Skip to content

Instantly share code, notes, and snippets.

@BrianAker
Last active August 13, 2024 04:51
Show Gist options
  • Select an option

  • Save BrianAker/b79fb1ecc1c9e7f99da67080324ead17 to your computer and use it in GitHub Desktop.

Select an option

Save BrianAker/b79fb1ecc1c9e7f99da67080324ead17 to your computer and use it in GitHub Desktop.
script for parsing the filenames of comic books, manga, etc, i.e. cbz files. It will find the series names and mostly parses the other metadata found in filenames. A later version of it generates ComicInfo.xml. It will sort files into directories based on Series names. Chapter number is not 100%
#!/usr/bin/perl
use strict;
use warnings;
use File::Path qw(make_path);
use File::Copy qw(move);
# Script version number
my $VERSION = "1.1.16-2024.08.12.00.00";
# Initialize default minimum group size
my $min_group_size = 2;
my $create_shift = 0; # Flag to determine if we should create directories and move files
my $show_help = 0; # Flag to determine if help should be shown
my $debug_mode = 0; # Flag to determine if debug information should be printed
# Declare the global hash for storing series information
my %series_groups;
# Custom argument parsing to handle +N, -N, --create-shift, --help, and --debug options
foreach my $arg (@ARGV) {
if ($arg =~ /^\+(\d+)$/) {
$min_group_size = $1;
} elsif ($arg =~ /^-(\d+)$/) {
$min_group_size = -$1;
} elsif ($arg eq '--create-shift') {
$create_shift = 1;
} elsif ($arg eq '--help') {
$show_help = 1;
} elsif ($arg eq '--debug') {
$debug_mode = 1;
}
}
# Remove the custom options from @ARGV
@ARGV = grep { !/^[+-]\d+$/ && $_ ne '--create-shift' && $_ ne '--help' && $_ ne '--debug' } @ARGV;
# Function to print help message
sub print_help {
print <<"END_HELP";
Usage: $0 [options] *.cbz
Options:
+N Only print groups with N or more files.
-N Only print groups with fewer than N files.
--create-shift Create directories based on the series name and move files into them.
--debug Print debug information such as Resolution, ScanInformation, Publisher, Title, Volume, Chapter, Publishing Date, and LanguageISO.
--help Display this help message.
Version: $VERSION
END_HELP
}
# Show help if no arguments are provided or if --help is used
if ($show_help || !@ARGV) {
print_help();
exit;
}
# Function to extract the series name, title, publisher, ScanInformation, LanguageISO, volume, chapter, and publishing date from a cbz filename
sub extract_series_name {
my ($filename, $info) = @_;
# Extract and remove ScanInformation (e.g., [ScanGroup]) at the start of the filename
if ($filename =~ s/^\[([^\]]+)\]\s*//) {
$info->{scan_info} = $1;
}
# Remove the file extension (.cbz)
$filename =~ s/\.cbz$//i;
# Extract and remove resolution info (e.g., (x3200))
if ($filename =~ s/\(x(\d+)\)//) {
$info->{resolution} = $1;
}
# Extract and remove the exact string "ENGLISH" surrounded by [] or () and store in LanguageISO
if ($filename =~ s/[\[\(]ENGLISH[\]\)]//i) {
$info->{language_iso} = "EN";
}
# Extract and remove (PNG) or [PNG] in any case and store in format
if ($filename =~ s/[\(\[]png[\)\]]//i) {
$info->{format} = "PNG";
}
# Extract and remove publisher information anchored at the end of the string (either [] or ())
if ($filename =~ s/\s*[\[\(]([^\]\)]+)[\]\)]\s*$//) {
$info->{publisher_info} = $1;
}
# Extract and remove volume information (e.g., Vol. 5, Vol. 2.0, v02, v02.5)
if ($filename =~ s/\b(?:Volume|Vol|Vol\.)\s*(\d+(\.\d+)?|v0{0,4}(\d{1,5}(?:\.\d+)?))\b//i) {
$info->{volume} = $3 || $1; # Store only the numeric part
} elsif ($filename =~ s/\bv0{0,4}(\d{1,5}(\.\d+)?)\b//i) {
$info->{volume} = $1; # Store only the numeric part
}
# Remove chapter/episode/operation information before extracting the title
if ($filename =~ s/\b(?:Chapter|Ch\.?|Part|Ep|Ep\.|Episode|Op|Op\.)\s*(\d+|EX\d+)\b//i) {
$info->{chapter} = $1;
} elsif ($filename =~ s/\b(EX\d+)\b//i) {
$info->{chapter} = $1;
}
# Going from right to left, look for a title after " - " and remove it from the series name
if ($filename =~ s/\s-\s(.+)$//) { # Match " - " and capture everything after it
$info->{title} = $1;
}
# If the title is still empty and the filename does not start with "re:" (case-insensitive)
if (!$info->{title} && $filename !~ /^re꞉/i && $filename =~ s/꞉\s(.+)$//) {
$info->{title} = $1;
}
# If the title is still empty, look for titles surrounded by ~ ~
if (!$info->{title} && $filename =~ s/~([^~]+)~//) {
$info->{title} = $1;
}
# Extract and remove publishing year/month (e.g., 2024-02)
if ($filename =~ s/\b(20\d{2}-\d{2})\b$//) {
$info->{publishing_date} = $1;
}
# Assign the cleaned filename to series_name
$info->{series_name} = $filename;
# Trim trailing spaces
$info->{series_name} =~ s/\s+$//;
}
# Main script to decode series names from multiple cbz filenames
foreach my $filename (@ARGV) {
# Check if the provided file has a .cbz extension
if ($filename =~ /\.cbz$/i) {
my %info;
extract_series_name($filename, \%info);
my $normalized_name = uc($info{series_name}); # Convert series name to uppercase for comparison
# Store the original series name and its corresponding file information
push @{$series_groups{$normalized_name}{files}}, {
filename => $filename,
%info,
};
# Increment the count for this original series name
$series_groups{$normalized_name}{count}{$info{series_name}}++;
} else {
print "Warning: '$filename' is not a .cbz file. Skipping...\n";
}
}
# Determine the most common original series name for each group
foreach my $normalized_name (keys %series_groups) {
my $most_common_name = (sort { $series_groups{$normalized_name}{count}{$b} <=> $series_groups{$normalized_name}{count}{$a} } keys %{$series_groups{$normalized_name}{count}})[0];
$series_groups{$normalized_name}{most_common_name} = $most_common_name;
}
# Sort series names and process the files based on the min_group_size
foreach my $normalized_name (sort keys %series_groups) {
my @files = @{$series_groups{$normalized_name}{files}};
next if ($min_group_size > 0 && @files < $min_group_size); # Skip groups with fewer than the specified minimum number of files
next if ($min_group_size < 0 && @files >= abs($min_group_size)); # Skip groups with equal to or more than the specified negative minimum number of files
my $most_common_name = $series_groups{$normalized_name}{most_common_name};
print "Series: $most_common_name\n";
foreach my $file_info (@files) {
my $file = $file_info->{filename};
my $resolution = $file_info->{resolution};
my $scan_info = $file_info->{scan_info};
my $publisher_info = $file_info->{publisher_info};
my $title = $file_info->{title};
my $volume = $file_info->{volume};
my $chapter = $file_info->{chapter};
my $publishing_date = $file_info->{publishing_date};
my $language_iso = $file_info->{language_iso};
print " File: $file\n";
if ($debug_mode) {
print " Resolution: $resolution\n" if $resolution;
print " ScanInformation: $scan_info\n" if $scan_info;
print " Publisher: $publisher_info\n" if $publisher_info;
print " Title: $title\n" if $title;
print " Volume: $volume\n" if $volume;
print " Chapter: $chapter\n" if $chapter;
print " Publishing Date: $publishing_date\n" if $publishing_date;
print " LanguageISO: $language_iso\n" if $language_iso;
}
}
print "\n";
# If --create-shift is specified, create the directory and move the files
if ($create_shift) {
my $dir_name = $most_common_name; # Use the most common series name directly as the directory name
unless (-d $dir_name) {
make_path($dir_name) or die "Failed to create directory '$dir_name': $!";
}
foreach my $file_info (@files) {
my $file = $file_info->{filename};
move($file, "$dir_name/") or die "Failed to move file '$file' to '$dir_name': $!";
}
print "Moved files to directory: $dir_name\n";
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment