Last active
August 13, 2024 04:51
-
-
Save BrianAker/b79fb1ecc1c9e7f99da67080324ead17 to your computer and use it in GitHub Desktop.
script for parsing the filenames of comic books, manga, etc, i.e. cbz files. It will find the series names and mostly parses the other metadata found in filenames. A later version of it generates ComicInfo.xml. It will sort files into directories based on Series names. Chapter number is not 100%
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/perl | |
| use strict; | |
| use warnings; | |
| use File::Path qw(make_path); | |
| use File::Copy qw(move); | |
| # Script version number | |
| my $VERSION = "1.1.16-2024.08.12.00.00"; | |
| # Initialize default minimum group size | |
| my $min_group_size = 2; | |
| my $create_shift = 0; # Flag to determine if we should create directories and move files | |
| my $show_help = 0; # Flag to determine if help should be shown | |
| my $debug_mode = 0; # Flag to determine if debug information should be printed | |
| # Declare the global hash for storing series information | |
| my %series_groups; | |
| # Custom argument parsing to handle +N, -N, --create-shift, --help, and --debug options | |
| foreach my $arg (@ARGV) { | |
| if ($arg =~ /^\+(\d+)$/) { | |
| $min_group_size = $1; | |
| } elsif ($arg =~ /^-(\d+)$/) { | |
| $min_group_size = -$1; | |
| } elsif ($arg eq '--create-shift') { | |
| $create_shift = 1; | |
| } elsif ($arg eq '--help') { | |
| $show_help = 1; | |
| } elsif ($arg eq '--debug') { | |
| $debug_mode = 1; | |
| } | |
| } | |
| # Remove the custom options from @ARGV | |
| @ARGV = grep { !/^[+-]\d+$/ && $_ ne '--create-shift' && $_ ne '--help' && $_ ne '--debug' } @ARGV; | |
| # Function to print help message | |
| sub print_help { | |
| print <<"END_HELP"; | |
| Usage: $0 [options] *.cbz | |
| Options: | |
| +N Only print groups with N or more files. | |
| -N Only print groups with fewer than N files. | |
| --create-shift Create directories based on the series name and move files into them. | |
| --debug Print debug information such as Resolution, ScanInformation, Publisher, Title, Volume, Chapter, Publishing Date, and LanguageISO. | |
| --help Display this help message. | |
| Version: $VERSION | |
| END_HELP | |
| } | |
| # Show help if no arguments are provided or if --help is used | |
| if ($show_help || !@ARGV) { | |
| print_help(); | |
| exit; | |
| } | |
| # Function to extract the series name, title, publisher, ScanInformation, LanguageISO, volume, chapter, and publishing date from a cbz filename | |
| sub extract_series_name { | |
| my ($filename, $info) = @_; | |
| # Extract and remove ScanInformation (e.g., [ScanGroup]) at the start of the filename | |
| if ($filename =~ s/^\[([^\]]+)\]\s*//) { | |
| $info->{scan_info} = $1; | |
| } | |
| # Remove the file extension (.cbz) | |
| $filename =~ s/\.cbz$//i; | |
| # Extract and remove resolution info (e.g., (x3200)) | |
| if ($filename =~ s/\(x(\d+)\)//) { | |
| $info->{resolution} = $1; | |
| } | |
| # Extract and remove the exact string "ENGLISH" surrounded by [] or () and store in LanguageISO | |
| if ($filename =~ s/[\[\(]ENGLISH[\]\)]//i) { | |
| $info->{language_iso} = "EN"; | |
| } | |
| # Extract and remove (PNG) or [PNG] in any case and store in format | |
| if ($filename =~ s/[\(\[]png[\)\]]//i) { | |
| $info->{format} = "PNG"; | |
| } | |
| # Extract and remove publisher information anchored at the end of the string (either [] or ()) | |
| if ($filename =~ s/\s*[\[\(]([^\]\)]+)[\]\)]\s*$//) { | |
| $info->{publisher_info} = $1; | |
| } | |
| # Extract and remove volume information (e.g., Vol. 5, Vol. 2.0, v02, v02.5) | |
| if ($filename =~ s/\b(?:Volume|Vol|Vol\.)\s*(\d+(\.\d+)?|v0{0,4}(\d{1,5}(?:\.\d+)?))\b//i) { | |
| $info->{volume} = $3 || $1; # Store only the numeric part | |
| } elsif ($filename =~ s/\bv0{0,4}(\d{1,5}(\.\d+)?)\b//i) { | |
| $info->{volume} = $1; # Store only the numeric part | |
| } | |
| # Remove chapter/episode/operation information before extracting the title | |
| if ($filename =~ s/\b(?:Chapter|Ch\.?|Part|Ep|Ep\.|Episode|Op|Op\.)\s*(\d+|EX\d+)\b//i) { | |
| $info->{chapter} = $1; | |
| } elsif ($filename =~ s/\b(EX\d+)\b//i) { | |
| $info->{chapter} = $1; | |
| } | |
| # Going from right to left, look for a title after " - " and remove it from the series name | |
| if ($filename =~ s/\s-\s(.+)$//) { # Match " - " and capture everything after it | |
| $info->{title} = $1; | |
| } | |
| # If the title is still empty and the filename does not start with "re:" (case-insensitive) | |
| if (!$info->{title} && $filename !~ /^re꞉/i && $filename =~ s/꞉\s(.+)$//) { | |
| $info->{title} = $1; | |
| } | |
| # If the title is still empty, look for titles surrounded by ~ ~ | |
| if (!$info->{title} && $filename =~ s/~([^~]+)~//) { | |
| $info->{title} = $1; | |
| } | |
| # Extract and remove publishing year/month (e.g., 2024-02) | |
| if ($filename =~ s/\b(20\d{2}-\d{2})\b$//) { | |
| $info->{publishing_date} = $1; | |
| } | |
| # Assign the cleaned filename to series_name | |
| $info->{series_name} = $filename; | |
| # Trim trailing spaces | |
| $info->{series_name} =~ s/\s+$//; | |
| } | |
| # Main script to decode series names from multiple cbz filenames | |
| foreach my $filename (@ARGV) { | |
| # Check if the provided file has a .cbz extension | |
| if ($filename =~ /\.cbz$/i) { | |
| my %info; | |
| extract_series_name($filename, \%info); | |
| my $normalized_name = uc($info{series_name}); # Convert series name to uppercase for comparison | |
| # Store the original series name and its corresponding file information | |
| push @{$series_groups{$normalized_name}{files}}, { | |
| filename => $filename, | |
| %info, | |
| }; | |
| # Increment the count for this original series name | |
| $series_groups{$normalized_name}{count}{$info{series_name}}++; | |
| } else { | |
| print "Warning: '$filename' is not a .cbz file. Skipping...\n"; | |
| } | |
| } | |
| # Determine the most common original series name for each group | |
| foreach my $normalized_name (keys %series_groups) { | |
| my $most_common_name = (sort { $series_groups{$normalized_name}{count}{$b} <=> $series_groups{$normalized_name}{count}{$a} } keys %{$series_groups{$normalized_name}{count}})[0]; | |
| $series_groups{$normalized_name}{most_common_name} = $most_common_name; | |
| } | |
| # Sort series names and process the files based on the min_group_size | |
| foreach my $normalized_name (sort keys %series_groups) { | |
| my @files = @{$series_groups{$normalized_name}{files}}; | |
| next if ($min_group_size > 0 && @files < $min_group_size); # Skip groups with fewer than the specified minimum number of files | |
| next if ($min_group_size < 0 && @files >= abs($min_group_size)); # Skip groups with equal to or more than the specified negative minimum number of files | |
| my $most_common_name = $series_groups{$normalized_name}{most_common_name}; | |
| print "Series: $most_common_name\n"; | |
| foreach my $file_info (@files) { | |
| my $file = $file_info->{filename}; | |
| my $resolution = $file_info->{resolution}; | |
| my $scan_info = $file_info->{scan_info}; | |
| my $publisher_info = $file_info->{publisher_info}; | |
| my $title = $file_info->{title}; | |
| my $volume = $file_info->{volume}; | |
| my $chapter = $file_info->{chapter}; | |
| my $publishing_date = $file_info->{publishing_date}; | |
| my $language_iso = $file_info->{language_iso}; | |
| print " File: $file\n"; | |
| if ($debug_mode) { | |
| print " Resolution: $resolution\n" if $resolution; | |
| print " ScanInformation: $scan_info\n" if $scan_info; | |
| print " Publisher: $publisher_info\n" if $publisher_info; | |
| print " Title: $title\n" if $title; | |
| print " Volume: $volume\n" if $volume; | |
| print " Chapter: $chapter\n" if $chapter; | |
| print " Publishing Date: $publishing_date\n" if $publishing_date; | |
| print " LanguageISO: $language_iso\n" if $language_iso; | |
| } | |
| } | |
| print "\n"; | |
| # If --create-shift is specified, create the directory and move the files | |
| if ($create_shift) { | |
| my $dir_name = $most_common_name; # Use the most common series name directly as the directory name | |
| unless (-d $dir_name) { | |
| make_path($dir_name) or die "Failed to create directory '$dir_name': $!"; | |
| } | |
| foreach my $file_info (@files) { | |
| my $file = $file_info->{filename}; | |
| move($file, "$dir_name/") or die "Failed to move file '$file' to '$dir_name': $!"; | |
| } | |
| print "Moved files to directory: $dir_name\n"; | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment