|
#!/usr/bin/env perl |
|
use strict; |
|
use warnings; |
|
use JSON::PP; |
|
use File::Find; |
|
use File::Basename; |
|
use File::stat; |
|
use Time::Piece; |
|
|
|
# This script handles cases where Google Takeout truncated filenames differently |
|
# for the JSON and media files, e.g.: |
|
# JSON: Screenshot_2023-01-24-13-37-45-254_br.com.orig.json |
|
# Media: Screenshot_2023-01-24-13-37-45-254_br.com.origi.jpg |
|
# Title: Screenshot_2023-01-24-13-37-45-254_br.com.original.bank.jpg |
|
# |
|
# Also handles truncated URL-encoded filenames like: |
|
# JSON: http_3A_2F_2Fimagescale.tumblr.com_2Fimage_2F1.json |
|
# Media: http_3A_2F_2Fimagescale.tumblr.com_2Fimage_2F12(3).jpg |
|
|
|
my $ROOT = shift // '.'; |
|
my $LOG = shift // './fix_truncated.log'; |
|
my $TARGET_ID = shift // ''; |
|
my $DRY_RUN = ($ENV{DRY_RUN} // '0') eq '1'; |
|
|
|
my %stats = (fixed => 0, same => 0, nomedia => 0, notime => 0, errors => 0, total => 0, skipped => 0); |
|
|
|
# Media extensions |
|
my %media_ext = map { lc($_) => 1 } qw(jpg jpeg png gif mp4 m4v webp); |
|
|
|
# Phase 1: Load all media files by directory |
|
print "Loading media files into memory...\n"; |
|
my %media_by_dir; |
|
|
|
find( |
|
sub { |
|
return unless -f $_; |
|
return unless /\.([^.]+)$/ && $media_ext{lc($1)}; |
|
|
|
my $dir = $File::Find::dir; |
|
my $file = $_; |
|
$media_by_dir{$dir}{$file} = 1; |
|
}, |
|
$ROOT |
|
); |
|
|
|
my $media_count = 0; |
|
$media_count += scalar(keys %{$_}) for values %media_by_dir; |
|
print "Loaded $media_count media files from " . scalar(keys %media_by_dir) . " directories\n"; |
|
|
|
# Phase 2: Find JSON files that are NOT standard supplement patterns |
|
# These are the truncated/weird ones we need to handle |
|
print "Finding non-standard JSON files...\n"; |
|
my @json_files; |
|
|
|
find( |
|
sub { |
|
return unless -f $_; |
|
return unless /\.json$/i; |
|
|
|
# SKIP standard patterns - these are handled by fix_times.pl |
|
return if /\.(supplemental-metadata|supplement|supplemen|supp).*\.json$/i; |
|
return if /\.\.json$/; # double-dot pattern |
|
|
|
my $full_path = $File::Find::name; |
|
push @json_files, $full_path if !$TARGET_ID || $full_path =~ /\Q$TARGET_ID\E/; |
|
}, |
|
$ROOT |
|
); |
|
|
|
print "Found " . scalar(@json_files) . " non-standard JSON files\n\n"; |
|
|
|
# Open log |
|
open my $log_fh, '>', $LOG or die "Cannot open log: $!"; |
|
|
|
sub logit { |
|
my $msg = shift; |
|
print $msg; |
|
print $log_fh $msg; |
|
} |
|
|
|
logit("== Immich Takeout Truncated Filename Fixer ==\n"); |
|
logit("Root: $ROOT\n"); |
|
logit("Log: $LOG\n"); |
|
logit("DRY RUN: " . ($DRY_RUN ? "YES" : "NO") . "\n"); |
|
logit("Filter: '$TARGET_ID'\n") if $TARGET_ID; |
|
logit("\n"); |
|
|
|
# Phase 3: Process each JSON |
|
for my $json_path (@json_files) { |
|
$stats{total}++; |
|
|
|
my $dir = dirname($json_path); |
|
my $json_base = basename($json_path); |
|
|
|
# Get JSON stem (remove .json) |
|
my $json_stem = $json_base; |
|
$json_stem =~ s/\.json$//i; |
|
|
|
# Remove (N) suffix if present for matching |
|
my ($json_stem_clean, $json_dup) = ($json_stem, undef); |
|
if ($json_stem =~ /^(.+?)\((\d+)\)$/) { |
|
($json_stem_clean, $json_dup) = ($1, $2); |
|
} |
|
|
|
# Read JSON to get title and timestamp |
|
my ($title, $ts) = read_json_data($json_path); |
|
|
|
unless ($ts && $ts =~ /^\d+$/) { |
|
$stats{notime}++; |
|
logit("⚠ NoTime: $json_path\n") if $TARGET_ID; |
|
next; |
|
} |
|
|
|
# Get media files in this directory |
|
my $files_hash = $media_by_dir{$dir}; |
|
unless ($files_hash && %$files_hash) { |
|
$stats{nomedia}++; |
|
next; |
|
} |
|
|
|
# Find matching media file(s) using prefix matching |
|
my @matches = find_matching_media($json_stem_clean, $json_dup, $title, $files_hash); |
|
|
|
unless (@matches) { |
|
$stats{nomedia}++; |
|
if ($TARGET_ID) { |
|
logit("⚠ NoMedia: $json_path\n"); |
|
logit(" JSON stem: '$json_stem_clean'" . (defined $json_dup ? " dup=($json_dup)" : "") . "\n"); |
|
logit(" Title: '$title'\n") if $title; |
|
logit(" Available: " . join(", ", sort keys %$files_hash) . "\n"); |
|
} |
|
next; |
|
} |
|
|
|
# Apply timestamp to matches |
|
for my $media_file (@matches) { |
|
my $full_path = "$dir/$media_file"; |
|
|
|
my $st = stat($full_path); |
|
unless ($st) { |
|
$stats{errors}++; |
|
next; |
|
} |
|
|
|
my $current = $st->mtime; |
|
|
|
if ($current == $ts) { |
|
$stats{same}++; |
|
next; |
|
} |
|
|
|
if ($DRY_RUN) { |
|
my $old_date = localtime($current)->strftime('%Y-%m-%d %H:%M:%S'); |
|
my $new_date = localtime($ts)->strftime('%Y-%m-%d %H:%M:%S'); |
|
logit("✔ [DRY] #$stats{total} $media_file: $old_date -> $new_date\n"); |
|
logit(" JSON: $json_base\n"); |
|
$stats{fixed}++; |
|
} |
|
elsif (utime($ts, $ts, $full_path)) { |
|
$stats{fixed}++; |
|
|
|
if ($TARGET_ID || $stats{fixed} % 100 == 0) { |
|
my $old_date = localtime($current)->strftime('%Y-%m-%d %H:%M:%S'); |
|
my $new_date = localtime($ts)->strftime('%Y-%m-%d %H:%M:%S'); |
|
logit("✔ #$stats{total} $media_file: $old_date -> $new_date\n"); |
|
logit(" JSON: $json_base\n"); |
|
} |
|
} |
|
else { |
|
$stats{errors}++; |
|
} |
|
} |
|
} |
|
|
|
logit("\n== SUMMARY ==\n"); |
|
logit("Total JSONs: $stats{total} | Fixed: $stats{fixed} | OK: $stats{same} | "); |
|
logit("NoMedia: $stats{nomedia} | NoTS: $stats{notime} | Errors: $stats{errors}\n"); |
|
|
|
close $log_fh; |
|
|
|
# ============================================================================ |
|
# Subroutines |
|
# ============================================================================ |
|
|
|
sub find_matching_media { |
|
my ($json_stem, $json_dup, $title, $files_hash) = @_; |
|
|
|
my @found; |
|
|
|
# FIRST: Try exact stem match (handles cases with trailing dots, etc.) |
|
for my $file (keys %$files_hash) { |
|
my ($file_stem) = $file =~ /^(.+)\.[^.]+$/; |
|
next unless $file_stem; |
|
|
|
# Remove trailing dots from both |
|
my $json_clean = $json_stem; |
|
my $file_clean = $file_stem; |
|
$json_clean =~ s/\.+$//; |
|
$file_clean =~ s/\.+$//; |
|
|
|
# Handle duplicate numbers |
|
my ($file_stem_clean, $file_dup) = ($file_clean, undef); |
|
if ($file_clean =~ /^(.+?)\((\d+)\)$/) { |
|
($file_stem_clean, $file_dup) = ($1, $2); |
|
} |
|
|
|
my ($json_stem_clean, $json_dup_check) = ($json_clean, $json_dup); |
|
if (!defined $json_dup_check && $json_clean =~ /^(.+?)\((\d+)\)$/) { |
|
($json_stem_clean, $json_dup_check) = ($1, $2); |
|
} |
|
|
|
# Check duplicate numbers match |
|
if (defined $json_dup_check || defined $file_dup) { |
|
next unless (defined $json_dup_check && defined $file_dup && $json_dup_check eq $file_dup); |
|
} |
|
|
|
# Exact match |
|
if ($json_stem_clean eq $file_stem_clean) { |
|
return ($file); |
|
} |
|
} |
|
|
|
# SECOND: Try very strict prefix match (for genuine truncation) |
|
# Require at least 90% match OR 40+ characters match |
|
my $min_strict_prefix = length($json_stem) > 40 ? 40 : int(length($json_stem) * 0.9); |
|
|
|
# Also, for Screenshot files with timestamps, verify the timestamp portion matches |
|
my $json_has_timestamp = $json_stem =~ /^Screenshot_(\d{4}-\d{2}-\d{2}-\d{2}-\d{2}-\d{2})/; |
|
my $json_timestamp = $json_has_timestamp ? $1 : ''; |
|
|
|
for my $file (keys %$files_hash) { |
|
my ($file_stem) = $file =~ /^(.+)\.[^.]+$/; |
|
next unless $file_stem; |
|
|
|
# If JSON has timestamp in filename, media MUST have same timestamp |
|
if ($json_timestamp) { |
|
next unless $file_stem =~ /^Screenshot_\Q$json_timestamp\E/; |
|
} |
|
|
|
my ($file_stem_clean, $file_dup) = ($file_stem, undef); |
|
if ($file_stem =~ /^(.+?)\((\d+)\)$/) { |
|
($file_stem_clean, $file_dup) = ($1, $2); |
|
} |
|
|
|
# Check duplicate numbers |
|
if (defined $json_dup) { |
|
next unless defined $file_dup && $file_dup eq $json_dup; |
|
} |
|
|
|
my $prefix_len = common_prefix_length($json_stem, $file_stem_clean); |
|
|
|
if ($prefix_len >= $min_strict_prefix) { |
|
push @found, $file; |
|
} |
|
} |
|
|
|
return @found; |
|
} |
|
|
|
sub common_prefix_length { |
|
my ($a, $b) = @_; |
|
my $len = 0; |
|
my $max = length($a) < length($b) ? length($a) : length($b); |
|
|
|
for my $i (0 .. $max - 1) { |
|
if (substr($a, $i, 1) eq substr($b, $i, 1)) { |
|
$len++; |
|
} |
|
else { |
|
last; |
|
} |
|
} |
|
|
|
return $len; |
|
} |
|
|
|
sub read_json_data { |
|
my $json_file = shift; |
|
|
|
open my $fh, '<', $json_file or return (undef, undef); |
|
my $content = do { local $/; <$fh> }; |
|
close $fh; |
|
|
|
my $data = eval { decode_json($content) }; |
|
return (undef, undef) unless $data; |
|
|
|
my $title = $data->{title}; |
|
my $ts = $data->{photoTakenTime}{timestamp} || $data->{creationTime}{timestamp}; |
|
|
|
return ($title, $ts); |
|
} |