Last active
January 20, 2025 22:33
-
-
Save J0hnL0cke/427ab4b8ff1e85ffdfaf2e5bb4979e2c to your computer and use it in GitHub Desktop.
Finds missing pdf files from conversion, then re-fetches them
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Based on my [previous script](https://gist.github.com/J0hnL0cke/95dbf624465034e399592c5d9690eb11) for converting mhtml -> pdf | |
| # This script scans the output directory, in order to figure out if the conversion has failed for any mhtml files | |
| # Instructions | |
| # Install docker | |
| # Set the number on the `tail` commands (in 2 locations now!) to strip file paths (ie "./Readings/filename.mhtml" -> strip first 8 chars -> "filename.mhtml" ) | |
| # Run in a directory where you have run `mkdir ./out` and have mhtml files in current folder or subfolders | |
| # To customize where the program searches, for files, edit the path used by the `find` command | |
| IFS=$'\n' && | |
| # run a script to get all the missing filenames to iterate over | |
| for filename in $( | |
| # recursively find all mhtml in the input directory | |
| for filename in $(find ./ -iname "*.mhtml" -type f ) | |
| do | |
| # open the file, figure out how the output file should be named. | |
| newname=$(grep -m 1 -o "<title>[^<]*" $filename | tail -c+8 | sed 's/[^a-zA-Z0-9 \_\.\-\/\\]//g') | |
| newname="$newname.pdf" | |
| r=$(find ./out -name "$newname") | |
| #echo $r | |
| if [ -z "${r}" ]; then | |
| #echo "file not found! (./out/$newname) " | |
| # output a list of missing files | |
| echo "${filename}" | |
| #else | |
| #echo "found file ($newname)" | |
| fi | |
| done | |
| ) | |
| do | |
| # open the file and grep for the content location tag (Snapshot-Content-Location: https://example.com/x) | |
| tag=$(grep -m 1 "Snapshot-Content-Location: " $filename) | |
| # trim to just the url | |
| url=$(echo $tag | sed -r 's#.*Snapshot-Content-Location: (.*)#\1#' | sed 's/[^a-zA-Z0-9 \_\.\-\:\/\\]//g') | |
| # name the output file based on the input file's name | |
| newname=$(grep -m 1 -o "<title>[^<]*" $filename | tail -c+8 | sed 's/[^a-zA-Z0-9 \_\.\-\/\\]//g') | |
| # set the path to save to, append file extension | |
| newname="./data/out/$newname.pdf" | |
| echo "processing file ($newname) ($url)" | |
| # run wkhtmltopdf on an auto-deleting docker container, and mount the current directory into the container | |
| sudo docker run --rm --volume "./:/data" --user `id -u`:`id -g` madnight/docker-alpine-wkhtmltopdf $url $newname | |
| done && unset IFS |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment