J0hnL0cke · January 20, 2025 22:33
diff --git a/mhtml-to-pdf2.sh b/mhtml-to-pdf2.sh
 # Based on my [previous script](https://gist.github.com/J0hnL0cke/95dbf624465034e399592c5d9690eb11) for converting mhtml -> pdf
 # This script scans the output directory, in order to figure out if the conversion has failed for any mhtml files

 # Instructions
 # Install docker
 # Set the number on the `tail` commands (in 2 locations now!) to strip file paths (ie "./Readings/filename.mhtml" -> strip first 8 chars -> "filename.mhtml" )
 # Run in a directory where you have run `mkdir ./out` and have mhtml files in current folder or subfolders
 # To customize where the program searches, for files, edit the path used by the `find` command

 IFS=$'\n' &&
 # run a script to get all the missing filenames to iterate over
 for filename in $(
  # recursively find all mhtml in the input directory
 	for filename in $(find ./ -iname "*.mhtml" -type f )
 	do
    # open the file, figure out how the output file should be named.
 	  newname=$(grep -m 1 -o "<title>[^<]*" $filename | tail -c+8 | sed 's/[^a-zA-Z0-9 \_\.\-\/\\]//g')
 	  newname="$newname.pdf"
 	  r=$(find ./out -name "$newname")
 	  #echo $r
 	  if [ -z "${r}" ]; then
 		#echo "file not found! (./out/$newname) "
    # output a list of missing files
 		echo "${filename}"
 	  #else
 		#echo "found file ($newname)"
 	  fi
 	done

 )

 do
  # open the file and grep for the content location tag (Snapshot-Content-Location: https://example.com/x)
  tag=$(grep -m 1 "Snapshot-Content-Location: " $filename)
  
  # trim to just the url
 	url=$(echo $tag | sed -r 's#.*Snapshot-Content-Location: (.*)#\1#' | sed 's/[^a-zA-Z0-9 \_\.\-\:\/\\]//g')
  
 	# name the output file based on the input file's name
  newname=$(grep -m 1 -o "<title>[^<]*" $filename | tail -c+8 | sed 's/[^a-zA-Z0-9 \_\.\-\/\\]//g')
  
 	# set the path to save to, append file extension
  newname="./data/out/$newname.pdf"
  
 	echo "processing file ($newname) ($url)"
  
 	# run wkhtmltopdf on an auto-deleting docker container, and mount the current directory into the container
 	sudo docker run --rm --volume "./:/data" --user `id -u`:`id -g` madnight/docker-alpine-wkhtmltopdf $url $newname
 	
 done && unset IFS
	# Based on my [previous script](https://gist.github.com/J0hnL0cke/95dbf624465034e399592c5d9690eb11) for converting mhtml -> pdf
	# This script scans the output directory, in order to figure out if the conversion has failed for any mhtml files

	# Instructions
	# Install docker
	# Set the number on the `tail` commands (in 2 locations now!) to strip file paths (ie "./Readings/filename.mhtml" -> strip first 8 chars -> "filename.mhtml" )
	# Run in a directory where you have run `mkdir ./out` and have mhtml files in current folder or subfolders
	# To customize where the program searches, for files, edit the path used by the `find` command

	IFS=$'\n' &&
	# run a script to get all the missing filenames to iterate over
	for filename in $(
	# recursively find all mhtml in the input directory
	for filename in $(find ./ -iname "*.mhtml" -type f )
	do
	# open the file, figure out how the output file should be named.
	newname=$(grep -m 1 -o "<title>[^<]*" $filename \| tail -c+8 \| sed 's/[^a-zA-Z0-9 \_\.\-\/\\]//g')
	newname="$newname.pdf"
	r=$(find ./out -name "$newname")
	#echo $r
	if [ -z "${r}" ]; then
	#echo "file not found! (./out/$newname) "
	# output a list of missing files
	echo "${filename}"
	#else
	#echo "found file ($newname)"
	fi
	done

	)

	do
	# open the file and grep for the content location tag (Snapshot-Content-Location: https://example.com/x)
	tag=$(grep -m 1 "Snapshot-Content-Location: " $filename)

	# trim to just the url
	url=$(echo $tag \| sed -r 's#.Snapshot-Content-Location: (.)#\1#' \| sed 's/[^a-zA-Z0-9 \_\.\-\:\/\\]//g')

	# name the output file based on the input file's name
	newname=$(grep -m 1 -o "<title>[^<]*" $filename \| tail -c+8 \| sed 's/[^a-zA-Z0-9 \_\.\-\/\\]//g')

	# set the path to save to, append file extension
	newname="./data/out/$newname.pdf"

	echo "processing file ($newname) ($url)"

	# run wkhtmltopdf on an auto-deleting docker container, and mount the current directory into the container
	sudo docker run --rm --volume "./:/data" --user `id -u`:`id -g` madnight/docker-alpine-wkhtmltopdf $url $newname

	done && unset IFS
No results found