Created
February 12, 2026 06:20
-
-
Save lee2sman/3c38b7767a59e7437cbc7dc93cbf5e05 to your computer and use it in GitHub Desktop.
A fish shell script that automatically converts a PDF to a plaintext output with aspell's autocorrect.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env fish | |
| # Check input arguments | |
| if test (count $argv) -lt 2 | |
| echo "Usage: $argv[0] input.pdf output.txt" | |
| exit 1 | |
| end | |
| set input $argv[1] | |
| set output $argv[2] | |
| # Check if input file exists | |
| if not test -f "$input" | |
| echo "Error: Input file '$input' not found" | |
| exit 1 | |
| end | |
| # Check dependencies | |
| for cmd in gs tesseract-ocr aspell sed | |
| if not command -v $cmd &>/dev/null | |
| echo "Error: $cmd is not installed" | |
| exit 1 | |
| end | |
| end | |
| echo "Converting PDF to PNG pages..." | |
| gs -dNOPAUSE -dBATCH -sDEVICE=pngalpha -sOutputFile=page_%03d.png $input | |
| echo "Running OCR on each page..." | |
| for img in page_*.png | |
| set base (basename "$img" .png) | |
| echo " Processing $base" | |
| tesseract-ocr "$img" "$base" | |
| end | |
| echo "Compiling pages..." | |
| cat page_*.txt > compiled.txt | |
| echo "Building correction wordlist..." | |
| aspell list < compiled.txt | while read word | |
| set suggestion (echo "$word" | aspell -a | grep "^&" | sed 's/.*: //' | cut -d',' -f1 | string trim) | |
| if test -n "$suggestion" | |
| echo "s/\b$word\b/$suggestion/g" | |
| end | |
| end > auto.sed | |
| echo "Applying autocorrections..." | |
| sed -f auto.sed compiled.txt > $output | |
| # Clean up temporary files | |
| echo "Cleaning up..." | |
| rm -f page_*.png | |
| rm -f page_*.txt | |
| rm -f auto.sed | |
| rm -f compiled.txt | |
| echo "Done! Output: $output" |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I'm using Void linux, and the tesseract package is called
tesseract-ocron my system, but may need to be changed totesseracton yours!