lee2sman · February 12, 2026 06:20 · lee2sman · Feb 12, 2026
diff --git a/convert-pdf-to-text b/convert-pdf-to-text
 #!/usr/bin/env fish

 # Check input arguments
 if test (count $argv) -lt 2
    echo "Usage: $argv[0] input.pdf output.txt"
    exit 1
 end

 set input $argv[1]
 set output $argv[2]

 # Check if input file exists
 if not test -f "$input"
    echo "Error: Input file '$input' not found"
    exit 1
 end

 # Check dependencies
 for cmd in gs tesseract-ocr aspell sed
    if not command -v $cmd &>/dev/null
        echo "Error: $cmd is not installed"
        exit 1
    end
 end

 echo "Converting PDF to PNG pages..."
 gs -dNOPAUSE -dBATCH -sDEVICE=pngalpha -sOutputFile=page_%03d.png $input

 echo "Running OCR on each page..."
 for img in page_*.png
    set base (basename "$img" .png)  
    echo "  Processing $base"
    tesseract-ocr "$img" "$base"  
 end

 echo "Compiling pages..."
 cat page_*.txt > compiled.txt

 echo "Building correction wordlist..."
 aspell list < compiled.txt | while read word
 set suggestion (echo "$word" | aspell -a | grep "^&" | sed 's/.*: //' | cut -d',' -f1 | string trim)
  if test -n "$suggestion"
    echo "s/\b$word\b/$suggestion/g"
  end
 end > auto.sed

 echo "Applying autocorrections..."
 sed -f auto.sed compiled.txt > $output

 # Clean up temporary files
 echo "Cleaning up..."
 rm -f page_*.png
 rm -f page_*.txt
 rm -f auto.sed
 rm -f compiled.txt

 echo "Done! Output: $output"
	#!/usr/bin/env fish

	# Check input arguments
	if test (count $argv) -lt 2
	echo "Usage: $argv[0] input.pdf output.txt"
	exit 1
	end

	set input $argv[1]
	set output $argv[2]

	# Check if input file exists
	if not test -f "$input"
	echo "Error: Input file '$input' not found"
	exit 1
	end

	# Check dependencies
	for cmd in gs tesseract-ocr aspell sed
	if not command -v $cmd &>/dev/null
	echo "Error: $cmd is not installed"
	exit 1
	end
	end

	echo "Converting PDF to PNG pages..."
	gs -dNOPAUSE -dBATCH -sDEVICE=pngalpha -sOutputFile=page_%03d.png $input

	echo "Running OCR on each page..."
	for img in page_*.png
	set base (basename "$img" .png)
	echo " Processing $base"
	tesseract-ocr "$img" "$base"
	end

	echo "Compiling pages..."
	cat page_*.txt > compiled.txt

	echo "Building correction wordlist..."
	aspell list < compiled.txt \| while read word
	set suggestion (echo "$word" \| aspell -a \| grep "^&" \| sed 's/.*: //' \| cut -d',' -f1 \| string trim)
	if test -n "$suggestion"
	echo "s/\b$word\b/$suggestion/g"
	end
	end > auto.sed

	echo "Applying autocorrections..."
	sed -f auto.sed compiled.txt > $output

	# Clean up temporary files
	echo "Cleaning up..."
	rm -f page_*.png
	rm -f page_*.txt
	rm -f auto.sed
	rm -f compiled.txt

	echo "Done! Output: $output"
No results found