Created
February 11, 2026 15:38
-
-
Save Bubbu0129/9eca6b3c17d569d58f7c4f600b56336b to your computer and use it in GitHub Desktop.
Remove sensitive information from PDF downloaded from JSTOR
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| # Usage: ./jstor.sh <input PDF> <output PDF> | |
| INPUT_FILE="$1" | |
| OUTPUT_FILE="$2" | |
| if [ $# -ne 2 ]; then | |
| echo "Usage: $0 <output_file> <input_file>" | |
| exit 1 | |
| fi | |
| echo "[1/3] Decompressing streams..." | |
| # -d: Decompress content streams (makes text readable for Step 2) | |
| # -i: Compress image streams (keeps binary images compact) | |
| mutool clean -di "$INPUT_FILE" decomp.pdf | |
| echo "[2/3] Removing objects containing target string..." | |
| # -0777 : Read the entire file into memory at once (slurp mode). | |
| # -pe : Auto-print output after executing the code. | |
| # s/...//gs : Search and replace globally (g) treating string as single line (s). | |
| # \Q...\E : Automatically escape special characters in the target string. | |
| perl -0777 -pe 's/\d+\s+\d+\s+obj.*?\Q(All use subject to\E.*?endobj//gs' decomp.pdf > temp.pdf | |
| echo "[3/3] Merging, Repairing, and removing 1st page..." | |
| # -O garbage : Scans the file and rebuilds the XREF table, ignoring the missing objects from Step 2. | |
| # -O compress: Re-compresses the streams. | |
| # -O sanitize: Reads the Page's content streams to clean up graphics commands. | |
| # 2-N : Selects page 2 through the end (2-N). | |
| mutool merge -O garbage,compress,sanitize -o "$OUTPUT_FILE" temp.pdf 2-N | |
| rm decomp.pdf temp.pdf | |
| echo "Done. Saved to $OUTPUT_FILE" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment