Last active
December 23, 2025 06:13
-
-
Save badbye/79f506013fc3c6d123404f7c287c7e8b to your computer and use it in GitHub Desktop.
将 PDF 转成 JSONL 的脚本
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env bash | |
| set -euo pipefail | |
| # A thin wrapper over pdftotext that emits JSONL instead of plain text. | |
| # Usage stays identical to pdftotext: pdftojsonl.sh [options] <PDF-file> [<jsonl-file>] | |
| if [[ $# -eq 0 ]]; then | |
| echo "Usage: pdftojsonl [pdftotext options] <PDF-file> [<jsonl-file>]" >&2 | |
| exit 1 | |
| fi | |
| OPTIONS_WITH_VALUES=( | |
| -f -l -r -x -y -W -H -fixed -linespacing -enc -eol -opw -upw | |
| -marginl -marginr -margint -marginb -cfg | |
| ) | |
| needs_value() { | |
| local opt="$1" | |
| for known in "${OPTIONS_WITH_VALUES[@]}"; do | |
| [[ "$known" == "$opt" ]] && return 0 | |
| done | |
| return 1 | |
| } | |
| pdf_file="" | |
| output_target="" | |
| start_page=1 | |
| pdftotext_args=() | |
| args=("$@") | |
| argc=${#args[@]} | |
| i=0 | |
| while [[ $i -lt $argc ]]; do | |
| arg="${args[$i]}" | |
| if [[ "$arg" == "--" ]]; then | |
| pdftotext_args+=("$arg") | |
| ((i++)) | |
| while [[ $i -lt $argc ]]; do | |
| token="${args[$i]}" | |
| if [[ -z "$pdf_file" ]]; then | |
| pdf_file="$token" | |
| elif [[ -z "$output_target" ]]; then | |
| output_target="$token" | |
| else | |
| echo "Error: unexpected extra argument: $token" >&2 | |
| exit 1 | |
| fi | |
| ((i++)) | |
| done | |
| break | |
| elif [[ "$arg" == -* ]]; then | |
| pdftotext_args+=("$arg") | |
| if needs_value "$arg"; then | |
| ((i++)) | |
| if [[ $i -ge $argc ]]; then | |
| echo "Error: $arg requires a value" >&2 | |
| exit 1 | |
| fi | |
| value="${args[$i]}" | |
| pdftotext_args+=("$value") | |
| [[ "$arg" == "-f" ]] && start_page="$value" | |
| fi | |
| else | |
| if [[ -z "$pdf_file" ]]; then | |
| pdf_file="$arg" | |
| elif [[ -z "$output_target" ]]; then | |
| output_target="$arg" | |
| else | |
| echo "Error: unexpected extra argument: $arg" >&2 | |
| exit 1 | |
| fi | |
| fi | |
| ((i++)) | |
| done | |
| if [[ -z "$pdf_file" ]]; then | |
| echo "Error: missing PDF file argument" >&2 | |
| exit 1 | |
| fi | |
| if [[ "$pdf_file" != "-" && ! -f "$pdf_file" ]]; then | |
| echo "Error: PDF file not found: $pdf_file" >&2 | |
| exit 1 | |
| fi | |
| if [[ -z "$output_target" ]]; then | |
| if [[ "$pdf_file" == "-" ]]; then | |
| output_target="-" | |
| elif [[ "$pdf_file" == *.pdf ]]; then | |
| output_target="${pdf_file%.pdf}.jsonl" | |
| else | |
| output_target="$pdf_file.jsonl" | |
| fi | |
| fi | |
| if [[ "$output_target" == "-" ]]; then | |
| output_path="/dev/stdout" | |
| else | |
| output_path="$output_target" | |
| fi | |
| tmp_txt=$(mktemp) | |
| trap 'rm -f "$tmp_txt"' EXIT | |
| pdftotext "${pdftotext_args[@]}" "$pdf_file" "$tmp_txt" | |
| if [[ ! -s "$tmp_txt" ]]; then | |
| echo "Error: pdftotext produced no output" >&2 | |
| exit 1 | |
| fi | |
| awk -v start_page="$start_page" ' | |
| BEGIN { | |
| RS = "\f" | |
| ORS = "" | |
| page = 0 | |
| } | |
| { | |
| if ($0 ~ /^[[:space:]]*$/) { | |
| next | |
| } | |
| page++ | |
| page_num = start_page + page - 1 | |
| text = $0 | |
| gsub(/\\/, "\\\\", text) | |
| gsub(/"/, "\\\"", text) | |
| gsub(/\t/, "\\t", text) | |
| gsub(/\r/, "", text) | |
| gsub(/\n/, "\\n", text) | |
| printf "{\"page\": %d, \"content\": \"%s\"}\n", page_num, text | |
| } | |
| ' "$tmp_txt" > "$output_path" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment