Skip to content

Instantly share code, notes, and snippets.

@badbye
Last active December 23, 2025 06:13
Show Gist options
  • Select an option

  • Save badbye/79f506013fc3c6d123404f7c287c7e8b to your computer and use it in GitHub Desktop.

Select an option

Save badbye/79f506013fc3c6d123404f7c287c7e8b to your computer and use it in GitHub Desktop.
将 PDF 转成 JSONL 的脚本
#!/usr/bin/env bash
set -euo pipefail
# A thin wrapper over pdftotext that emits JSONL instead of plain text.
# Usage stays identical to pdftotext: pdftojsonl.sh [options] <PDF-file> [<jsonl-file>]
if [[ $# -eq 0 ]]; then
echo "Usage: pdftojsonl [pdftotext options] <PDF-file> [<jsonl-file>]" >&2
exit 1
fi
OPTIONS_WITH_VALUES=(
-f -l -r -x -y -W -H -fixed -linespacing -enc -eol -opw -upw
-marginl -marginr -margint -marginb -cfg
)
needs_value() {
local opt="$1"
for known in "${OPTIONS_WITH_VALUES[@]}"; do
[[ "$known" == "$opt" ]] && return 0
done
return 1
}
pdf_file=""
output_target=""
start_page=1
pdftotext_args=()
args=("$@")
argc=${#args[@]}
i=0
while [[ $i -lt $argc ]]; do
arg="${args[$i]}"
if [[ "$arg" == "--" ]]; then
pdftotext_args+=("$arg")
((i++))
while [[ $i -lt $argc ]]; do
token="${args[$i]}"
if [[ -z "$pdf_file" ]]; then
pdf_file="$token"
elif [[ -z "$output_target" ]]; then
output_target="$token"
else
echo "Error: unexpected extra argument: $token" >&2
exit 1
fi
((i++))
done
break
elif [[ "$arg" == -* ]]; then
pdftotext_args+=("$arg")
if needs_value "$arg"; then
((i++))
if [[ $i -ge $argc ]]; then
echo "Error: $arg requires a value" >&2
exit 1
fi
value="${args[$i]}"
pdftotext_args+=("$value")
[[ "$arg" == "-f" ]] && start_page="$value"
fi
else
if [[ -z "$pdf_file" ]]; then
pdf_file="$arg"
elif [[ -z "$output_target" ]]; then
output_target="$arg"
else
echo "Error: unexpected extra argument: $arg" >&2
exit 1
fi
fi
((i++))
done
if [[ -z "$pdf_file" ]]; then
echo "Error: missing PDF file argument" >&2
exit 1
fi
if [[ "$pdf_file" != "-" && ! -f "$pdf_file" ]]; then
echo "Error: PDF file not found: $pdf_file" >&2
exit 1
fi
if [[ -z "$output_target" ]]; then
if [[ "$pdf_file" == "-" ]]; then
output_target="-"
elif [[ "$pdf_file" == *.pdf ]]; then
output_target="${pdf_file%.pdf}.jsonl"
else
output_target="$pdf_file.jsonl"
fi
fi
if [[ "$output_target" == "-" ]]; then
output_path="/dev/stdout"
else
output_path="$output_target"
fi
tmp_txt=$(mktemp)
trap 'rm -f "$tmp_txt"' EXIT
pdftotext "${pdftotext_args[@]}" "$pdf_file" "$tmp_txt"
if [[ ! -s "$tmp_txt" ]]; then
echo "Error: pdftotext produced no output" >&2
exit 1
fi
awk -v start_page="$start_page" '
BEGIN {
RS = "\f"
ORS = ""
page = 0
}
{
if ($0 ~ /^[[:space:]]*$/) {
next
}
page++
page_num = start_page + page - 1
text = $0
gsub(/\\/, "\\\\", text)
gsub(/"/, "\\\"", text)
gsub(/\t/, "\\t", text)
gsub(/\r/, "", text)
gsub(/\n/, "\\n", text)
printf "{\"page\": %d, \"content\": \"%s\"}\n", page_num, text
}
' "$tmp_txt" > "$output_path"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment