Skip to content

Instantly share code, notes, and snippets.

@hotchpotch
Created December 10, 2025 04:20
Show Gist options
  • Select an option

  • Save hotchpotch/9406e30448cb840bf89d7e0231c2419c to your computer and use it in GitHub Desktop.

Select an option

Save hotchpotch/9406e30448cb840bf89d7e0231c2419c to your computer and use it in GitHub Desktop.
PDF to translate cli wrapper
#!/usr/bin/env python3
"""Translate recent PDFs in Downloads using pdf2zh_next via uvx.
Defaults:
- Looks for PDFs downloaded within the last day.
- Saves outputs to ~/Downloads/pdf2translated/{filename}.translated.pdf
- Skips files that already have a translated output.
- Uses pdf2zh_next with Google Translate, Japanese output, no watermark, and
alternating bilingual pages.
Only standard library modules are used. The script delegates translation work
to the external `uvx pdf2zh_next` command.
"""
from __future__ import annotations
import argparse
import re
import shutil
import subprocess
import sys
from datetime import datetime, timedelta
from pathlib import Path
from typing import Iterable, List, Optional, Tuple
# Default configuration
DEFAULT_DAYS = 1
DEFAULT_INPUT_DIR = Path.home() / "Downloads"
DEFAULT_OUTPUT_DIR = DEFAULT_INPUT_DIR / "0_pdf_translated"
# Base command and options passed to pdf2zh_next via uvx.
# Keep this near the top so defaults are easy to tweak.
PDF2ZH_CMD_PREFIX: List[str] = [
"uvx",
"pdf2zh_next",
"--google",
"--lang-out",
"ja",
"--watermark-output-mode",
"no_watermark",
"--dual-translate-first",
"--use-alternating-pages-dual",
]
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Translate recent PDFs with pdf2zh_next via uvx."
)
parser.add_argument(
"--days",
type=float,
default=DEFAULT_DAYS,
help="Look back this many days from now (default: 1).",
)
parser.add_argument(
"--input-dir",
type=Path,
default=DEFAULT_INPUT_DIR,
help="Directory to scan for PDFs (default: ~/Downloads).",
)
parser.add_argument(
"--output-dir",
type=Path,
default=DEFAULT_OUTPUT_DIR,
help="Directory to store translated PDFs (default: ~/Downloads/pdf2translated).",
)
return parser.parse_args()
def strip_trailing_paren_index(stem: str) -> str:
"""Remove trailing ' (number)' pattern from a filename stem."""
return re.sub(r"\s*\(\d+\)\s*$", "", stem)
def find_recent_pdfs(directory: Path, days: float) -> Iterable[Path]:
"""Yield PDFs in directory whose mtime is within the given window."""
cutoff = datetime.now() - timedelta(days=days)
for path in sorted(directory.glob("*.pdf")):
try:
mtime = datetime.fromtimestamp(path.stat().st_mtime)
except FileNotFoundError:
continue
if mtime >= cutoff:
yield path
def select_latest_by_base(paths: Iterable[Path]) -> List[Tuple[Path, float, str, str]]:
"""For files sharing the same base name (parentheses stripped), keep the newest."""
latest: dict[str, Tuple[Path, float, str, str]] = {}
for path in paths:
base_raw = path.stem
base_clean = strip_trailing_paren_index(base_raw)
try:
mtime = path.stat().st_mtime
except FileNotFoundError:
continue
key = base_clean
if key not in latest or mtime > latest[key][1]:
latest[key] = (path, mtime, base_raw, base_clean)
return list(latest.values())
def ensure_tools_available() -> None:
if shutil.which("uvx") is None:
sys.stderr.write("uvx is not installed or not in PATH.\n")
sys.exit(1)
def cleanup_extras(base_prefixes: Iterable[str], output_dir: Path, keep: Path) -> None:
"""Remove other generated PDFs for the given prefixes, keeping only `keep`."""
prefixes = tuple(base_prefixes)
if not prefixes:
return
for path in output_dir.glob("*.pdf"):
if not path.name.startswith(prefixes):
continue
try:
if path.resolve() == keep.resolve():
continue
except FileNotFoundError:
continue
try:
path.unlink()
except OSError as exc:
sys.stderr.write(f"Warning: failed to remove {path}: {exc}\n")
def make_title_slug(pdf_path: Path) -> Optional[str]:
"""Extract a safe title slug from the first page text."""
if shutil.which("pdftotext") is None:
sys.stderr.write("Warning: pdftotext not found; skipping title extraction.\n")
return None
try:
result = subprocess.run(
["pdftotext", "-f", "1", "-l", "1", str(pdf_path), "-"],
capture_output=True,
text=True,
check=True,
)
except subprocess.CalledProcessError as exc:
sys.stderr.write(f"Warning: pdftotext failed ({exc.returncode}); skipping title extraction.\n")
return None
lines = result.stdout.splitlines()[:5]
text = " ".join(lines)
text = re.sub(r"\s+", " ", text).strip()
if not text:
return None
matches = re.findall(r"[A-Za-z0-9 ]+", text)
if not matches:
return None
candidate = max(matches, key=len)
candidate = re.sub(r"\s+", " ", candidate).strip()
if not candidate:
return None
candidate = candidate[:50].replace(" ", "_")
return candidate
def translate_pdf(pdf_path: Path, output_dir: Path, target_path: Path, base_prefix: str) -> bool:
"""Run pdf2zh_next and move the bilingual output to target_path.
Returns True on success (translated file present or created), False otherwise.
"""
cmd = PDF2ZH_CMD_PREFIX + ["--output", str(output_dir), str(pdf_path)]
print(f"[translate] {pdf_path.name} -> {target_path.name}")
# Stream stdout/stderr live so the user can see pdf2zh_next progress.
result = subprocess.run(cmd)
if result.returncode != 0:
sys.stderr.write(
f"pdf2zh_next failed for {pdf_path.name} (exit {result.returncode}).\n"
)
return False
base = pdf_path.stem
def find_generated_file() -> Optional[Path]:
produced = [
p
for p in output_dir.glob("*.pdf")
if p.name.startswith(base)
]
if not produced:
return None
def priority(p: Path) -> tuple:
name = p.name
return (
0 if "no_watermark" in name else 1, # prefer no_watermark
0 if "dual" in name else 1, # then prefer dual layout
-p.stat().st_mtime, # newest first
)
produced.sort(key=priority)
return produced[0]
output_file = find_generated_file()
if output_file is None:
sys.stderr.write(f"Translated output for {pdf_path.name} not found in {output_dir}.\n")
return False
try:
output_file.replace(target_path)
except OSError as exc:
sys.stderr.write(f"Failed to move output {output_file} -> {target_path}: {exc}\n")
return False
cleanup_extras([base_prefix, strip_trailing_paren_index(base_prefix)], output_dir, target_path)
return True
def main() -> None:
args = parse_args()
ensure_tools_available()
if not args.input_dir.is_dir():
sys.stderr.write(f"Input directory not found: {args.input_dir}\n")
sys.exit(1)
args.output_dir.mkdir(parents=True, exist_ok=True)
recent = list(find_recent_pdfs(args.input_dir, args.days))
targets = select_latest_by_base(recent)
if not targets:
print("No PDFs found within the specified time window.")
return
for pdf_path, mtime, base_raw, base_clean in targets:
title_slug = make_title_slug(pdf_path)
if title_slug:
dest_name = f"{base_clean}.translated.{title_slug}.pdf"
else:
dest_name = f"{base_clean}.translated.pdf"
dest = args.output_dir / dest_name
if dest.exists():
try:
dest_mtime = dest.stat().st_mtime
except FileNotFoundError:
dest_mtime = 0
if mtime <= dest_mtime:
cleanup_extras([base_raw, base_clean], args.output_dir, dest)
print(f"[skip] {dest.name} already exists and is newer or same age.")
continue
else:
print(f"[retranslate] {dest.name} is older than source; re-running.")
success = translate_pdf(pdf_path, args.output_dir, dest, base_raw)
if success:
print(f"[done] {dest}")
else:
print(f"[fail] {pdf_path.name}")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment