Created
December 10, 2025 04:20
-
-
Save hotchpotch/9406e30448cb840bf89d7e0231c2419c to your computer and use it in GitHub Desktop.
PDF to translate cli wrapper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """Translate recent PDFs in Downloads using pdf2zh_next via uvx. | |
| Defaults: | |
| - Looks for PDFs downloaded within the last day. | |
| - Saves outputs to ~/Downloads/pdf2translated/{filename}.translated.pdf | |
| - Skips files that already have a translated output. | |
| - Uses pdf2zh_next with Google Translate, Japanese output, no watermark, and | |
| alternating bilingual pages. | |
| Only standard library modules are used. The script delegates translation work | |
| to the external `uvx pdf2zh_next` command. | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import re | |
| import shutil | |
| import subprocess | |
| import sys | |
| from datetime import datetime, timedelta | |
| from pathlib import Path | |
| from typing import Iterable, List, Optional, Tuple | |
| # Default configuration | |
| DEFAULT_DAYS = 1 | |
| DEFAULT_INPUT_DIR = Path.home() / "Downloads" | |
| DEFAULT_OUTPUT_DIR = DEFAULT_INPUT_DIR / "0_pdf_translated" | |
| # Base command and options passed to pdf2zh_next via uvx. | |
| # Keep this near the top so defaults are easy to tweak. | |
| PDF2ZH_CMD_PREFIX: List[str] = [ | |
| "uvx", | |
| "pdf2zh_next", | |
| "--google", | |
| "--lang-out", | |
| "ja", | |
| "--watermark-output-mode", | |
| "no_watermark", | |
| "--dual-translate-first", | |
| "--use-alternating-pages-dual", | |
| ] | |
| def parse_args() -> argparse.Namespace: | |
| parser = argparse.ArgumentParser( | |
| description="Translate recent PDFs with pdf2zh_next via uvx." | |
| ) | |
| parser.add_argument( | |
| "--days", | |
| type=float, | |
| default=DEFAULT_DAYS, | |
| help="Look back this many days from now (default: 1).", | |
| ) | |
| parser.add_argument( | |
| "--input-dir", | |
| type=Path, | |
| default=DEFAULT_INPUT_DIR, | |
| help="Directory to scan for PDFs (default: ~/Downloads).", | |
| ) | |
| parser.add_argument( | |
| "--output-dir", | |
| type=Path, | |
| default=DEFAULT_OUTPUT_DIR, | |
| help="Directory to store translated PDFs (default: ~/Downloads/pdf2translated).", | |
| ) | |
| return parser.parse_args() | |
| def strip_trailing_paren_index(stem: str) -> str: | |
| """Remove trailing ' (number)' pattern from a filename stem.""" | |
| return re.sub(r"\s*\(\d+\)\s*$", "", stem) | |
| def find_recent_pdfs(directory: Path, days: float) -> Iterable[Path]: | |
| """Yield PDFs in directory whose mtime is within the given window.""" | |
| cutoff = datetime.now() - timedelta(days=days) | |
| for path in sorted(directory.glob("*.pdf")): | |
| try: | |
| mtime = datetime.fromtimestamp(path.stat().st_mtime) | |
| except FileNotFoundError: | |
| continue | |
| if mtime >= cutoff: | |
| yield path | |
| def select_latest_by_base(paths: Iterable[Path]) -> List[Tuple[Path, float, str, str]]: | |
| """For files sharing the same base name (parentheses stripped), keep the newest.""" | |
| latest: dict[str, Tuple[Path, float, str, str]] = {} | |
| for path in paths: | |
| base_raw = path.stem | |
| base_clean = strip_trailing_paren_index(base_raw) | |
| try: | |
| mtime = path.stat().st_mtime | |
| except FileNotFoundError: | |
| continue | |
| key = base_clean | |
| if key not in latest or mtime > latest[key][1]: | |
| latest[key] = (path, mtime, base_raw, base_clean) | |
| return list(latest.values()) | |
| def ensure_tools_available() -> None: | |
| if shutil.which("uvx") is None: | |
| sys.stderr.write("uvx is not installed or not in PATH.\n") | |
| sys.exit(1) | |
| def cleanup_extras(base_prefixes: Iterable[str], output_dir: Path, keep: Path) -> None: | |
| """Remove other generated PDFs for the given prefixes, keeping only `keep`.""" | |
| prefixes = tuple(base_prefixes) | |
| if not prefixes: | |
| return | |
| for path in output_dir.glob("*.pdf"): | |
| if not path.name.startswith(prefixes): | |
| continue | |
| try: | |
| if path.resolve() == keep.resolve(): | |
| continue | |
| except FileNotFoundError: | |
| continue | |
| try: | |
| path.unlink() | |
| except OSError as exc: | |
| sys.stderr.write(f"Warning: failed to remove {path}: {exc}\n") | |
| def make_title_slug(pdf_path: Path) -> Optional[str]: | |
| """Extract a safe title slug from the first page text.""" | |
| if shutil.which("pdftotext") is None: | |
| sys.stderr.write("Warning: pdftotext not found; skipping title extraction.\n") | |
| return None | |
| try: | |
| result = subprocess.run( | |
| ["pdftotext", "-f", "1", "-l", "1", str(pdf_path), "-"], | |
| capture_output=True, | |
| text=True, | |
| check=True, | |
| ) | |
| except subprocess.CalledProcessError as exc: | |
| sys.stderr.write(f"Warning: pdftotext failed ({exc.returncode}); skipping title extraction.\n") | |
| return None | |
| lines = result.stdout.splitlines()[:5] | |
| text = " ".join(lines) | |
| text = re.sub(r"\s+", " ", text).strip() | |
| if not text: | |
| return None | |
| matches = re.findall(r"[A-Za-z0-9 ]+", text) | |
| if not matches: | |
| return None | |
| candidate = max(matches, key=len) | |
| candidate = re.sub(r"\s+", " ", candidate).strip() | |
| if not candidate: | |
| return None | |
| candidate = candidate[:50].replace(" ", "_") | |
| return candidate | |
| def translate_pdf(pdf_path: Path, output_dir: Path, target_path: Path, base_prefix: str) -> bool: | |
| """Run pdf2zh_next and move the bilingual output to target_path. | |
| Returns True on success (translated file present or created), False otherwise. | |
| """ | |
| cmd = PDF2ZH_CMD_PREFIX + ["--output", str(output_dir), str(pdf_path)] | |
| print(f"[translate] {pdf_path.name} -> {target_path.name}") | |
| # Stream stdout/stderr live so the user can see pdf2zh_next progress. | |
| result = subprocess.run(cmd) | |
| if result.returncode != 0: | |
| sys.stderr.write( | |
| f"pdf2zh_next failed for {pdf_path.name} (exit {result.returncode}).\n" | |
| ) | |
| return False | |
| base = pdf_path.stem | |
| def find_generated_file() -> Optional[Path]: | |
| produced = [ | |
| p | |
| for p in output_dir.glob("*.pdf") | |
| if p.name.startswith(base) | |
| ] | |
| if not produced: | |
| return None | |
| def priority(p: Path) -> tuple: | |
| name = p.name | |
| return ( | |
| 0 if "no_watermark" in name else 1, # prefer no_watermark | |
| 0 if "dual" in name else 1, # then prefer dual layout | |
| -p.stat().st_mtime, # newest first | |
| ) | |
| produced.sort(key=priority) | |
| return produced[0] | |
| output_file = find_generated_file() | |
| if output_file is None: | |
| sys.stderr.write(f"Translated output for {pdf_path.name} not found in {output_dir}.\n") | |
| return False | |
| try: | |
| output_file.replace(target_path) | |
| except OSError as exc: | |
| sys.stderr.write(f"Failed to move output {output_file} -> {target_path}: {exc}\n") | |
| return False | |
| cleanup_extras([base_prefix, strip_trailing_paren_index(base_prefix)], output_dir, target_path) | |
| return True | |
| def main() -> None: | |
| args = parse_args() | |
| ensure_tools_available() | |
| if not args.input_dir.is_dir(): | |
| sys.stderr.write(f"Input directory not found: {args.input_dir}\n") | |
| sys.exit(1) | |
| args.output_dir.mkdir(parents=True, exist_ok=True) | |
| recent = list(find_recent_pdfs(args.input_dir, args.days)) | |
| targets = select_latest_by_base(recent) | |
| if not targets: | |
| print("No PDFs found within the specified time window.") | |
| return | |
| for pdf_path, mtime, base_raw, base_clean in targets: | |
| title_slug = make_title_slug(pdf_path) | |
| if title_slug: | |
| dest_name = f"{base_clean}.translated.{title_slug}.pdf" | |
| else: | |
| dest_name = f"{base_clean}.translated.pdf" | |
| dest = args.output_dir / dest_name | |
| if dest.exists(): | |
| try: | |
| dest_mtime = dest.stat().st_mtime | |
| except FileNotFoundError: | |
| dest_mtime = 0 | |
| if mtime <= dest_mtime: | |
| cleanup_extras([base_raw, base_clean], args.output_dir, dest) | |
| print(f"[skip] {dest.name} already exists and is newer or same age.") | |
| continue | |
| else: | |
| print(f"[retranslate] {dest.name} is older than source; re-running.") | |
| success = translate_pdf(pdf_path, args.output_dir, dest, base_raw) | |
| if success: | |
| print(f"[done] {dest}") | |
| else: | |
| print(f"[fail] {pdf_path.name}") | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment