Created
December 23, 2025 22:30
-
-
Save suobset/17b82ebf416520cab514cb3925b5ed60 to your computer and use it in GitHub Desktop.
pdf_merger.py - Recursively merge PDFs, images (PNG/JPG), and DOCX files into organized output PDFs with auto-generated cover pages, section dividers, and document dividers. Features chronological sorting by filename dates, gitignore-style exclusion files, and flexible output modes (one PDF per subdirectory or single flat merge). Great for organ…
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| pdf_merger - Recursively merge PDFs in a directory with cover pages and dividers. | |
| ================================================================================ | |
| REQUIREMENTS | |
| ================================================================================ | |
| Python packages: | |
| pip install pypdf reportlab pillow | |
| If you're on a system with "externally managed environment" (Debian 12+, | |
| Ubuntu 23.04+, Fedora 38+), either: | |
| 1. Use a venv: | |
| python3 -m venv .venv && source .venv/bin/activate && pip install ... | |
| 2. Use pipx for isolated installs | |
| 3. Use --break-system-packages (not recommended) | |
| 4. Run in a VM/container as root (what I did lol) | |
| System packages (Debian/Ubuntu): | |
| apt install pandoc texlive-latex-base texlive-latex-extra texlive-fonts-recommended | |
| For LibreOffice alternative (instead of pandoc): | |
| apt install libreoffice | |
| Minimal install for PDF-only (no DOCX conversion): | |
| pip install pypdf reportlab pillow | |
| ================================================================================ | |
| MIT License | |
| Copyright (c) 2025 Kushagra Srivastava | |
| Permission is hereby granted, free of charge, to any person obtaining a copy | |
| of this software and associated documentation files (the "Software"), to deal | |
| in the Software without restriction, including without limitation the rights | |
| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
| copies of the Software, and to permit persons to whom the Software is | |
| furnished to do so, subject to the following conditions: | |
| The above copyright notice and this permission notice shall be included in all | |
| copies or substantial portions of the Software. | |
| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
| SOFTWARE. | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import os | |
| import subprocess | |
| import tempfile | |
| import shutil | |
| import logging | |
| import sys | |
| import fnmatch | |
| from pathlib import Path | |
| from datetime import datetime | |
| from io import BytesIO | |
| import re | |
| from typing import Optional, Dict, List, Tuple, Union | |
| from pypdf import PdfWriter, PdfReader | |
| from reportlab.lib.pagesizes import letter | |
| from reportlab.pdfgen import canvas | |
| from PIL import Image | |
| # Suppress pypdf warnings about malformed PDFs | |
| logging.getLogger("pypdf").setLevel(logging.ERROR) | |
| VERSION = "1.0.0" | |
| HELP_SHORT = """\ | |
| pdf_merger v{version} - Recursively merge PDFs with cover pages and dividers. | |
| Usage: | |
| pdf_merger.py <input_dir> [options] | |
| pdf_merger.py --help-extended Full documentation | |
| Quick Examples: | |
| pdf_merger.py ./documents # One PDF per subdirectory | |
| pdf_merger.py ./documents --flat # Single merged PDF | |
| pdf_merger.py ./documents -o ./out # Custom output directory | |
| pdf_merger.py ./documents -e .pdfignore # Use exclusion file | |
| pdf_merger.py ./documents -s .pdfsingle # Use single-dirs file | |
| Options: | |
| -o, --output DIR Output directory (default: <input>/merged_output) | |
| -e, --exclude-file FILE File with exclusion patterns (like .gitignore) | |
| -s, --single-file FILE File listing directories to merge individually | |
| --exclude PATH [PATH...] Exclude paths directly via command line | |
| --flat Merge everything into one PDF | |
| --no-covers Skip all cover pages and dividers | |
| --no-doc-dividers Skip per-document dividers (keep section dividers) | |
| --docx-converter NAME 'pandoc' (default) or 'libreoffice' | |
| -v, --verbose Verbose output | |
| --help-extended Show full documentation | |
| --version Show version | |
| Run 'pdf_merger.py --help-extended' for detailed documentation. | |
| """.format(version=VERSION) | |
| HELP_EXTENDED = """\ | |
| pdf_merger v{version} - Recursively merge PDFs with cover pages and dividers. | |
| ================================================================================ | |
| OVERVIEW | |
| ================================================================================ | |
| pdf_merger recursively scans a directory for PDF, PNG, JPG, and DOCX files, | |
| converts non-PDFs, and merges them into organized output PDFs with optional | |
| cover pages and dividers. | |
| Default behavior creates one output PDF per top-level subdirectory. Use --flat | |
| to merge everything into a single PDF instead. | |
| ================================================================================ | |
| USAGE | |
| ================================================================================ | |
| pdf_merger.py <input_dir> [options] | |
| ================================================================================ | |
| OPTIONS | |
| ================================================================================ | |
| Input/Output: | |
| <input_dir> Directory to process (required) | |
| -o, --output DIR Output directory (default: <input>/merged_output) | |
| Exclusion/Inclusion: | |
| -e, --exclude-file FILE Path to exclusion file (gitignore-like syntax) | |
| -s, --single-file FILE Path to single-dirs file (directories to process) | |
| --exclude PATH [PATH...] Exclude paths directly (can use multiple times) | |
| Modes: | |
| --flat Merge all files into a single PDF instead of | |
| creating one PDF per top-level subdirectory | |
| Formatting: | |
| --no-covers Skip all cover pages and dividers | |
| --no-doc-dividers Skip individual document dividers but keep | |
| section dividers for subdirectories | |
| Conversion: | |
| --docx-converter NAME Tool for DOCX to PDF conversion: | |
| 'pandoc' (default) - requires pandoc + texlive | |
| 'libreoffice' - requires libreoffice | |
| Other: | |
| -v, --verbose Show detailed progress including conversions | |
| -h, --help Show quick help | |
| --help-extended Show this extended documentation | |
| --version Show version number | |
| ================================================================================ | |
| EXCLUSION FILE FORMAT (-e, --exclude-file) | |
| ================================================================================ | |
| The exclusion file uses gitignore-like syntax. Each line is a pattern: | |
| # This is a comment | |
| drafts/ # Exclude entire 'drafts' directory | |
| *.tmp # Exclude all .tmp files | |
| old/archive/ # Exclude nested directory | |
| secret.pdf # Exclude specific file | |
| **/temp # Exclude 'temp' anywhere in tree | |
| backup-* # Exclude anything starting with 'backup-' | |
| Rules: | |
| - Blank lines and lines starting with # are ignored | |
| - Patterns ending with / match directories only | |
| - * matches anything except / | |
| - ** matches anything including / | |
| - Patterns are relative to input_dir | |
| Example .pdfignore file: | |
| # Ignore temporary files | |
| *.tmp | |
| *.bak | |
| # Ignore drafts folder | |
| drafts/ | |
| # Ignore specific file | |
| notes/todo.docx | |
| ================================================================================ | |
| SINGLE-DIRS FILE FORMAT (-s, --single-file) | |
| ================================================================================ | |
| The single-dirs file lists directories that should each become their own merged | |
| PDF. One directory path per line, relative to input_dir. | |
| # This is a comment | |
| Documents/Legal | |
| Documents/Financial | |
| Photos/2024 | |
| If provided, ONLY these directories are processed (plus any root files). | |
| If not provided, all top-level subdirectories are processed. | |
| Example .pdfsingle file: | |
| # Only merge these specific directories | |
| Beneficiary | |
| Evidence/Bills | |
| Evidence/Contracts | |
| Financial Records | |
| ================================================================================ | |
| OUTPUT STRUCTURE | |
| ================================================================================ | |
| Default mode (one PDF per subdirectory): | |
| input_dir/ | |
| ├── FolderA/ | |
| │ ├── file1.pdf | |
| │ └── SubFolder/ | |
| │ └── file2.pdf | |
| └── FolderB/ | |
| └── file3.pdf | |
| Creates: | |
| merged_output/ | |
| ├── FolderA.pdf # Contains file1.pdf, SubFolder/file2.pdf | |
| └── FolderB.pdf # Contains file3.pdf | |
| Flat mode (--flat): | |
| Creates: | |
| merged_output/ | |
| └── input_dir.pdf # Contains everything | |
| ================================================================================ | |
| COVER PAGES AND DIVIDERS | |
| ================================================================================ | |
| By default, each merged PDF includes: | |
| 1. MAIN COVER PAGE | |
| - Title (directory name) | |
| - Generation timestamp | |
| - Total document count | |
| - Full table of contents organized by subdirectory | |
| 2. SECTION DIVIDERS | |
| - Inserted before each subdirectory's contents | |
| - Shows subdirectory name and file count | |
| - Lists files in that section | |
| 3. DOCUMENT DIVIDERS | |
| - Inserted before each individual document | |
| - Shows filename and parent directory | |
| Use --no-covers to skip all of these. | |
| Use --no-doc-dividers to keep section dividers but skip per-document dividers. | |
| ================================================================================ | |
| FILE SORTING | |
| ================================================================================ | |
| Files are sorted chronologically when dates can be parsed from filenames. | |
| Supported date formats: | |
| 2024-01-15_report.pdf # ISO format | |
| 20240115_report.pdf # Compact ISO | |
| January 2024.pdf # Month Year | |
| Jan_2024_report.pdf # Abbreviated month | |
| report_2024-01-15.pdf # Date anywhere in name | |
| Files without parseable dates are sorted alphabetically after dated files. | |
| ================================================================================ | |
| SUPPORTED FILE TYPES | |
| ================================================================================ | |
| Natively supported: | |
| .pdf # Merged directly | |
| Converted to PDF: | |
| .png, .jpg, .jpeg # Via Pillow | |
| .docx # Via pandoc or libreoffice | |
| Files without extensions are checked for PDF magic bytes (%PDF). | |
| ================================================================================ | |
| DEPENDENCIES | |
| ================================================================================ | |
| Python packages (pip install): | |
| pypdf # PDF reading/writing | |
| reportlab # Cover page generation | |
| pillow # Image conversion | |
| System tools: | |
| pandoc + texlive-latex-extra # For DOCX (default) | |
| OR libreoffice # For DOCX (alternative) | |
| Debian/Ubuntu installation: | |
| apt install pandoc texlive-latex-base texlive-latex-extra | |
| pip install pypdf reportlab pillow | |
| ================================================================================ | |
| EXAMPLES | |
| ================================================================================ | |
| Basic usage: | |
| pdf_merger.py ./my_documents | |
| Custom output directory: | |
| pdf_merger.py ./my_documents -o ./printed_output | |
| Using exclusion file: | |
| pdf_merger.py ./my_documents -e .pdfignore | |
| Using both exclusion and single-dirs files: | |
| pdf_merger.py ./my_documents -e .pdfignore -s .pdfsingle | |
| Exclude via command line: | |
| pdf_merger.py ./my_documents --exclude "drafts" "temp" "*.bak" | |
| Merge everything into one PDF: | |
| pdf_merger.py ./my_documents --flat | |
| Minimal output (no covers): | |
| pdf_merger.py ./my_documents --flat --no-covers | |
| Use LibreOffice for DOCX conversion: | |
| pdf_merger.py ./my_documents --docx-converter libreoffice | |
| Verbose output: | |
| pdf_merger.py ./my_documents -v | |
| ================================================================================ | |
| LICENSE | |
| ================================================================================ | |
| MIT License - Copyright (c) 2025 Kushagra Srivastava | |
| See source code header for full license text. | |
| """.format(version=VERSION) | |
| def print_help(): | |
| print(HELP_SHORT) | |
| sys.exit(0) | |
| def print_help_extended(): | |
| print(HELP_EXTENDED) | |
| sys.exit(0) | |
| def print_version(): | |
| print("pdf_merger v{}".format(VERSION)) | |
| sys.exit(0) | |
| def parse_args(): | |
| # Handle custom help flags before argparse | |
| if "--help-extended" in sys.argv: | |
| print_help_extended() | |
| if "--version" in sys.argv: | |
| print_version() | |
| if len(sys.argv) == 1 or (len(sys.argv) == 2 and sys.argv[1] in ["-h", "--help"]): | |
| print_help() | |
| parser = argparse.ArgumentParser(add_help=False) | |
| parser.add_argument("input_dir", type=Path, nargs="?", default=None) | |
| parser.add_argument("-o", "--output", type=Path, default=None) | |
| parser.add_argument("-e", "--exclude-file", type=Path, default=None) | |
| parser.add_argument("-s", "--single-file", type=Path, default=None) | |
| parser.add_argument("--exclude", nargs="*", default=[], action="append") | |
| parser.add_argument("--flat", action="store_true") | |
| parser.add_argument("--no-covers", action="store_true") | |
| parser.add_argument("--no-doc-dividers", action="store_true") | |
| parser.add_argument("--docx-converter", choices=["pandoc", "libreoffice"], default="pandoc") | |
| parser.add_argument("-v", "--verbose", action="store_true") | |
| parser.add_argument("-h", "--help", action="store_true") | |
| parser.add_argument("--help-extended", action="store_true") | |
| parser.add_argument("--version", action="store_true") | |
| args = parser.parse_args() | |
| if args.help: | |
| print_help() | |
| if args.input_dir is None: | |
| print("Error: input_dir is required\n") | |
| print_help() | |
| # Flatten the nested list from action="append" | |
| flat_excludes = [] | |
| for exc_list in args.exclude: | |
| if exc_list: | |
| flat_excludes.extend(exc_list) | |
| args.exclude = flat_excludes | |
| return args | |
| def parse_pattern_file(filepath: Path) -> List[str]: | |
| """Parse a gitignore-like pattern file.""" | |
| patterns = [] | |
| if not filepath.exists(): | |
| print("Warning: Pattern file not found: {}".format(filepath)) | |
| return patterns | |
| with open(filepath, 'r') as f: | |
| for line in f: | |
| line = line.strip() | |
| # Skip empty lines and comments | |
| if not line or line.startswith('#'): | |
| continue | |
| patterns.append(line) | |
| return patterns | |
| def extract_date_from_filename(filename: str) -> Tuple[int, Tuple[int, int, int], str]: | |
| """Extract date from filename for chronological sorting.""" | |
| name = Path(filename).stem.lower() | |
| m = re.match(r'^(\d{4})-?(\d{2})-?(\d{2})', name) | |
| if m: | |
| return (0, (int(m.group(1)), int(m.group(2)), int(m.group(3))), filename) | |
| m = re.search(r'(\d{4})-(\d{2})-(\d{2})', name) | |
| if m: | |
| return (0, (int(m.group(1)), int(m.group(2)), int(m.group(3))), filename) | |
| months = { | |
| 'jan': 1, 'january': 1, 'feb': 2, 'february': 2, 'mar': 3, 'march': 3, | |
| 'apr': 4, 'april': 4, 'may': 5, 'jun': 6, 'june': 6, 'jul': 7, 'july': 7, | |
| 'aug': 8, 'august': 8, 'sep': 9, 'sept': 9, 'september': 9, | |
| 'oct': 10, 'october': 10, 'nov': 11, 'november': 11, 'dec': 12, 'december': 12 | |
| } | |
| for month_name, month_num in months.items(): | |
| m = re.search(r'{}[\s_]*(\d{{4}})'.format(month_name), name) | |
| if m: | |
| return (0, (int(m.group(1)), month_num, 1), filename) | |
| m = re.search(r'(\d{{4}})[\s_]*{}'.format(month_name), name) | |
| if m: | |
| return (0, (int(m.group(1)), month_num, 1), filename) | |
| return (1, (9999, 99, 99), filename) | |
| def convert_image_to_pdf(img_path: Path, output_path: Path) -> bool: | |
| """Convert image to PDF using Pillow.""" | |
| try: | |
| img = Image.open(img_path) | |
| if img.mode in ('RGBA', 'LA', 'P'): | |
| rgb_img = Image.new('RGB', img.size, (255, 255, 255)) | |
| if img.mode == 'P': | |
| img = img.convert('RGBA') | |
| rgb_img.paste(img, mask=img.split()[-1] if img.mode == 'RGBA' else None) | |
| img = rgb_img | |
| elif img.mode != 'RGB': | |
| img = img.convert('RGB') | |
| img.save(str(output_path), 'PDF', resolution=100.0) | |
| return True | |
| except Exception as e: | |
| print(" ERROR converting image {}: {}".format(img_path.name, e)) | |
| return False | |
| def convert_docx_to_pdf(docx_path: Path, output_path: Path, converter: str = "pandoc") -> bool: | |
| """Convert docx to PDF using pandoc or libreoffice.""" | |
| try: | |
| if converter == "pandoc": | |
| subprocess.run( | |
| ["pandoc", str(docx_path), "-o", str(output_path)], | |
| check=True, capture_output=True | |
| ) | |
| else: | |
| out_dir = output_path.parent | |
| subprocess.run( | |
| ["libreoffice", "--headless", "--convert-to", "pdf", | |
| "--outdir", str(out_dir), str(docx_path)], | |
| check=True, capture_output=True | |
| ) | |
| expected = out_dir / (docx_path.stem + ".pdf") | |
| if expected != output_path and expected.exists(): | |
| shutil.move(str(expected), str(output_path)) | |
| return True | |
| except subprocess.CalledProcessError as e: | |
| stderr = e.stderr.decode() if e.stderr else str(e) | |
| print(" ERROR converting docx {}: {}".format(docx_path.name, stderr)) | |
| return False | |
| except Exception as e: | |
| print(" ERROR converting docx {}: {}".format(docx_path.name, e)) | |
| return False | |
| def create_main_cover(title: str, structure: Dict[str, List[Path]], base_dir: Path) -> bytes: | |
| """Create main cover page with full document listing.""" | |
| buffer = BytesIO() | |
| c = canvas.Canvas(buffer, pagesize=letter) | |
| width, height = letter | |
| c.setFont("Helvetica-Bold", 28) | |
| c.drawCentredString(width / 2, height - 80, title) | |
| c.setFont("Helvetica", 11) | |
| c.drawCentredString(width / 2, height - 105, "Generated: {}".format( | |
| datetime.now().strftime('%B %d, %Y at %H:%M'))) | |
| total = sum(len(files) for files in structure.values()) | |
| c.setFont("Helvetica", 12) | |
| c.drawCentredString(width / 2, height - 125, "Total: {} documents".format(total)) | |
| y = height - 170 | |
| c.setFont("Helvetica-Bold", 14) | |
| c.drawString(72, y, "Contents:") | |
| y -= 25 | |
| for subdir, files in structure.items(): | |
| if y < 100: | |
| c.showPage() | |
| y = height - 72 | |
| c.setFont("Helvetica-Bold", 11) | |
| subdir_display = subdir if subdir else "(root)" | |
| c.drawString(72, y, "{} ({} files)".format(subdir_display, len(files))) | |
| y -= 18 | |
| c.setFont("Helvetica", 9) | |
| for f in files: | |
| if y < 72: | |
| c.showPage() | |
| y = height - 72 | |
| c.setFont("Helvetica", 9) | |
| c.drawString(100, y, "* {}".format(f.name)) | |
| y -= 13 | |
| y -= 8 | |
| c.save() | |
| buffer.seek(0) | |
| return buffer.read() | |
| def create_subdir_divider(subdir_name: str, files: List[Path]) -> bytes: | |
| """Create a divider page for a subdirectory.""" | |
| buffer = BytesIO() | |
| c = canvas.Canvas(buffer, pagesize=letter) | |
| width, height = letter | |
| c.setFont("Helvetica-Bold", 24) | |
| c.drawCentredString(width / 2, height / 2 + 40, subdir_name) | |
| c.setFont("Helvetica", 14) | |
| c.drawCentredString(width / 2, height / 2, "{} documents".format(len(files))) | |
| c.setFont("Helvetica", 10) | |
| y = height / 2 - 40 | |
| for f in files[:15]: | |
| c.drawCentredString(width / 2, y, f.name) | |
| y -= 16 | |
| if len(files) > 15: | |
| c.drawCentredString(width / 2, y, "... and {} more".format(len(files) - 15)) | |
| c.save() | |
| buffer.seek(0) | |
| return buffer.read() | |
| def create_document_divider(doc_name: str, subdir: str = None) -> bytes: | |
| """Create a divider page for an individual document.""" | |
| buffer = BytesIO() | |
| c = canvas.Canvas(buffer, pagesize=letter) | |
| width, height = letter | |
| c.setFont("Helvetica-Bold", 18) | |
| if len(doc_name) > 50: | |
| mid = len(doc_name) // 2 | |
| split_idx = doc_name.rfind(' ', 0, mid + 10) | |
| if split_idx == -1: | |
| split_idx = mid | |
| line1 = doc_name[:split_idx].strip() | |
| line2 = doc_name[split_idx:].strip() | |
| c.drawCentredString(width / 2, height / 2 + 20, line1) | |
| c.drawCentredString(width / 2, height / 2 - 5, line2) | |
| else: | |
| c.drawCentredString(width / 2, height / 2 + 10, doc_name) | |
| if subdir: | |
| c.setFont("Helvetica", 11) | |
| c.drawCentredString(width / 2, height / 2 - 30, "from: {}".format(subdir)) | |
| c.save() | |
| buffer.seek(0) | |
| return buffer.read() | |
| def add_pdf_to_writer(writer: PdfWriter, pdf_bytes_or_path: Union[bytes, Path]) -> int: | |
| """Add PDF pages to writer. Returns number of pages added.""" | |
| try: | |
| if isinstance(pdf_bytes_or_path, bytes): | |
| reader = PdfReader(BytesIO(pdf_bytes_or_path)) | |
| else: | |
| reader = PdfReader(str(pdf_bytes_or_path)) | |
| for page in reader.pages: | |
| writer.add_page(page) | |
| return len(reader.pages) | |
| except Exception as e: | |
| print(" ERROR reading PDF: {}".format(e)) | |
| return 0 | |
| class PDFMerger: | |
| def __init__(self, input_dir: Path, output_dir: Path, | |
| exclude_patterns: List[str], single_dirs: Optional[List[str]], | |
| docx_converter: str = "pandoc", no_covers: bool = False, | |
| no_doc_dividers: bool = False, verbose: bool = False): | |
| self.input_dir = input_dir.resolve() | |
| self.output_dir = output_dir.resolve() | |
| self.exclude_patterns = exclude_patterns | |
| self.single_dirs = single_dirs | |
| self.docx_converter = docx_converter | |
| self.no_covers = no_covers | |
| self.no_doc_dividers = no_doc_dividers | |
| self.verbose = verbose | |
| self.temp_dir = None | |
| # Always exclude output dir | |
| try: | |
| rel = self.output_dir.relative_to(self.input_dir) | |
| self.exclude_patterns.append(str(rel)) | |
| except ValueError: | |
| pass | |
| def matches_pattern(self, rel_path: str, pattern: str) -> bool: | |
| """Check if a relative path matches a gitignore-like pattern.""" | |
| # Handle directory patterns (ending with /) | |
| is_dir_pattern = pattern.endswith('/') | |
| if is_dir_pattern: | |
| pattern = pattern[:-1] | |
| # Handle ** (matches any path) | |
| if '**' in pattern: | |
| # Convert ** to regex-like matching | |
| regex_pattern = pattern.replace('**', '.*').replace('*', '[^/]*') | |
| if re.match(regex_pattern, rel_path): | |
| return True | |
| # Also check if any parent matches | |
| parts = rel_path.split('/') | |
| for i in range(len(parts)): | |
| partial = '/'.join(parts[:i+1]) | |
| if re.match(regex_pattern, partial): | |
| return True | |
| return False | |
| # Handle simple glob patterns | |
| if '*' in pattern or '?' in pattern: | |
| # Check against full path and basename | |
| if fnmatch.fnmatch(rel_path, pattern): | |
| return True | |
| if fnmatch.fnmatch(Path(rel_path).name, pattern): | |
| return True | |
| return False | |
| # Exact match or prefix match for directories | |
| if rel_path == pattern: | |
| return True | |
| if rel_path.startswith(pattern + '/'): | |
| return True | |
| return False | |
| def is_excluded(self, path: Path) -> bool: | |
| """Check if a path matches any exclusion pattern.""" | |
| try: | |
| rel_path = str(path.relative_to(self.input_dir)) | |
| except ValueError: | |
| return False | |
| for pattern in self.exclude_patterns: | |
| if self.matches_pattern(rel_path, pattern): | |
| return True | |
| return False | |
| def convert_file_if_needed(self, fpath: Path) -> Optional[Path]: | |
| """Convert non-PDF files, return path to PDF.""" | |
| ext = fpath.suffix.lower() | |
| if ext in {'.png', '.jpg', '.jpeg'}: | |
| out_pdf = self.temp_dir / (fpath.stem + '.pdf') | |
| if self.verbose: | |
| print(" Converting image: {}".format(fpath.name)) | |
| if convert_image_to_pdf(fpath, out_pdf): | |
| return out_pdf | |
| return None | |
| elif ext == '.docx': | |
| out_pdf = self.temp_dir / (fpath.stem + '.pdf') | |
| if self.verbose: | |
| print(" Converting docx: {}".format(fpath.name)) | |
| if convert_docx_to_pdf(fpath, out_pdf, self.docx_converter): | |
| return out_pdf | |
| return None | |
| elif ext == '.pdf' or ext == '': | |
| return fpath | |
| return None | |
| def collect_files_with_structure(self, directory: Path) -> Dict[str, List[Path]]: | |
| """Collect files organized by subdirectory.""" | |
| structure = {} | |
| for root, dirs, filenames in os.walk(directory): | |
| root_path = Path(root) | |
| dirs[:] = [d for d in dirs if not self.is_excluded(root_path / d)] | |
| rel_to_base = root_path.relative_to(directory) | |
| subdir_name = str(rel_to_base) if str(rel_to_base) != "." else "" | |
| files = [] | |
| for fname in filenames: | |
| if fname.startswith('.'): | |
| continue | |
| fpath = root_path / fname | |
| if self.is_excluded(fpath): | |
| continue | |
| ext = fpath.suffix.lower() | |
| if ext in {'.pdf', '.png', '.jpg', '.jpeg', '.docx'}: | |
| files.append(fpath) | |
| elif ext == '': | |
| try: | |
| with open(fpath, 'rb') as f: | |
| if f.read(4) == b'%PDF': | |
| files.append(fpath) | |
| except: | |
| pass | |
| if files: | |
| files.sort(key=lambda p: extract_date_from_filename(p.name)) | |
| structure[subdir_name] = files | |
| return structure | |
| def process_directory(self, dir_path: Path, output_name: str) -> Optional[Path]: | |
| """Process a directory into a single merged PDF with dividers.""" | |
| print("\n Processing: {}/".format(dir_path.name)) | |
| structure = self.collect_files_with_structure(dir_path) | |
| if not structure: | |
| print(" No files found") | |
| return None | |
| total_files = sum(len(f) for f in structure.values()) | |
| print(" Found {} files in {} sections".format(total_files, len(structure))) | |
| writer = PdfWriter() | |
| if not self.no_covers: | |
| title = output_name.replace('-', ' - ').replace('_', ' ') | |
| cover_bytes = create_main_cover(title, structure, dir_path) | |
| add_pdf_to_writer(writer, cover_bytes) | |
| sorted_subdirs = sorted(structure.keys(), key=lambda x: (x != "", x)) | |
| for subdir in sorted_subdirs: | |
| files = structure[subdir] | |
| subdir_display = subdir if subdir else "(Root Documents)" | |
| if self.verbose: | |
| print(" Section: {} ({} files)".format(subdir_display, len(files))) | |
| if not self.no_covers and (len(structure) > 1 or subdir != ""): | |
| divider_bytes = create_subdir_divider(subdir_display, files) | |
| add_pdf_to_writer(writer, divider_bytes) | |
| for fpath in files: | |
| if not self.no_covers and not self.no_doc_dividers: | |
| doc_divider = create_document_divider(fpath.name, subdir if subdir else None) | |
| add_pdf_to_writer(writer, doc_divider) | |
| pdf_path = self.convert_file_if_needed(fpath) | |
| if pdf_path: | |
| add_pdf_to_writer(writer, pdf_path) | |
| output_path = self.output_dir / "{}.pdf".format(output_name) | |
| with open(output_path, 'wb') as f: | |
| writer.write(f) | |
| size_mb = output_path.stat().st_size / (1024 * 1024) | |
| print(" Created: {} ({:.2f} MB)".format(output_path.name, size_mb)) | |
| return output_path | |
| def run_flat(self) -> List[Path]: | |
| """Merge everything into a single PDF.""" | |
| output_name = self.input_dir.name.replace(' ', '_') | |
| result = self.process_directory(self.input_dir, output_name) | |
| return [result] if result else [] | |
| def run_split(self) -> List[Path]: | |
| """Create one PDF per top-level subdirectory or per single_dirs entry.""" | |
| created_files = [] | |
| # Determine which directories to process | |
| if self.single_dirs: | |
| # Use explicit list from single-file | |
| dirs_to_process = [] | |
| for dir_str in self.single_dirs: | |
| dir_path = self.input_dir / dir_str | |
| if dir_path.exists() and dir_path.is_dir(): | |
| dirs_to_process.append(dir_path) | |
| else: | |
| print(" Warning: Directory not found: {}".format(dir_str)) | |
| else: | |
| # Default: all top-level subdirectories | |
| dirs_to_process = [d for d in sorted(self.input_dir.iterdir()) | |
| if d.is_dir() and not self.is_excluded(d)] | |
| # Handle files directly in input_dir (not in subdirs) | |
| root_files = [] | |
| for item in self.input_dir.iterdir(): | |
| if item.is_file() and not item.name.startswith('.') and not self.is_excluded(item): | |
| ext = item.suffix.lower() | |
| if ext in {'.pdf', '.png', '.jpg', '.jpeg', '.docx'}: | |
| root_files.append(item) | |
| if root_files: | |
| print("\n Processing: (root files)") | |
| print(" Found {} files".format(len(root_files))) | |
| writer = PdfWriter() | |
| root_files.sort(key=lambda p: extract_date_from_filename(p.name)) | |
| if not self.no_covers: | |
| structure = {"": root_files} | |
| title = "{} - Root".format(self.input_dir.name) | |
| cover_bytes = create_main_cover(title, structure, self.input_dir) | |
| add_pdf_to_writer(writer, cover_bytes) | |
| for fpath in root_files: | |
| if not self.no_covers and not self.no_doc_dividers: | |
| doc_divider = create_document_divider(fpath.name) | |
| add_pdf_to_writer(writer, doc_divider) | |
| pdf_path = self.convert_file_if_needed(fpath) | |
| if pdf_path: | |
| add_pdf_to_writer(writer, pdf_path) | |
| output_path = self.output_dir / "Root.pdf" | |
| with open(output_path, 'wb') as f: | |
| writer.write(f) | |
| size_mb = output_path.stat().st_size / (1024 * 1024) | |
| print(" Created: {} ({:.2f} MB)".format(output_path.name, size_mb)) | |
| created_files.append(output_path) | |
| # Process directories | |
| for subdir in dirs_to_process: | |
| output_name = subdir.name.replace(' ', '_').replace(':', '_') | |
| # For nested paths from single_dirs, use full relative path | |
| if self.single_dirs: | |
| try: | |
| rel = subdir.relative_to(self.input_dir) | |
| output_name = str(rel).replace('/', '-').replace(' ', '_').replace(':', '_') | |
| except ValueError: | |
| pass | |
| result = self.process_directory(subdir, output_name) | |
| if result: | |
| created_files.append(result) | |
| return created_files | |
| def run(self, flat: bool = False) -> List[Path]: | |
| """Run the merger.""" | |
| print("PDF Merger v{}".format(VERSION)) | |
| print("Input: {}".format(self.input_dir)) | |
| print("Output: {}".format(self.output_dir)) | |
| print("=" * 60) | |
| if not self.input_dir.exists(): | |
| print("ERROR: Input directory not found: {}".format(self.input_dir)) | |
| return [] | |
| self.output_dir.mkdir(parents=True, exist_ok=True) | |
| self.temp_dir = Path(tempfile.mkdtemp(prefix="pdf_merger_")) | |
| if self.verbose: | |
| print("Temp: {}".format(self.temp_dir)) | |
| if self.exclude_patterns: | |
| print("Exclusions: {}".format(self.exclude_patterns)) | |
| if self.single_dirs: | |
| print("Single dirs: {}".format(self.single_dirs)) | |
| try: | |
| if flat: | |
| created_files = self.run_flat() | |
| else: | |
| created_files = self.run_split() | |
| finally: | |
| shutil.rmtree(self.temp_dir) | |
| if self.verbose: | |
| print("\nTemp files cleaned up.") | |
| print("\n" + "=" * 60) | |
| print("SUMMARY: Created {} PDFs".format(len(created_files))) | |
| print("Location: {}".format(self.output_dir)) | |
| print("=" * 60 + "\n") | |
| total_size = 0 | |
| for f in sorted(created_files): | |
| size_mb = f.stat().st_size / (1024 * 1024) | |
| total_size += size_mb | |
| print(" {:<50} {:>6.2f} MB".format(f.name, size_mb)) | |
| print(" " + "-" * 58) | |
| print(" {:<50} {:>6.2f} MB".format("TOTAL", total_size)) | |
| return created_files | |
| def main(): | |
| args = parse_args() | |
| output_dir = args.output if args.output else args.input_dir / "merged_output" | |
| # Collect exclusion patterns | |
| exclude_patterns = list(args.exclude) | |
| if args.exclude_file: | |
| exclude_patterns.extend(parse_pattern_file(args.exclude_file)) | |
| # Collect single dirs | |
| single_dirs = None | |
| if args.single_file: | |
| single_dirs = parse_pattern_file(args.single_file) | |
| merger = PDFMerger( | |
| input_dir=args.input_dir, | |
| output_dir=output_dir, | |
| exclude_patterns=exclude_patterns, | |
| single_dirs=single_dirs, | |
| docx_converter=args.docx_converter, | |
| no_covers=args.no_covers, | |
| no_doc_dividers=args.no_doc_dividers, | |
| verbose=args.verbose | |
| ) | |
| merger.run(flat=args.flat) | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment