suobset · December 23, 2025 22:30
diff --git a/pdf_merger.py b/pdf_merger.py
 #!/usr/bin/env python3
 """
 pdf_merger - Recursively merge PDFs in a directory with cover pages and dividers.

 ================================================================================
 REQUIREMENTS
 ================================================================================

 Python packages:
    pip install pypdf reportlab pillow

    If you're on a system with "externally managed environment" (Debian 12+,
    Ubuntu 23.04+, Fedora 38+), either:
      1. Use a venv:
         python3 -m venv .venv && source .venv/bin/activate && pip install ...
      2. Use pipx for isolated installs
      3. Use --break-system-packages (not recommended)
      4. Run in a VM/container as root (what I did lol)

 System packages (Debian/Ubuntu):
    apt install pandoc texlive-latex-base texlive-latex-extra texlive-fonts-recommended

    For LibreOffice alternative (instead of pandoc):
    apt install libreoffice

 Minimal install for PDF-only (no DOCX conversion):
    pip install pypdf reportlab pillow

 ================================================================================

 MIT License

 Copyright (c) 2025 Kushagra Srivastava

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:

 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.

 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 """

 from __future__ import annotations

 import argparse
 import os
 import subprocess
 import tempfile
 import shutil
 import logging
 import sys
 import fnmatch
 from pathlib import Path
 from datetime import datetime
 from io import BytesIO
 import re
 from typing import Optional, Dict, List, Tuple, Union

 from pypdf import PdfWriter, PdfReader
 from reportlab.lib.pagesizes import letter
 from reportlab.pdfgen import canvas
 from PIL import Image

 # Suppress pypdf warnings about malformed PDFs
 logging.getLogger("pypdf").setLevel(logging.ERROR)

 VERSION = "1.0.0"

 HELP_SHORT = """\
 pdf_merger v{version} - Recursively merge PDFs with cover pages and dividers.

 Usage:
    pdf_merger.py <input_dir> [options]
    pdf_merger.py --help-extended     Full documentation

 Quick Examples:
    pdf_merger.py ./documents                     # One PDF per subdirectory
    pdf_merger.py ./documents --flat              # Single merged PDF
    pdf_merger.py ./documents -o ./out            # Custom output directory
    pdf_merger.py ./documents -e .pdfignore       # Use exclusion file
    pdf_merger.py ./documents -s .pdfsingle       # Use single-dirs file

 Options:
    -o, --output DIR          Output directory (default: <input>/merged_output)
    -e, --exclude-file FILE   File with exclusion patterns (like .gitignore)
    -s, --single-file FILE    File listing directories to merge individually
    --exclude PATH [PATH...]  Exclude paths directly via command line
    --flat                    Merge everything into one PDF
    --no-covers               Skip all cover pages and dividers
    --no-doc-dividers         Skip per-document dividers (keep section dividers)
    --docx-converter NAME     'pandoc' (default) or 'libreoffice'
    -v, --verbose             Verbose output
    --help-extended           Show full documentation
    --version                 Show version

 Run 'pdf_merger.py --help-extended' for detailed documentation.
 """.format(version=VERSION)

 HELP_EXTENDED = """\
 pdf_merger v{version} - Recursively merge PDFs with cover pages and dividers.

 ================================================================================
 OVERVIEW
 ================================================================================

 pdf_merger recursively scans a directory for PDF, PNG, JPG, and DOCX files,
 converts non-PDFs, and merges them into organized output PDFs with optional
 cover pages and dividers.

 Default behavior creates one output PDF per top-level subdirectory. Use --flat
 to merge everything into a single PDF instead.

 ================================================================================
 USAGE
 ================================================================================

    pdf_merger.py <input_dir> [options]

 ================================================================================
 OPTIONS
 ================================================================================

 Input/Output:
    <input_dir>               Directory to process (required)
    -o, --output DIR          Output directory (default: <input>/merged_output)

 Exclusion/Inclusion:
    -e, --exclude-file FILE   Path to exclusion file (gitignore-like syntax)
    -s, --single-file FILE    Path to single-dirs file (directories to process)
    --exclude PATH [PATH...]  Exclude paths directly (can use multiple times)

 Modes:
    --flat                    Merge all files into a single PDF instead of
                              creating one PDF per top-level subdirectory

 Formatting:
    --no-covers               Skip all cover pages and dividers
    --no-doc-dividers         Skip individual document dividers but keep
                              section dividers for subdirectories

 Conversion:
    --docx-converter NAME     Tool for DOCX to PDF conversion:
                              'pandoc' (default) - requires pandoc + texlive
                              'libreoffice' - requires libreoffice

 Other:
    -v, --verbose             Show detailed progress including conversions
    -h, --help                Show quick help
    --help-extended           Show this extended documentation
    --version                 Show version number

 ================================================================================
 EXCLUSION FILE FORMAT (-e, --exclude-file)
 ================================================================================

 The exclusion file uses gitignore-like syntax. Each line is a pattern:

    # This is a comment
    drafts/                   # Exclude entire 'drafts' directory
    *.tmp                     # Exclude all .tmp files
    old/archive/              # Exclude nested directory
    secret.pdf                # Exclude specific file
    **/temp                   # Exclude 'temp' anywhere in tree
    backup-*                  # Exclude anything starting with 'backup-'

 Rules:
    - Blank lines and lines starting with # are ignored
    - Patterns ending with / match directories only
    - * matches anything except /
    - ** matches anything including /
    - Patterns are relative to input_dir

 Example .pdfignore file:
    # Ignore temporary files
    *.tmp
    *.bak
    
    # Ignore drafts folder
    drafts/
    
    # Ignore specific file
    notes/todo.docx

 ================================================================================
 SINGLE-DIRS FILE FORMAT (-s, --single-file)
 ================================================================================

 The single-dirs file lists directories that should each become their own merged
 PDF. One directory path per line, relative to input_dir.

    # This is a comment
    Documents/Legal
    Documents/Financial
    Photos/2024

 If provided, ONLY these directories are processed (plus any root files).
 If not provided, all top-level subdirectories are processed.

 Example .pdfsingle file:
    # Only merge these specific directories
    Beneficiary
    Evidence/Bills
    Evidence/Contracts
    Financial Records

 ================================================================================
 OUTPUT STRUCTURE
 ================================================================================

 Default mode (one PDF per subdirectory):

    input_dir/
    ├── FolderA/
    │   ├── file1.pdf
    │   └── SubFolder/
    │       └── file2.pdf
    └── FolderB/
        └── file3.pdf

    Creates:
    merged_output/
    ├── FolderA.pdf      # Contains file1.pdf, SubFolder/file2.pdf
    └── FolderB.pdf      # Contains file3.pdf

 Flat mode (--flat):

    Creates:
    merged_output/
    └── input_dir.pdf    # Contains everything

 ================================================================================
 COVER PAGES AND DIVIDERS
 ================================================================================

 By default, each merged PDF includes:

 1. MAIN COVER PAGE
   - Title (directory name)
   - Generation timestamp
   - Total document count
   - Full table of contents organized by subdirectory

 2. SECTION DIVIDERS
   - Inserted before each subdirectory's contents
   - Shows subdirectory name and file count
   - Lists files in that section

 3. DOCUMENT DIVIDERS
   - Inserted before each individual document
   - Shows filename and parent directory

 Use --no-covers to skip all of these.
 Use --no-doc-dividers to keep section dividers but skip per-document dividers.

 ================================================================================
 FILE SORTING
 ================================================================================

 Files are sorted chronologically when dates can be parsed from filenames.
 Supported date formats:

    2024-01-15_report.pdf     # ISO format
    20240115_report.pdf       # Compact ISO
    January 2024.pdf          # Month Year
    Jan_2024_report.pdf       # Abbreviated month
    report_2024-01-15.pdf     # Date anywhere in name

 Files without parseable dates are sorted alphabetically after dated files.

 ================================================================================
 SUPPORTED FILE TYPES
 ================================================================================

 Natively supported:
    .pdf                      # Merged directly

 Converted to PDF:
    .png, .jpg, .jpeg         # Via Pillow
    .docx                     # Via pandoc or libreoffice

 Files without extensions are checked for PDF magic bytes (%PDF).

 ================================================================================
 DEPENDENCIES
 ================================================================================

 Python packages (pip install):
    pypdf                     # PDF reading/writing
    reportlab                 # Cover page generation  
    pillow                    # Image conversion

 System tools:
    pandoc + texlive-latex-extra    # For DOCX (default)
    OR libreoffice                   # For DOCX (alternative)

 Debian/Ubuntu installation:
    apt install pandoc texlive-latex-base texlive-latex-extra
    pip install pypdf reportlab pillow

 ================================================================================
 EXAMPLES
 ================================================================================

 Basic usage:
    pdf_merger.py ./my_documents

 Custom output directory:
    pdf_merger.py ./my_documents -o ./printed_output

 Using exclusion file:
    pdf_merger.py ./my_documents -e .pdfignore

 Using both exclusion and single-dirs files:
    pdf_merger.py ./my_documents -e .pdfignore -s .pdfsingle

 Exclude via command line:
    pdf_merger.py ./my_documents --exclude "drafts" "temp" "*.bak"

 Merge everything into one PDF:
    pdf_merger.py ./my_documents --flat

 Minimal output (no covers):
    pdf_merger.py ./my_documents --flat --no-covers

 Use LibreOffice for DOCX conversion:
    pdf_merger.py ./my_documents --docx-converter libreoffice

 Verbose output:
    pdf_merger.py ./my_documents -v

 ================================================================================
 LICENSE
 ================================================================================

 MIT License - Copyright (c) 2025 Kushagra Srivastava

 See source code header for full license text.
 """.format(version=VERSION)


 def print_help():
    print(HELP_SHORT)
    sys.exit(0)


 def print_help_extended():
    print(HELP_EXTENDED)
    sys.exit(0)


 def print_version():
    print("pdf_merger v{}".format(VERSION))
    sys.exit(0)


 def parse_args():
    # Handle custom help flags before argparse
    if "--help-extended" in sys.argv:
        print_help_extended()
    if "--version" in sys.argv:
        print_version()
    if len(sys.argv) == 1 or (len(sys.argv) == 2 and sys.argv[1] in ["-h", "--help"]):
        print_help()
    
    parser = argparse.ArgumentParser(add_help=False)
    
    parser.add_argument("input_dir", type=Path, nargs="?", default=None)
    parser.add_argument("-o", "--output", type=Path, default=None)
    parser.add_argument("-e", "--exclude-file", type=Path, default=None)
    parser.add_argument("-s", "--single-file", type=Path, default=None)
    parser.add_argument("--exclude", nargs="*", default=[], action="append")
    parser.add_argument("--flat", action="store_true")
    parser.add_argument("--no-covers", action="store_true")
    parser.add_argument("--no-doc-dividers", action="store_true")
    parser.add_argument("--docx-converter", choices=["pandoc", "libreoffice"], default="pandoc")
    parser.add_argument("-v", "--verbose", action="store_true")
    parser.add_argument("-h", "--help", action="store_true")
    parser.add_argument("--help-extended", action="store_true")
    parser.add_argument("--version", action="store_true")
    
    args = parser.parse_args()
    
    if args.help:
        print_help()
    
    if args.input_dir is None:
        print("Error: input_dir is required\n")
        print_help()
    
    # Flatten the nested list from action="append"
    flat_excludes = []
    for exc_list in args.exclude:
        if exc_list:
            flat_excludes.extend(exc_list)
    args.exclude = flat_excludes
    
    return args


 def parse_pattern_file(filepath: Path) -> List[str]:
    """Parse a gitignore-like pattern file."""
    patterns = []
    if not filepath.exists():
        print("Warning: Pattern file not found: {}".format(filepath))
        return patterns
    
    with open(filepath, 'r') as f:
        for line in f:
            line = line.strip()
            # Skip empty lines and comments
            if not line or line.startswith('#'):
                continue
            patterns.append(line)
    
    return patterns


 def extract_date_from_filename(filename: str) -> Tuple[int, Tuple[int, int, int], str]:
    """Extract date from filename for chronological sorting."""
    name = Path(filename).stem.lower()
    
    m = re.match(r'^(\d{4})-?(\d{2})-?(\d{2})', name)
    if m:
        return (0, (int(m.group(1)), int(m.group(2)), int(m.group(3))), filename)
    
    m = re.search(r'(\d{4})-(\d{2})-(\d{2})', name)
    if m:
        return (0, (int(m.group(1)), int(m.group(2)), int(m.group(3))), filename)
    
    months = {
        'jan': 1, 'january': 1, 'feb': 2, 'february': 2, 'mar': 3, 'march': 3,
        'apr': 4, 'april': 4, 'may': 5, 'jun': 6, 'june': 6, 'jul': 7, 'july': 7,
        'aug': 8, 'august': 8, 'sep': 9, 'sept': 9, 'september': 9,
        'oct': 10, 'october': 10, 'nov': 11, 'november': 11, 'dec': 12, 'december': 12
    }
    for month_name, month_num in months.items():
        m = re.search(r'{}[\s_]*(\d{{4}})'.format(month_name), name)
        if m:
            return (0, (int(m.group(1)), month_num, 1), filename)
        m = re.search(r'(\d{{4}})[\s_]*{}'.format(month_name), name)
        if m:
            return (0, (int(m.group(1)), month_num, 1), filename)
    
    return (1, (9999, 99, 99), filename)


 def convert_image_to_pdf(img_path: Path, output_path: Path) -> bool:
    """Convert image to PDF using Pillow."""
    try:
        img = Image.open(img_path)
        if img.mode in ('RGBA', 'LA', 'P'):
            rgb_img = Image.new('RGB', img.size, (255, 255, 255))
            if img.mode == 'P':
                img = img.convert('RGBA')
            rgb_img.paste(img, mask=img.split()[-1] if img.mode == 'RGBA' else None)
            img = rgb_img
        elif img.mode != 'RGB':
            img = img.convert('RGB')
        
        img.save(str(output_path), 'PDF', resolution=100.0)
        return True
    except Exception as e:
        print("      ERROR converting image {}: {}".format(img_path.name, e))
        return False


 def convert_docx_to_pdf(docx_path: Path, output_path: Path, converter: str = "pandoc") -> bool:
    """Convert docx to PDF using pandoc or libreoffice."""
    try:
        if converter == "pandoc":
            subprocess.run(
                ["pandoc", str(docx_path), "-o", str(output_path)],
                check=True, capture_output=True
            )
        else:
            out_dir = output_path.parent
            subprocess.run(
                ["libreoffice", "--headless", "--convert-to", "pdf",
                 "--outdir", str(out_dir), str(docx_path)],
                check=True, capture_output=True
            )
            expected = out_dir / (docx_path.stem + ".pdf")
            if expected != output_path and expected.exists():
                shutil.move(str(expected), str(output_path))
        return True
    except subprocess.CalledProcessError as e:
        stderr = e.stderr.decode() if e.stderr else str(e)
        print("      ERROR converting docx {}: {}".format(docx_path.name, stderr))
        return False
    except Exception as e:
        print("      ERROR converting docx {}: {}".format(docx_path.name, e))
        return False


 def create_main_cover(title: str, structure: Dict[str, List[Path]], base_dir: Path) -> bytes:
    """Create main cover page with full document listing."""
    buffer = BytesIO()
    c = canvas.Canvas(buffer, pagesize=letter)
    width, height = letter
    
    c.setFont("Helvetica-Bold", 28)
    c.drawCentredString(width / 2, height - 80, title)
    
    c.setFont("Helvetica", 11)
    c.drawCentredString(width / 2, height - 105, "Generated: {}".format(
        datetime.now().strftime('%B %d, %Y at %H:%M')))
    
    total = sum(len(files) for files in structure.values())
    c.setFont("Helvetica", 12)
    c.drawCentredString(width / 2, height - 125, "Total: {} documents".format(total))
    
    y = height - 170
    c.setFont("Helvetica-Bold", 14)
    c.drawString(72, y, "Contents:")
    y -= 25
    
    for subdir, files in structure.items():
        if y < 100:
            c.showPage()
            y = height - 72
        
        c.setFont("Helvetica-Bold", 11)
        subdir_display = subdir if subdir else "(root)"
        c.drawString(72, y, "{} ({} files)".format(subdir_display, len(files)))
        y -= 18
        
        c.setFont("Helvetica", 9)
        for f in files:
            if y < 72:
                c.showPage()
                y = height - 72
                c.setFont("Helvetica", 9)
            c.drawString(100, y, "* {}".format(f.name))
            y -= 13
        
        y -= 8
    
    c.save()
    buffer.seek(0)
    return buffer.read()


 def create_subdir_divider(subdir_name: str, files: List[Path]) -> bytes:
    """Create a divider page for a subdirectory."""
    buffer = BytesIO()
    c = canvas.Canvas(buffer, pagesize=letter)
    width, height = letter
    
    c.setFont("Helvetica-Bold", 24)
    c.drawCentredString(width / 2, height / 2 + 40, subdir_name)
    
    c.setFont("Helvetica", 14)
    c.drawCentredString(width / 2, height / 2, "{} documents".format(len(files)))
    
    c.setFont("Helvetica", 10)
    y = height / 2 - 40
    for f in files[:15]:
        c.drawCentredString(width / 2, y, f.name)
        y -= 16
    if len(files) > 15:
        c.drawCentredString(width / 2, y, "... and {} more".format(len(files) - 15))
    
    c.save()
    buffer.seek(0)
    return buffer.read()


 def create_document_divider(doc_name: str, subdir: str = None) -> bytes:
    """Create a divider page for an individual document."""
    buffer = BytesIO()
    c = canvas.Canvas(buffer, pagesize=letter)
    width, height = letter
    
    c.setFont("Helvetica-Bold", 18)
    
    if len(doc_name) > 50:
        mid = len(doc_name) // 2
        split_idx = doc_name.rfind(' ', 0, mid + 10)
        if split_idx == -1:
            split_idx = mid
        line1 = doc_name[:split_idx].strip()
        line2 = doc_name[split_idx:].strip()
        c.drawCentredString(width / 2, height / 2 + 20, line1)
        c.drawCentredString(width / 2, height / 2 - 5, line2)
    else:
        c.drawCentredString(width / 2, height / 2 + 10, doc_name)
    
    if subdir:
        c.setFont("Helvetica", 11)
        c.drawCentredString(width / 2, height / 2 - 30, "from: {}".format(subdir))
    
    c.save()
    buffer.seek(0)
    return buffer.read()


 def add_pdf_to_writer(writer: PdfWriter, pdf_bytes_or_path: Union[bytes, Path]) -> int:
    """Add PDF pages to writer. Returns number of pages added."""
    try:
        if isinstance(pdf_bytes_or_path, bytes):
            reader = PdfReader(BytesIO(pdf_bytes_or_path))
        else:
            reader = PdfReader(str(pdf_bytes_or_path))
        
        for page in reader.pages:
            writer.add_page(page)
        return len(reader.pages)
    except Exception as e:
        print("      ERROR reading PDF: {}".format(e))
        return 0


 class PDFMerger:
    def __init__(self, input_dir: Path, output_dir: Path,
                 exclude_patterns: List[str], single_dirs: Optional[List[str]],
                 docx_converter: str = "pandoc", no_covers: bool = False,
                 no_doc_dividers: bool = False, verbose: bool = False):
        self.input_dir = input_dir.resolve()
        self.output_dir = output_dir.resolve()
        self.exclude_patterns = exclude_patterns
        self.single_dirs = single_dirs
        self.docx_converter = docx_converter
        self.no_covers = no_covers
        self.no_doc_dividers = no_doc_dividers
        self.verbose = verbose
        self.temp_dir = None
        
        # Always exclude output dir
        try:
            rel = self.output_dir.relative_to(self.input_dir)
            self.exclude_patterns.append(str(rel))
        except ValueError:
            pass
    
    def matches_pattern(self, rel_path: str, pattern: str) -> bool:
        """Check if a relative path matches a gitignore-like pattern."""
        # Handle directory patterns (ending with /)
        is_dir_pattern = pattern.endswith('/')
        if is_dir_pattern:
            pattern = pattern[:-1]
        
        # Handle ** (matches any path)
        if '**' in pattern:
            # Convert ** to regex-like matching
            regex_pattern = pattern.replace('**', '.*').replace('*', '[^/]*')
            if re.match(regex_pattern, rel_path):
                return True
            # Also check if any parent matches
            parts = rel_path.split('/')
            for i in range(len(parts)):
                partial = '/'.join(parts[:i+1])
                if re.match(regex_pattern, partial):
                    return True
            return False
        
        # Handle simple glob patterns
        if '*' in pattern or '?' in pattern:
            # Check against full path and basename
            if fnmatch.fnmatch(rel_path, pattern):
                return True
            if fnmatch.fnmatch(Path(rel_path).name, pattern):
                return True
            return False
        
        # Exact match or prefix match for directories
        if rel_path == pattern:
            return True
        if rel_path.startswith(pattern + '/'):
            return True
        
        return False
    
    def is_excluded(self, path: Path) -> bool:
        """Check if a path matches any exclusion pattern."""
        try:
            rel_path = str(path.relative_to(self.input_dir))
        except ValueError:
            return False
        
        for pattern in self.exclude_patterns:
            if self.matches_pattern(rel_path, pattern):
                return True
        return False
    
    def convert_file_if_needed(self, fpath: Path) -> Optional[Path]:
        """Convert non-PDF files, return path to PDF."""
        ext = fpath.suffix.lower()
        
        if ext in {'.png', '.jpg', '.jpeg'}:
            out_pdf = self.temp_dir / (fpath.stem + '.pdf')
            if self.verbose:
                print("      Converting image: {}".format(fpath.name))
            if convert_image_to_pdf(fpath, out_pdf):
                return out_pdf
            return None
        
        elif ext == '.docx':
            out_pdf = self.temp_dir / (fpath.stem + '.pdf')
            if self.verbose:
                print("      Converting docx: {}".format(fpath.name))
            if convert_docx_to_pdf(fpath, out_pdf, self.docx_converter):
                return out_pdf
            return None
        
        elif ext == '.pdf' or ext == '':
            return fpath
        
        return None
    
    def collect_files_with_structure(self, directory: Path) -> Dict[str, List[Path]]:
        """Collect files organized by subdirectory."""
        structure = {}
        
        for root, dirs, filenames in os.walk(directory):
            root_path = Path(root)
            
            dirs[:] = [d for d in dirs if not self.is_excluded(root_path / d)]
            
            rel_to_base = root_path.relative_to(directory)
            subdir_name = str(rel_to_base) if str(rel_to_base) != "." else ""
            
            files = []
            for fname in filenames:
                if fname.startswith('.'):
                    continue
                
                fpath = root_path / fname
                
                if self.is_excluded(fpath):
                    continue
                
                ext = fpath.suffix.lower()
                
                if ext in {'.pdf', '.png', '.jpg', '.jpeg', '.docx'}:
                    files.append(fpath)
                elif ext == '':
                    try:
                        with open(fpath, 'rb') as f:
                            if f.read(4) == b'%PDF':
                                files.append(fpath)
                    except:
                        pass
            
            if files:
                files.sort(key=lambda p: extract_date_from_filename(p.name))
                structure[subdir_name] = files
        
        return structure
    
    def process_directory(self, dir_path: Path, output_name: str) -> Optional[Path]:
        """Process a directory into a single merged PDF with dividers."""
        print("\n  Processing: {}/".format(dir_path.name))
        
        structure = self.collect_files_with_structure(dir_path)
        if not structure:
            print("    No files found")
            return None
        
        total_files = sum(len(f) for f in structure.values())
        print("    Found {} files in {} sections".format(total_files, len(structure)))
        
        writer = PdfWriter()
        
        if not self.no_covers:
            title = output_name.replace('-', ' - ').replace('_', ' ')
            cover_bytes = create_main_cover(title, structure, dir_path)
            add_pdf_to_writer(writer, cover_bytes)
        
        sorted_subdirs = sorted(structure.keys(), key=lambda x: (x != "", x))
        
        for subdir in sorted_subdirs:
            files = structure[subdir]
            subdir_display = subdir if subdir else "(Root Documents)"
            
            if self.verbose:
                print("    Section: {} ({} files)".format(subdir_display, len(files)))
            
            if not self.no_covers and (len(structure) > 1 or subdir != ""):
                divider_bytes = create_subdir_divider(subdir_display, files)
                add_pdf_to_writer(writer, divider_bytes)
            
            for fpath in files:
                if not self.no_covers and not self.no_doc_dividers:
                    doc_divider = create_document_divider(fpath.name, subdir if subdir else None)
                    add_pdf_to_writer(writer, doc_divider)
                
                pdf_path = self.convert_file_if_needed(fpath)
                if pdf_path:
                    add_pdf_to_writer(writer, pdf_path)
        
        output_path = self.output_dir / "{}.pdf".format(output_name)
        with open(output_path, 'wb') as f:
            writer.write(f)
        
        size_mb = output_path.stat().st_size / (1024 * 1024)
        print("    Created: {} ({:.2f} MB)".format(output_path.name, size_mb))
        
        return output_path
    
    def run_flat(self) -> List[Path]:
        """Merge everything into a single PDF."""
        output_name = self.input_dir.name.replace(' ', '_')
        result = self.process_directory(self.input_dir, output_name)
        return [result] if result else []
    
    def run_split(self) -> List[Path]:
        """Create one PDF per top-level subdirectory or per single_dirs entry."""
        created_files = []
        
        # Determine which directories to process
        if self.single_dirs:
            # Use explicit list from single-file
            dirs_to_process = []
            for dir_str in self.single_dirs:
                dir_path = self.input_dir / dir_str
                if dir_path.exists() and dir_path.is_dir():
                    dirs_to_process.append(dir_path)
                else:
                    print("  Warning: Directory not found: {}".format(dir_str))
        else:
            # Default: all top-level subdirectories
            dirs_to_process = [d for d in sorted(self.input_dir.iterdir())
                               if d.is_dir() and not self.is_excluded(d)]
        
        # Handle files directly in input_dir (not in subdirs)
        root_files = []
        for item in self.input_dir.iterdir():
            if item.is_file() and not item.name.startswith('.') and not self.is_excluded(item):
                ext = item.suffix.lower()
                if ext in {'.pdf', '.png', '.jpg', '.jpeg', '.docx'}:
                    root_files.append(item)
        
        if root_files:
            print("\n  Processing: (root files)")
            print("    Found {} files".format(len(root_files)))
            
            writer = PdfWriter()
            root_files.sort(key=lambda p: extract_date_from_filename(p.name))
            
            if not self.no_covers:
                structure = {"": root_files}
                title = "{} - Root".format(self.input_dir.name)
                cover_bytes = create_main_cover(title, structure, self.input_dir)
                add_pdf_to_writer(writer, cover_bytes)
            
            for fpath in root_files:
                if not self.no_covers and not self.no_doc_dividers:
                    doc_divider = create_document_divider(fpath.name)
                    add_pdf_to_writer(writer, doc_divider)
                
                pdf_path = self.convert_file_if_needed(fpath)
                if pdf_path:
                    add_pdf_to_writer(writer, pdf_path)
            
            output_path = self.output_dir / "Root.pdf"
            with open(output_path, 'wb') as f:
                writer.write(f)
            
            size_mb = output_path.stat().st_size / (1024 * 1024)
            print("    Created: {} ({:.2f} MB)".format(output_path.name, size_mb))
            created_files.append(output_path)
        
        # Process directories
        for subdir in dirs_to_process:
            output_name = subdir.name.replace(' ', '_').replace(':', '_')
            # For nested paths from single_dirs, use full relative path
            if self.single_dirs:
                try:
                    rel = subdir.relative_to(self.input_dir)
                    output_name = str(rel).replace('/', '-').replace(' ', '_').replace(':', '_')
                except ValueError:
                    pass
            
            result = self.process_directory(subdir, output_name)
            if result:
                created_files.append(result)
        
        return created_files
    
    def run(self, flat: bool = False) -> List[Path]:
        """Run the merger."""
        print("PDF Merger v{}".format(VERSION))
        print("Input: {}".format(self.input_dir))
        print("Output: {}".format(self.output_dir))
        print("=" * 60)
        
        if not self.input_dir.exists():
            print("ERROR: Input directory not found: {}".format(self.input_dir))
            return []
        
        self.output_dir.mkdir(parents=True, exist_ok=True)
        self.temp_dir = Path(tempfile.mkdtemp(prefix="pdf_merger_"))
        
        if self.verbose:
            print("Temp: {}".format(self.temp_dir))
            if self.exclude_patterns:
                print("Exclusions: {}".format(self.exclude_patterns))
            if self.single_dirs:
                print("Single dirs: {}".format(self.single_dirs))
        
        try:
            if flat:
                created_files = self.run_flat()
            else:
                created_files = self.run_split()
        finally:
            shutil.rmtree(self.temp_dir)
            if self.verbose:
                print("\nTemp files cleaned up.")
        
        print("\n" + "=" * 60)
        print("SUMMARY: Created {} PDFs".format(len(created_files)))
        print("Location: {}".format(self.output_dir))
        print("=" * 60 + "\n")
        
        total_size = 0
        for f in sorted(created_files):
            size_mb = f.stat().st_size / (1024 * 1024)
            total_size += size_mb
            print("  {:<50} {:>6.2f} MB".format(f.name, size_mb))
        
        print("  " + "-" * 58)
        print("  {:<50} {:>6.2f} MB".format("TOTAL", total_size))
        
        return created_files


 def main():
    args = parse_args()
    
    output_dir = args.output if args.output else args.input_dir / "merged_output"
    
    # Collect exclusion patterns
    exclude_patterns = list(args.exclude)
    if args.exclude_file:
        exclude_patterns.extend(parse_pattern_file(args.exclude_file))
    
    # Collect single dirs
    single_dirs = None
    if args.single_file:
        single_dirs = parse_pattern_file(args.single_file)
    
    merger = PDFMerger(
        input_dir=args.input_dir,
        output_dir=output_dir,
        exclude_patterns=exclude_patterns,
        single_dirs=single_dirs,
        docx_converter=args.docx_converter,
        no_covers=args.no_covers,
        no_doc_dividers=args.no_doc_dividers,
        verbose=args.verbose
    )
    
    merger.run(flat=args.flat)


 if __name__ == "__main__":
    main()
No results found