Skip to content

Instantly share code, notes, and snippets.

@suobset
Created December 23, 2025 22:30
Show Gist options
  • Select an option

  • Save suobset/17b82ebf416520cab514cb3925b5ed60 to your computer and use it in GitHub Desktop.

Select an option

Save suobset/17b82ebf416520cab514cb3925b5ed60 to your computer and use it in GitHub Desktop.
pdf_merger.py - Recursively merge PDFs, images (PNG/JPG), and DOCX files into organized output PDFs with auto-generated cover pages, section dividers, and document dividers. Features chronological sorting by filename dates, gitignore-style exclusion files, and flexible output modes (one PDF per subdirectory or single flat merge). Great for organ…
#!/usr/bin/env python3
"""
pdf_merger - Recursively merge PDFs in a directory with cover pages and dividers.
================================================================================
REQUIREMENTS
================================================================================
Python packages:
pip install pypdf reportlab pillow
If you're on a system with "externally managed environment" (Debian 12+,
Ubuntu 23.04+, Fedora 38+), either:
1. Use a venv:
python3 -m venv .venv && source .venv/bin/activate && pip install ...
2. Use pipx for isolated installs
3. Use --break-system-packages (not recommended)
4. Run in a VM/container as root (what I did lol)
System packages (Debian/Ubuntu):
apt install pandoc texlive-latex-base texlive-latex-extra texlive-fonts-recommended
For LibreOffice alternative (instead of pandoc):
apt install libreoffice
Minimal install for PDF-only (no DOCX conversion):
pip install pypdf reportlab pillow
================================================================================
MIT License
Copyright (c) 2025 Kushagra Srivastava
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
"""
from __future__ import annotations
import argparse
import os
import subprocess
import tempfile
import shutil
import logging
import sys
import fnmatch
from pathlib import Path
from datetime import datetime
from io import BytesIO
import re
from typing import Optional, Dict, List, Tuple, Union
from pypdf import PdfWriter, PdfReader
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from PIL import Image
# Suppress pypdf warnings about malformed PDFs
logging.getLogger("pypdf").setLevel(logging.ERROR)
VERSION = "1.0.0"
HELP_SHORT = """\
pdf_merger v{version} - Recursively merge PDFs with cover pages and dividers.
Usage:
pdf_merger.py <input_dir> [options]
pdf_merger.py --help-extended Full documentation
Quick Examples:
pdf_merger.py ./documents # One PDF per subdirectory
pdf_merger.py ./documents --flat # Single merged PDF
pdf_merger.py ./documents -o ./out # Custom output directory
pdf_merger.py ./documents -e .pdfignore # Use exclusion file
pdf_merger.py ./documents -s .pdfsingle # Use single-dirs file
Options:
-o, --output DIR Output directory (default: <input>/merged_output)
-e, --exclude-file FILE File with exclusion patterns (like .gitignore)
-s, --single-file FILE File listing directories to merge individually
--exclude PATH [PATH...] Exclude paths directly via command line
--flat Merge everything into one PDF
--no-covers Skip all cover pages and dividers
--no-doc-dividers Skip per-document dividers (keep section dividers)
--docx-converter NAME 'pandoc' (default) or 'libreoffice'
-v, --verbose Verbose output
--help-extended Show full documentation
--version Show version
Run 'pdf_merger.py --help-extended' for detailed documentation.
""".format(version=VERSION)
HELP_EXTENDED = """\
pdf_merger v{version} - Recursively merge PDFs with cover pages and dividers.
================================================================================
OVERVIEW
================================================================================
pdf_merger recursively scans a directory for PDF, PNG, JPG, and DOCX files,
converts non-PDFs, and merges them into organized output PDFs with optional
cover pages and dividers.
Default behavior creates one output PDF per top-level subdirectory. Use --flat
to merge everything into a single PDF instead.
================================================================================
USAGE
================================================================================
pdf_merger.py <input_dir> [options]
================================================================================
OPTIONS
================================================================================
Input/Output:
<input_dir> Directory to process (required)
-o, --output DIR Output directory (default: <input>/merged_output)
Exclusion/Inclusion:
-e, --exclude-file FILE Path to exclusion file (gitignore-like syntax)
-s, --single-file FILE Path to single-dirs file (directories to process)
--exclude PATH [PATH...] Exclude paths directly (can use multiple times)
Modes:
--flat Merge all files into a single PDF instead of
creating one PDF per top-level subdirectory
Formatting:
--no-covers Skip all cover pages and dividers
--no-doc-dividers Skip individual document dividers but keep
section dividers for subdirectories
Conversion:
--docx-converter NAME Tool for DOCX to PDF conversion:
'pandoc' (default) - requires pandoc + texlive
'libreoffice' - requires libreoffice
Other:
-v, --verbose Show detailed progress including conversions
-h, --help Show quick help
--help-extended Show this extended documentation
--version Show version number
================================================================================
EXCLUSION FILE FORMAT (-e, --exclude-file)
================================================================================
The exclusion file uses gitignore-like syntax. Each line is a pattern:
# This is a comment
drafts/ # Exclude entire 'drafts' directory
*.tmp # Exclude all .tmp files
old/archive/ # Exclude nested directory
secret.pdf # Exclude specific file
**/temp # Exclude 'temp' anywhere in tree
backup-* # Exclude anything starting with 'backup-'
Rules:
- Blank lines and lines starting with # are ignored
- Patterns ending with / match directories only
- * matches anything except /
- ** matches anything including /
- Patterns are relative to input_dir
Example .pdfignore file:
# Ignore temporary files
*.tmp
*.bak
# Ignore drafts folder
drafts/
# Ignore specific file
notes/todo.docx
================================================================================
SINGLE-DIRS FILE FORMAT (-s, --single-file)
================================================================================
The single-dirs file lists directories that should each become their own merged
PDF. One directory path per line, relative to input_dir.
# This is a comment
Documents/Legal
Documents/Financial
Photos/2024
If provided, ONLY these directories are processed (plus any root files).
If not provided, all top-level subdirectories are processed.
Example .pdfsingle file:
# Only merge these specific directories
Beneficiary
Evidence/Bills
Evidence/Contracts
Financial Records
================================================================================
OUTPUT STRUCTURE
================================================================================
Default mode (one PDF per subdirectory):
input_dir/
├── FolderA/
│ ├── file1.pdf
│ └── SubFolder/
│ └── file2.pdf
└── FolderB/
└── file3.pdf
Creates:
merged_output/
├── FolderA.pdf # Contains file1.pdf, SubFolder/file2.pdf
└── FolderB.pdf # Contains file3.pdf
Flat mode (--flat):
Creates:
merged_output/
└── input_dir.pdf # Contains everything
================================================================================
COVER PAGES AND DIVIDERS
================================================================================
By default, each merged PDF includes:
1. MAIN COVER PAGE
- Title (directory name)
- Generation timestamp
- Total document count
- Full table of contents organized by subdirectory
2. SECTION DIVIDERS
- Inserted before each subdirectory's contents
- Shows subdirectory name and file count
- Lists files in that section
3. DOCUMENT DIVIDERS
- Inserted before each individual document
- Shows filename and parent directory
Use --no-covers to skip all of these.
Use --no-doc-dividers to keep section dividers but skip per-document dividers.
================================================================================
FILE SORTING
================================================================================
Files are sorted chronologically when dates can be parsed from filenames.
Supported date formats:
2024-01-15_report.pdf # ISO format
20240115_report.pdf # Compact ISO
January 2024.pdf # Month Year
Jan_2024_report.pdf # Abbreviated month
report_2024-01-15.pdf # Date anywhere in name
Files without parseable dates are sorted alphabetically after dated files.
================================================================================
SUPPORTED FILE TYPES
================================================================================
Natively supported:
.pdf # Merged directly
Converted to PDF:
.png, .jpg, .jpeg # Via Pillow
.docx # Via pandoc or libreoffice
Files without extensions are checked for PDF magic bytes (%PDF).
================================================================================
DEPENDENCIES
================================================================================
Python packages (pip install):
pypdf # PDF reading/writing
reportlab # Cover page generation
pillow # Image conversion
System tools:
pandoc + texlive-latex-extra # For DOCX (default)
OR libreoffice # For DOCX (alternative)
Debian/Ubuntu installation:
apt install pandoc texlive-latex-base texlive-latex-extra
pip install pypdf reportlab pillow
================================================================================
EXAMPLES
================================================================================
Basic usage:
pdf_merger.py ./my_documents
Custom output directory:
pdf_merger.py ./my_documents -o ./printed_output
Using exclusion file:
pdf_merger.py ./my_documents -e .pdfignore
Using both exclusion and single-dirs files:
pdf_merger.py ./my_documents -e .pdfignore -s .pdfsingle
Exclude via command line:
pdf_merger.py ./my_documents --exclude "drafts" "temp" "*.bak"
Merge everything into one PDF:
pdf_merger.py ./my_documents --flat
Minimal output (no covers):
pdf_merger.py ./my_documents --flat --no-covers
Use LibreOffice for DOCX conversion:
pdf_merger.py ./my_documents --docx-converter libreoffice
Verbose output:
pdf_merger.py ./my_documents -v
================================================================================
LICENSE
================================================================================
MIT License - Copyright (c) 2025 Kushagra Srivastava
See source code header for full license text.
""".format(version=VERSION)
def print_help():
print(HELP_SHORT)
sys.exit(0)
def print_help_extended():
print(HELP_EXTENDED)
sys.exit(0)
def print_version():
print("pdf_merger v{}".format(VERSION))
sys.exit(0)
def parse_args():
# Handle custom help flags before argparse
if "--help-extended" in sys.argv:
print_help_extended()
if "--version" in sys.argv:
print_version()
if len(sys.argv) == 1 or (len(sys.argv) == 2 and sys.argv[1] in ["-h", "--help"]):
print_help()
parser = argparse.ArgumentParser(add_help=False)
parser.add_argument("input_dir", type=Path, nargs="?", default=None)
parser.add_argument("-o", "--output", type=Path, default=None)
parser.add_argument("-e", "--exclude-file", type=Path, default=None)
parser.add_argument("-s", "--single-file", type=Path, default=None)
parser.add_argument("--exclude", nargs="*", default=[], action="append")
parser.add_argument("--flat", action="store_true")
parser.add_argument("--no-covers", action="store_true")
parser.add_argument("--no-doc-dividers", action="store_true")
parser.add_argument("--docx-converter", choices=["pandoc", "libreoffice"], default="pandoc")
parser.add_argument("-v", "--verbose", action="store_true")
parser.add_argument("-h", "--help", action="store_true")
parser.add_argument("--help-extended", action="store_true")
parser.add_argument("--version", action="store_true")
args = parser.parse_args()
if args.help:
print_help()
if args.input_dir is None:
print("Error: input_dir is required\n")
print_help()
# Flatten the nested list from action="append"
flat_excludes = []
for exc_list in args.exclude:
if exc_list:
flat_excludes.extend(exc_list)
args.exclude = flat_excludes
return args
def parse_pattern_file(filepath: Path) -> List[str]:
"""Parse a gitignore-like pattern file."""
patterns = []
if not filepath.exists():
print("Warning: Pattern file not found: {}".format(filepath))
return patterns
with open(filepath, 'r') as f:
for line in f:
line = line.strip()
# Skip empty lines and comments
if not line or line.startswith('#'):
continue
patterns.append(line)
return patterns
def extract_date_from_filename(filename: str) -> Tuple[int, Tuple[int, int, int], str]:
"""Extract date from filename for chronological sorting."""
name = Path(filename).stem.lower()
m = re.match(r'^(\d{4})-?(\d{2})-?(\d{2})', name)
if m:
return (0, (int(m.group(1)), int(m.group(2)), int(m.group(3))), filename)
m = re.search(r'(\d{4})-(\d{2})-(\d{2})', name)
if m:
return (0, (int(m.group(1)), int(m.group(2)), int(m.group(3))), filename)
months = {
'jan': 1, 'january': 1, 'feb': 2, 'february': 2, 'mar': 3, 'march': 3,
'apr': 4, 'april': 4, 'may': 5, 'jun': 6, 'june': 6, 'jul': 7, 'july': 7,
'aug': 8, 'august': 8, 'sep': 9, 'sept': 9, 'september': 9,
'oct': 10, 'october': 10, 'nov': 11, 'november': 11, 'dec': 12, 'december': 12
}
for month_name, month_num in months.items():
m = re.search(r'{}[\s_]*(\d{{4}})'.format(month_name), name)
if m:
return (0, (int(m.group(1)), month_num, 1), filename)
m = re.search(r'(\d{{4}})[\s_]*{}'.format(month_name), name)
if m:
return (0, (int(m.group(1)), month_num, 1), filename)
return (1, (9999, 99, 99), filename)
def convert_image_to_pdf(img_path: Path, output_path: Path) -> bool:
"""Convert image to PDF using Pillow."""
try:
img = Image.open(img_path)
if img.mode in ('RGBA', 'LA', 'P'):
rgb_img = Image.new('RGB', img.size, (255, 255, 255))
if img.mode == 'P':
img = img.convert('RGBA')
rgb_img.paste(img, mask=img.split()[-1] if img.mode == 'RGBA' else None)
img = rgb_img
elif img.mode != 'RGB':
img = img.convert('RGB')
img.save(str(output_path), 'PDF', resolution=100.0)
return True
except Exception as e:
print(" ERROR converting image {}: {}".format(img_path.name, e))
return False
def convert_docx_to_pdf(docx_path: Path, output_path: Path, converter: str = "pandoc") -> bool:
"""Convert docx to PDF using pandoc or libreoffice."""
try:
if converter == "pandoc":
subprocess.run(
["pandoc", str(docx_path), "-o", str(output_path)],
check=True, capture_output=True
)
else:
out_dir = output_path.parent
subprocess.run(
["libreoffice", "--headless", "--convert-to", "pdf",
"--outdir", str(out_dir), str(docx_path)],
check=True, capture_output=True
)
expected = out_dir / (docx_path.stem + ".pdf")
if expected != output_path and expected.exists():
shutil.move(str(expected), str(output_path))
return True
except subprocess.CalledProcessError as e:
stderr = e.stderr.decode() if e.stderr else str(e)
print(" ERROR converting docx {}: {}".format(docx_path.name, stderr))
return False
except Exception as e:
print(" ERROR converting docx {}: {}".format(docx_path.name, e))
return False
def create_main_cover(title: str, structure: Dict[str, List[Path]], base_dir: Path) -> bytes:
"""Create main cover page with full document listing."""
buffer = BytesIO()
c = canvas.Canvas(buffer, pagesize=letter)
width, height = letter
c.setFont("Helvetica-Bold", 28)
c.drawCentredString(width / 2, height - 80, title)
c.setFont("Helvetica", 11)
c.drawCentredString(width / 2, height - 105, "Generated: {}".format(
datetime.now().strftime('%B %d, %Y at %H:%M')))
total = sum(len(files) for files in structure.values())
c.setFont("Helvetica", 12)
c.drawCentredString(width / 2, height - 125, "Total: {} documents".format(total))
y = height - 170
c.setFont("Helvetica-Bold", 14)
c.drawString(72, y, "Contents:")
y -= 25
for subdir, files in structure.items():
if y < 100:
c.showPage()
y = height - 72
c.setFont("Helvetica-Bold", 11)
subdir_display = subdir if subdir else "(root)"
c.drawString(72, y, "{} ({} files)".format(subdir_display, len(files)))
y -= 18
c.setFont("Helvetica", 9)
for f in files:
if y < 72:
c.showPage()
y = height - 72
c.setFont("Helvetica", 9)
c.drawString(100, y, "* {}".format(f.name))
y -= 13
y -= 8
c.save()
buffer.seek(0)
return buffer.read()
def create_subdir_divider(subdir_name: str, files: List[Path]) -> bytes:
"""Create a divider page for a subdirectory."""
buffer = BytesIO()
c = canvas.Canvas(buffer, pagesize=letter)
width, height = letter
c.setFont("Helvetica-Bold", 24)
c.drawCentredString(width / 2, height / 2 + 40, subdir_name)
c.setFont("Helvetica", 14)
c.drawCentredString(width / 2, height / 2, "{} documents".format(len(files)))
c.setFont("Helvetica", 10)
y = height / 2 - 40
for f in files[:15]:
c.drawCentredString(width / 2, y, f.name)
y -= 16
if len(files) > 15:
c.drawCentredString(width / 2, y, "... and {} more".format(len(files) - 15))
c.save()
buffer.seek(0)
return buffer.read()
def create_document_divider(doc_name: str, subdir: str = None) -> bytes:
"""Create a divider page for an individual document."""
buffer = BytesIO()
c = canvas.Canvas(buffer, pagesize=letter)
width, height = letter
c.setFont("Helvetica-Bold", 18)
if len(doc_name) > 50:
mid = len(doc_name) // 2
split_idx = doc_name.rfind(' ', 0, mid + 10)
if split_idx == -1:
split_idx = mid
line1 = doc_name[:split_idx].strip()
line2 = doc_name[split_idx:].strip()
c.drawCentredString(width / 2, height / 2 + 20, line1)
c.drawCentredString(width / 2, height / 2 - 5, line2)
else:
c.drawCentredString(width / 2, height / 2 + 10, doc_name)
if subdir:
c.setFont("Helvetica", 11)
c.drawCentredString(width / 2, height / 2 - 30, "from: {}".format(subdir))
c.save()
buffer.seek(0)
return buffer.read()
def add_pdf_to_writer(writer: PdfWriter, pdf_bytes_or_path: Union[bytes, Path]) -> int:
"""Add PDF pages to writer. Returns number of pages added."""
try:
if isinstance(pdf_bytes_or_path, bytes):
reader = PdfReader(BytesIO(pdf_bytes_or_path))
else:
reader = PdfReader(str(pdf_bytes_or_path))
for page in reader.pages:
writer.add_page(page)
return len(reader.pages)
except Exception as e:
print(" ERROR reading PDF: {}".format(e))
return 0
class PDFMerger:
def __init__(self, input_dir: Path, output_dir: Path,
exclude_patterns: List[str], single_dirs: Optional[List[str]],
docx_converter: str = "pandoc", no_covers: bool = False,
no_doc_dividers: bool = False, verbose: bool = False):
self.input_dir = input_dir.resolve()
self.output_dir = output_dir.resolve()
self.exclude_patterns = exclude_patterns
self.single_dirs = single_dirs
self.docx_converter = docx_converter
self.no_covers = no_covers
self.no_doc_dividers = no_doc_dividers
self.verbose = verbose
self.temp_dir = None
# Always exclude output dir
try:
rel = self.output_dir.relative_to(self.input_dir)
self.exclude_patterns.append(str(rel))
except ValueError:
pass
def matches_pattern(self, rel_path: str, pattern: str) -> bool:
"""Check if a relative path matches a gitignore-like pattern."""
# Handle directory patterns (ending with /)
is_dir_pattern = pattern.endswith('/')
if is_dir_pattern:
pattern = pattern[:-1]
# Handle ** (matches any path)
if '**' in pattern:
# Convert ** to regex-like matching
regex_pattern = pattern.replace('**', '.*').replace('*', '[^/]*')
if re.match(regex_pattern, rel_path):
return True
# Also check if any parent matches
parts = rel_path.split('/')
for i in range(len(parts)):
partial = '/'.join(parts[:i+1])
if re.match(regex_pattern, partial):
return True
return False
# Handle simple glob patterns
if '*' in pattern or '?' in pattern:
# Check against full path and basename
if fnmatch.fnmatch(rel_path, pattern):
return True
if fnmatch.fnmatch(Path(rel_path).name, pattern):
return True
return False
# Exact match or prefix match for directories
if rel_path == pattern:
return True
if rel_path.startswith(pattern + '/'):
return True
return False
def is_excluded(self, path: Path) -> bool:
"""Check if a path matches any exclusion pattern."""
try:
rel_path = str(path.relative_to(self.input_dir))
except ValueError:
return False
for pattern in self.exclude_patterns:
if self.matches_pattern(rel_path, pattern):
return True
return False
def convert_file_if_needed(self, fpath: Path) -> Optional[Path]:
"""Convert non-PDF files, return path to PDF."""
ext = fpath.suffix.lower()
if ext in {'.png', '.jpg', '.jpeg'}:
out_pdf = self.temp_dir / (fpath.stem + '.pdf')
if self.verbose:
print(" Converting image: {}".format(fpath.name))
if convert_image_to_pdf(fpath, out_pdf):
return out_pdf
return None
elif ext == '.docx':
out_pdf = self.temp_dir / (fpath.stem + '.pdf')
if self.verbose:
print(" Converting docx: {}".format(fpath.name))
if convert_docx_to_pdf(fpath, out_pdf, self.docx_converter):
return out_pdf
return None
elif ext == '.pdf' or ext == '':
return fpath
return None
def collect_files_with_structure(self, directory: Path) -> Dict[str, List[Path]]:
"""Collect files organized by subdirectory."""
structure = {}
for root, dirs, filenames in os.walk(directory):
root_path = Path(root)
dirs[:] = [d for d in dirs if not self.is_excluded(root_path / d)]
rel_to_base = root_path.relative_to(directory)
subdir_name = str(rel_to_base) if str(rel_to_base) != "." else ""
files = []
for fname in filenames:
if fname.startswith('.'):
continue
fpath = root_path / fname
if self.is_excluded(fpath):
continue
ext = fpath.suffix.lower()
if ext in {'.pdf', '.png', '.jpg', '.jpeg', '.docx'}:
files.append(fpath)
elif ext == '':
try:
with open(fpath, 'rb') as f:
if f.read(4) == b'%PDF':
files.append(fpath)
except:
pass
if files:
files.sort(key=lambda p: extract_date_from_filename(p.name))
structure[subdir_name] = files
return structure
def process_directory(self, dir_path: Path, output_name: str) -> Optional[Path]:
"""Process a directory into a single merged PDF with dividers."""
print("\n Processing: {}/".format(dir_path.name))
structure = self.collect_files_with_structure(dir_path)
if not structure:
print(" No files found")
return None
total_files = sum(len(f) for f in structure.values())
print(" Found {} files in {} sections".format(total_files, len(structure)))
writer = PdfWriter()
if not self.no_covers:
title = output_name.replace('-', ' - ').replace('_', ' ')
cover_bytes = create_main_cover(title, structure, dir_path)
add_pdf_to_writer(writer, cover_bytes)
sorted_subdirs = sorted(structure.keys(), key=lambda x: (x != "", x))
for subdir in sorted_subdirs:
files = structure[subdir]
subdir_display = subdir if subdir else "(Root Documents)"
if self.verbose:
print(" Section: {} ({} files)".format(subdir_display, len(files)))
if not self.no_covers and (len(structure) > 1 or subdir != ""):
divider_bytes = create_subdir_divider(subdir_display, files)
add_pdf_to_writer(writer, divider_bytes)
for fpath in files:
if not self.no_covers and not self.no_doc_dividers:
doc_divider = create_document_divider(fpath.name, subdir if subdir else None)
add_pdf_to_writer(writer, doc_divider)
pdf_path = self.convert_file_if_needed(fpath)
if pdf_path:
add_pdf_to_writer(writer, pdf_path)
output_path = self.output_dir / "{}.pdf".format(output_name)
with open(output_path, 'wb') as f:
writer.write(f)
size_mb = output_path.stat().st_size / (1024 * 1024)
print(" Created: {} ({:.2f} MB)".format(output_path.name, size_mb))
return output_path
def run_flat(self) -> List[Path]:
"""Merge everything into a single PDF."""
output_name = self.input_dir.name.replace(' ', '_')
result = self.process_directory(self.input_dir, output_name)
return [result] if result else []
def run_split(self) -> List[Path]:
"""Create one PDF per top-level subdirectory or per single_dirs entry."""
created_files = []
# Determine which directories to process
if self.single_dirs:
# Use explicit list from single-file
dirs_to_process = []
for dir_str in self.single_dirs:
dir_path = self.input_dir / dir_str
if dir_path.exists() and dir_path.is_dir():
dirs_to_process.append(dir_path)
else:
print(" Warning: Directory not found: {}".format(dir_str))
else:
# Default: all top-level subdirectories
dirs_to_process = [d for d in sorted(self.input_dir.iterdir())
if d.is_dir() and not self.is_excluded(d)]
# Handle files directly in input_dir (not in subdirs)
root_files = []
for item in self.input_dir.iterdir():
if item.is_file() and not item.name.startswith('.') and not self.is_excluded(item):
ext = item.suffix.lower()
if ext in {'.pdf', '.png', '.jpg', '.jpeg', '.docx'}:
root_files.append(item)
if root_files:
print("\n Processing: (root files)")
print(" Found {} files".format(len(root_files)))
writer = PdfWriter()
root_files.sort(key=lambda p: extract_date_from_filename(p.name))
if not self.no_covers:
structure = {"": root_files}
title = "{} - Root".format(self.input_dir.name)
cover_bytes = create_main_cover(title, structure, self.input_dir)
add_pdf_to_writer(writer, cover_bytes)
for fpath in root_files:
if not self.no_covers and not self.no_doc_dividers:
doc_divider = create_document_divider(fpath.name)
add_pdf_to_writer(writer, doc_divider)
pdf_path = self.convert_file_if_needed(fpath)
if pdf_path:
add_pdf_to_writer(writer, pdf_path)
output_path = self.output_dir / "Root.pdf"
with open(output_path, 'wb') as f:
writer.write(f)
size_mb = output_path.stat().st_size / (1024 * 1024)
print(" Created: {} ({:.2f} MB)".format(output_path.name, size_mb))
created_files.append(output_path)
# Process directories
for subdir in dirs_to_process:
output_name = subdir.name.replace(' ', '_').replace(':', '_')
# For nested paths from single_dirs, use full relative path
if self.single_dirs:
try:
rel = subdir.relative_to(self.input_dir)
output_name = str(rel).replace('/', '-').replace(' ', '_').replace(':', '_')
except ValueError:
pass
result = self.process_directory(subdir, output_name)
if result:
created_files.append(result)
return created_files
def run(self, flat: bool = False) -> List[Path]:
"""Run the merger."""
print("PDF Merger v{}".format(VERSION))
print("Input: {}".format(self.input_dir))
print("Output: {}".format(self.output_dir))
print("=" * 60)
if not self.input_dir.exists():
print("ERROR: Input directory not found: {}".format(self.input_dir))
return []
self.output_dir.mkdir(parents=True, exist_ok=True)
self.temp_dir = Path(tempfile.mkdtemp(prefix="pdf_merger_"))
if self.verbose:
print("Temp: {}".format(self.temp_dir))
if self.exclude_patterns:
print("Exclusions: {}".format(self.exclude_patterns))
if self.single_dirs:
print("Single dirs: {}".format(self.single_dirs))
try:
if flat:
created_files = self.run_flat()
else:
created_files = self.run_split()
finally:
shutil.rmtree(self.temp_dir)
if self.verbose:
print("\nTemp files cleaned up.")
print("\n" + "=" * 60)
print("SUMMARY: Created {} PDFs".format(len(created_files)))
print("Location: {}".format(self.output_dir))
print("=" * 60 + "\n")
total_size = 0
for f in sorted(created_files):
size_mb = f.stat().st_size / (1024 * 1024)
total_size += size_mb
print(" {:<50} {:>6.2f} MB".format(f.name, size_mb))
print(" " + "-" * 58)
print(" {:<50} {:>6.2f} MB".format("TOTAL", total_size))
return created_files
def main():
args = parse_args()
output_dir = args.output if args.output else args.input_dir / "merged_output"
# Collect exclusion patterns
exclude_patterns = list(args.exclude)
if args.exclude_file:
exclude_patterns.extend(parse_pattern_file(args.exclude_file))
# Collect single dirs
single_dirs = None
if args.single_file:
single_dirs = parse_pattern_file(args.single_file)
merger = PDFMerger(
input_dir=args.input_dir,
output_dir=output_dir,
exclude_patterns=exclude_patterns,
single_dirs=single_dirs,
docx_converter=args.docx_converter,
no_covers=args.no_covers,
no_doc_dividers=args.no_doc_dividers,
verbose=args.verbose
)
merger.run(flat=args.flat)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment