Skip to content

Instantly share code, notes, and snippets.

@dantetemplar
Created December 13, 2025 16:25
Show Gist options
  • Select an option

  • Save dantetemplar/7f9787493140b1cf17626540e1ed5de3 to your computer and use it in GitHub Desktop.

Select an option

Save dantetemplar/7f9787493140b1cf17626540e1ed5de3 to your computer and use it in GitHub Desktop.
Testing different approaches to autocrop scanned image, best for my purpose was docaligner
"""Auto-crop scanned PDF documents by detecting content boundaries."""
import argparse
import io
import sys
import time
from pathlib import Path
import cv2
import numpy as np
import pymupdf
from PIL import Image
# Global model instance for docaligner (lazy-loaded)
_docaligner_model = None
def detect_content_bbox_docaligner(image: Image.Image) -> tuple[int, int, int, int]:
"""
Detect document boundaries using DocAligner (https://github.com/DocsaidLab/DocAligner).
Uses a deep learning model for document corner detection.
"""
global _docaligner_model
try:
from docaligner import DocAligner
except ImportError:
raise ImportError("docaligner package required. Install with: pip install docaligner")
# Lazy-load model
if _docaligner_model is None:
_docaligner_model = DocAligner()
rgb = np.array(image.convert("RGB"))
H, W = rgb.shape[:2]
# Convert RGB to BGR for docaligner
img_bgr = cv2.cvtColor(rgb, cv2.COLOR_RGB2BGR)
# Get document corners
corners = _docaligner_model(img_bgr) # 4x2 array: [[x,y], ...]
if corners is None or len(corners) == 0:
return (0, 0, W, H)
# Convert corners to bounding box
x_coords = corners[:, 0]
y_coords = corners[:, 1]
x0 = int(np.min(x_coords))
x1 = int(np.max(x_coords))
y0 = int(np.min(y_coords))
y1 = int(np.max(y_coords))
return (x0, y0, x1, y1)
def detect_content_bbox_docscan(image: Image.Image) -> tuple[int, int, int, int]:
"""
Detect document boundaries using docscan (https://github.com/danielgatis/docscan).
Uses rembg for background removal and OpenCV contour detection.
"""
try:
from docscan.doc import scan
except ImportError:
raise ImportError("docscan package required. Install with: pip install docscan")
rgb = np.array(image.convert("RGB"))
H, W = rgb.shape[:2]
# Convert PIL Image to bytes
img_bytes = io.BytesIO()
image.save(img_bytes, format="PNG")
img_bytes.seek(0)
input_data = img_bytes.read()
# Process with docscan (removes background and finds document)
processed_data = scan(input_data)
if processed_data is None:
return (0, 0, W, H)
# Decode the processed image
img_array = np.frombuffer(processed_data, np.uint8)
processed_img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
if processed_img is None:
return (0, 0, W, H)
# Convert to grayscale and find contours
gray = cv2.cvtColor(processed_img, cv2.COLOR_BGR2GRAY)
_, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
if not contours:
return (0, 0, W, H)
# Get largest contour
largest_contour = max(contours, key=cv2.contourArea)
x, y, w, h = cv2.boundingRect(largest_contour)
return (x, y, x + w, y + h)
def detect_content_bbox_fmeow(image: Image.Image) -> tuple[int, int, int, int]:
"""
Detect document boundaries using document-scanner (https://github.com/dantetemplar/updated-fMeow-document-scanner).
Uses Hough line transform and connectivity analysis.
"""
try:
from doc_scanner import scanner
except ImportError:
raise ImportError("document-scanner package required. Install with: pip install git+https://github.com/dantetemplar/updated-fMeow-document-scanner")
rgb = np.array(image.convert("RGB"))
H, W = rgb.shape[:2]
# Convert RGB to HSV
img_bgr = cv2.cvtColor(rgb, cv2.COLOR_RGB2BGR)
hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV)
# Try intensity channel first
intensity_scanner = scanner(hsv[:, :, 2])
intensity_scanner.scan()
if intensity_scanner.corners is not None:
corners = intensity_scanner.coordinates()
else:
# Fallback to saturation channel
saturation_scanner = scanner(hsv[:, :, 1])
saturation_scanner.scan()
if saturation_scanner.corners is not None:
corners = saturation_scanner.coordinates()
else:
return (0, 0, W, H)
# Convert corners to bounding box
corners_array = np.array(corners)
x_coords = corners_array[:, 0]
y_coords = corners_array[:, 1]
x0 = int(np.clip(np.min(x_coords), 0, W))
x1 = int(np.clip(np.max(x_coords), 0, W))
y0 = int(np.clip(np.min(y_coords), 0, H))
y1 = int(np.clip(np.max(y_coords), 0, H))
return (x0, y0, x1, y1)
def detect_content_bbox_endalk(image: Image.Image) -> tuple[int, int, int, int]:
"""
Detect document boundaries using document-scanner (https://github.com/endalk200/document-scanner).
Uses LSD (Line Segment Detector) for corner detection.
"""
try:
from scan import DocScanner
except ImportError:
raise ImportError("document-scanner package required. Install with: pip install document-scanner")
rgb = np.array(image.convert("RGB"))
H, W = rgb.shape[:2]
# Convert RGB to BGR for OpenCV
img_bgr = cv2.cvtColor(rgb, cv2.COLOR_RGB2BGR)
# Create scanner instance (non-interactive)
doc_scanner = DocScanner(interactive=False)
# Get contour using the scanner's internal method
# We need to resize first (scanner expects rescaled image)
RESCALED_HEIGHT = 500.0
ratio = H / RESCALED_HEIGHT
# Resize for processing
rescaled = cv2.resize(img_bgr, (int(W / ratio), int(RESCALED_HEIGHT)))
# Get contour
screen_cnt = doc_scanner.get_contour(rescaled)
# Scale back to original size
screen_cnt = (screen_cnt * ratio).astype(int)
# Convert to bounding box
x_coords = screen_cnt[:, 0]
y_coords = screen_cnt[:, 1]
x0 = int(np.clip(np.min(x_coords), 0, W))
x1 = int(np.clip(np.max(x_coords), 0, W))
y0 = int(np.clip(np.min(y_coords), 0, H))
y1 = int(np.clip(np.max(y_coords), 0, H))
return (x0, y0, x1, y1)
# Method mapping for detection functions
DETECTION_METHODS = {
"docaligner": detect_content_bbox_docaligner,
"danielgatis": detect_content_bbox_docscan,
"fmeow": detect_content_bbox_fmeow,
"endalk": detect_content_bbox_endalk,
}
def autocrop_pdf_bytes(pdf_bytes: bytes, dpi: int = 300, method: str = "docaligner") -> bytes:
"""Convert each page to image, auto-crop, and rebuild a PDF."""
if method not in DETECTION_METHODS:
raise ValueError(f"Unknown method: {method}. Available methods: {', '.join(DETECTION_METHODS.keys())}")
detect_func = DETECTION_METHODS[method]
src = pymupdf.open(stream=pdf_bytes, filetype="pdf")
total_pages = len(src)
total_time = 0.0
page_num = 0
for page in src:
page_num += 1
pix = page.get_pixmap(dpi=dpi)
img = Image.open(io.BytesIO(pix.tobytes("png")))
# Time the detection
start_time = time.perf_counter()
x0, y0, x1, y1 = detect_func(img)
elapsed = time.perf_counter() - start_time
total_time += elapsed
print(f"Page {page_num}/{total_pages}: Detection took {elapsed:.3f}s", flush=True)
# Convert to PDF coordinates and apply inplace
scale = 72.0 / dpi
crop_rect = pymupdf.Rect(x0 * scale, y0 * scale, x1 * scale, y1 * scale)
page.set_cropbox(crop_rect) # Modifies page inplace [web:1]
print(f"Total detection time: {total_time:.3f}s (average: {total_time/total_pages:.3f}s per page)")
out = io.BytesIO()
src.save(out, garbage=3, deflate=True)
src.close()
return out.getvalue()
def autocrop_pdf(input_path: Path, output_path: Path | None = None, dpi: int = 300, method: str = "docaligner") -> None:
"""Auto-crop all pages in a scanned PDF document."""
if not input_path.exists():
print(f"Error: File not found: {input_path}", file=sys.stderr)
sys.exit(1)
if not input_path.suffix.lower() == ".pdf":
print(f"Error: Input file must be a PDF: {input_path}", file=sys.stderr)
sys.exit(1)
if output_path is None:
output_path = input_path.parent / f"{input_path.stem}_cropped{input_path.suffix}"
print(f"Processing PDF: {input_path} (method: {method})")
pdf_bytes = input_path.read_bytes()
cropped_pdf_bytes = autocrop_pdf_bytes(pdf_bytes, dpi=dpi, method=method)
print(f"Saving cropped PDF to: {output_path}")
output_path.write_bytes(cropped_pdf_bytes)
print(f"Successfully created cropped PDF: {output_path}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Auto-crop scanned PDF documents by detecting content boundaries")
parser.add_argument("input", type=Path, help="Path to input PDF file")
parser.add_argument("-o", "--output", type=Path, help="Path to output PDF file (default: input_cropped.pdf)")
parser.add_argument(
"-d",
"--dpi",
type=int,
default=300,
help="DPI for rendering PDF pages (default: 300)",
)
parser.add_argument(
"-m",
"--method",
type=str,
default="docaligner",
choices=list(DETECTION_METHODS.keys()),
help="Detection method to use (default: docaligner). Options: docaligner, danielgatis, fmeow, endalk",
)
args = parser.parse_args()
autocrop_pdf(args.input, args.output, args.dpi, args.method)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment