Created
December 13, 2025 16:25
-
-
Save dantetemplar/7f9787493140b1cf17626540e1ed5de3 to your computer and use it in GitHub Desktop.
Testing different approaches to autocrop scanned image, best for my purpose was docaligner
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """Auto-crop scanned PDF documents by detecting content boundaries.""" | |
| import argparse | |
| import io | |
| import sys | |
| import time | |
| from pathlib import Path | |
| import cv2 | |
| import numpy as np | |
| import pymupdf | |
| from PIL import Image | |
| # Global model instance for docaligner (lazy-loaded) | |
| _docaligner_model = None | |
| def detect_content_bbox_docaligner(image: Image.Image) -> tuple[int, int, int, int]: | |
| """ | |
| Detect document boundaries using DocAligner (https://github.com/DocsaidLab/DocAligner). | |
| Uses a deep learning model for document corner detection. | |
| """ | |
| global _docaligner_model | |
| try: | |
| from docaligner import DocAligner | |
| except ImportError: | |
| raise ImportError("docaligner package required. Install with: pip install docaligner") | |
| # Lazy-load model | |
| if _docaligner_model is None: | |
| _docaligner_model = DocAligner() | |
| rgb = np.array(image.convert("RGB")) | |
| H, W = rgb.shape[:2] | |
| # Convert RGB to BGR for docaligner | |
| img_bgr = cv2.cvtColor(rgb, cv2.COLOR_RGB2BGR) | |
| # Get document corners | |
| corners = _docaligner_model(img_bgr) # 4x2 array: [[x,y], ...] | |
| if corners is None or len(corners) == 0: | |
| return (0, 0, W, H) | |
| # Convert corners to bounding box | |
| x_coords = corners[:, 0] | |
| y_coords = corners[:, 1] | |
| x0 = int(np.min(x_coords)) | |
| x1 = int(np.max(x_coords)) | |
| y0 = int(np.min(y_coords)) | |
| y1 = int(np.max(y_coords)) | |
| return (x0, y0, x1, y1) | |
| def detect_content_bbox_docscan(image: Image.Image) -> tuple[int, int, int, int]: | |
| """ | |
| Detect document boundaries using docscan (https://github.com/danielgatis/docscan). | |
| Uses rembg for background removal and OpenCV contour detection. | |
| """ | |
| try: | |
| from docscan.doc import scan | |
| except ImportError: | |
| raise ImportError("docscan package required. Install with: pip install docscan") | |
| rgb = np.array(image.convert("RGB")) | |
| H, W = rgb.shape[:2] | |
| # Convert PIL Image to bytes | |
| img_bytes = io.BytesIO() | |
| image.save(img_bytes, format="PNG") | |
| img_bytes.seek(0) | |
| input_data = img_bytes.read() | |
| # Process with docscan (removes background and finds document) | |
| processed_data = scan(input_data) | |
| if processed_data is None: | |
| return (0, 0, W, H) | |
| # Decode the processed image | |
| img_array = np.frombuffer(processed_data, np.uint8) | |
| processed_img = cv2.imdecode(img_array, cv2.IMREAD_COLOR) | |
| if processed_img is None: | |
| return (0, 0, W, H) | |
| # Convert to grayscale and find contours | |
| gray = cv2.cvtColor(processed_img, cv2.COLOR_BGR2GRAY) | |
| _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) | |
| contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) | |
| if not contours: | |
| return (0, 0, W, H) | |
| # Get largest contour | |
| largest_contour = max(contours, key=cv2.contourArea) | |
| x, y, w, h = cv2.boundingRect(largest_contour) | |
| return (x, y, x + w, y + h) | |
| def detect_content_bbox_fmeow(image: Image.Image) -> tuple[int, int, int, int]: | |
| """ | |
| Detect document boundaries using document-scanner (https://github.com/dantetemplar/updated-fMeow-document-scanner). | |
| Uses Hough line transform and connectivity analysis. | |
| """ | |
| try: | |
| from doc_scanner import scanner | |
| except ImportError: | |
| raise ImportError("document-scanner package required. Install with: pip install git+https://github.com/dantetemplar/updated-fMeow-document-scanner") | |
| rgb = np.array(image.convert("RGB")) | |
| H, W = rgb.shape[:2] | |
| # Convert RGB to HSV | |
| img_bgr = cv2.cvtColor(rgb, cv2.COLOR_RGB2BGR) | |
| hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV) | |
| # Try intensity channel first | |
| intensity_scanner = scanner(hsv[:, :, 2]) | |
| intensity_scanner.scan() | |
| if intensity_scanner.corners is not None: | |
| corners = intensity_scanner.coordinates() | |
| else: | |
| # Fallback to saturation channel | |
| saturation_scanner = scanner(hsv[:, :, 1]) | |
| saturation_scanner.scan() | |
| if saturation_scanner.corners is not None: | |
| corners = saturation_scanner.coordinates() | |
| else: | |
| return (0, 0, W, H) | |
| # Convert corners to bounding box | |
| corners_array = np.array(corners) | |
| x_coords = corners_array[:, 0] | |
| y_coords = corners_array[:, 1] | |
| x0 = int(np.clip(np.min(x_coords), 0, W)) | |
| x1 = int(np.clip(np.max(x_coords), 0, W)) | |
| y0 = int(np.clip(np.min(y_coords), 0, H)) | |
| y1 = int(np.clip(np.max(y_coords), 0, H)) | |
| return (x0, y0, x1, y1) | |
| def detect_content_bbox_endalk(image: Image.Image) -> tuple[int, int, int, int]: | |
| """ | |
| Detect document boundaries using document-scanner (https://github.com/endalk200/document-scanner). | |
| Uses LSD (Line Segment Detector) for corner detection. | |
| """ | |
| try: | |
| from scan import DocScanner | |
| except ImportError: | |
| raise ImportError("document-scanner package required. Install with: pip install document-scanner") | |
| rgb = np.array(image.convert("RGB")) | |
| H, W = rgb.shape[:2] | |
| # Convert RGB to BGR for OpenCV | |
| img_bgr = cv2.cvtColor(rgb, cv2.COLOR_RGB2BGR) | |
| # Create scanner instance (non-interactive) | |
| doc_scanner = DocScanner(interactive=False) | |
| # Get contour using the scanner's internal method | |
| # We need to resize first (scanner expects rescaled image) | |
| RESCALED_HEIGHT = 500.0 | |
| ratio = H / RESCALED_HEIGHT | |
| # Resize for processing | |
| rescaled = cv2.resize(img_bgr, (int(W / ratio), int(RESCALED_HEIGHT))) | |
| # Get contour | |
| screen_cnt = doc_scanner.get_contour(rescaled) | |
| # Scale back to original size | |
| screen_cnt = (screen_cnt * ratio).astype(int) | |
| # Convert to bounding box | |
| x_coords = screen_cnt[:, 0] | |
| y_coords = screen_cnt[:, 1] | |
| x0 = int(np.clip(np.min(x_coords), 0, W)) | |
| x1 = int(np.clip(np.max(x_coords), 0, W)) | |
| y0 = int(np.clip(np.min(y_coords), 0, H)) | |
| y1 = int(np.clip(np.max(y_coords), 0, H)) | |
| return (x0, y0, x1, y1) | |
| # Method mapping for detection functions | |
| DETECTION_METHODS = { | |
| "docaligner": detect_content_bbox_docaligner, | |
| "danielgatis": detect_content_bbox_docscan, | |
| "fmeow": detect_content_bbox_fmeow, | |
| "endalk": detect_content_bbox_endalk, | |
| } | |
| def autocrop_pdf_bytes(pdf_bytes: bytes, dpi: int = 300, method: str = "docaligner") -> bytes: | |
| """Convert each page to image, auto-crop, and rebuild a PDF.""" | |
| if method not in DETECTION_METHODS: | |
| raise ValueError(f"Unknown method: {method}. Available methods: {', '.join(DETECTION_METHODS.keys())}") | |
| detect_func = DETECTION_METHODS[method] | |
| src = pymupdf.open(stream=pdf_bytes, filetype="pdf") | |
| total_pages = len(src) | |
| total_time = 0.0 | |
| page_num = 0 | |
| for page in src: | |
| page_num += 1 | |
| pix = page.get_pixmap(dpi=dpi) | |
| img = Image.open(io.BytesIO(pix.tobytes("png"))) | |
| # Time the detection | |
| start_time = time.perf_counter() | |
| x0, y0, x1, y1 = detect_func(img) | |
| elapsed = time.perf_counter() - start_time | |
| total_time += elapsed | |
| print(f"Page {page_num}/{total_pages}: Detection took {elapsed:.3f}s", flush=True) | |
| # Convert to PDF coordinates and apply inplace | |
| scale = 72.0 / dpi | |
| crop_rect = pymupdf.Rect(x0 * scale, y0 * scale, x1 * scale, y1 * scale) | |
| page.set_cropbox(crop_rect) # Modifies page inplace [web:1] | |
| print(f"Total detection time: {total_time:.3f}s (average: {total_time/total_pages:.3f}s per page)") | |
| out = io.BytesIO() | |
| src.save(out, garbage=3, deflate=True) | |
| src.close() | |
| return out.getvalue() | |
| def autocrop_pdf(input_path: Path, output_path: Path | None = None, dpi: int = 300, method: str = "docaligner") -> None: | |
| """Auto-crop all pages in a scanned PDF document.""" | |
| if not input_path.exists(): | |
| print(f"Error: File not found: {input_path}", file=sys.stderr) | |
| sys.exit(1) | |
| if not input_path.suffix.lower() == ".pdf": | |
| print(f"Error: Input file must be a PDF: {input_path}", file=sys.stderr) | |
| sys.exit(1) | |
| if output_path is None: | |
| output_path = input_path.parent / f"{input_path.stem}_cropped{input_path.suffix}" | |
| print(f"Processing PDF: {input_path} (method: {method})") | |
| pdf_bytes = input_path.read_bytes() | |
| cropped_pdf_bytes = autocrop_pdf_bytes(pdf_bytes, dpi=dpi, method=method) | |
| print(f"Saving cropped PDF to: {output_path}") | |
| output_path.write_bytes(cropped_pdf_bytes) | |
| print(f"Successfully created cropped PDF: {output_path}") | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser(description="Auto-crop scanned PDF documents by detecting content boundaries") | |
| parser.add_argument("input", type=Path, help="Path to input PDF file") | |
| parser.add_argument("-o", "--output", type=Path, help="Path to output PDF file (default: input_cropped.pdf)") | |
| parser.add_argument( | |
| "-d", | |
| "--dpi", | |
| type=int, | |
| default=300, | |
| help="DPI for rendering PDF pages (default: 300)", | |
| ) | |
| parser.add_argument( | |
| "-m", | |
| "--method", | |
| type=str, | |
| default="docaligner", | |
| choices=list(DETECTION_METHODS.keys()), | |
| help="Detection method to use (default: docaligner). Options: docaligner, danielgatis, fmeow, endalk", | |
| ) | |
| args = parser.parse_args() | |
| autocrop_pdf(args.input, args.output, args.dpi, args.method) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment