dantetemplar · December 13, 2025 16:25
diff --git a/autocrop.py b/autocrop.py
 """Auto-crop scanned PDF documents by detecting content boundaries."""

 import argparse
 import io
 import sys
 import time
 from pathlib import Path

 import cv2
 import numpy as np
 import pymupdf
 from PIL import Image

 # Global model instance for docaligner (lazy-loaded)
 _docaligner_model = None


 def detect_content_bbox_docaligner(image: Image.Image) -> tuple[int, int, int, int]:
    """
    Detect document boundaries using DocAligner (https://github.com/DocsaidLab/DocAligner).
    Uses a deep learning model for document corner detection.
    """
    global _docaligner_model
    
    try:
        from docaligner import DocAligner
    except ImportError:
        raise ImportError("docaligner package required. Install with: pip install docaligner")
    
    # Lazy-load model
    if _docaligner_model is None:
        _docaligner_model = DocAligner()
    
    rgb = np.array(image.convert("RGB"))
    H, W = rgb.shape[:2]
    
    # Convert RGB to BGR for docaligner
    img_bgr = cv2.cvtColor(rgb, cv2.COLOR_RGB2BGR)
    
    # Get document corners
    corners = _docaligner_model(img_bgr)  # 4x2 array: [[x,y], ...]
    
    if corners is None or len(corners) == 0:
        return (0, 0, W, H)
    
    # Convert corners to bounding box
    x_coords = corners[:, 0]
    y_coords = corners[:, 1]
    
    x0 = int(np.min(x_coords))
    x1 = int(np.max(x_coords))
    y0 = int(np.min(y_coords))
    y1 = int(np.max(y_coords))
    
    return (x0, y0, x1, y1)


 def detect_content_bbox_docscan(image: Image.Image) -> tuple[int, int, int, int]:
    """
    Detect document boundaries using docscan (https://github.com/danielgatis/docscan).
    Uses rembg for background removal and OpenCV contour detection.
    """
    try:
        from docscan.doc import scan
    except ImportError:
        raise ImportError("docscan package required. Install with: pip install docscan")
    
    rgb = np.array(image.convert("RGB"))
    H, W = rgb.shape[:2]
    
    # Convert PIL Image to bytes
    img_bytes = io.BytesIO()
    image.save(img_bytes, format="PNG")
    img_bytes.seek(0)
    input_data = img_bytes.read()
    
    # Process with docscan (removes background and finds document)
    processed_data = scan(input_data)
    
    if processed_data is None:
        return (0, 0, W, H)
    
    # Decode the processed image
    img_array = np.frombuffer(processed_data, np.uint8)
    processed_img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
    
    if processed_img is None:
        return (0, 0, W, H)
    
    # Convert to grayscale and find contours
    gray = cv2.cvtColor(processed_img, cv2.COLOR_BGR2GRAY)
    _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    if not contours:
        return (0, 0, W, H)
    
    # Get largest contour
    largest_contour = max(contours, key=cv2.contourArea)
    x, y, w, h = cv2.boundingRect(largest_contour)
    
    return (x, y, x + w, y + h)


 def detect_content_bbox_fmeow(image: Image.Image) -> tuple[int, int, int, int]:
    """
    Detect document boundaries using document-scanner (https://github.com/dantetemplar/updated-fMeow-document-scanner).
    Uses Hough line transform and connectivity analysis.
    """
    try:
        from doc_scanner import scanner
    except ImportError:
        raise ImportError("document-scanner package required. Install with: pip install git+https://github.com/dantetemplar/updated-fMeow-document-scanner")
    
    rgb = np.array(image.convert("RGB"))
    H, W = rgb.shape[:2]
    
    # Convert RGB to HSV
    img_bgr = cv2.cvtColor(rgb, cv2.COLOR_RGB2BGR)
    hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV)
    
    # Try intensity channel first
    intensity_scanner = scanner(hsv[:, :, 2])
    intensity_scanner.scan()
    
    if intensity_scanner.corners is not None:
        corners = intensity_scanner.coordinates()
    else:
        # Fallback to saturation channel
        saturation_scanner = scanner(hsv[:, :, 1])
        saturation_scanner.scan()
        if saturation_scanner.corners is not None:
            corners = saturation_scanner.coordinates()
        else:
            return (0, 0, W, H)
    
    # Convert corners to bounding box
    corners_array = np.array(corners)
    x_coords = corners_array[:, 0]
    y_coords = corners_array[:, 1]
    
    x0 = int(np.clip(np.min(x_coords), 0, W))
    x1 = int(np.clip(np.max(x_coords), 0, W))
    y0 = int(np.clip(np.min(y_coords), 0, H))
    y1 = int(np.clip(np.max(y_coords), 0, H))
    
    return (x0, y0, x1, y1)


 def detect_content_bbox_endalk(image: Image.Image) -> tuple[int, int, int, int]:
    """
    Detect document boundaries using document-scanner (https://github.com/endalk200/document-scanner).
    Uses LSD (Line Segment Detector) for corner detection.
    """
    try:
        from scan import DocScanner
    except ImportError:
        raise ImportError("document-scanner package required. Install with: pip install document-scanner")
    
    rgb = np.array(image.convert("RGB"))
    H, W = rgb.shape[:2]
    
    # Convert RGB to BGR for OpenCV
    img_bgr = cv2.cvtColor(rgb, cv2.COLOR_RGB2BGR)
    
    # Create scanner instance (non-interactive)
    doc_scanner = DocScanner(interactive=False)
    
    # Get contour using the scanner's internal method
    # We need to resize first (scanner expects rescaled image)
    RESCALED_HEIGHT = 500.0
    ratio = H / RESCALED_HEIGHT
    
    # Resize for processing
    rescaled = cv2.resize(img_bgr, (int(W / ratio), int(RESCALED_HEIGHT)))
    
    # Get contour
    screen_cnt = doc_scanner.get_contour(rescaled)
    
    # Scale back to original size
    screen_cnt = (screen_cnt * ratio).astype(int)
    
    # Convert to bounding box
    x_coords = screen_cnt[:, 0]
    y_coords = screen_cnt[:, 1]
    
    x0 = int(np.clip(np.min(x_coords), 0, W))
    x1 = int(np.clip(np.max(x_coords), 0, W))
    y0 = int(np.clip(np.min(y_coords), 0, H))
    y1 = int(np.clip(np.max(y_coords), 0, H))
    
    return (x0, y0, x1, y1)


 # Method mapping for detection functions
 DETECTION_METHODS = {
    "docaligner": detect_content_bbox_docaligner,
    "danielgatis": detect_content_bbox_docscan,
    "fmeow": detect_content_bbox_fmeow,
    "endalk": detect_content_bbox_endalk,
 }


 def autocrop_pdf_bytes(pdf_bytes: bytes, dpi: int = 300, method: str = "docaligner") -> bytes:
    """Convert each page to image, auto-crop, and rebuild a PDF."""
    if method not in DETECTION_METHODS:
        raise ValueError(f"Unknown method: {method}. Available methods: {', '.join(DETECTION_METHODS.keys())}")
    
    detect_func = DETECTION_METHODS[method]
    src = pymupdf.open(stream=pdf_bytes, filetype="pdf")
    total_pages = len(src)
    total_time = 0.0

    page_num = 0
    for page in src:
        page_num += 1
        pix = page.get_pixmap(dpi=dpi)
        img = Image.open(io.BytesIO(pix.tobytes("png")))
        
        # Time the detection
        start_time = time.perf_counter()
        x0, y0, x1, y1 = detect_func(img)
        elapsed = time.perf_counter() - start_time
        total_time += elapsed
        
        print(f"Page {page_num}/{total_pages}: Detection took {elapsed:.3f}s", flush=True)

        # Convert to PDF coordinates and apply inplace
        scale = 72.0 / dpi
        crop_rect = pymupdf.Rect(x0 * scale, y0 * scale, x1 * scale, y1 * scale)
        page.set_cropbox(crop_rect)  # Modifies page inplace [web:1]

    print(f"Total detection time: {total_time:.3f}s (average: {total_time/total_pages:.3f}s per page)")
    
    out = io.BytesIO()
    src.save(out, garbage=3, deflate=True)
    src.close()
    return out.getvalue()


 def autocrop_pdf(input_path: Path, output_path: Path | None = None, dpi: int = 300, method: str = "docaligner") -> None:
    """Auto-crop all pages in a scanned PDF document."""
    if not input_path.exists():
        print(f"Error: File not found: {input_path}", file=sys.stderr)
        sys.exit(1)

    if not input_path.suffix.lower() == ".pdf":
        print(f"Error: Input file must be a PDF: {input_path}", file=sys.stderr)
        sys.exit(1)

    if output_path is None:
        output_path = input_path.parent / f"{input_path.stem}_cropped{input_path.suffix}"

    print(f"Processing PDF: {input_path} (method: {method})")
    pdf_bytes = input_path.read_bytes()
    cropped_pdf_bytes = autocrop_pdf_bytes(pdf_bytes, dpi=dpi, method=method)

    print(f"Saving cropped PDF to: {output_path}")
    output_path.write_bytes(cropped_pdf_bytes)

    print(f"Successfully created cropped PDF: {output_path}")


 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Auto-crop scanned PDF documents by detecting content boundaries")
    parser.add_argument("input", type=Path, help="Path to input PDF file")
    parser.add_argument("-o", "--output", type=Path, help="Path to output PDF file (default: input_cropped.pdf)")
    parser.add_argument(
        "-d",
        "--dpi",
        type=int,
        default=300,
        help="DPI for rendering PDF pages (default: 300)",
    )
    parser.add_argument(
        "-m",
        "--method",
        type=str,
        default="docaligner",
        choices=list(DETECTION_METHODS.keys()),
        help="Detection method to use (default: docaligner). Options: docaligner, danielgatis, fmeow, endalk",
    )

    args = parser.parse_args()

    autocrop_pdf(args.input, args.output, args.dpi, args.method)
	"""Auto-crop scanned PDF documents by detecting content boundaries."""

	import argparse
	import io
	import sys
	import time
	from pathlib import Path

	import cv2
	import numpy as np
	import pymupdf
	from PIL import Image

	# Global model instance for docaligner (lazy-loaded)
	_docaligner_model = None


	def detect_content_bbox_docaligner(image: Image.Image) -> tuple[int, int, int, int]:
	"""
	Detect document boundaries using DocAligner (https://github.com/DocsaidLab/DocAligner).
	Uses a deep learning model for document corner detection.
	"""
	global _docaligner_model

	try:
	from docaligner import DocAligner
	except ImportError:
	raise ImportError("docaligner package required. Install with: pip install docaligner")

	# Lazy-load model
	if _docaligner_model is None:
	_docaligner_model = DocAligner()

	rgb = np.array(image.convert("RGB"))
	H, W = rgb.shape[:2]

	# Convert RGB to BGR for docaligner
	img_bgr = cv2.cvtColor(rgb, cv2.COLOR_RGB2BGR)

	# Get document corners
	corners = _docaligner_model(img_bgr) # 4x2 array: [[x,y], ...]

	if corners is None or len(corners) == 0:
	return (0, 0, W, H)

	# Convert corners to bounding box
	x_coords = corners[:, 0]
	y_coords = corners[:, 1]

	x0 = int(np.min(x_coords))
	x1 = int(np.max(x_coords))
	y0 = int(np.min(y_coords))
	y1 = int(np.max(y_coords))

	return (x0, y0, x1, y1)


	def detect_content_bbox_docscan(image: Image.Image) -> tuple[int, int, int, int]:
	"""
	Detect document boundaries using docscan (https://github.com/danielgatis/docscan).
	Uses rembg for background removal and OpenCV contour detection.
	"""
	try:
	from docscan.doc import scan
	except ImportError:
	raise ImportError("docscan package required. Install with: pip install docscan")

	rgb = np.array(image.convert("RGB"))
	H, W = rgb.shape[:2]

	# Convert PIL Image to bytes
	img_bytes = io.BytesIO()
	image.save(img_bytes, format="PNG")
	img_bytes.seek(0)
	input_data = img_bytes.read()

	# Process with docscan (removes background and finds document)
	processed_data = scan(input_data)

	if processed_data is None:
	return (0, 0, W, H)

	# Decode the processed image
	img_array = np.frombuffer(processed_data, np.uint8)
	processed_img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)

	if processed_img is None:
	return (0, 0, W, H)

	# Convert to grayscale and find contours
	gray = cv2.cvtColor(processed_img, cv2.COLOR_BGR2GRAY)
	_, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
	contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

	if not contours:
	return (0, 0, W, H)

	# Get largest contour
	largest_contour = max(contours, key=cv2.contourArea)
	x, y, w, h = cv2.boundingRect(largest_contour)

	return (x, y, x + w, y + h)


	def detect_content_bbox_fmeow(image: Image.Image) -> tuple[int, int, int, int]:
	"""
	Detect document boundaries using document-scanner (https://github.com/dantetemplar/updated-fMeow-document-scanner).
	Uses Hough line transform and connectivity analysis.
	"""
	try:
	from doc_scanner import scanner
	except ImportError:
	raise ImportError("document-scanner package required. Install with: pip install git+https://github.com/dantetemplar/updated-fMeow-document-scanner")

	rgb = np.array(image.convert("RGB"))
	H, W = rgb.shape[:2]

	# Convert RGB to HSV
	img_bgr = cv2.cvtColor(rgb, cv2.COLOR_RGB2BGR)
	hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV)

	# Try intensity channel first
	intensity_scanner = scanner(hsv[:, :, 2])
	intensity_scanner.scan()

	if intensity_scanner.corners is not None:
	corners = intensity_scanner.coordinates()
	else:
	# Fallback to saturation channel
	saturation_scanner = scanner(hsv[:, :, 1])
	saturation_scanner.scan()
	if saturation_scanner.corners is not None:
	corners = saturation_scanner.coordinates()
	else:
	return (0, 0, W, H)

	# Convert corners to bounding box
	corners_array = np.array(corners)
	x_coords = corners_array[:, 0]
	y_coords = corners_array[:, 1]

	x0 = int(np.clip(np.min(x_coords), 0, W))
	x1 = int(np.clip(np.max(x_coords), 0, W))
	y0 = int(np.clip(np.min(y_coords), 0, H))
	y1 = int(np.clip(np.max(y_coords), 0, H))

	return (x0, y0, x1, y1)


	def detect_content_bbox_endalk(image: Image.Image) -> tuple[int, int, int, int]:
	"""
	Detect document boundaries using document-scanner (https://github.com/endalk200/document-scanner).
	Uses LSD (Line Segment Detector) for corner detection.
	"""
	try:
	from scan import DocScanner
	except ImportError:
	raise ImportError("document-scanner package required. Install with: pip install document-scanner")

	rgb = np.array(image.convert("RGB"))
	H, W = rgb.shape[:2]

	# Convert RGB to BGR for OpenCV
	img_bgr = cv2.cvtColor(rgb, cv2.COLOR_RGB2BGR)

	# Create scanner instance (non-interactive)
	doc_scanner = DocScanner(interactive=False)

	# Get contour using the scanner's internal method
	# We need to resize first (scanner expects rescaled image)
	RESCALED_HEIGHT = 500.0
	ratio = H / RESCALED_HEIGHT

	# Resize for processing
	rescaled = cv2.resize(img_bgr, (int(W / ratio), int(RESCALED_HEIGHT)))

	# Get contour
	screen_cnt = doc_scanner.get_contour(rescaled)

	# Scale back to original size
	screen_cnt = (screen_cnt * ratio).astype(int)

	# Convert to bounding box
	x_coords = screen_cnt[:, 0]
	y_coords = screen_cnt[:, 1]

	x0 = int(np.clip(np.min(x_coords), 0, W))
	x1 = int(np.clip(np.max(x_coords), 0, W))
	y0 = int(np.clip(np.min(y_coords), 0, H))
	y1 = int(np.clip(np.max(y_coords), 0, H))

	return (x0, y0, x1, y1)


	# Method mapping for detection functions
	DETECTION_METHODS = {
	"docaligner": detect_content_bbox_docaligner,
	"danielgatis": detect_content_bbox_docscan,
	"fmeow": detect_content_bbox_fmeow,
	"endalk": detect_content_bbox_endalk,
	}


	def autocrop_pdf_bytes(pdf_bytes: bytes, dpi: int = 300, method: str = "docaligner") -> bytes:
	"""Convert each page to image, auto-crop, and rebuild a PDF."""
	if method not in DETECTION_METHODS:
	raise ValueError(f"Unknown method: {method}. Available methods: {', '.join(DETECTION_METHODS.keys())}")

	detect_func = DETECTION_METHODS[method]
	src = pymupdf.open(stream=pdf_bytes, filetype="pdf")
	total_pages = len(src)
	total_time = 0.0

	page_num = 0
	for page in src:
	page_num += 1
	pix = page.get_pixmap(dpi=dpi)
	img = Image.open(io.BytesIO(pix.tobytes("png")))

	# Time the detection
	start_time = time.perf_counter()
	x0, y0, x1, y1 = detect_func(img)
	elapsed = time.perf_counter() - start_time
	total_time += elapsed

	print(f"Page {page_num}/{total_pages}: Detection took {elapsed:.3f}s", flush=True)

	# Convert to PDF coordinates and apply inplace
	scale = 72.0 / dpi
	crop_rect = pymupdf.Rect(x0 * scale, y0 * scale, x1 * scale, y1 * scale)
	page.set_cropbox(crop_rect) # Modifies page inplace [web:1]

	print(f"Total detection time: {total_time:.3f}s (average: {total_time/total_pages:.3f}s per page)")

	out = io.BytesIO()
	src.save(out, garbage=3, deflate=True)
	src.close()
	return out.getvalue()


	def autocrop_pdf(input_path: Path, output_path: Path \| None = None, dpi: int = 300, method: str = "docaligner") -> None:
	"""Auto-crop all pages in a scanned PDF document."""
	if not input_path.exists():
	print(f"Error: File not found: {input_path}", file=sys.stderr)
	sys.exit(1)

	if not input_path.suffix.lower() == ".pdf":
	print(f"Error: Input file must be a PDF: {input_path}", file=sys.stderr)
	sys.exit(1)

	if output_path is None:
	output_path = input_path.parent / f"{input_path.stem}_cropped{input_path.suffix}"

	print(f"Processing PDF: {input_path} (method: {method})")
	pdf_bytes = input_path.read_bytes()
	cropped_pdf_bytes = autocrop_pdf_bytes(pdf_bytes, dpi=dpi, method=method)

	print(f"Saving cropped PDF to: {output_path}")
	output_path.write_bytes(cropped_pdf_bytes)

	print(f"Successfully created cropped PDF: {output_path}")


	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="Auto-crop scanned PDF documents by detecting content boundaries")
	parser.add_argument("input", type=Path, help="Path to input PDF file")
	parser.add_argument("-o", "--output", type=Path, help="Path to output PDF file (default: input_cropped.pdf)")
	parser.add_argument(
	"-d",
	"--dpi",
	type=int,
	default=300,
	help="DPI for rendering PDF pages (default: 300)",
	)
	parser.add_argument(
	"-m",
	"--method",
	type=str,
	default="docaligner",
	choices=list(DETECTION_METHODS.keys()),
	help="Detection method to use (default: docaligner). Options: docaligner, danielgatis, fmeow, endalk",
	)

	args = parser.parse_args()

	autocrop_pdf(args.input, args.output, args.dpi, args.method)
No results found