adithya-r-prabhu · January 8, 2026 10:16
diff --git a/gistfile1.txt b/gistfile1.txt
 import fitz  # PyMuPDF
 import cv2
 import numpy as np
 from PIL import Image, ImageOps
 import os

 # -----------------------------
 # INPUT/OUTPUT
 # -----------------------------
 input_pdf = "input.pdf"
 output_pdf = "output_fixed.pdf"

 # Temporary folder to store images
 tmp_folder = "tmp_pages"
 os.makedirs(tmp_folder, exist_ok=True)

 # -----------------------------
 # PARAMETERS
 # -----------------------------
 zoom = 3               # 3x zoom (~216 dpi, good clarity without huge size)
 threshold_value = 40   # Detect dark pixels
 min_area = 20000       # Ignore small regions (text, thin lines)
 dark_ratio_min = 0.6   # Only invert if >60% dark pixels
 jpeg_quality = 90      # Save images as JPEG to reduce size

 # -----------------------------
 # OPEN PDF
 # -----------------------------
 doc = fitz.open(input_pdf)
 new_doc = fitz.open()

 for page_num in range(len(doc)):
    page = doc[page_num]

    # Render page at high DPI
    mat = fitz.Matrix(zoom, zoom)
    pix = page.get_pixmap(matrix=mat, alpha=False)

    # Convert to numpy image
    img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)

    # Threshold: detect dark regions
    _, mask = cv2.threshold(gray, threshold_value, 255, cv2.THRESH_BINARY_INV)

    # Find contours of dark regions
    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    pil_img = Image.fromarray(img)

    for cnt in contours:
        x, y, w, h = cv2.boundingRect(cnt)

        # Ignore small regions (text, thin lines, borders)
        if w * h < min_area:
            continue

        # Extract region as numpy
        region_np = np.array(pil_img.crop((x, y, x + w, y + h)))
        gray_region = cv2.cvtColor(region_np, cv2.COLOR_RGB2GRAY)

        # Calculate darkness ratio
        dark_pixels = np.sum(gray_region < threshold_value)
        total_pixels = gray_region.size
        dark_ratio = dark_pixels / total_pixels

        # Only invert if mostly dark (solid black block)
        if dark_ratio > dark_ratio_min:
            region = pil_img.crop((x, y, x + w, y + h))
            inverted = ImageOps.invert(region)
            pil_img.paste(inverted, (x, y))

    # Save page as JPEG to reduce file size
    temp_path = os.path.join(tmp_folder, f"page_{page_num+1}.jpg")
    pil_img.save(temp_path, quality=jpeg_quality, dpi=(zoom*72, zoom*72))

    # Insert JPEG back into PDF
    new_page = new_doc.new_page(width=page.rect.width, height=page.rect.height)
    new_page.insert_image(page.rect, filename=temp_path)

 # Save final compressed PDF
 new_doc.save(output_pdf)
 print("✅ Done! Fixed PDF saved as:", output_pdf)
	import fitz # PyMuPDF
	import cv2
	import numpy as np
	from PIL import Image, ImageOps
	import os

	# -----------------------------
	# INPUT/OUTPUT
	# -----------------------------
	input_pdf = "input.pdf"
	output_pdf = "output_fixed.pdf"

	# Temporary folder to store images
	tmp_folder = "tmp_pages"
	os.makedirs(tmp_folder, exist_ok=True)

	# -----------------------------
	# PARAMETERS
	# -----------------------------
	zoom = 3 # 3x zoom (~216 dpi, good clarity without huge size)
	threshold_value = 40 # Detect dark pixels
	min_area = 20000 # Ignore small regions (text, thin lines)
	dark_ratio_min = 0.6 # Only invert if >60% dark pixels
	jpeg_quality = 90 # Save images as JPEG to reduce size

	# -----------------------------
	# OPEN PDF
	# -----------------------------
	doc = fitz.open(input_pdf)
	new_doc = fitz.open()

	for page_num in range(len(doc)):
	page = doc[page_num]

	# Render page at high DPI
	mat = fitz.Matrix(zoom, zoom)
	pix = page.get_pixmap(matrix=mat, alpha=False)

	# Convert to numpy image
	img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
	gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)

	# Threshold: detect dark regions
	_, mask = cv2.threshold(gray, threshold_value, 255, cv2.THRESH_BINARY_INV)

	# Find contours of dark regions
	contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

	pil_img = Image.fromarray(img)

	for cnt in contours:
	x, y, w, h = cv2.boundingRect(cnt)

	# Ignore small regions (text, thin lines, borders)
	if w * h < min_area:
	continue

	# Extract region as numpy
	region_np = np.array(pil_img.crop((x, y, x + w, y + h)))
	gray_region = cv2.cvtColor(region_np, cv2.COLOR_RGB2GRAY)

	# Calculate darkness ratio
	dark_pixels = np.sum(gray_region < threshold_value)
	total_pixels = gray_region.size
	dark_ratio = dark_pixels / total_pixels

	# Only invert if mostly dark (solid black block)
	if dark_ratio > dark_ratio_min:
	region = pil_img.crop((x, y, x + w, y + h))
	inverted = ImageOps.invert(region)
	pil_img.paste(inverted, (x, y))

	# Save page as JPEG to reduce file size
	temp_path = os.path.join(tmp_folder, f"page_{page_num+1}.jpg")
	pil_img.save(temp_path, quality=jpeg_quality, dpi=(zoom72, zoom72))

	# Insert JPEG back into PDF
	new_page = new_doc.new_page(width=page.rect.width, height=page.rect.height)
	new_page.insert_image(page.rect, filename=temp_path)

	# Save final compressed PDF
	new_doc.save(output_pdf)
	print("✅ Done! Fixed PDF saved as:", output_pdf)
No results found