Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Select an option

  • Save adithya-r-prabhu/79038ccbd027ce635be631b4a6919dc3 to your computer and use it in GitHub Desktop.

Select an option

Save adithya-r-prabhu/79038ccbd027ce635be631b4a6919dc3 to your computer and use it in GitHub Desktop.
invert black parts in page
import fitz # PyMuPDF
import cv2
import numpy as np
from PIL import Image, ImageOps
import os
# -----------------------------
# INPUT/OUTPUT
# -----------------------------
input_pdf = "input.pdf"
output_pdf = "output_fixed.pdf"
# Temporary folder to store images
tmp_folder = "tmp_pages"
os.makedirs(tmp_folder, exist_ok=True)
# -----------------------------
# PARAMETERS
# -----------------------------
zoom = 3 # 3x zoom (~216 dpi, good clarity without huge size)
threshold_value = 40 # Detect dark pixels
min_area = 20000 # Ignore small regions (text, thin lines)
dark_ratio_min = 0.6 # Only invert if >60% dark pixels
jpeg_quality = 90 # Save images as JPEG to reduce size
# -----------------------------
# OPEN PDF
# -----------------------------
doc = fitz.open(input_pdf)
new_doc = fitz.open()
for page_num in range(len(doc)):
page = doc[page_num]
# Render page at high DPI
mat = fitz.Matrix(zoom, zoom)
pix = page.get_pixmap(matrix=mat, alpha=False)
# Convert to numpy image
img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
# Threshold: detect dark regions
_, mask = cv2.threshold(gray, threshold_value, 255, cv2.THRESH_BINARY_INV)
# Find contours of dark regions
contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
pil_img = Image.fromarray(img)
for cnt in contours:
x, y, w, h = cv2.boundingRect(cnt)
# Ignore small regions (text, thin lines, borders)
if w * h < min_area:
continue
# Extract region as numpy
region_np = np.array(pil_img.crop((x, y, x + w, y + h)))
gray_region = cv2.cvtColor(region_np, cv2.COLOR_RGB2GRAY)
# Calculate darkness ratio
dark_pixels = np.sum(gray_region < threshold_value)
total_pixels = gray_region.size
dark_ratio = dark_pixels / total_pixels
# Only invert if mostly dark (solid black block)
if dark_ratio > dark_ratio_min:
region = pil_img.crop((x, y, x + w, y + h))
inverted = ImageOps.invert(region)
pil_img.paste(inverted, (x, y))
# Save page as JPEG to reduce file size
temp_path = os.path.join(tmp_folder, f"page_{page_num+1}.jpg")
pil_img.save(temp_path, quality=jpeg_quality, dpi=(zoom*72, zoom*72))
# Insert JPEG back into PDF
new_page = new_doc.new_page(width=page.rect.width, height=page.rect.height)
new_page.insert_image(page.rect, filename=temp_path)
# Save final compressed PDF
new_doc.save(output_pdf)
print("✅ Done! Fixed PDF saved as:", output_pdf)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment