Skip to content

Instantly share code, notes, and snippets.

@rahimnathwani
Created February 3, 2026 04:15
Show Gist options
  • Select an option

  • Save rahimnathwani/324a8d510c801e36ce6bd5885e710fc6 to your computer and use it in GitHub Desktop.

Select an option

Save rahimnathwani/324a8d510c801e36ce6bd5885e710fc6 to your computer and use it in GitHub Desktop.
Highlight non-M rows in Hilldale report card PDF
#!/usr/bin/env uv run
# /// script
# dependencies = [
# "pymupdf",
# ]
# ///
import fitz
def highlight_rows(input_pdf, output_pdf):
doc = fitz.open(input_pdf)
# Define colors (RGB values 0-1)
# E -> Light Green
color_e = (0.56, 0.93, 0.56)
# R -> Yellow
color_r = (1, 1, 0)
# P -> Pink
color_p = (1, 0.75, 0.8)
rating_colors = {
"E": color_e,
"R": color_r,
"P": color_p
}
for page in doc:
# Get text lines
# "dict" gives blocks -> lines -> spans
# We want to flatten this to a list of lines with (bbox, text)
all_lines = []
blocks = page.get_text("dict")["blocks"]
for block in blocks:
if "lines" not in block:
continue
for line in block["lines"]:
# Reconstruct text for the line
text = " ".join([span["text"] for span in line["spans"]]).strip()
all_lines.append({
"bbox": fitz.Rect(line["bbox"]),
"text": text
})
# Identify rating lines and content lines
# Rating lines: Text is exactly E, R, P (or M, N, etc but we only care about E, R, P)
# and positioned on the right side (x0 > 400 is a safe heuristic based on inspection)
rating_candidates = []
content_lines = []
for line_obj in all_lines:
text = line_obj["text"]
bbox = line_obj["bbox"]
# Check if it looks like a rating column entry
# The inspection showed x0 around 525 for ratings. 450 is a safe cutoff.
if bbox.x0 > 450 and len(text) <= 2:
if text in rating_colors:
rating_candidates.append(line_obj)
# We ignore M, -, etc. for candidates but treat them as 'non-content' effectively?
# Actually, if we have a rating 'M', we shouldn't treat it as content for another rating.
# So we just don't add it to content_lines if it looks like a rating (short, right side).
else:
content_lines.append(line_obj)
# For each target rating, find overlapping content
for rating_line in rating_candidates:
rating_text = rating_line["text"]
r_bbox = rating_line["bbox"]
# Find vertically overlapping content lines
# Overlap: max(y0_1, y0_2) < min(y1_1, y1_2)
overlapping_bboxes = [r_bbox]
for content in content_lines:
c_bbox = content["bbox"]
# Check vertical overlap
if max(r_bbox.y0, c_bbox.y0) < min(r_bbox.y1, c_bbox.y1):
overlapping_bboxes.append(c_bbox)
# Calculate union bbox
if overlapping_bboxes:
# Union of all rectangles
x0 = min([b.x0 for b in overlapping_bboxes])
y0 = min([b.y0 for b in overlapping_bboxes])
x1 = max([b.x1 for b in overlapping_bboxes])
y1 = max([b.y1 for b in overlapping_bboxes])
# Expand slightly for aesthetics or use page margins?
# Using a fixed left margin (e.g. 36) to the rating's right edge looks neat.
# Inspection showed left margin starts around 36 or 60.
final_rect = fitz.Rect(36, y0, r_bbox.x1 + 5, y1)
# Add highlight
annot = page.add_highlight_annot(final_rect)
annot.set_colors(stroke=rating_colors[rating_text])
annot.update()
doc.save(output_pdf)
print(f"Processed PDF saved as: {output_pdf}")
if __name__ == "__main__":
highlight_rows("Report Card.pdf", "Report Card_Highlighted.pdf")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment