Created
February 3, 2026 04:15
-
-
Save rahimnathwani/324a8d510c801e36ce6bd5885e710fc6 to your computer and use it in GitHub Desktop.
Highlight non-M rows in Hilldale report card PDF
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env uv run | |
| # /// script | |
| # dependencies = [ | |
| # "pymupdf", | |
| # ] | |
| # /// | |
| import fitz | |
| def highlight_rows(input_pdf, output_pdf): | |
| doc = fitz.open(input_pdf) | |
| # Define colors (RGB values 0-1) | |
| # E -> Light Green | |
| color_e = (0.56, 0.93, 0.56) | |
| # R -> Yellow | |
| color_r = (1, 1, 0) | |
| # P -> Pink | |
| color_p = (1, 0.75, 0.8) | |
| rating_colors = { | |
| "E": color_e, | |
| "R": color_r, | |
| "P": color_p | |
| } | |
| for page in doc: | |
| # Get text lines | |
| # "dict" gives blocks -> lines -> spans | |
| # We want to flatten this to a list of lines with (bbox, text) | |
| all_lines = [] | |
| blocks = page.get_text("dict")["blocks"] | |
| for block in blocks: | |
| if "lines" not in block: | |
| continue | |
| for line in block["lines"]: | |
| # Reconstruct text for the line | |
| text = " ".join([span["text"] for span in line["spans"]]).strip() | |
| all_lines.append({ | |
| "bbox": fitz.Rect(line["bbox"]), | |
| "text": text | |
| }) | |
| # Identify rating lines and content lines | |
| # Rating lines: Text is exactly E, R, P (or M, N, etc but we only care about E, R, P) | |
| # and positioned on the right side (x0 > 400 is a safe heuristic based on inspection) | |
| rating_candidates = [] | |
| content_lines = [] | |
| for line_obj in all_lines: | |
| text = line_obj["text"] | |
| bbox = line_obj["bbox"] | |
| # Check if it looks like a rating column entry | |
| # The inspection showed x0 around 525 for ratings. 450 is a safe cutoff. | |
| if bbox.x0 > 450 and len(text) <= 2: | |
| if text in rating_colors: | |
| rating_candidates.append(line_obj) | |
| # We ignore M, -, etc. for candidates but treat them as 'non-content' effectively? | |
| # Actually, if we have a rating 'M', we shouldn't treat it as content for another rating. | |
| # So we just don't add it to content_lines if it looks like a rating (short, right side). | |
| else: | |
| content_lines.append(line_obj) | |
| # For each target rating, find overlapping content | |
| for rating_line in rating_candidates: | |
| rating_text = rating_line["text"] | |
| r_bbox = rating_line["bbox"] | |
| # Find vertically overlapping content lines | |
| # Overlap: max(y0_1, y0_2) < min(y1_1, y1_2) | |
| overlapping_bboxes = [r_bbox] | |
| for content in content_lines: | |
| c_bbox = content["bbox"] | |
| # Check vertical overlap | |
| if max(r_bbox.y0, c_bbox.y0) < min(r_bbox.y1, c_bbox.y1): | |
| overlapping_bboxes.append(c_bbox) | |
| # Calculate union bbox | |
| if overlapping_bboxes: | |
| # Union of all rectangles | |
| x0 = min([b.x0 for b in overlapping_bboxes]) | |
| y0 = min([b.y0 for b in overlapping_bboxes]) | |
| x1 = max([b.x1 for b in overlapping_bboxes]) | |
| y1 = max([b.y1 for b in overlapping_bboxes]) | |
| # Expand slightly for aesthetics or use page margins? | |
| # Using a fixed left margin (e.g. 36) to the rating's right edge looks neat. | |
| # Inspection showed left margin starts around 36 or 60. | |
| final_rect = fitz.Rect(36, y0, r_bbox.x1 + 5, y1) | |
| # Add highlight | |
| annot = page.add_highlight_annot(final_rect) | |
| annot.set_colors(stroke=rating_colors[rating_text]) | |
| annot.update() | |
| doc.save(output_pdf) | |
| print(f"Processed PDF saved as: {output_pdf}") | |
| if __name__ == "__main__": | |
| highlight_rows("Report Card.pdf", "Report Card_Highlighted.pdf") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment