rahimnathwani · February 3, 2026 04:15
diff --git a/highlight_report_card.py b/highlight_report_card.py
 #!/usr/bin/env uv run
 # /// script
 # dependencies = [
 #   "pymupdf",
 # ]
 # ///

 import fitz

 def highlight_rows(input_pdf, output_pdf):
    doc = fitz.open(input_pdf)
    
    # Define colors (RGB values 0-1)
    # E -> Light Green
    color_e = (0.56, 0.93, 0.56)
    # R -> Yellow
    color_r = (1, 1, 0)
    # P -> Pink
    color_p = (1, 0.75, 0.8)

    rating_colors = {
        "E": color_e,
        "R": color_r,
        "P": color_p
    }

    for page in doc:
        # Get text lines
        # "dict" gives blocks -> lines -> spans
        # We want to flatten this to a list of lines with (bbox, text)
        
        all_lines = []
        blocks = page.get_text("dict")["blocks"]
        for block in blocks:
            if "lines" not in block:
                continue
            for line in block["lines"]:
                # Reconstruct text for the line
                text = " ".join([span["text"] for span in line["spans"]]).strip()
                all_lines.append({
                    "bbox": fitz.Rect(line["bbox"]),
                    "text": text
                })
        
        # Identify rating lines and content lines
        # Rating lines: Text is exactly E, R, P (or M, N, etc but we only care about E, R, P)
        # and positioned on the right side (x0 > 400 is a safe heuristic based on inspection)
        
        rating_candidates = []
        content_lines = []
        
        for line_obj in all_lines:
            text = line_obj["text"]
            bbox = line_obj["bbox"]
            
            # Check if it looks like a rating column entry
            # The inspection showed x0 around 525 for ratings. 450 is a safe cutoff.
            if bbox.x0 > 450 and len(text) <= 2: 
                if text in rating_colors:
                    rating_candidates.append(line_obj)
                # We ignore M, -, etc. for candidates but treat them as 'non-content' effectively?
                # Actually, if we have a rating 'M', we shouldn't treat it as content for another rating.
                # So we just don't add it to content_lines if it looks like a rating (short, right side).
            else:
                content_lines.append(line_obj)
                
        # For each target rating, find overlapping content
        for rating_line in rating_candidates:
            rating_text = rating_line["text"]
            r_bbox = rating_line["bbox"]
            
            # Find vertically overlapping content lines
            # Overlap: max(y0_1, y0_2) < min(y1_1, y1_2)
            
            overlapping_bboxes = [r_bbox]
            
            for content in content_lines:
                c_bbox = content["bbox"]
                
                # Check vertical overlap
                if max(r_bbox.y0, c_bbox.y0) < min(r_bbox.y1, c_bbox.y1):
                    overlapping_bboxes.append(c_bbox)
            
            # Calculate union bbox
            if overlapping_bboxes:
                # Union of all rectangles
                x0 = min([b.x0 for b in overlapping_bboxes])
                y0 = min([b.y0 for b in overlapping_bboxes])
                x1 = max([b.x1 for b in overlapping_bboxes])
                y1 = max([b.y1 for b in overlapping_bboxes])
                
                # Expand slightly for aesthetics or use page margins?
                # Using a fixed left margin (e.g. 36) to the rating's right edge looks neat.
                # Inspection showed left margin starts around 36 or 60.
                final_rect = fitz.Rect(36, y0, r_bbox.x1 + 5, y1)
                
                # Add highlight
                annot = page.add_highlight_annot(final_rect)
                annot.set_colors(stroke=rating_colors[rating_text])
                annot.update()

    doc.save(output_pdf)
    print(f"Processed PDF saved as: {output_pdf}")

 if __name__ == "__main__":
    highlight_rows("Report Card.pdf", "Report Card_Highlighted.pdf")
	#!/usr/bin/env uv run
	# /// script
	# dependencies = [
	# "pymupdf",
	# ]
	# ///

	import fitz

	def highlight_rows(input_pdf, output_pdf):
	doc = fitz.open(input_pdf)

	# Define colors (RGB values 0-1)
	# E -> Light Green
	color_e = (0.56, 0.93, 0.56)
	# R -> Yellow
	color_r = (1, 1, 0)
	# P -> Pink
	color_p = (1, 0.75, 0.8)

	rating_colors = {
	"E": color_e,
	"R": color_r,
	"P": color_p
	}

	for page in doc:
	# Get text lines
	# "dict" gives blocks -> lines -> spans
	# We want to flatten this to a list of lines with (bbox, text)

	all_lines = []
	blocks = page.get_text("dict")["blocks"]
	for block in blocks:
	if "lines" not in block:
	continue
	for line in block["lines"]:
	# Reconstruct text for the line
	text = " ".join([span["text"] for span in line["spans"]]).strip()
	all_lines.append({
	"bbox": fitz.Rect(line["bbox"]),
	"text": text
	})

	# Identify rating lines and content lines
	# Rating lines: Text is exactly E, R, P (or M, N, etc but we only care about E, R, P)
	# and positioned on the right side (x0 > 400 is a safe heuristic based on inspection)

	rating_candidates = []
	content_lines = []

	for line_obj in all_lines:
	text = line_obj["text"]
	bbox = line_obj["bbox"]

	# Check if it looks like a rating column entry
	# The inspection showed x0 around 525 for ratings. 450 is a safe cutoff.
	if bbox.x0 > 450 and len(text) <= 2:
	if text in rating_colors:
	rating_candidates.append(line_obj)
	# We ignore M, -, etc. for candidates but treat them as 'non-content' effectively?
	# Actually, if we have a rating 'M', we shouldn't treat it as content for another rating.
	# So we just don't add it to content_lines if it looks like a rating (short, right side).
	else:
	content_lines.append(line_obj)

	# For each target rating, find overlapping content
	for rating_line in rating_candidates:
	rating_text = rating_line["text"]
	r_bbox = rating_line["bbox"]

	# Find vertically overlapping content lines
	# Overlap: max(y0_1, y0_2) < min(y1_1, y1_2)

	overlapping_bboxes = [r_bbox]

	for content in content_lines:
	c_bbox = content["bbox"]

	# Check vertical overlap
	if max(r_bbox.y0, c_bbox.y0) < min(r_bbox.y1, c_bbox.y1):
	overlapping_bboxes.append(c_bbox)

	# Calculate union bbox
	if overlapping_bboxes:
	# Union of all rectangles
	x0 = min([b.x0 for b in overlapping_bboxes])
	y0 = min([b.y0 for b in overlapping_bboxes])
	x1 = max([b.x1 for b in overlapping_bboxes])
	y1 = max([b.y1 for b in overlapping_bboxes])

	# Expand slightly for aesthetics or use page margins?
	# Using a fixed left margin (e.g. 36) to the rating's right edge looks neat.
	# Inspection showed left margin starts around 36 or 60.
	final_rect = fitz.Rect(36, y0, r_bbox.x1 + 5, y1)

	# Add highlight
	annot = page.add_highlight_annot(final_rect)
	annot.set_colors(stroke=rating_colors[rating_text])
	annot.update()

	doc.save(output_pdf)
	print(f"Processed PDF saved as: {output_pdf}")

	if __name__ == "__main__":
	highlight_rows("Report Card.pdf", "Report Card_Highlighted.pdf")
No results found