nuhmanpk · December 25, 2025 11:52
diff --git a/README.md b/README.md
diff --git a/sir_parser.py b/sir_parser.py
 from pdf2image import convert_from_path
 import pytesseract
 import pandas as pd
 import cv2
 import numpy as np
 import logging
 from tqdm import tqdm
 import re
 from googletrans import Translator

 logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s"
 )

 pdf_path = "../path_to_sir.pdf"
 csv_path = "output_clean_translated.csv"

 translator = Translator()

 logging.info("Starting PDF to image conversion")
 pages = convert_from_path(pdf_path, dpi=300)
 logging.info(f"Total pages detected: {len(pages)}")

 records = []
 current = {}

 def translate(text):
    try:
        return translator.translate(text, src="ml", dest="en").text
    except:
        return ""

 def flush():
    global current
    if current:
        records.append(current)
        current = {}

 for page in tqdm(pages, desc="Processing pages"):
    img = np.array(page)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]

    text = pytesseract.image_to_string(
        gray,
        lang="mal",
        config="--oem 3 --psm 6"
    )

    lines = [l.strip() for l in text.split("\n") if l.strip()]

    for line in lines:
        if line.startswith("പേര്"):
            flush()
            value = re.sub(r".*?:", "", line).strip()
            current["name_ml"] = value
            current["name_en"] = translate(value)

        elif "അച്ഛന്റെ പേര്" in line or "ഭര്‍ത്താവിന്റെ പേര്" in line:
            value = re.sub(r".*?:", "", line).strip()
            current["relative_name_ml"] = value
            current["relative_name_en"] = translate(value)

        elif "വീട്ടു നമ്പര്‍" in line:
            value = re.sub(r".*?:", "", line).strip()
            current["house_no"] = value

        elif "പ്രായം" in line:
            age = re.search(r"പ്രായം\s*:\s*(\d+)", line)
            gender = re.search(r"ലിംഗം\s*:\s*(\S+)", line)
            if age:
                current["age"] = age.group(1)
            if gender:
                current["gender_ml"] = gender.group(1)
                current["gender_en"] = translate(gender.group(1))

    flush()

 logging.info(f"Total voters extracted: {len(records)}")

 df = pd.DataFrame(records)
 df.to_csv(csv_path, index=False, encoding="utf-8-sig")

 logging.info("Structured + translated CSV saved successfully")
	from pdf2image import convert_from_path
	import pytesseract
	import pandas as pd
	import cv2
	import numpy as np
	import logging
	from tqdm import tqdm
	import re
	from googletrans import Translator

	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s \| %(levelname)s \| %(message)s"
	)

	pdf_path = "../path_to_sir.pdf"
	csv_path = "output_clean_translated.csv"

	translator = Translator()

	logging.info("Starting PDF to image conversion")
	pages = convert_from_path(pdf_path, dpi=300)
	logging.info(f"Total pages detected: {len(pages)}")

	records = []
	current = {}

	def translate(text):
	try:
	return translator.translate(text, src="ml", dest="en").text
	except:
	return ""

	def flush():
	global current
	if current:
	records.append(current)
	current = {}

	for page in tqdm(pages, desc="Processing pages"):
	img = np.array(page)
	gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
	gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]

	text = pytesseract.image_to_string(
	gray,
	lang="mal",
	config="--oem 3 --psm 6"
	)

	lines = [l.strip() for l in text.split("\n") if l.strip()]

	for line in lines:
	if line.startswith("പേര്"):
	flush()
	value = re.sub(r".*?:", "", line).strip()
	current["name_ml"] = value
	current["name_en"] = translate(value)

	elif "അച്ഛന്റെ പേര്" in line or "ഭര്‍ത്താവിന്റെ പേര്" in line:
	value = re.sub(r".*?:", "", line).strip()
	current["relative_name_ml"] = value
	current["relative_name_en"] = translate(value)

	elif "വീട്ടു നമ്പര്‍" in line:
	value = re.sub(r".*?:", "", line).strip()
	current["house_no"] = value

	elif "പ്രായം" in line:
	age = re.search(r"പ്രായം\s:\s(\d+)", line)
	gender = re.search(r"ലിംഗം\s:\s(\S+)", line)
	if age:
	current["age"] = age.group(1)
	if gender:
	current["gender_ml"] = gender.group(1)
	current["gender_en"] = translate(gender.group(1))

	flush()

	logging.info(f"Total voters extracted: {len(records)}")

	df = pd.DataFrame(records)
	df.to_csv(csv_path, index=False, encoding="utf-8-sig")

	logging.info("Structured + translated CSV saved successfully")
No results found