aont · January 6, 2026 10:29
diff --git a/0 A Simple Python Tool for Controlled PDF Text Extraction.md b/0 A Simple Python Tool for Controlled PDF Text Extraction.md
diff --git a/main.py b/main.py
 #!/usr/bin/env python3
 from __future__ import annotations

 import math
 import sys
 from typing import Iterator, Optional, Tuple

 from pypdf import PdfReader

 # =========================
 # Extraction conditions (adjust only here if needed)
 # =========================
 TARGET_FONTS = {
    ("Hoge", 12.555059999999997),
    ("Fuga", 12.945840000000032),
 }
 SIZE_TOL = 1e-6  # Tolerance for math.isclose

 # As in the original code, extraction of all text (font filter disabled) is the default
 ENABLE_FONT_FILTER = False


 def _normalize_font_name(raw) -> Optional[str]:
    """
    Convert and normalize font information passed from pypdf into a string.
    Example: NameObject('/Hoge') -> 'Hoge'
    """
    if raw is None:
        return None
    s = str(raw)
    if s.startswith("/"):
        s = s[1:]
    return s or None


 def is_target_text(font_name: Optional[str], font_size: Optional[float]) -> bool:
    """Determine whether a text fragment is a target for extraction (by font name and size)."""
    if not ENABLE_FONT_FILTER:
        return True

    if font_name is None or font_size is None:
        return False

    for f, sz in TARGET_FONTS:
        if font_name == f and math.isclose(font_size, sz, rel_tol=0.0, abs_tol=SIZE_TOL):
            return True
    return False


 def extract_text_stream(fp) -> Iterator[str]:
    """
    - Extract only target text (optionally filtered by font name and size)
    - Replace '.' with '.\\n'
    - If a line ends with '-', merge it with the next line (remove the trailing '-')
    """
    reader = PdfReader(fp)

    carry = ""  # Buffer for joining lines when a line ends with a hyphen

    for page in reader.pages:
        chunks: list[str] = []

        def visitor_text(
            text: str,
            cm,  # current transformation matrix
            tm,  # text matrix
            font_dict,
            font_size: float,
        ):
            # Guard because text may be empty
            if not text:
                return

            # font_dict is often a dict-like object (some PDFs may not provide it)
            base_font = None
            try:
                if font_dict:
                    base_font = font_dict.get("/BaseFont")
            except Exception:
                base_font = None

            font_name = _normalize_font_name(base_font)
            size = float(font_size) if font_size is not None else None

            if is_target_text(font_name, size):
                chunks.append(text)

        # Using visitor_text allows collecting text fragments
        # in the order of the content stream
        # extraction_mode is not specified because it may be unsupported
        # depending on the pypdf version
        page.extract_text(visitor_text=visitor_text)

        s = "".join(chunks)
        if not s:
            continue

        s = s.replace(".", ".\n")

        for line in s.splitlines(keepends=False):
            if carry:
                line = carry + line
                carry = ""

            if line.endswith("-"):
                carry = line[:-1]
                continue

            yield line

        # As in the original code, we do not flush at block boundaries;
        # carry is preserved (you can flush here if needed)
        # if carry:
        #     yield carry
        #     carry = ""

    if carry:
        yield carry


 def main(pdf_path: str) -> None:
    with open(pdf_path, "rb") as f:
        for chunk in extract_text_stream(f):
            sys.stdout.buffer.write(chunk.encode() + b"\n")


 if __name__ == "__main__":
    path = sys.argv[1]
    main(path)
	#!/usr/bin/env python3
	from __future__ import annotations

	import math
	import sys
	from typing import Iterator, Optional, Tuple

	from pypdf import PdfReader

	# =========================
	# Extraction conditions (adjust only here if needed)
	# =========================
	TARGET_FONTS = {
	("Hoge", 12.555059999999997),
	("Fuga", 12.945840000000032),
	}
	SIZE_TOL = 1e-6 # Tolerance for math.isclose

	# As in the original code, extraction of all text (font filter disabled) is the default
	ENABLE_FONT_FILTER = False


	def _normalize_font_name(raw) -> Optional[str]:
	"""
	Convert and normalize font information passed from pypdf into a string.
	Example: NameObject('/Hoge') -> 'Hoge'
	"""
	if raw is None:
	return None
	s = str(raw)
	if s.startswith("/"):
	s = s[1:]
	return s or None


	def is_target_text(font_name: Optional[str], font_size: Optional[float]) -> bool:
	"""Determine whether a text fragment is a target for extraction (by font name and size)."""
	if not ENABLE_FONT_FILTER:
	return True

	if font_name is None or font_size is None:
	return False

	for f, sz in TARGET_FONTS:
	if font_name == f and math.isclose(font_size, sz, rel_tol=0.0, abs_tol=SIZE_TOL):
	return True
	return False


	def extract_text_stream(fp) -> Iterator[str]:
	"""
	- Extract only target text (optionally filtered by font name and size)
	- Replace '.' with '.\\n'
	- If a line ends with '-', merge it with the next line (remove the trailing '-')
	"""
	reader = PdfReader(fp)

	carry = "" # Buffer for joining lines when a line ends with a hyphen

	for page in reader.pages:
	chunks: list[str] = []

	def visitor_text(
	text: str,
	cm, # current transformation matrix
	tm, # text matrix
	font_dict,
	font_size: float,
	):
	# Guard because text may be empty
	if not text:
	return

	# font_dict is often a dict-like object (some PDFs may not provide it)
	base_font = None
	try:
	if font_dict:
	base_font = font_dict.get("/BaseFont")
	except Exception:
	base_font = None

	font_name = _normalize_font_name(base_font)
	size = float(font_size) if font_size is not None else None

	if is_target_text(font_name, size):
	chunks.append(text)

	# Using visitor_text allows collecting text fragments
	# in the order of the content stream
	# extraction_mode is not specified because it may be unsupported
	# depending on the pypdf version
	page.extract_text(visitor_text=visitor_text)

	s = "".join(chunks)
	if not s:
	continue

	s = s.replace(".", ".\n")

	for line in s.splitlines(keepends=False):
	if carry:
	line = carry + line
	carry = ""

	if line.endswith("-"):
	carry = line[:-1]
	continue

	yield line

	# As in the original code, we do not flush at block boundaries;
	# carry is preserved (you can flush here if needed)
	# if carry:
	# yield carry
	# carry = ""

	if carry:
	yield carry


	def main(pdf_path: str) -> None:
	with open(pdf_path, "rb") as f:
	for chunk in extract_text_stream(f):
	sys.stdout.buffer.write(chunk.encode() + b"\n")


	if __name__ == "__main__":
	path = sys.argv[1]
	main(path)
No results found