albertbuchard · January 4, 2026 19:20 · albertbuchard · May 20, 2024
diff --git a/textrepo.py b/textrepo.py
 #!/usr/bin/env python3
 """
 textrepo — dump a repository into a single text file, optimized for LLM input.

 Features
 - Always prints a folder tree of the discovered files (after ignores).
 - Emits a per-file section for every *text-eligible* file (even if skipped/truncated).
 - Respects .gitignore by default (disable with --no-gitignore).
 - Supports extra ignore patterns and an additional ignore file (gitignore-ish).
 - Detects and omits binary / non-text files from the per-file sections
  (they still appear in the tree).
 - Skips prior textrepo dumps (by magic marker) to avoid recursive inclusion.
 - Deletes the output file before writing (prevents self-inclusion / stale output).
 - Truncates each file by max lines and approx max bytes.
 - Optional --no-line-break to remove all line breaks from emitted file contents.
 - Optional --compact to minimize added characters (short header, ASCII tree, compact framing).
 - Optional --max-total-bytes to stop emitting content past a total output budget
  (headers still emitted for eligible files, consistent with legacy behavior).
 - Streams output (does not build the entire dump in memory).

 Usage:
  textrepo /path/to/repo output.txt

 Examples:
  textrepo . repo.txt
  textrepo . repo.txt --extra-ignore "*.log" --extra-ignore "Saved/" --max-lines 300
  textrepo . repo.txt --no-gitignore
  textrepo . repo.txt --ignore-file .textrepoignore
  textrepo . repo.txt --no-line-break
  textrepo . repo.txt --compact --max-bytes 8000 --max-lines 200

 Notes on ignore semantics:
 - This is a lightweight “gitignore-ish” matcher, not a full gitignore spec.
 - Rules support:
  - comments (#) and empty lines
  - negation with leading '!'
  - directory-only rules with trailing '/'
  - anchored rules with leading '/'
  - glob matching via fnmatch
 - Pattern behavior:
  - If the rule contains a '/', it is matched against the full relative path, and also '**/<pattern>'.
  - If the rule does not contain '/', it matches any path component (basename-style).
 """

 from __future__ import annotations

 import argparse
 import fnmatch
 import os
 import sys
 from dataclasses import dataclass
 from enum import Enum
 from pathlib import Path
 from typing import Iterable, List, Optional, Set, Tuple


 # ----------------------- Dump marker (for self-skip) --------------------------

 DUMP_MAGIC_LINE = "TEXTREPO_DUMP_V1"
 DUMP_LEGACY_PREFIX = "REPOSITORY SNAPSHOT"


 # ---------------------- Gitignore-ish pattern handling ------------------------

 @dataclass(frozen=True)
 class IgnoreRule:
    negate: bool
    pattern: str
    dir_only: bool
    anchored: bool


 def _normalize_relpath(relpath: str) -> str:
    # Always operate on posix-style, relative paths.
    return relpath.replace(os.sep, "/").lstrip("./")


 def parse_ignore_file(path: Path) -> List[IgnoreRule]:
    rules: List[IgnoreRule] = []
    try:
        lines = path.read_text(encoding="utf-8", errors="replace").splitlines()
    except OSError:
        return rules

    for raw in lines:
        s = raw.strip()
        if not s or s.startswith("#"):
            continue

        negate = s.startswith("!")
        if negate:
            s = s[1:].strip()
            if not s:
                continue

        anchored = s.startswith("/")
        if anchored:
            s = s[1:]

        dir_only = s.endswith("/")
        if dir_only:
            s = s[:-1]

        if not s:
            continue

        rules.append(IgnoreRule(negate=negate, pattern=s, dir_only=dir_only, anchored=anchored))

    return rules


 def _match_rule(rule: IgnoreRule, rel_posix: str, is_dir: bool) -> bool:
    if rule.dir_only and not is_dir:
        return False

    pat = rule.pattern

    # Anchored: match only from repo root.
    if rule.anchored:
        # "/foo" means "foo" relative to root
        return fnmatch.fnmatchcase(rel_posix, pat)

    # Unanchored:
    # - If pattern contains '/', match against full rel path and also '**/pattern'.
    # - If pattern has no '/', match against any path component.
    if "/" in pat:
        if fnmatch.fnmatchcase(rel_posix, pat):
            return True
        return fnmatch.fnmatchcase(rel_posix, f"**/{pat}")

    # Component/basename style:
    # Check each component (gitignore-like basename behavior).
    parts = rel_posix.split("/") if rel_posix else []
    return any(fnmatch.fnmatchcase(part, pat) for part in parts)


 def is_ignored(rel_posix: str, is_dir: bool, rules: List[IgnoreRule]) -> bool:
    # Last matching rule wins; negation flips.
    ignored = False
    for rule in rules:
        if _match_rule(rule, rel_posix, is_dir):
            ignored = not rule.negate
    return ignored


 # ---------------------------- File reading helpers ----------------------------

 COMMON_BINARY_EXTS = {
    # images
    ".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tga", ".psd", ".exr", ".webp", ".ico",
    # audio/video
    ".mp3", ".wav", ".ogg", ".flac", ".mp4", ".mov", ".avi", ".mkv", ".m4a",
    # archives
    ".zip", ".7z", ".rar", ".tar", ".gz", ".bz2", ".xz",
    # executables / libs
    ".exe", ".dll", ".so", ".dylib",
    # common binary-ish assets
    ".uasset", ".umap", ".pak",
    # documents often not useful as plain text
    ".pdf",
 }


 class ProbeKind(Enum):
    TEXT = "text"
    BINARY = "binary"
    UNREADABLE = "unreadable"


 def probe_binary_kind(path: Path, ext: str) -> ProbeKind:
    """
    Decide whether a file is binary/non-text.
    - Returns BINARY if extension is in COMMON_BINARY_EXTS or content looks binary.
    - Returns UNREADABLE if the probe can't read the file.
    - Returns TEXT otherwise.

    Important: we treat UNREADABLE separately so it is NOT silently omitted.
    """
    if ext in COMMON_BINARY_EXTS:
        return ProbeKind.BINARY

    try:
        with path.open("rb") as f:
            chunk = f.read(8192)
    except OSError:
        return ProbeKind.UNREADABLE

    if not chunk:
        return ProbeKind.TEXT

    if b"\x00" in chunk:
        return ProbeKind.BINARY

    # Heuristic: ratio of printable bytes.
    printable = sum(b in b"\t\n\r" or 32 <= b <= 126 for b in chunk)
    if printable / len(chunk) < 0.80:
        return ProbeKind.BINARY

    return ProbeKind.TEXT


 def starts_with_textrepo_dump_marker(path: Path) -> bool:
    """
    True if file content starts with this tool's marker (or a legacy header),
    allowing us to skip previously generated dumps to avoid recursion.
    """
    try:
        with path.open("rb") as f:
            head = f.read(4096)
    except OSError:
        return False

    s = head.decode("utf-8", errors="replace")
    if s.startswith("\ufeff"):
        s = s.lstrip("\ufeff")
    return s.startswith(DUMP_MAGIC_LINE) or s.startswith(DUMP_LEGACY_PREFIX)


 def read_text_snippet(
    path: Path,
    max_bytes: int,
    max_lines: int,
    no_line_break: bool,
 ) -> Tuple[str, bool, int, int]:
    """
    Returns: (text, truncated, approx_bytes_read, lines_read)

    Truncation uses:
    - max_lines cap (exact)
    - max_bytes cap (approx; based on encoding of each line)
    """
    encodings = ("utf-8", "latin-1")
    last_err: Optional[Exception] = None

    for enc in encodings:
        try:
            out_parts: List[str] = []
            bytes_so_far = 0
            lines_so_far = 0
            truncated = False

            with path.open("r", encoding=enc, errors="strict") as f:
                for line in f:
                    if lines_so_far >= max_lines:
                        truncated = True
                        break

                    if no_line_break:
                        # remove all line breaks
                        line = line.replace("\n", "").replace("\r", "")

                    line_bytes = len(line.encode(enc, errors="replace"))
                    if bytes_so_far + line_bytes > max_bytes:
                        remaining = max(0, max_bytes - bytes_so_far)
                        if remaining > 0:
                            # approximate slice by chars (byte-perfect not required)
                            out_parts.append(line[: max(1, min(len(line), remaining))])
                        truncated = True
                        break

                    out_parts.append(line)
                    bytes_so_far += line_bytes
                    lines_so_far += 1

            return ("".join(out_parts), truncated, bytes_so_far, lines_so_far)

        except (UnicodeDecodeError, OSError) as e:
            last_err = e
            continue

    raise last_err if last_err else IOError("Failed to read file")


 # ------------------------------ Tree formatting ------------------------------

 def build_tree(paths: List[str], ascii_only: bool) -> str:
    """
    Build a deterministic tree view from a list of posix-style relative paths.
    """
    tree: dict = {}
    for p in paths:
        parts = p.split("/")
        node = tree
        for part in parts[:-1]:
            node = node.setdefault(part, {})
        node.setdefault("__files__", []).append(parts[-1])

    if ascii_only:
        BR_MID, BR_LAST = "|-- ", "`-- "
        EXT_MID, EXT_LAST = "|   ", "    "
    else:
        BR_MID, BR_LAST = "├── ", "└── "
        EXT_MID, EXT_LAST = "│   ", "    "

    def render(node: dict, prefix: str = "") -> List[str]:
        lines: List[str] = []
        dirs = sorted(k for k in node.keys() if k != "__files__")
        files = sorted(node.get("__files__", []))

        for i, d in enumerate(dirs):
            is_last_dir = (i == len(dirs) - 1) and (len(files) == 0)
            branch = BR_LAST if is_last_dir else BR_MID
            lines.append(prefix + branch + d + "/")
            extension = EXT_LAST if is_last_dir else EXT_MID
            lines.extend(render(node[d], prefix + extension))

        for j, f in enumerate(files):
            is_last = (j == len(files) - 1)
            branch = BR_LAST if is_last else BR_MID
            lines.append(prefix + branch + f)

        return lines

    return "\n".join(render(tree))


 # --------------------------------- Defaults ----------------------------------

 DEFAULT_EXTRA_IGNORES = [
    # VCS / IDE
    ".git/", ".hg/", ".svn/", ".idea/", ".vscode/",
    # Python
    "__pycache__/", "*.pyc", "*.pyo", "*.pyd",
    "venv/", ".venv/", ".tox/", ".pytest_cache/", ".mypy_cache/", ".ruff_cache/", ".hypothesis/",
    ".coverage", "coverage/",
    # OS / editor junk
    ".DS_Store", "Thumbs.db",
    # Node / web
    "node_modules/", "dist/", "build/", ".next/", ".turbo/", ".parcel-cache/",
    "package-lock.json", "yarn.lock", "pnpm-lock.yaml",
    # Cloudflare workers / wrangler
    ".wrangler/",
    # Terraform / direnv / misc caches
    ".terraform/", ".direnv/", ".cache/",
 ]


 # --------------------------------- Core model --------------------------------

 @dataclass(frozen=True)
 class FileEntry:
    rel: str
    abs_path: Path
    size: Optional[int]
    kind: ProbeKind
    is_prior_dump: bool


 @dataclass(frozen=True)
 class DumpConfig:
    root: Path
    output_path: Path
    respect_gitignore: bool
    extra_ignores: List[str]
    ignore_file: Optional[Path]
    include_globs: Optional[List[str]]
    max_bytes: int
    max_lines: int
    max_total_bytes: Optional[int]
    no_line_break: bool
    compact: bool
    quiet: bool


 class Emitter:
    """
    Streaming writer that tracks UTF-8 byte count of emitted text.
    """
    def __init__(self, out_fh, also_stdout: bool):
        self._fh = out_fh
        self._also_stdout = also_stdout
        self.total_bytes = 0

    def write(self, s: str) -> None:
        self._fh.write(s)
        self.total_bytes += len(s.encode("utf-8", errors="replace"))
        if self._also_stdout:
            sys.stdout.write(s)

    def writeln(self, s: str = "") -> None:
        self.write(s + "\n")


 # -------------------------------- Implementation ------------------------------

 def _delete_output_file_if_exists(output_path: Path) -> None:
    try:
        if output_path.exists() or output_path.is_symlink():
            if output_path.is_dir():
                raise IsADirectoryError(f"Output path is a directory: {output_path}")
            output_path.unlink()
    except OSError as e:
        raise RuntimeError(f"Failed to delete existing output file '{output_path}': {e}") from e


 def _build_rules(cfg: DumpConfig) -> List[IgnoreRule]:
    rules: List[IgnoreRule] = []

    if cfg.respect_gitignore:
        gi = cfg.root / ".gitignore"
        if gi.exists():
            rules.extend(parse_ignore_file(gi))

    if cfg.ignore_file and cfg.ignore_file.exists():
        rules.extend(parse_ignore_file(cfg.ignore_file))

    def add_rule(pat: str) -> None:
        s = pat.strip()
        if not s or s.startswith("#"):
            return
        negate = s.startswith("!")
        if negate:
            s = s[1:].strip()
        anchored = s.startswith("/")
        if anchored:
            s = s[1:]
        dir_only = s.endswith("/")
        if dir_only:
            s = s[:-1]
        if s:
            rules.append(IgnoreRule(negate=negate, pattern=s, dir_only=dir_only, anchored=anchored))

    for pat in DEFAULT_EXTRA_IGNORES:
        add_rule(pat)
    for pat in cfg.extra_ignores:
        add_rule(pat)

    return rules


 def collect_files(root: Path, rules: List[IgnoreRule], include_globs: Optional[List[str]], exclude_rel: Set[str]) -> List[str]:
    root = root.resolve()
    files: List[str] = []

    def on_walk_error(_err: OSError) -> None:
        # Best-effort walk; unreadable dirs are simply skipped.
        return

    for dirpath, dirnames, filenames in os.walk(root, onerror=on_walk_error, followlinks=False):
        dirpath_p = Path(dirpath)
        rel_dir = _normalize_relpath(str(dirpath_p.relative_to(root)))
        if rel_dir == ".":
            rel_dir = ""

        # prune ignored dirs
        pruned: List[str] = []
        for d in list(dirnames):
            rel = _normalize_relpath(str(Path(rel_dir) / d)) if rel_dir else _normalize_relpath(d)
            if is_ignored(rel, is_dir=True, rules=rules):
                continue
            pruned.append(d)
        dirnames[:] = pruned

        for fn in filenames:
            rel = _normalize_relpath(str(Path(rel_dir) / fn)) if rel_dir else _normalize_relpath(fn)
            if rel in exclude_rel:
                continue
            if is_ignored(rel, is_dir=False, rules=rules):
                continue
            if include_globs and not any(fnmatch.fnmatchcase(rel, g) for g in include_globs):
                continue
            files.append(rel)

    files.sort()
    return files


 def plan_files(cfg: DumpConfig, rules: List[IgnoreRule]) -> Tuple[List[str], List[FileEntry], int]:
    # Exclude output file if it's under root.
    exclude_rel: Set[str] = set()
    try:
        rel_out = _normalize_relpath(str(cfg.output_path.relative_to(cfg.root)))
        if rel_out:
            exclude_rel.add(rel_out)
    except ValueError:
        pass

    rel_paths = collect_files(cfg.root, rules, cfg.include_globs, exclude_rel=exclude_rel)

    entries: List[FileEntry] = []
    omitted_binary = 0

    for rel in rel_paths:
        abs_path = cfg.root / rel
        ext = abs_path.suffix.lower()

        size: Optional[int] = None
        try:
            size = abs_path.stat().st_size
        except OSError:
            size = None

        kind = probe_binary_kind(abs_path, ext)

        # Requested behavior:
        # - If skipped because binary/non-text => omit from per-file sections entirely.
        if kind == ProbeKind.BINARY:
            omitted_binary += 1

        # Prior dump marker is only meaningful for readable, text-eligible files.
        is_prior_dump = False
        if kind == ProbeKind.TEXT:
            is_prior_dump = starts_with_textrepo_dump_marker(abs_path)

        entries.append(FileEntry(rel=rel, abs_path=abs_path, size=size, kind=kind, is_prior_dump=is_prior_dump))

    return rel_paths, entries, omitted_binary


 def emit_header(cfg: DumpConfig, em: Emitter, tree_text: str, total_in_tree: int, emitted_sections: int, omitted_binary: int) -> None:
    if cfg.compact:
        # Minimal header to reduce overhead
        em.writeln(DUMP_MAGIC_LINE)
        em.writeln(f"R {cfg.root}")
        # T=tree count, E=eligible sections count, B=binary omitted count
        em.writeln(f"T {total_in_tree} E {emitted_sections} B {omitted_binary}")
        mtb = "-" if cfg.max_total_bytes is None else str(cfg.max_total_bytes)
        em.writeln(f"L b={cfg.max_bytes} l={cfg.max_lines} t={mtb} nlb={int(cfg.no_line_break)} gi={int(cfg.respect_gitignore)}")
        em.writeln("TREE")
        em.writeln(tree_text)
        em.writeln("--")
    else:
        em.writeln(DUMP_MAGIC_LINE)
        em.writeln("REPOSITORY SNAPSHOT")
        em.writeln(f"Root: {cfg.root}")
        em.writeln(f"Files in tree (after ignores): {total_in_tree}")
        em.writeln(f"Files emitted in sections: {emitted_sections}")
        em.writeln(f"Binary/non-text omitted from sections: {omitted_binary}")
        em.writeln(f"Per-file limits: max_bytes={cfg.max_bytes}, max_lines={cfg.max_lines}")
        em.writeln(f"No line breaks in file contents: {cfg.no_line_break}")
        em.writeln(f"Respect .gitignore: {cfg.respect_gitignore}")
        em.writeln(f"Extra ignore-file: {str(cfg.ignore_file) if cfg.ignore_file else '(none)'}")
        em.writeln()
        em.writeln("FOLDER STRUCTURE (filtered)")
        em.writeln(tree_text)
        em.writeln()
        em.writeln("=" * 80)
        em.writeln()


 def emit_file_section(cfg: DumpConfig, em: Emitter, entry: FileEntry) -> None:
    """
    Emit one file's header and content/skip markers.
    Caller must enforce "omit binary/non-text from sections".
    """
    # Header
    if cfg.compact:
        # @@ <rel>[\t<size>]
        if entry.size is None:
            em.writeln(f"@@ {entry.rel}")
        else:
            em.writeln(f"@@ {entry.rel}\t{entry.size}")
    else:
        em.writeln(f"FILE: {entry.rel}")
        if entry.size is not None:
            em.writeln(f"Size: {entry.size} bytes")
        em.writeln("-" * 80)

    # max-total-bytes check (after header, consistent with legacy behavior)
    if cfg.max_total_bytes is not None and em.total_bytes >= cfg.max_total_bytes:
        if cfg.compact:
            em.writeln("! total")
            em.writeln("@@")
        else:
            em.writeln("[SKIPPED CONTENT] max_total_bytes reached.")
            em.writeln()
        return

    # Prior dump skip
    if entry.is_prior_dump:
        if cfg.compact:
            em.writeln("! dump")
            em.writeln("@@")
        else:
            em.writeln("[SKIPPED CONTENT] previous textrepo dump detected.")
            em.writeln()
        return

    # Unreadable probe: we still try to read; if it fails, we show a read error.
    try:
        text, truncated, bytes_read, lines_read = read_text_snippet(
            entry.abs_path,
            max_bytes=cfg.max_bytes,
            max_lines=cfg.max_lines,
            no_line_break=cfg.no_line_break,
        )
    except Exception as e:
        if cfg.compact:
            em.writeln(f"! err {type(e).__name__}")
            em.writeln("@@")
        else:
            em.writeln(f"[SKIPPED CONTENT] read error: {type(e).__name__}: {e}")
            em.writeln()
        return

    if cfg.compact:
        # Emit content directly; close with @@
        body = text.rstrip("\n")
        if body:
            em.writeln(body)
        if truncated:
            # ~ <lines> <bytes>
            em.writeln(f"~ {lines_read} {bytes_read}")
        em.writeln("@@")
    else:
        em.writeln("BEGIN")
        em.writeln(text.rstrip("\n"))
        if truncated:
            em.writeln(f"[TRUNCATED] showed ~{lines_read} lines / ~{bytes_read} bytes.")
        em.writeln("END")
        em.writeln()


 def format_repo(cfg: DumpConfig) -> None:
    cfg_root = cfg.root.resolve()
    cfg_out = cfg.output_path.expanduser().resolve()
    cfg = DumpConfig(
        root=cfg_root,
        output_path=cfg_out,
        respect_gitignore=cfg.respect_gitignore,
        extra_ignores=cfg.extra_ignores,
        ignore_file=cfg.ignore_file,
        include_globs=cfg.include_globs,
        max_bytes=cfg.max_bytes,
        max_lines=cfg.max_lines,
        max_total_bytes=cfg.max_total_bytes,
        no_line_break=cfg.no_line_break,
        compact=cfg.compact,
        quiet=cfg.quiet,
    )

    _delete_output_file_if_exists(cfg.output_path)

    rules = _build_rules(cfg)
    rel_paths, entries, omitted_binary = plan_files(cfg, rules)

    tree_text = build_tree(rel_paths, ascii_only=cfg.compact) if rel_paths else "(no files)"
    eligible_entries = [e for e in entries if e.kind != ProbeKind.BINARY]
    emitted_sections = len([e for e in eligible_entries if e.kind != ProbeKind.BINARY])

    cfg.output_path.parent.mkdir(parents=True, exist_ok=True)
    with cfg.output_path.open("w", encoding="utf-8", errors="replace") as out_f:
        em = Emitter(out_f, also_stdout=(not cfg.quiet))

        emit_header(
            cfg=cfg,
            em=em,
            tree_text=tree_text,
            total_in_tree=len(rel_paths),
            emitted_sections=len(eligible_entries),
            omitted_binary=omitted_binary,
        )

        for entry in eligible_entries:
            # Requested behavior: binary/non-text omitted from per-file sections entirely
            # (already filtered), but unreadable is still included (shows read error if needed).
            emit_file_section(cfg, em, entry)


 # ----------------------------------- CLI -------------------------------------

 def parse_args(argv: Optional[List[str]] = None) -> DumpConfig:
    ap = argparse.ArgumentParser(prog="textrepo", add_help=True)
    ap.add_argument("repository_root_directory", type=str)
    ap.add_argument("output_file_path", type=str)

    ap.add_argument("--no-gitignore", action="store_true", help="Do NOT read/respect .gitignore.")
    ap.add_argument(
        "--extra-ignore",
        action="append",
        default=[],
        help="Additional ignore pattern (repeatable). Supports '!' negation and trailing '/' for dirs.",
    )
    ap.add_argument(
        "--ignore-file",
        type=str,
        default=None,
        help="Path to an additional ignore file (gitignore-like patterns).",
    )
    ap.add_argument(
        "--include",
        action="append",
        default=None,
        help="Only include files matching these glob(s) (repeatable). Example: --include 'src/**' --include '*.md'",
    )
    ap.add_argument("--max-bytes", type=int, default=12_000, help="Max bytes of content to emit per file (approx).")
    ap.add_argument("--max-lines", type=int, default=250, help="Max lines of content to emit per file.")
    ap.add_argument(
        "--max-total-bytes",
        type=int,
        default=None,
        help="Stop emitting file contents once total output reaches this many bytes (headers still emitted).",
    )
    ap.add_argument("--no-line-break", action="store_true", help="Remove all line breaks from emitted file contents.")
    ap.add_argument("--compact", action="store_true", help="Reduce added characters (short header + ASCII tree).")
    ap.add_argument("--quiet", action="store_true", help="Do not print to stdout (still writes output file).")

    args = ap.parse_args(argv)

    root = Path(os.path.expanduser(args.repository_root_directory))
    if not root.exists():
        raise SystemExit(f"Error: {root} does not exist.")
    if not root.is_dir():
        raise SystemExit(f"Error: {root} is not a directory.")

    out = Path(os.path.expanduser(args.output_file_path))
    ignore_file = Path(os.path.expanduser(args.ignore_file)) if args.ignore_file else None

    return DumpConfig(
        root=root,
        output_path=out,
        respect_gitignore=not args.no_gitignore,
        extra_ignores=args.extra_ignore or [],
        ignore_file=ignore_file,
        include_globs=args.include,
        max_bytes=max(512, args.max_bytes),
        max_lines=max(1, args.max_lines),
        max_total_bytes=args.max_total_bytes,
        no_line_break=args.no_line_break,
        compact=args.compact,
        quiet=args.quiet,
    )


 def main(argv: Optional[List[str]] = None) -> int:
    try:
        cfg = parse_args(argv)
        format_repo(cfg)
        return 0
    except KeyboardInterrupt:
        return 130
    except Exception as e:
        print(f"Error: {type(e).__name__}: {e}", file=sys.stderr)
        return 2


 if __name__ == "__main__":
    raise SystemExit(main())
	#!/usr/bin/env python3
	"""
	textrepo — dump a repository into a single text file, optimized for LLM input.

	Features
	- Always prints a folder tree of the discovered files (after ignores).
	- Emits a per-file section for every text-eligible file (even if skipped/truncated).
	- Respects .gitignore by default (disable with --no-gitignore).
	- Supports extra ignore patterns and an additional ignore file (gitignore-ish).
	- Detects and omits binary / non-text files from the per-file sections
	(they still appear in the tree).
	- Skips prior textrepo dumps (by magic marker) to avoid recursive inclusion.
	- Deletes the output file before writing (prevents self-inclusion / stale output).
	- Truncates each file by max lines and approx max bytes.
	- Optional --no-line-break to remove all line breaks from emitted file contents.
	- Optional --compact to minimize added characters (short header, ASCII tree, compact framing).
	- Optional --max-total-bytes to stop emitting content past a total output budget
	(headers still emitted for eligible files, consistent with legacy behavior).
	- Streams output (does not build the entire dump in memory).

	Usage:
	textrepo /path/to/repo output.txt

	Examples:
	textrepo . repo.txt
	textrepo . repo.txt --extra-ignore "*.log" --extra-ignore "Saved/" --max-lines 300
	textrepo . repo.txt --no-gitignore
	textrepo . repo.txt --ignore-file .textrepoignore
	textrepo . repo.txt --no-line-break
	textrepo . repo.txt --compact --max-bytes 8000 --max-lines 200

	Notes on ignore semantics:
	- This is a lightweight “gitignore-ish” matcher, not a full gitignore spec.
	- Rules support:
	- comments (#) and empty lines
	- negation with leading '!'
	- directory-only rules with trailing '/'
	- anchored rules with leading '/'
	- glob matching via fnmatch
	- Pattern behavior:
	- If the rule contains a '/', it is matched against the full relative path, and also '**/<pattern>'.
	- If the rule does not contain '/', it matches any path component (basename-style).
	"""

	from __future__ import annotations

	import argparse
	import fnmatch
	import os
	import sys
	from dataclasses import dataclass
	from enum import Enum
	from pathlib import Path
	from typing import Iterable, List, Optional, Set, Tuple


	# ----------------------- Dump marker (for self-skip) --------------------------

	DUMP_MAGIC_LINE = "TEXTREPO_DUMP_V1"
	DUMP_LEGACY_PREFIX = "REPOSITORY SNAPSHOT"


	# ---------------------- Gitignore-ish pattern handling ------------------------

	@dataclass(frozen=True)
	class IgnoreRule:
	negate: bool
	pattern: str
	dir_only: bool
	anchored: bool


	def _normalize_relpath(relpath: str) -> str:
	# Always operate on posix-style, relative paths.
	return relpath.replace(os.sep, "/").lstrip("./")


	def parse_ignore_file(path: Path) -> List[IgnoreRule]:
	rules: List[IgnoreRule] = []
	try:
	lines = path.read_text(encoding="utf-8", errors="replace").splitlines()
	except OSError:
	return rules

	for raw in lines:
	s = raw.strip()
	if not s or s.startswith("#"):
	continue

	negate = s.startswith("!")
	if negate:
	s = s[1:].strip()
	if not s:
	continue

	anchored = s.startswith("/")
	if anchored:
	s = s[1:]

	dir_only = s.endswith("/")
	if dir_only:
	s = s[:-1]

	if not s:
	continue

	rules.append(IgnoreRule(negate=negate, pattern=s, dir_only=dir_only, anchored=anchored))

	return rules


	def _match_rule(rule: IgnoreRule, rel_posix: str, is_dir: bool) -> bool:
	if rule.dir_only and not is_dir:
	return False

	pat = rule.pattern

	# Anchored: match only from repo root.
	if rule.anchored:
	# "/foo" means "foo" relative to root
	return fnmatch.fnmatchcase(rel_posix, pat)

	# Unanchored:
	# - If pattern contains '/', match against full rel path and also '**/pattern'.
	# - If pattern has no '/', match against any path component.
	if "/" in pat:
	if fnmatch.fnmatchcase(rel_posix, pat):
	return True
	return fnmatch.fnmatchcase(rel_posix, f"**/{pat}")

	# Component/basename style:
	# Check each component (gitignore-like basename behavior).
	parts = rel_posix.split("/") if rel_posix else []
	return any(fnmatch.fnmatchcase(part, pat) for part in parts)


	def is_ignored(rel_posix: str, is_dir: bool, rules: List[IgnoreRule]) -> bool:
	# Last matching rule wins; negation flips.
	ignored = False
	for rule in rules:
	if _match_rule(rule, rel_posix, is_dir):
	ignored = not rule.negate
	return ignored


	# ---------------------------- File reading helpers ----------------------------

	COMMON_BINARY_EXTS = {
	# images
	".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tga", ".psd", ".exr", ".webp", ".ico",
	# audio/video
	".mp3", ".wav", ".ogg", ".flac", ".mp4", ".mov", ".avi", ".mkv", ".m4a",
	# archives
	".zip", ".7z", ".rar", ".tar", ".gz", ".bz2", ".xz",
	# executables / libs
	".exe", ".dll", ".so", ".dylib",
	# common binary-ish assets
	".uasset", ".umap", ".pak",
	# documents often not useful as plain text
	".pdf",
	}


	class ProbeKind(Enum):
	TEXT = "text"
	BINARY = "binary"
	UNREADABLE = "unreadable"


	def probe_binary_kind(path: Path, ext: str) -> ProbeKind:
	"""
	Decide whether a file is binary/non-text.
	- Returns BINARY if extension is in COMMON_BINARY_EXTS or content looks binary.
	- Returns UNREADABLE if the probe can't read the file.
	- Returns TEXT otherwise.

	Important: we treat UNREADABLE separately so it is NOT silently omitted.
	"""
	if ext in COMMON_BINARY_EXTS:
	return ProbeKind.BINARY

	try:
	with path.open("rb") as f:
	chunk = f.read(8192)
	except OSError:
	return ProbeKind.UNREADABLE

	if not chunk:
	return ProbeKind.TEXT

	if b"\x00" in chunk:
	return ProbeKind.BINARY

	# Heuristic: ratio of printable bytes.
	printable = sum(b in b"\t\n\r" or 32 <= b <= 126 for b in chunk)
	if printable / len(chunk) < 0.80:
	return ProbeKind.BINARY

	return ProbeKind.TEXT


	def starts_with_textrepo_dump_marker(path: Path) -> bool:
	"""
	True if file content starts with this tool's marker (or a legacy header),
	allowing us to skip previously generated dumps to avoid recursion.
	"""
	try:
	with path.open("rb") as f:
	head = f.read(4096)
	except OSError:
	return False

	s = head.decode("utf-8", errors="replace")
	if s.startswith("\ufeff"):
	s = s.lstrip("\ufeff")
	return s.startswith(DUMP_MAGIC_LINE) or s.startswith(DUMP_LEGACY_PREFIX)


	def read_text_snippet(
	path: Path,
	max_bytes: int,
	max_lines: int,
	no_line_break: bool,
	) -> Tuple[str, bool, int, int]:
	"""
	Returns: (text, truncated, approx_bytes_read, lines_read)

	Truncation uses:
	- max_lines cap (exact)
	- max_bytes cap (approx; based on encoding of each line)
	"""
	encodings = ("utf-8", "latin-1")
	last_err: Optional[Exception] = None

	for enc in encodings:
	try:
	out_parts: List[str] = []
	bytes_so_far = 0
	lines_so_far = 0
	truncated = False

	with path.open("r", encoding=enc, errors="strict") as f:
	for line in f:
	if lines_so_far >= max_lines:
	truncated = True
	break

	if no_line_break:
	# remove all line breaks
	line = line.replace("\n", "").replace("\r", "")

	line_bytes = len(line.encode(enc, errors="replace"))
	if bytes_so_far + line_bytes > max_bytes:
	remaining = max(0, max_bytes - bytes_so_far)
	if remaining > 0:
	# approximate slice by chars (byte-perfect not required)
	out_parts.append(line[: max(1, min(len(line), remaining))])
	truncated = True
	break

	out_parts.append(line)
	bytes_so_far += line_bytes
	lines_so_far += 1

	return ("".join(out_parts), truncated, bytes_so_far, lines_so_far)

	except (UnicodeDecodeError, OSError) as e:
	last_err = e
	continue

	raise last_err if last_err else IOError("Failed to read file")


	# ------------------------------ Tree formatting ------------------------------

	def build_tree(paths: List[str], ascii_only: bool) -> str:
	"""
	Build a deterministic tree view from a list of posix-style relative paths.
	"""
	tree: dict = {}
	for p in paths:
	parts = p.split("/")
	node = tree
	for part in parts[:-1]:
	node = node.setdefault(part, {})
	node.setdefault("__files__", []).append(parts[-1])

	if ascii_only:
	BR_MID, BR_LAST = "\|-- ", "`-- "
	EXT_MID, EXT_LAST = "\| ", " "
	else:
	BR_MID, BR_LAST = "├── ", "└── "
	EXT_MID, EXT_LAST = "│ ", " "

	def render(node: dict, prefix: str = "") -> List[str]:
	lines: List[str] = []
	dirs = sorted(k for k in node.keys() if k != "__files__")
	files = sorted(node.get("__files__", []))

	for i, d in enumerate(dirs):
	is_last_dir = (i == len(dirs) - 1) and (len(files) == 0)
	branch = BR_LAST if is_last_dir else BR_MID
	lines.append(prefix + branch + d + "/")
	extension = EXT_LAST if is_last_dir else EXT_MID
	lines.extend(render(node[d], prefix + extension))

	for j, f in enumerate(files):
	is_last = (j == len(files) - 1)
	branch = BR_LAST if is_last else BR_MID
	lines.append(prefix + branch + f)

	return lines

	return "\n".join(render(tree))


	# --------------------------------- Defaults ----------------------------------

	DEFAULT_EXTRA_IGNORES = [
	# VCS / IDE
	".git/", ".hg/", ".svn/", ".idea/", ".vscode/",
	# Python
	"__pycache__/", ".pyc", ".pyo", "*.pyd",
	"venv/", ".venv/", ".tox/", ".pytest_cache/", ".mypy_cache/", ".ruff_cache/", ".hypothesis/",
	".coverage", "coverage/",
	# OS / editor junk
	".DS_Store", "Thumbs.db",
	# Node / web
	"node_modules/", "dist/", "build/", ".next/", ".turbo/", ".parcel-cache/",
	"package-lock.json", "yarn.lock", "pnpm-lock.yaml",
	# Cloudflare workers / wrangler
	".wrangler/",
	# Terraform / direnv / misc caches
	".terraform/", ".direnv/", ".cache/",
	]


	# --------------------------------- Core model --------------------------------

	@dataclass(frozen=True)
	class FileEntry:
	rel: str
	abs_path: Path
	size: Optional[int]
	kind: ProbeKind
	is_prior_dump: bool


	@dataclass(frozen=True)
	class DumpConfig:
	root: Path
	output_path: Path
	respect_gitignore: bool
	extra_ignores: List[str]
	ignore_file: Optional[Path]
	include_globs: Optional[List[str]]
	max_bytes: int
	max_lines: int
	max_total_bytes: Optional[int]
	no_line_break: bool
	compact: bool
	quiet: bool


	class Emitter:
	"""
	Streaming writer that tracks UTF-8 byte count of emitted text.
	"""
	def __init__(self, out_fh, also_stdout: bool):
	self._fh = out_fh
	self._also_stdout = also_stdout
	self.total_bytes = 0

	def write(self, s: str) -> None:
	self._fh.write(s)
	self.total_bytes += len(s.encode("utf-8", errors="replace"))
	if self._also_stdout:
	sys.stdout.write(s)

	def writeln(self, s: str = "") -> None:
	self.write(s + "\n")


	# -------------------------------- Implementation ------------------------------

	def _delete_output_file_if_exists(output_path: Path) -> None:
	try:
	if output_path.exists() or output_path.is_symlink():
	if output_path.is_dir():
	raise IsADirectoryError(f"Output path is a directory: {output_path}")
	output_path.unlink()
	except OSError as e:
	raise RuntimeError(f"Failed to delete existing output file '{output_path}': {e}") from e


	def _build_rules(cfg: DumpConfig) -> List[IgnoreRule]:
	rules: List[IgnoreRule] = []

	if cfg.respect_gitignore:
	gi = cfg.root / ".gitignore"
	if gi.exists():
	rules.extend(parse_ignore_file(gi))

	if cfg.ignore_file and cfg.ignore_file.exists():
	rules.extend(parse_ignore_file(cfg.ignore_file))

	def add_rule(pat: str) -> None:
	s = pat.strip()
	if not s or s.startswith("#"):
	return
	negate = s.startswith("!")
	if negate:
	s = s[1:].strip()
	anchored = s.startswith("/")
	if anchored:
	s = s[1:]
	dir_only = s.endswith("/")
	if dir_only:
	s = s[:-1]
	if s:
	rules.append(IgnoreRule(negate=negate, pattern=s, dir_only=dir_only, anchored=anchored))

	for pat in DEFAULT_EXTRA_IGNORES:
	add_rule(pat)
	for pat in cfg.extra_ignores:
	add_rule(pat)

	return rules


	def collect_files(root: Path, rules: List[IgnoreRule], include_globs: Optional[List[str]], exclude_rel: Set[str]) -> List[str]:
	root = root.resolve()
	files: List[str] = []

	def on_walk_error(_err: OSError) -> None:
	# Best-effort walk; unreadable dirs are simply skipped.
	return

	for dirpath, dirnames, filenames in os.walk(root, onerror=on_walk_error, followlinks=False):
	dirpath_p = Path(dirpath)
	rel_dir = _normalize_relpath(str(dirpath_p.relative_to(root)))
	if rel_dir == ".":
	rel_dir = ""

	# prune ignored dirs
	pruned: List[str] = []
	for d in list(dirnames):
	rel = _normalize_relpath(str(Path(rel_dir) / d)) if rel_dir else _normalize_relpath(d)
	if is_ignored(rel, is_dir=True, rules=rules):
	continue
	pruned.append(d)
	dirnames[:] = pruned

	for fn in filenames:
	rel = _normalize_relpath(str(Path(rel_dir) / fn)) if rel_dir else _normalize_relpath(fn)
	if rel in exclude_rel:
	continue
	if is_ignored(rel, is_dir=False, rules=rules):
	continue
	if include_globs and not any(fnmatch.fnmatchcase(rel, g) for g in include_globs):
	continue
	files.append(rel)

	files.sort()
	return files


	def plan_files(cfg: DumpConfig, rules: List[IgnoreRule]) -> Tuple[List[str], List[FileEntry], int]:
	# Exclude output file if it's under root.
	exclude_rel: Set[str] = set()
	try:
	rel_out = _normalize_relpath(str(cfg.output_path.relative_to(cfg.root)))
	if rel_out:
	exclude_rel.add(rel_out)
	except ValueError:
	pass

	rel_paths = collect_files(cfg.root, rules, cfg.include_globs, exclude_rel=exclude_rel)

	entries: List[FileEntry] = []
	omitted_binary = 0

	for rel in rel_paths:
	abs_path = cfg.root / rel
	ext = abs_path.suffix.lower()

	size: Optional[int] = None
	try:
	size = abs_path.stat().st_size
	except OSError:
	size = None

	kind = probe_binary_kind(abs_path, ext)

	# Requested behavior:
	# - If skipped because binary/non-text => omit from per-file sections entirely.
	if kind == ProbeKind.BINARY:
	omitted_binary += 1

	# Prior dump marker is only meaningful for readable, text-eligible files.
	is_prior_dump = False
	if kind == ProbeKind.TEXT:
	is_prior_dump = starts_with_textrepo_dump_marker(abs_path)

	entries.append(FileEntry(rel=rel, abs_path=abs_path, size=size, kind=kind, is_prior_dump=is_prior_dump))

	return rel_paths, entries, omitted_binary


	def emit_header(cfg: DumpConfig, em: Emitter, tree_text: str, total_in_tree: int, emitted_sections: int, omitted_binary: int) -> None:
	if cfg.compact:
	# Minimal header to reduce overhead
	em.writeln(DUMP_MAGIC_LINE)
	em.writeln(f"R {cfg.root}")
	# T=tree count, E=eligible sections count, B=binary omitted count
	em.writeln(f"T {total_in_tree} E {emitted_sections} B {omitted_binary}")
	mtb = "-" if cfg.max_total_bytes is None else str(cfg.max_total_bytes)
	em.writeln(f"L b={cfg.max_bytes} l={cfg.max_lines} t={mtb} nlb={int(cfg.no_line_break)} gi={int(cfg.respect_gitignore)}")
	em.writeln("TREE")
	em.writeln(tree_text)
	em.writeln("--")
	else:
	em.writeln(DUMP_MAGIC_LINE)
	em.writeln("REPOSITORY SNAPSHOT")
	em.writeln(f"Root: {cfg.root}")
	em.writeln(f"Files in tree (after ignores): {total_in_tree}")
	em.writeln(f"Files emitted in sections: {emitted_sections}")
	em.writeln(f"Binary/non-text omitted from sections: {omitted_binary}")
	em.writeln(f"Per-file limits: max_bytes={cfg.max_bytes}, max_lines={cfg.max_lines}")
	em.writeln(f"No line breaks in file contents: {cfg.no_line_break}")
	em.writeln(f"Respect .gitignore: {cfg.respect_gitignore}")
	em.writeln(f"Extra ignore-file: {str(cfg.ignore_file) if cfg.ignore_file else '(none)'}")
	em.writeln()
	em.writeln("FOLDER STRUCTURE (filtered)")
	em.writeln(tree_text)
	em.writeln()
	em.writeln("=" * 80)
	em.writeln()


	def emit_file_section(cfg: DumpConfig, em: Emitter, entry: FileEntry) -> None:
	"""
	Emit one file's header and content/skip markers.
	Caller must enforce "omit binary/non-text from sections".
	"""
	# Header
	if cfg.compact:
	# @@ <rel>[\t<size>]
	if entry.size is None:
	em.writeln(f"@@ {entry.rel}")
	else:
	em.writeln(f"@@ {entry.rel}\t{entry.size}")
	else:
	em.writeln(f"FILE: {entry.rel}")
	if entry.size is not None:
	em.writeln(f"Size: {entry.size} bytes")
	em.writeln("-" * 80)

	# max-total-bytes check (after header, consistent with legacy behavior)
	if cfg.max_total_bytes is not None and em.total_bytes >= cfg.max_total_bytes:
	if cfg.compact:
	em.writeln("! total")
	em.writeln("@@")
	else:
	em.writeln("[SKIPPED CONTENT] max_total_bytes reached.")
	em.writeln()
	return

	# Prior dump skip
	if entry.is_prior_dump:
	if cfg.compact:
	em.writeln("! dump")
	em.writeln("@@")
	else:
	em.writeln("[SKIPPED CONTENT] previous textrepo dump detected.")
	em.writeln()
	return

	# Unreadable probe: we still try to read; if it fails, we show a read error.
	try:
	text, truncated, bytes_read, lines_read = read_text_snippet(
	entry.abs_path,
	max_bytes=cfg.max_bytes,
	max_lines=cfg.max_lines,
	no_line_break=cfg.no_line_break,
	)
	except Exception as e:
	if cfg.compact:
	em.writeln(f"! err {type(e).__name__}")
	em.writeln("@@")
	else:
	em.writeln(f"[SKIPPED CONTENT] read error: {type(e).__name__}: {e}")
	em.writeln()
	return

	if cfg.compact:
	# Emit content directly; close with @@
	body = text.rstrip("\n")
	if body:
	em.writeln(body)
	if truncated:
	# ~ <lines> <bytes>
	em.writeln(f"~ {lines_read} {bytes_read}")
	em.writeln("@@")
	else:
	em.writeln("BEGIN")
	em.writeln(text.rstrip("\n"))
	if truncated:
	em.writeln(f"[TRUNCATED] showed ~{lines_read} lines / ~{bytes_read} bytes.")
	em.writeln("END")
	em.writeln()


	def format_repo(cfg: DumpConfig) -> None:
	cfg_root = cfg.root.resolve()
	cfg_out = cfg.output_path.expanduser().resolve()
	cfg = DumpConfig(
	root=cfg_root,
	output_path=cfg_out,
	respect_gitignore=cfg.respect_gitignore,
	extra_ignores=cfg.extra_ignores,
	ignore_file=cfg.ignore_file,
	include_globs=cfg.include_globs,
	max_bytes=cfg.max_bytes,
	max_lines=cfg.max_lines,
	max_total_bytes=cfg.max_total_bytes,
	no_line_break=cfg.no_line_break,
	compact=cfg.compact,
	quiet=cfg.quiet,
	)

	_delete_output_file_if_exists(cfg.output_path)

	rules = _build_rules(cfg)
	rel_paths, entries, omitted_binary = plan_files(cfg, rules)

	tree_text = build_tree(rel_paths, ascii_only=cfg.compact) if rel_paths else "(no files)"
	eligible_entries = [e for e in entries if e.kind != ProbeKind.BINARY]
	emitted_sections = len([e for e in eligible_entries if e.kind != ProbeKind.BINARY])

	cfg.output_path.parent.mkdir(parents=True, exist_ok=True)
	with cfg.output_path.open("w", encoding="utf-8", errors="replace") as out_f:
	em = Emitter(out_f, also_stdout=(not cfg.quiet))

	emit_header(
	cfg=cfg,
	em=em,
	tree_text=tree_text,
	total_in_tree=len(rel_paths),
	emitted_sections=len(eligible_entries),
	omitted_binary=omitted_binary,
	)

	for entry in eligible_entries:
	# Requested behavior: binary/non-text omitted from per-file sections entirely
	# (already filtered), but unreadable is still included (shows read error if needed).
	emit_file_section(cfg, em, entry)


	# ----------------------------------- CLI -------------------------------------

	def parse_args(argv: Optional[List[str]] = None) -> DumpConfig:
	ap = argparse.ArgumentParser(prog="textrepo", add_help=True)
	ap.add_argument("repository_root_directory", type=str)
	ap.add_argument("output_file_path", type=str)

	ap.add_argument("--no-gitignore", action="store_true", help="Do NOT read/respect .gitignore.")
	ap.add_argument(
	"--extra-ignore",
	action="append",
	default=[],
	help="Additional ignore pattern (repeatable). Supports '!' negation and trailing '/' for dirs.",
	)
	ap.add_argument(
	"--ignore-file",
	type=str,
	default=None,
	help="Path to an additional ignore file (gitignore-like patterns).",
	)
	ap.add_argument(
	"--include",
	action="append",
	default=None,
	help="Only include files matching these glob(s) (repeatable). Example: --include 'src/*' --include '.md'",
	)
	ap.add_argument("--max-bytes", type=int, default=12_000, help="Max bytes of content to emit per file (approx).")
	ap.add_argument("--max-lines", type=int, default=250, help="Max lines of content to emit per file.")
	ap.add_argument(
	"--max-total-bytes",
	type=int,
	default=None,
	help="Stop emitting file contents once total output reaches this many bytes (headers still emitted).",
	)
	ap.add_argument("--no-line-break", action="store_true", help="Remove all line breaks from emitted file contents.")
	ap.add_argument("--compact", action="store_true", help="Reduce added characters (short header + ASCII tree).")
	ap.add_argument("--quiet", action="store_true", help="Do not print to stdout (still writes output file).")

	args = ap.parse_args(argv)

	root = Path(os.path.expanduser(args.repository_root_directory))
	if not root.exists():
	raise SystemExit(f"Error: {root} does not exist.")
	if not root.is_dir():
	raise SystemExit(f"Error: {root} is not a directory.")

	out = Path(os.path.expanduser(args.output_file_path))
	ignore_file = Path(os.path.expanduser(args.ignore_file)) if args.ignore_file else None

	return DumpConfig(
	root=root,
	output_path=out,
	respect_gitignore=not args.no_gitignore,
	extra_ignores=args.extra_ignore or [],
	ignore_file=ignore_file,
	include_globs=args.include,
	max_bytes=max(512, args.max_bytes),
	max_lines=max(1, args.max_lines),
	max_total_bytes=args.max_total_bytes,
	no_line_break=args.no_line_break,
	compact=args.compact,
	quiet=args.quiet,
	)


	def main(argv: Optional[List[str]] = None) -> int:
	try:
	cfg = parse_args(argv)
	format_repo(cfg)
	return 0
	except KeyboardInterrupt:
	return 130
	except Exception as e:
	print(f"Error: {type(e).__name__}: {e}", file=sys.stderr)
	return 2


	if __name__ == "__main__":
	raise SystemExit(main())
No results found