Last active
January 4, 2026 19:20
-
-
Save albertbuchard/991d36041e1b2c93c00ec1b8d09f2716 to your computer and use it in GitHub Desktop.
textrepo: A Python Script to Concatenate All Files in a Repository into a Single Text File, Ignoring Specified Patterns
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| textrepo — dump a repository into a single text file, optimized for LLM input. | |
| Features | |
| - Always prints a folder tree of the discovered files (after ignores). | |
| - Emits a per-file section for every *text-eligible* file (even if skipped/truncated). | |
| - Respects .gitignore by default (disable with --no-gitignore). | |
| - Supports extra ignore patterns and an additional ignore file (gitignore-ish). | |
| - Detects and omits binary / non-text files from the per-file sections | |
| (they still appear in the tree). | |
| - Skips prior textrepo dumps (by magic marker) to avoid recursive inclusion. | |
| - Deletes the output file before writing (prevents self-inclusion / stale output). | |
| - Truncates each file by max lines and approx max bytes. | |
| - Optional --no-line-break to remove all line breaks from emitted file contents. | |
| - Optional --compact to minimize added characters (short header, ASCII tree, compact framing). | |
| - Optional --max-total-bytes to stop emitting content past a total output budget | |
| (headers still emitted for eligible files, consistent with legacy behavior). | |
| - Streams output (does not build the entire dump in memory). | |
| Usage: | |
| textrepo /path/to/repo output.txt | |
| Examples: | |
| textrepo . repo.txt | |
| textrepo . repo.txt --extra-ignore "*.log" --extra-ignore "Saved/" --max-lines 300 | |
| textrepo . repo.txt --no-gitignore | |
| textrepo . repo.txt --ignore-file .textrepoignore | |
| textrepo . repo.txt --no-line-break | |
| textrepo . repo.txt --compact --max-bytes 8000 --max-lines 200 | |
| Notes on ignore semantics: | |
| - This is a lightweight “gitignore-ish” matcher, not a full gitignore spec. | |
| - Rules support: | |
| - comments (#) and empty lines | |
| - negation with leading '!' | |
| - directory-only rules with trailing '/' | |
| - anchored rules with leading '/' | |
| - glob matching via fnmatch | |
| - Pattern behavior: | |
| - If the rule contains a '/', it is matched against the full relative path, and also '**/<pattern>'. | |
| - If the rule does not contain '/', it matches any path component (basename-style). | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import fnmatch | |
| import os | |
| import sys | |
| from dataclasses import dataclass | |
| from enum import Enum | |
| from pathlib import Path | |
| from typing import Iterable, List, Optional, Set, Tuple | |
| # ----------------------- Dump marker (for self-skip) -------------------------- | |
| DUMP_MAGIC_LINE = "TEXTREPO_DUMP_V1" | |
| DUMP_LEGACY_PREFIX = "REPOSITORY SNAPSHOT" | |
| # ---------------------- Gitignore-ish pattern handling ------------------------ | |
| @dataclass(frozen=True) | |
| class IgnoreRule: | |
| negate: bool | |
| pattern: str | |
| dir_only: bool | |
| anchored: bool | |
| def _normalize_relpath(relpath: str) -> str: | |
| # Always operate on posix-style, relative paths. | |
| return relpath.replace(os.sep, "/").lstrip("./") | |
| def parse_ignore_file(path: Path) -> List[IgnoreRule]: | |
| rules: List[IgnoreRule] = [] | |
| try: | |
| lines = path.read_text(encoding="utf-8", errors="replace").splitlines() | |
| except OSError: | |
| return rules | |
| for raw in lines: | |
| s = raw.strip() | |
| if not s or s.startswith("#"): | |
| continue | |
| negate = s.startswith("!") | |
| if negate: | |
| s = s[1:].strip() | |
| if not s: | |
| continue | |
| anchored = s.startswith("/") | |
| if anchored: | |
| s = s[1:] | |
| dir_only = s.endswith("/") | |
| if dir_only: | |
| s = s[:-1] | |
| if not s: | |
| continue | |
| rules.append(IgnoreRule(negate=negate, pattern=s, dir_only=dir_only, anchored=anchored)) | |
| return rules | |
| def _match_rule(rule: IgnoreRule, rel_posix: str, is_dir: bool) -> bool: | |
| if rule.dir_only and not is_dir: | |
| return False | |
| pat = rule.pattern | |
| # Anchored: match only from repo root. | |
| if rule.anchored: | |
| # "/foo" means "foo" relative to root | |
| return fnmatch.fnmatchcase(rel_posix, pat) | |
| # Unanchored: | |
| # - If pattern contains '/', match against full rel path and also '**/pattern'. | |
| # - If pattern has no '/', match against any path component. | |
| if "/" in pat: | |
| if fnmatch.fnmatchcase(rel_posix, pat): | |
| return True | |
| return fnmatch.fnmatchcase(rel_posix, f"**/{pat}") | |
| # Component/basename style: | |
| # Check each component (gitignore-like basename behavior). | |
| parts = rel_posix.split("/") if rel_posix else [] | |
| return any(fnmatch.fnmatchcase(part, pat) for part in parts) | |
| def is_ignored(rel_posix: str, is_dir: bool, rules: List[IgnoreRule]) -> bool: | |
| # Last matching rule wins; negation flips. | |
| ignored = False | |
| for rule in rules: | |
| if _match_rule(rule, rel_posix, is_dir): | |
| ignored = not rule.negate | |
| return ignored | |
| # ---------------------------- File reading helpers ---------------------------- | |
| COMMON_BINARY_EXTS = { | |
| # images | |
| ".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tga", ".psd", ".exr", ".webp", ".ico", | |
| # audio/video | |
| ".mp3", ".wav", ".ogg", ".flac", ".mp4", ".mov", ".avi", ".mkv", ".m4a", | |
| # archives | |
| ".zip", ".7z", ".rar", ".tar", ".gz", ".bz2", ".xz", | |
| # executables / libs | |
| ".exe", ".dll", ".so", ".dylib", | |
| # common binary-ish assets | |
| ".uasset", ".umap", ".pak", | |
| # documents often not useful as plain text | |
| ".pdf", | |
| } | |
| class ProbeKind(Enum): | |
| TEXT = "text" | |
| BINARY = "binary" | |
| UNREADABLE = "unreadable" | |
| def probe_binary_kind(path: Path, ext: str) -> ProbeKind: | |
| """ | |
| Decide whether a file is binary/non-text. | |
| - Returns BINARY if extension is in COMMON_BINARY_EXTS or content looks binary. | |
| - Returns UNREADABLE if the probe can't read the file. | |
| - Returns TEXT otherwise. | |
| Important: we treat UNREADABLE separately so it is NOT silently omitted. | |
| """ | |
| if ext in COMMON_BINARY_EXTS: | |
| return ProbeKind.BINARY | |
| try: | |
| with path.open("rb") as f: | |
| chunk = f.read(8192) | |
| except OSError: | |
| return ProbeKind.UNREADABLE | |
| if not chunk: | |
| return ProbeKind.TEXT | |
| if b"\x00" in chunk: | |
| return ProbeKind.BINARY | |
| # Heuristic: ratio of printable bytes. | |
| printable = sum(b in b"\t\n\r" or 32 <= b <= 126 for b in chunk) | |
| if printable / len(chunk) < 0.80: | |
| return ProbeKind.BINARY | |
| return ProbeKind.TEXT | |
| def starts_with_textrepo_dump_marker(path: Path) -> bool: | |
| """ | |
| True if file content starts with this tool's marker (or a legacy header), | |
| allowing us to skip previously generated dumps to avoid recursion. | |
| """ | |
| try: | |
| with path.open("rb") as f: | |
| head = f.read(4096) | |
| except OSError: | |
| return False | |
| s = head.decode("utf-8", errors="replace") | |
| if s.startswith("\ufeff"): | |
| s = s.lstrip("\ufeff") | |
| return s.startswith(DUMP_MAGIC_LINE) or s.startswith(DUMP_LEGACY_PREFIX) | |
| def read_text_snippet( | |
| path: Path, | |
| max_bytes: int, | |
| max_lines: int, | |
| no_line_break: bool, | |
| ) -> Tuple[str, bool, int, int]: | |
| """ | |
| Returns: (text, truncated, approx_bytes_read, lines_read) | |
| Truncation uses: | |
| - max_lines cap (exact) | |
| - max_bytes cap (approx; based on encoding of each line) | |
| """ | |
| encodings = ("utf-8", "latin-1") | |
| last_err: Optional[Exception] = None | |
| for enc in encodings: | |
| try: | |
| out_parts: List[str] = [] | |
| bytes_so_far = 0 | |
| lines_so_far = 0 | |
| truncated = False | |
| with path.open("r", encoding=enc, errors="strict") as f: | |
| for line in f: | |
| if lines_so_far >= max_lines: | |
| truncated = True | |
| break | |
| if no_line_break: | |
| # remove all line breaks | |
| line = line.replace("\n", "").replace("\r", "") | |
| line_bytes = len(line.encode(enc, errors="replace")) | |
| if bytes_so_far + line_bytes > max_bytes: | |
| remaining = max(0, max_bytes - bytes_so_far) | |
| if remaining > 0: | |
| # approximate slice by chars (byte-perfect not required) | |
| out_parts.append(line[: max(1, min(len(line), remaining))]) | |
| truncated = True | |
| break | |
| out_parts.append(line) | |
| bytes_so_far += line_bytes | |
| lines_so_far += 1 | |
| return ("".join(out_parts), truncated, bytes_so_far, lines_so_far) | |
| except (UnicodeDecodeError, OSError) as e: | |
| last_err = e | |
| continue | |
| raise last_err if last_err else IOError("Failed to read file") | |
| # ------------------------------ Tree formatting ------------------------------ | |
| def build_tree(paths: List[str], ascii_only: bool) -> str: | |
| """ | |
| Build a deterministic tree view from a list of posix-style relative paths. | |
| """ | |
| tree: dict = {} | |
| for p in paths: | |
| parts = p.split("/") | |
| node = tree | |
| for part in parts[:-1]: | |
| node = node.setdefault(part, {}) | |
| node.setdefault("__files__", []).append(parts[-1]) | |
| if ascii_only: | |
| BR_MID, BR_LAST = "|-- ", "`-- " | |
| EXT_MID, EXT_LAST = "| ", " " | |
| else: | |
| BR_MID, BR_LAST = "├── ", "└── " | |
| EXT_MID, EXT_LAST = "│ ", " " | |
| def render(node: dict, prefix: str = "") -> List[str]: | |
| lines: List[str] = [] | |
| dirs = sorted(k for k in node.keys() if k != "__files__") | |
| files = sorted(node.get("__files__", [])) | |
| for i, d in enumerate(dirs): | |
| is_last_dir = (i == len(dirs) - 1) and (len(files) == 0) | |
| branch = BR_LAST if is_last_dir else BR_MID | |
| lines.append(prefix + branch + d + "/") | |
| extension = EXT_LAST if is_last_dir else EXT_MID | |
| lines.extend(render(node[d], prefix + extension)) | |
| for j, f in enumerate(files): | |
| is_last = (j == len(files) - 1) | |
| branch = BR_LAST if is_last else BR_MID | |
| lines.append(prefix + branch + f) | |
| return lines | |
| return "\n".join(render(tree)) | |
| # --------------------------------- Defaults ---------------------------------- | |
| DEFAULT_EXTRA_IGNORES = [ | |
| # VCS / IDE | |
| ".git/", ".hg/", ".svn/", ".idea/", ".vscode/", | |
| # Python | |
| "__pycache__/", "*.pyc", "*.pyo", "*.pyd", | |
| "venv/", ".venv/", ".tox/", ".pytest_cache/", ".mypy_cache/", ".ruff_cache/", ".hypothesis/", | |
| ".coverage", "coverage/", | |
| # OS / editor junk | |
| ".DS_Store", "Thumbs.db", | |
| # Node / web | |
| "node_modules/", "dist/", "build/", ".next/", ".turbo/", ".parcel-cache/", | |
| "package-lock.json", "yarn.lock", "pnpm-lock.yaml", | |
| # Cloudflare workers / wrangler | |
| ".wrangler/", | |
| # Terraform / direnv / misc caches | |
| ".terraform/", ".direnv/", ".cache/", | |
| ] | |
| # --------------------------------- Core model -------------------------------- | |
| @dataclass(frozen=True) | |
| class FileEntry: | |
| rel: str | |
| abs_path: Path | |
| size: Optional[int] | |
| kind: ProbeKind | |
| is_prior_dump: bool | |
| @dataclass(frozen=True) | |
| class DumpConfig: | |
| root: Path | |
| output_path: Path | |
| respect_gitignore: bool | |
| extra_ignores: List[str] | |
| ignore_file: Optional[Path] | |
| include_globs: Optional[List[str]] | |
| max_bytes: int | |
| max_lines: int | |
| max_total_bytes: Optional[int] | |
| no_line_break: bool | |
| compact: bool | |
| quiet: bool | |
| class Emitter: | |
| """ | |
| Streaming writer that tracks UTF-8 byte count of emitted text. | |
| """ | |
| def __init__(self, out_fh, also_stdout: bool): | |
| self._fh = out_fh | |
| self._also_stdout = also_stdout | |
| self.total_bytes = 0 | |
| def write(self, s: str) -> None: | |
| self._fh.write(s) | |
| self.total_bytes += len(s.encode("utf-8", errors="replace")) | |
| if self._also_stdout: | |
| sys.stdout.write(s) | |
| def writeln(self, s: str = "") -> None: | |
| self.write(s + "\n") | |
| # -------------------------------- Implementation ------------------------------ | |
| def _delete_output_file_if_exists(output_path: Path) -> None: | |
| try: | |
| if output_path.exists() or output_path.is_symlink(): | |
| if output_path.is_dir(): | |
| raise IsADirectoryError(f"Output path is a directory: {output_path}") | |
| output_path.unlink() | |
| except OSError as e: | |
| raise RuntimeError(f"Failed to delete existing output file '{output_path}': {e}") from e | |
| def _build_rules(cfg: DumpConfig) -> List[IgnoreRule]: | |
| rules: List[IgnoreRule] = [] | |
| if cfg.respect_gitignore: | |
| gi = cfg.root / ".gitignore" | |
| if gi.exists(): | |
| rules.extend(parse_ignore_file(gi)) | |
| if cfg.ignore_file and cfg.ignore_file.exists(): | |
| rules.extend(parse_ignore_file(cfg.ignore_file)) | |
| def add_rule(pat: str) -> None: | |
| s = pat.strip() | |
| if not s or s.startswith("#"): | |
| return | |
| negate = s.startswith("!") | |
| if negate: | |
| s = s[1:].strip() | |
| anchored = s.startswith("/") | |
| if anchored: | |
| s = s[1:] | |
| dir_only = s.endswith("/") | |
| if dir_only: | |
| s = s[:-1] | |
| if s: | |
| rules.append(IgnoreRule(negate=negate, pattern=s, dir_only=dir_only, anchored=anchored)) | |
| for pat in DEFAULT_EXTRA_IGNORES: | |
| add_rule(pat) | |
| for pat in cfg.extra_ignores: | |
| add_rule(pat) | |
| return rules | |
| def collect_files(root: Path, rules: List[IgnoreRule], include_globs: Optional[List[str]], exclude_rel: Set[str]) -> List[str]: | |
| root = root.resolve() | |
| files: List[str] = [] | |
| def on_walk_error(_err: OSError) -> None: | |
| # Best-effort walk; unreadable dirs are simply skipped. | |
| return | |
| for dirpath, dirnames, filenames in os.walk(root, onerror=on_walk_error, followlinks=False): | |
| dirpath_p = Path(dirpath) | |
| rel_dir = _normalize_relpath(str(dirpath_p.relative_to(root))) | |
| if rel_dir == ".": | |
| rel_dir = "" | |
| # prune ignored dirs | |
| pruned: List[str] = [] | |
| for d in list(dirnames): | |
| rel = _normalize_relpath(str(Path(rel_dir) / d)) if rel_dir else _normalize_relpath(d) | |
| if is_ignored(rel, is_dir=True, rules=rules): | |
| continue | |
| pruned.append(d) | |
| dirnames[:] = pruned | |
| for fn in filenames: | |
| rel = _normalize_relpath(str(Path(rel_dir) / fn)) if rel_dir else _normalize_relpath(fn) | |
| if rel in exclude_rel: | |
| continue | |
| if is_ignored(rel, is_dir=False, rules=rules): | |
| continue | |
| if include_globs and not any(fnmatch.fnmatchcase(rel, g) for g in include_globs): | |
| continue | |
| files.append(rel) | |
| files.sort() | |
| return files | |
| def plan_files(cfg: DumpConfig, rules: List[IgnoreRule]) -> Tuple[List[str], List[FileEntry], int]: | |
| # Exclude output file if it's under root. | |
| exclude_rel: Set[str] = set() | |
| try: | |
| rel_out = _normalize_relpath(str(cfg.output_path.relative_to(cfg.root))) | |
| if rel_out: | |
| exclude_rel.add(rel_out) | |
| except ValueError: | |
| pass | |
| rel_paths = collect_files(cfg.root, rules, cfg.include_globs, exclude_rel=exclude_rel) | |
| entries: List[FileEntry] = [] | |
| omitted_binary = 0 | |
| for rel in rel_paths: | |
| abs_path = cfg.root / rel | |
| ext = abs_path.suffix.lower() | |
| size: Optional[int] = None | |
| try: | |
| size = abs_path.stat().st_size | |
| except OSError: | |
| size = None | |
| kind = probe_binary_kind(abs_path, ext) | |
| # Requested behavior: | |
| # - If skipped because binary/non-text => omit from per-file sections entirely. | |
| if kind == ProbeKind.BINARY: | |
| omitted_binary += 1 | |
| # Prior dump marker is only meaningful for readable, text-eligible files. | |
| is_prior_dump = False | |
| if kind == ProbeKind.TEXT: | |
| is_prior_dump = starts_with_textrepo_dump_marker(abs_path) | |
| entries.append(FileEntry(rel=rel, abs_path=abs_path, size=size, kind=kind, is_prior_dump=is_prior_dump)) | |
| return rel_paths, entries, omitted_binary | |
| def emit_header(cfg: DumpConfig, em: Emitter, tree_text: str, total_in_tree: int, emitted_sections: int, omitted_binary: int) -> None: | |
| if cfg.compact: | |
| # Minimal header to reduce overhead | |
| em.writeln(DUMP_MAGIC_LINE) | |
| em.writeln(f"R {cfg.root}") | |
| # T=tree count, E=eligible sections count, B=binary omitted count | |
| em.writeln(f"T {total_in_tree} E {emitted_sections} B {omitted_binary}") | |
| mtb = "-" if cfg.max_total_bytes is None else str(cfg.max_total_bytes) | |
| em.writeln(f"L b={cfg.max_bytes} l={cfg.max_lines} t={mtb} nlb={int(cfg.no_line_break)} gi={int(cfg.respect_gitignore)}") | |
| em.writeln("TREE") | |
| em.writeln(tree_text) | |
| em.writeln("--") | |
| else: | |
| em.writeln(DUMP_MAGIC_LINE) | |
| em.writeln("REPOSITORY SNAPSHOT") | |
| em.writeln(f"Root: {cfg.root}") | |
| em.writeln(f"Files in tree (after ignores): {total_in_tree}") | |
| em.writeln(f"Files emitted in sections: {emitted_sections}") | |
| em.writeln(f"Binary/non-text omitted from sections: {omitted_binary}") | |
| em.writeln(f"Per-file limits: max_bytes={cfg.max_bytes}, max_lines={cfg.max_lines}") | |
| em.writeln(f"No line breaks in file contents: {cfg.no_line_break}") | |
| em.writeln(f"Respect .gitignore: {cfg.respect_gitignore}") | |
| em.writeln(f"Extra ignore-file: {str(cfg.ignore_file) if cfg.ignore_file else '(none)'}") | |
| em.writeln() | |
| em.writeln("FOLDER STRUCTURE (filtered)") | |
| em.writeln(tree_text) | |
| em.writeln() | |
| em.writeln("=" * 80) | |
| em.writeln() | |
| def emit_file_section(cfg: DumpConfig, em: Emitter, entry: FileEntry) -> None: | |
| """ | |
| Emit one file's header and content/skip markers. | |
| Caller must enforce "omit binary/non-text from sections". | |
| """ | |
| # Header | |
| if cfg.compact: | |
| # @@ <rel>[\t<size>] | |
| if entry.size is None: | |
| em.writeln(f"@@ {entry.rel}") | |
| else: | |
| em.writeln(f"@@ {entry.rel}\t{entry.size}") | |
| else: | |
| em.writeln(f"FILE: {entry.rel}") | |
| if entry.size is not None: | |
| em.writeln(f"Size: {entry.size} bytes") | |
| em.writeln("-" * 80) | |
| # max-total-bytes check (after header, consistent with legacy behavior) | |
| if cfg.max_total_bytes is not None and em.total_bytes >= cfg.max_total_bytes: | |
| if cfg.compact: | |
| em.writeln("! total") | |
| em.writeln("@@") | |
| else: | |
| em.writeln("[SKIPPED CONTENT] max_total_bytes reached.") | |
| em.writeln() | |
| return | |
| # Prior dump skip | |
| if entry.is_prior_dump: | |
| if cfg.compact: | |
| em.writeln("! dump") | |
| em.writeln("@@") | |
| else: | |
| em.writeln("[SKIPPED CONTENT] previous textrepo dump detected.") | |
| em.writeln() | |
| return | |
| # Unreadable probe: we still try to read; if it fails, we show a read error. | |
| try: | |
| text, truncated, bytes_read, lines_read = read_text_snippet( | |
| entry.abs_path, | |
| max_bytes=cfg.max_bytes, | |
| max_lines=cfg.max_lines, | |
| no_line_break=cfg.no_line_break, | |
| ) | |
| except Exception as e: | |
| if cfg.compact: | |
| em.writeln(f"! err {type(e).__name__}") | |
| em.writeln("@@") | |
| else: | |
| em.writeln(f"[SKIPPED CONTENT] read error: {type(e).__name__}: {e}") | |
| em.writeln() | |
| return | |
| if cfg.compact: | |
| # Emit content directly; close with @@ | |
| body = text.rstrip("\n") | |
| if body: | |
| em.writeln(body) | |
| if truncated: | |
| # ~ <lines> <bytes> | |
| em.writeln(f"~ {lines_read} {bytes_read}") | |
| em.writeln("@@") | |
| else: | |
| em.writeln("BEGIN") | |
| em.writeln(text.rstrip("\n")) | |
| if truncated: | |
| em.writeln(f"[TRUNCATED] showed ~{lines_read} lines / ~{bytes_read} bytes.") | |
| em.writeln("END") | |
| em.writeln() | |
| def format_repo(cfg: DumpConfig) -> None: | |
| cfg_root = cfg.root.resolve() | |
| cfg_out = cfg.output_path.expanduser().resolve() | |
| cfg = DumpConfig( | |
| root=cfg_root, | |
| output_path=cfg_out, | |
| respect_gitignore=cfg.respect_gitignore, | |
| extra_ignores=cfg.extra_ignores, | |
| ignore_file=cfg.ignore_file, | |
| include_globs=cfg.include_globs, | |
| max_bytes=cfg.max_bytes, | |
| max_lines=cfg.max_lines, | |
| max_total_bytes=cfg.max_total_bytes, | |
| no_line_break=cfg.no_line_break, | |
| compact=cfg.compact, | |
| quiet=cfg.quiet, | |
| ) | |
| _delete_output_file_if_exists(cfg.output_path) | |
| rules = _build_rules(cfg) | |
| rel_paths, entries, omitted_binary = plan_files(cfg, rules) | |
| tree_text = build_tree(rel_paths, ascii_only=cfg.compact) if rel_paths else "(no files)" | |
| eligible_entries = [e for e in entries if e.kind != ProbeKind.BINARY] | |
| emitted_sections = len([e for e in eligible_entries if e.kind != ProbeKind.BINARY]) | |
| cfg.output_path.parent.mkdir(parents=True, exist_ok=True) | |
| with cfg.output_path.open("w", encoding="utf-8", errors="replace") as out_f: | |
| em = Emitter(out_f, also_stdout=(not cfg.quiet)) | |
| emit_header( | |
| cfg=cfg, | |
| em=em, | |
| tree_text=tree_text, | |
| total_in_tree=len(rel_paths), | |
| emitted_sections=len(eligible_entries), | |
| omitted_binary=omitted_binary, | |
| ) | |
| for entry in eligible_entries: | |
| # Requested behavior: binary/non-text omitted from per-file sections entirely | |
| # (already filtered), but unreadable is still included (shows read error if needed). | |
| emit_file_section(cfg, em, entry) | |
| # ----------------------------------- CLI ------------------------------------- | |
| def parse_args(argv: Optional[List[str]] = None) -> DumpConfig: | |
| ap = argparse.ArgumentParser(prog="textrepo", add_help=True) | |
| ap.add_argument("repository_root_directory", type=str) | |
| ap.add_argument("output_file_path", type=str) | |
| ap.add_argument("--no-gitignore", action="store_true", help="Do NOT read/respect .gitignore.") | |
| ap.add_argument( | |
| "--extra-ignore", | |
| action="append", | |
| default=[], | |
| help="Additional ignore pattern (repeatable). Supports '!' negation and trailing '/' for dirs.", | |
| ) | |
| ap.add_argument( | |
| "--ignore-file", | |
| type=str, | |
| default=None, | |
| help="Path to an additional ignore file (gitignore-like patterns).", | |
| ) | |
| ap.add_argument( | |
| "--include", | |
| action="append", | |
| default=None, | |
| help="Only include files matching these glob(s) (repeatable). Example: --include 'src/**' --include '*.md'", | |
| ) | |
| ap.add_argument("--max-bytes", type=int, default=12_000, help="Max bytes of content to emit per file (approx).") | |
| ap.add_argument("--max-lines", type=int, default=250, help="Max lines of content to emit per file.") | |
| ap.add_argument( | |
| "--max-total-bytes", | |
| type=int, | |
| default=None, | |
| help="Stop emitting file contents once total output reaches this many bytes (headers still emitted).", | |
| ) | |
| ap.add_argument("--no-line-break", action="store_true", help="Remove all line breaks from emitted file contents.") | |
| ap.add_argument("--compact", action="store_true", help="Reduce added characters (short header + ASCII tree).") | |
| ap.add_argument("--quiet", action="store_true", help="Do not print to stdout (still writes output file).") | |
| args = ap.parse_args(argv) | |
| root = Path(os.path.expanduser(args.repository_root_directory)) | |
| if not root.exists(): | |
| raise SystemExit(f"Error: {root} does not exist.") | |
| if not root.is_dir(): | |
| raise SystemExit(f"Error: {root} is not a directory.") | |
| out = Path(os.path.expanduser(args.output_file_path)) | |
| ignore_file = Path(os.path.expanduser(args.ignore_file)) if args.ignore_file else None | |
| return DumpConfig( | |
| root=root, | |
| output_path=out, | |
| respect_gitignore=not args.no_gitignore, | |
| extra_ignores=args.extra_ignore or [], | |
| ignore_file=ignore_file, | |
| include_globs=args.include, | |
| max_bytes=max(512, args.max_bytes), | |
| max_lines=max(1, args.max_lines), | |
| max_total_bytes=args.max_total_bytes, | |
| no_line_break=args.no_line_break, | |
| compact=args.compact, | |
| quiet=args.quiet, | |
| ) | |
| def main(argv: Optional[List[str]] = None) -> int: | |
| try: | |
| cfg = parse_args(argv) | |
| format_repo(cfg) | |
| return 0 | |
| except KeyboardInterrupt: | |
| return 130 | |
| except Exception as e: | |
| print(f"Error: {type(e).__name__}: {e}", file=sys.stderr) | |
| return 2 | |
| if __name__ == "__main__": | |
| raise SystemExit(main()) |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
If you plan on using it often:
Use it:
textrepo ~/my/repo my-repo-content.txt