Skip to content

Instantly share code, notes, and snippets.

@albertbuchard
Last active January 4, 2026 19:20
Show Gist options
  • Select an option

  • Save albertbuchard/991d36041e1b2c93c00ec1b8d09f2716 to your computer and use it in GitHub Desktop.

Select an option

Save albertbuchard/991d36041e1b2c93c00ec1b8d09f2716 to your computer and use it in GitHub Desktop.
textrepo: A Python Script to Concatenate All Files in a Repository into a Single Text File, Ignoring Specified Patterns
#!/usr/bin/env python3
"""
textrepo — dump a repository into a single text file, optimized for LLM input.
Features
- Always prints a folder tree of the discovered files (after ignores).
- Emits a per-file section for every *text-eligible* file (even if skipped/truncated).
- Respects .gitignore by default (disable with --no-gitignore).
- Supports extra ignore patterns and an additional ignore file (gitignore-ish).
- Detects and omits binary / non-text files from the per-file sections
(they still appear in the tree).
- Skips prior textrepo dumps (by magic marker) to avoid recursive inclusion.
- Deletes the output file before writing (prevents self-inclusion / stale output).
- Truncates each file by max lines and approx max bytes.
- Optional --no-line-break to remove all line breaks from emitted file contents.
- Optional --compact to minimize added characters (short header, ASCII tree, compact framing).
- Optional --max-total-bytes to stop emitting content past a total output budget
(headers still emitted for eligible files, consistent with legacy behavior).
- Streams output (does not build the entire dump in memory).
Usage:
textrepo /path/to/repo output.txt
Examples:
textrepo . repo.txt
textrepo . repo.txt --extra-ignore "*.log" --extra-ignore "Saved/" --max-lines 300
textrepo . repo.txt --no-gitignore
textrepo . repo.txt --ignore-file .textrepoignore
textrepo . repo.txt --no-line-break
textrepo . repo.txt --compact --max-bytes 8000 --max-lines 200
Notes on ignore semantics:
- This is a lightweight “gitignore-ish” matcher, not a full gitignore spec.
- Rules support:
- comments (#) and empty lines
- negation with leading '!'
- directory-only rules with trailing '/'
- anchored rules with leading '/'
- glob matching via fnmatch
- Pattern behavior:
- If the rule contains a '/', it is matched against the full relative path, and also '**/<pattern>'.
- If the rule does not contain '/', it matches any path component (basename-style).
"""
from __future__ import annotations
import argparse
import fnmatch
import os
import sys
from dataclasses import dataclass
from enum import Enum
from pathlib import Path
from typing import Iterable, List, Optional, Set, Tuple
# ----------------------- Dump marker (for self-skip) --------------------------
DUMP_MAGIC_LINE = "TEXTREPO_DUMP_V1"
DUMP_LEGACY_PREFIX = "REPOSITORY SNAPSHOT"
# ---------------------- Gitignore-ish pattern handling ------------------------
@dataclass(frozen=True)
class IgnoreRule:
negate: bool
pattern: str
dir_only: bool
anchored: bool
def _normalize_relpath(relpath: str) -> str:
# Always operate on posix-style, relative paths.
return relpath.replace(os.sep, "/").lstrip("./")
def parse_ignore_file(path: Path) -> List[IgnoreRule]:
rules: List[IgnoreRule] = []
try:
lines = path.read_text(encoding="utf-8", errors="replace").splitlines()
except OSError:
return rules
for raw in lines:
s = raw.strip()
if not s or s.startswith("#"):
continue
negate = s.startswith("!")
if negate:
s = s[1:].strip()
if not s:
continue
anchored = s.startswith("/")
if anchored:
s = s[1:]
dir_only = s.endswith("/")
if dir_only:
s = s[:-1]
if not s:
continue
rules.append(IgnoreRule(negate=negate, pattern=s, dir_only=dir_only, anchored=anchored))
return rules
def _match_rule(rule: IgnoreRule, rel_posix: str, is_dir: bool) -> bool:
if rule.dir_only and not is_dir:
return False
pat = rule.pattern
# Anchored: match only from repo root.
if rule.anchored:
# "/foo" means "foo" relative to root
return fnmatch.fnmatchcase(rel_posix, pat)
# Unanchored:
# - If pattern contains '/', match against full rel path and also '**/pattern'.
# - If pattern has no '/', match against any path component.
if "/" in pat:
if fnmatch.fnmatchcase(rel_posix, pat):
return True
return fnmatch.fnmatchcase(rel_posix, f"**/{pat}")
# Component/basename style:
# Check each component (gitignore-like basename behavior).
parts = rel_posix.split("/") if rel_posix else []
return any(fnmatch.fnmatchcase(part, pat) for part in parts)
def is_ignored(rel_posix: str, is_dir: bool, rules: List[IgnoreRule]) -> bool:
# Last matching rule wins; negation flips.
ignored = False
for rule in rules:
if _match_rule(rule, rel_posix, is_dir):
ignored = not rule.negate
return ignored
# ---------------------------- File reading helpers ----------------------------
COMMON_BINARY_EXTS = {
# images
".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tga", ".psd", ".exr", ".webp", ".ico",
# audio/video
".mp3", ".wav", ".ogg", ".flac", ".mp4", ".mov", ".avi", ".mkv", ".m4a",
# archives
".zip", ".7z", ".rar", ".tar", ".gz", ".bz2", ".xz",
# executables / libs
".exe", ".dll", ".so", ".dylib",
# common binary-ish assets
".uasset", ".umap", ".pak",
# documents often not useful as plain text
".pdf",
}
class ProbeKind(Enum):
TEXT = "text"
BINARY = "binary"
UNREADABLE = "unreadable"
def probe_binary_kind(path: Path, ext: str) -> ProbeKind:
"""
Decide whether a file is binary/non-text.
- Returns BINARY if extension is in COMMON_BINARY_EXTS or content looks binary.
- Returns UNREADABLE if the probe can't read the file.
- Returns TEXT otherwise.
Important: we treat UNREADABLE separately so it is NOT silently omitted.
"""
if ext in COMMON_BINARY_EXTS:
return ProbeKind.BINARY
try:
with path.open("rb") as f:
chunk = f.read(8192)
except OSError:
return ProbeKind.UNREADABLE
if not chunk:
return ProbeKind.TEXT
if b"\x00" in chunk:
return ProbeKind.BINARY
# Heuristic: ratio of printable bytes.
printable = sum(b in b"\t\n\r" or 32 <= b <= 126 for b in chunk)
if printable / len(chunk) < 0.80:
return ProbeKind.BINARY
return ProbeKind.TEXT
def starts_with_textrepo_dump_marker(path: Path) -> bool:
"""
True if file content starts with this tool's marker (or a legacy header),
allowing us to skip previously generated dumps to avoid recursion.
"""
try:
with path.open("rb") as f:
head = f.read(4096)
except OSError:
return False
s = head.decode("utf-8", errors="replace")
if s.startswith("\ufeff"):
s = s.lstrip("\ufeff")
return s.startswith(DUMP_MAGIC_LINE) or s.startswith(DUMP_LEGACY_PREFIX)
def read_text_snippet(
path: Path,
max_bytes: int,
max_lines: int,
no_line_break: bool,
) -> Tuple[str, bool, int, int]:
"""
Returns: (text, truncated, approx_bytes_read, lines_read)
Truncation uses:
- max_lines cap (exact)
- max_bytes cap (approx; based on encoding of each line)
"""
encodings = ("utf-8", "latin-1")
last_err: Optional[Exception] = None
for enc in encodings:
try:
out_parts: List[str] = []
bytes_so_far = 0
lines_so_far = 0
truncated = False
with path.open("r", encoding=enc, errors="strict") as f:
for line in f:
if lines_so_far >= max_lines:
truncated = True
break
if no_line_break:
# remove all line breaks
line = line.replace("\n", "").replace("\r", "")
line_bytes = len(line.encode(enc, errors="replace"))
if bytes_so_far + line_bytes > max_bytes:
remaining = max(0, max_bytes - bytes_so_far)
if remaining > 0:
# approximate slice by chars (byte-perfect not required)
out_parts.append(line[: max(1, min(len(line), remaining))])
truncated = True
break
out_parts.append(line)
bytes_so_far += line_bytes
lines_so_far += 1
return ("".join(out_parts), truncated, bytes_so_far, lines_so_far)
except (UnicodeDecodeError, OSError) as e:
last_err = e
continue
raise last_err if last_err else IOError("Failed to read file")
# ------------------------------ Tree formatting ------------------------------
def build_tree(paths: List[str], ascii_only: bool) -> str:
"""
Build a deterministic tree view from a list of posix-style relative paths.
"""
tree: dict = {}
for p in paths:
parts = p.split("/")
node = tree
for part in parts[:-1]:
node = node.setdefault(part, {})
node.setdefault("__files__", []).append(parts[-1])
if ascii_only:
BR_MID, BR_LAST = "|-- ", "`-- "
EXT_MID, EXT_LAST = "| ", " "
else:
BR_MID, BR_LAST = "├── ", "└── "
EXT_MID, EXT_LAST = "│ ", " "
def render(node: dict, prefix: str = "") -> List[str]:
lines: List[str] = []
dirs = sorted(k for k in node.keys() if k != "__files__")
files = sorted(node.get("__files__", []))
for i, d in enumerate(dirs):
is_last_dir = (i == len(dirs) - 1) and (len(files) == 0)
branch = BR_LAST if is_last_dir else BR_MID
lines.append(prefix + branch + d + "/")
extension = EXT_LAST if is_last_dir else EXT_MID
lines.extend(render(node[d], prefix + extension))
for j, f in enumerate(files):
is_last = (j == len(files) - 1)
branch = BR_LAST if is_last else BR_MID
lines.append(prefix + branch + f)
return lines
return "\n".join(render(tree))
# --------------------------------- Defaults ----------------------------------
DEFAULT_EXTRA_IGNORES = [
# VCS / IDE
".git/", ".hg/", ".svn/", ".idea/", ".vscode/",
# Python
"__pycache__/", "*.pyc", "*.pyo", "*.pyd",
"venv/", ".venv/", ".tox/", ".pytest_cache/", ".mypy_cache/", ".ruff_cache/", ".hypothesis/",
".coverage", "coverage/",
# OS / editor junk
".DS_Store", "Thumbs.db",
# Node / web
"node_modules/", "dist/", "build/", ".next/", ".turbo/", ".parcel-cache/",
"package-lock.json", "yarn.lock", "pnpm-lock.yaml",
# Cloudflare workers / wrangler
".wrangler/",
# Terraform / direnv / misc caches
".terraform/", ".direnv/", ".cache/",
]
# --------------------------------- Core model --------------------------------
@dataclass(frozen=True)
class FileEntry:
rel: str
abs_path: Path
size: Optional[int]
kind: ProbeKind
is_prior_dump: bool
@dataclass(frozen=True)
class DumpConfig:
root: Path
output_path: Path
respect_gitignore: bool
extra_ignores: List[str]
ignore_file: Optional[Path]
include_globs: Optional[List[str]]
max_bytes: int
max_lines: int
max_total_bytes: Optional[int]
no_line_break: bool
compact: bool
quiet: bool
class Emitter:
"""
Streaming writer that tracks UTF-8 byte count of emitted text.
"""
def __init__(self, out_fh, also_stdout: bool):
self._fh = out_fh
self._also_stdout = also_stdout
self.total_bytes = 0
def write(self, s: str) -> None:
self._fh.write(s)
self.total_bytes += len(s.encode("utf-8", errors="replace"))
if self._also_stdout:
sys.stdout.write(s)
def writeln(self, s: str = "") -> None:
self.write(s + "\n")
# -------------------------------- Implementation ------------------------------
def _delete_output_file_if_exists(output_path: Path) -> None:
try:
if output_path.exists() or output_path.is_symlink():
if output_path.is_dir():
raise IsADirectoryError(f"Output path is a directory: {output_path}")
output_path.unlink()
except OSError as e:
raise RuntimeError(f"Failed to delete existing output file '{output_path}': {e}") from e
def _build_rules(cfg: DumpConfig) -> List[IgnoreRule]:
rules: List[IgnoreRule] = []
if cfg.respect_gitignore:
gi = cfg.root / ".gitignore"
if gi.exists():
rules.extend(parse_ignore_file(gi))
if cfg.ignore_file and cfg.ignore_file.exists():
rules.extend(parse_ignore_file(cfg.ignore_file))
def add_rule(pat: str) -> None:
s = pat.strip()
if not s or s.startswith("#"):
return
negate = s.startswith("!")
if negate:
s = s[1:].strip()
anchored = s.startswith("/")
if anchored:
s = s[1:]
dir_only = s.endswith("/")
if dir_only:
s = s[:-1]
if s:
rules.append(IgnoreRule(negate=negate, pattern=s, dir_only=dir_only, anchored=anchored))
for pat in DEFAULT_EXTRA_IGNORES:
add_rule(pat)
for pat in cfg.extra_ignores:
add_rule(pat)
return rules
def collect_files(root: Path, rules: List[IgnoreRule], include_globs: Optional[List[str]], exclude_rel: Set[str]) -> List[str]:
root = root.resolve()
files: List[str] = []
def on_walk_error(_err: OSError) -> None:
# Best-effort walk; unreadable dirs are simply skipped.
return
for dirpath, dirnames, filenames in os.walk(root, onerror=on_walk_error, followlinks=False):
dirpath_p = Path(dirpath)
rel_dir = _normalize_relpath(str(dirpath_p.relative_to(root)))
if rel_dir == ".":
rel_dir = ""
# prune ignored dirs
pruned: List[str] = []
for d in list(dirnames):
rel = _normalize_relpath(str(Path(rel_dir) / d)) if rel_dir else _normalize_relpath(d)
if is_ignored(rel, is_dir=True, rules=rules):
continue
pruned.append(d)
dirnames[:] = pruned
for fn in filenames:
rel = _normalize_relpath(str(Path(rel_dir) / fn)) if rel_dir else _normalize_relpath(fn)
if rel in exclude_rel:
continue
if is_ignored(rel, is_dir=False, rules=rules):
continue
if include_globs and not any(fnmatch.fnmatchcase(rel, g) for g in include_globs):
continue
files.append(rel)
files.sort()
return files
def plan_files(cfg: DumpConfig, rules: List[IgnoreRule]) -> Tuple[List[str], List[FileEntry], int]:
# Exclude output file if it's under root.
exclude_rel: Set[str] = set()
try:
rel_out = _normalize_relpath(str(cfg.output_path.relative_to(cfg.root)))
if rel_out:
exclude_rel.add(rel_out)
except ValueError:
pass
rel_paths = collect_files(cfg.root, rules, cfg.include_globs, exclude_rel=exclude_rel)
entries: List[FileEntry] = []
omitted_binary = 0
for rel in rel_paths:
abs_path = cfg.root / rel
ext = abs_path.suffix.lower()
size: Optional[int] = None
try:
size = abs_path.stat().st_size
except OSError:
size = None
kind = probe_binary_kind(abs_path, ext)
# Requested behavior:
# - If skipped because binary/non-text => omit from per-file sections entirely.
if kind == ProbeKind.BINARY:
omitted_binary += 1
# Prior dump marker is only meaningful for readable, text-eligible files.
is_prior_dump = False
if kind == ProbeKind.TEXT:
is_prior_dump = starts_with_textrepo_dump_marker(abs_path)
entries.append(FileEntry(rel=rel, abs_path=abs_path, size=size, kind=kind, is_prior_dump=is_prior_dump))
return rel_paths, entries, omitted_binary
def emit_header(cfg: DumpConfig, em: Emitter, tree_text: str, total_in_tree: int, emitted_sections: int, omitted_binary: int) -> None:
if cfg.compact:
# Minimal header to reduce overhead
em.writeln(DUMP_MAGIC_LINE)
em.writeln(f"R {cfg.root}")
# T=tree count, E=eligible sections count, B=binary omitted count
em.writeln(f"T {total_in_tree} E {emitted_sections} B {omitted_binary}")
mtb = "-" if cfg.max_total_bytes is None else str(cfg.max_total_bytes)
em.writeln(f"L b={cfg.max_bytes} l={cfg.max_lines} t={mtb} nlb={int(cfg.no_line_break)} gi={int(cfg.respect_gitignore)}")
em.writeln("TREE")
em.writeln(tree_text)
em.writeln("--")
else:
em.writeln(DUMP_MAGIC_LINE)
em.writeln("REPOSITORY SNAPSHOT")
em.writeln(f"Root: {cfg.root}")
em.writeln(f"Files in tree (after ignores): {total_in_tree}")
em.writeln(f"Files emitted in sections: {emitted_sections}")
em.writeln(f"Binary/non-text omitted from sections: {omitted_binary}")
em.writeln(f"Per-file limits: max_bytes={cfg.max_bytes}, max_lines={cfg.max_lines}")
em.writeln(f"No line breaks in file contents: {cfg.no_line_break}")
em.writeln(f"Respect .gitignore: {cfg.respect_gitignore}")
em.writeln(f"Extra ignore-file: {str(cfg.ignore_file) if cfg.ignore_file else '(none)'}")
em.writeln()
em.writeln("FOLDER STRUCTURE (filtered)")
em.writeln(tree_text)
em.writeln()
em.writeln("=" * 80)
em.writeln()
def emit_file_section(cfg: DumpConfig, em: Emitter, entry: FileEntry) -> None:
"""
Emit one file's header and content/skip markers.
Caller must enforce "omit binary/non-text from sections".
"""
# Header
if cfg.compact:
# @@ <rel>[\t<size>]
if entry.size is None:
em.writeln(f"@@ {entry.rel}")
else:
em.writeln(f"@@ {entry.rel}\t{entry.size}")
else:
em.writeln(f"FILE: {entry.rel}")
if entry.size is not None:
em.writeln(f"Size: {entry.size} bytes")
em.writeln("-" * 80)
# max-total-bytes check (after header, consistent with legacy behavior)
if cfg.max_total_bytes is not None and em.total_bytes >= cfg.max_total_bytes:
if cfg.compact:
em.writeln("! total")
em.writeln("@@")
else:
em.writeln("[SKIPPED CONTENT] max_total_bytes reached.")
em.writeln()
return
# Prior dump skip
if entry.is_prior_dump:
if cfg.compact:
em.writeln("! dump")
em.writeln("@@")
else:
em.writeln("[SKIPPED CONTENT] previous textrepo dump detected.")
em.writeln()
return
# Unreadable probe: we still try to read; if it fails, we show a read error.
try:
text, truncated, bytes_read, lines_read = read_text_snippet(
entry.abs_path,
max_bytes=cfg.max_bytes,
max_lines=cfg.max_lines,
no_line_break=cfg.no_line_break,
)
except Exception as e:
if cfg.compact:
em.writeln(f"! err {type(e).__name__}")
em.writeln("@@")
else:
em.writeln(f"[SKIPPED CONTENT] read error: {type(e).__name__}: {e}")
em.writeln()
return
if cfg.compact:
# Emit content directly; close with @@
body = text.rstrip("\n")
if body:
em.writeln(body)
if truncated:
# ~ <lines> <bytes>
em.writeln(f"~ {lines_read} {bytes_read}")
em.writeln("@@")
else:
em.writeln("BEGIN")
em.writeln(text.rstrip("\n"))
if truncated:
em.writeln(f"[TRUNCATED] showed ~{lines_read} lines / ~{bytes_read} bytes.")
em.writeln("END")
em.writeln()
def format_repo(cfg: DumpConfig) -> None:
cfg_root = cfg.root.resolve()
cfg_out = cfg.output_path.expanduser().resolve()
cfg = DumpConfig(
root=cfg_root,
output_path=cfg_out,
respect_gitignore=cfg.respect_gitignore,
extra_ignores=cfg.extra_ignores,
ignore_file=cfg.ignore_file,
include_globs=cfg.include_globs,
max_bytes=cfg.max_bytes,
max_lines=cfg.max_lines,
max_total_bytes=cfg.max_total_bytes,
no_line_break=cfg.no_line_break,
compact=cfg.compact,
quiet=cfg.quiet,
)
_delete_output_file_if_exists(cfg.output_path)
rules = _build_rules(cfg)
rel_paths, entries, omitted_binary = plan_files(cfg, rules)
tree_text = build_tree(rel_paths, ascii_only=cfg.compact) if rel_paths else "(no files)"
eligible_entries = [e for e in entries if e.kind != ProbeKind.BINARY]
emitted_sections = len([e for e in eligible_entries if e.kind != ProbeKind.BINARY])
cfg.output_path.parent.mkdir(parents=True, exist_ok=True)
with cfg.output_path.open("w", encoding="utf-8", errors="replace") as out_f:
em = Emitter(out_f, also_stdout=(not cfg.quiet))
emit_header(
cfg=cfg,
em=em,
tree_text=tree_text,
total_in_tree=len(rel_paths),
emitted_sections=len(eligible_entries),
omitted_binary=omitted_binary,
)
for entry in eligible_entries:
# Requested behavior: binary/non-text omitted from per-file sections entirely
# (already filtered), but unreadable is still included (shows read error if needed).
emit_file_section(cfg, em, entry)
# ----------------------------------- CLI -------------------------------------
def parse_args(argv: Optional[List[str]] = None) -> DumpConfig:
ap = argparse.ArgumentParser(prog="textrepo", add_help=True)
ap.add_argument("repository_root_directory", type=str)
ap.add_argument("output_file_path", type=str)
ap.add_argument("--no-gitignore", action="store_true", help="Do NOT read/respect .gitignore.")
ap.add_argument(
"--extra-ignore",
action="append",
default=[],
help="Additional ignore pattern (repeatable). Supports '!' negation and trailing '/' for dirs.",
)
ap.add_argument(
"--ignore-file",
type=str,
default=None,
help="Path to an additional ignore file (gitignore-like patterns).",
)
ap.add_argument(
"--include",
action="append",
default=None,
help="Only include files matching these glob(s) (repeatable). Example: --include 'src/**' --include '*.md'",
)
ap.add_argument("--max-bytes", type=int, default=12_000, help="Max bytes of content to emit per file (approx).")
ap.add_argument("--max-lines", type=int, default=250, help="Max lines of content to emit per file.")
ap.add_argument(
"--max-total-bytes",
type=int,
default=None,
help="Stop emitting file contents once total output reaches this many bytes (headers still emitted).",
)
ap.add_argument("--no-line-break", action="store_true", help="Remove all line breaks from emitted file contents.")
ap.add_argument("--compact", action="store_true", help="Reduce added characters (short header + ASCII tree).")
ap.add_argument("--quiet", action="store_true", help="Do not print to stdout (still writes output file).")
args = ap.parse_args(argv)
root = Path(os.path.expanduser(args.repository_root_directory))
if not root.exists():
raise SystemExit(f"Error: {root} does not exist.")
if not root.is_dir():
raise SystemExit(f"Error: {root} is not a directory.")
out = Path(os.path.expanduser(args.output_file_path))
ignore_file = Path(os.path.expanduser(args.ignore_file)) if args.ignore_file else None
return DumpConfig(
root=root,
output_path=out,
respect_gitignore=not args.no_gitignore,
extra_ignores=args.extra_ignore or [],
ignore_file=ignore_file,
include_globs=args.include,
max_bytes=max(512, args.max_bytes),
max_lines=max(1, args.max_lines),
max_total_bytes=args.max_total_bytes,
no_line_break=args.no_line_break,
compact=args.compact,
quiet=args.quiet,
)
def main(argv: Optional[List[str]] = None) -> int:
try:
cfg = parse_args(argv)
format_repo(cfg)
return 0
except KeyboardInterrupt:
return 130
except Exception as e:
print(f"Error: {type(e).__name__}: {e}", file=sys.stderr)
return 2
if __name__ == "__main__":
raise SystemExit(main())
@albertbuchard
Copy link
Author

If you plan on using it often:

chmod +x textrepo.py
sudo mv textrepo.py /usr/local/bin/textrepo

Use it:

textrepo ~/my/repo my-repo-content.txt

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment