chengscott · January 30, 2026 15:20
diff --git a/missed_prefix.py b/missed_prefix.py
 #!/usr/bin/env python3
 import sys
 import re
 import argparse
 from collections import Counter

 # Nginx error log style:
 # ... request: "GET /path?x=y HTTP/1.1"
 RE_ERR = re.compile(r'request:\s*"([A-Z]+)\s+(\S+)\s+HTTP/[^"]+"')

 # Generic access-log style embedded in a line:
 # ..."GET /path?x=y HTTP/1.1"...
 RE_ACC = re.compile(r'"([A-Z]+)\s+(\S+)\s+HTTP/[^"]+"')

 def extract_path(line: str) -> str | None:
    m = RE_ERR.search(line) or RE_ACC.search(line)
    if not m:
        return None
    path = m.group(2)

    # If something like http://host/path sneaks in, keep only the path
    if "://" in path:
        try:
            path = "/" + path.split("://", 1)[1].split("/", 1)[1]
        except Exception:
            return None

    # Drop query/fragment
    path = path.split("?", 1)[0].split("#", 1)[0]

    # Normalize
    if not path.startswith("/"):
        path = "/" + path
    return path

 def gen_prefixes(path: str, max_depth: int):
    # Break into segments and build /a, /a/b, /a/b/c...
    segs = [s for s in path.split("/") if s]
    for d in range(1, min(max_depth, len(segs)) + 1):
        yield "/" + "/".join(segs[:d])

 def main():
    ap = argparse.ArgumentParser(
        description="Summarize most common URL path prefixes from fail2ban-regex --print-all-missed output."
    )
    ap.add_argument("--top", type=int, default=20, help="How many prefixes to show per depth.")
    ap.add_argument("--max-depth", type=int, default=4, help="Prefix depth in path segments.")
    ap.add_argument("--min-count", type=int, default=1, help="Only show prefixes with count >= this.")
    args = ap.parse_args()

    counts_by_depth = [Counter() for _ in range(args.max_depth + 1)]
    total_lines = 0
    total_paths = 0

    for line in sys.stdin:
        total_lines += 1
        p = extract_path(line)
        if not p:
            continue
        total_paths += 1
        for pref in gen_prefixes(p, args.max_depth):
            depth = pref.count("/")  # "/a" => 1, "/a/b" => 2
            counts_by_depth[depth][pref] += 1

    print(f"Read {total_lines} lines, extracted {total_paths} request paths.\n")

    for depth in range(1, args.max_depth + 1):
        items = [(c, pref) for pref, c in counts_by_depth[depth].items() if c >= args.min_count]
        items.sort(key=lambda x: (-x[0], x[1]))
        if not items:
            continue
        print(f"=== Top prefixes at depth {depth} ===")
        for c, pref in items[: args.top]:
            print(f"{c:6d}  {pref}")
        print()

 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	import sys
	import re
	import argparse
	from collections import Counter

	# Nginx error log style:
	# ... request: "GET /path?x=y HTTP/1.1"
	RE_ERR = re.compile(r'request:\s*"([A-Z]+)\s+(\S+)\s+HTTP/[^"]+"')

	# Generic access-log style embedded in a line:
	# ..."GET /path?x=y HTTP/1.1"...
	RE_ACC = re.compile(r'"([A-Z]+)\s+(\S+)\s+HTTP/[^"]+"')

	def extract_path(line: str) -> str \| None:
	m = RE_ERR.search(line) or RE_ACC.search(line)
	if not m:
	return None
	path = m.group(2)

	# If something like http://host/path sneaks in, keep only the path
	if "://" in path:
	try:
	path = "/" + path.split("://", 1)[1].split("/", 1)[1]
	except Exception:
	return None

	# Drop query/fragment
	path = path.split("?", 1)[0].split("#", 1)[0]

	# Normalize
	if not path.startswith("/"):
	path = "/" + path
	return path

	def gen_prefixes(path: str, max_depth: int):
	# Break into segments and build /a, /a/b, /a/b/c...
	segs = [s for s in path.split("/") if s]
	for d in range(1, min(max_depth, len(segs)) + 1):
	yield "/" + "/".join(segs[:d])

	def main():
	ap = argparse.ArgumentParser(
	description="Summarize most common URL path prefixes from fail2ban-regex --print-all-missed output."
	)
	ap.add_argument("--top", type=int, default=20, help="How many prefixes to show per depth.")
	ap.add_argument("--max-depth", type=int, default=4, help="Prefix depth in path segments.")
	ap.add_argument("--min-count", type=int, default=1, help="Only show prefixes with count >= this.")
	args = ap.parse_args()

	counts_by_depth = [Counter() for _ in range(args.max_depth + 1)]
	total_lines = 0
	total_paths = 0

	for line in sys.stdin:
	total_lines += 1
	p = extract_path(line)
	if not p:
	continue
	total_paths += 1
	for pref in gen_prefixes(p, args.max_depth):
	depth = pref.count("/") # "/a" => 1, "/a/b" => 2
	counts_by_depth[depth][pref] += 1

	print(f"Read {total_lines} lines, extracted {total_paths} request paths.\n")

	for depth in range(1, args.max_depth + 1):
	items = [(c, pref) for pref, c in counts_by_depth[depth].items() if c >= args.min_count]
	items.sort(key=lambda x: (-x[0], x[1]))
	if not items:
	continue
	print(f"=== Top prefixes at depth {depth} ===")
	for c, pref in items[: args.top]:
	print(f"{c:6d} {pref}")
	print()

	if __name__ == "__main__":
	main()
No results found