Gerardwx · January 30, 2026 14:23
diff --git a/regex_benchmark.py b/regex_benchmark.py
 #!/usr/bin/env python3
 """
 Benchmark comparison of regex execution times:
 - re.match('dog')
 - re.search('^dog')
 - re.match('^dog')
 - str.startswith('dog')
 """

 import re
 import timeit
 import random
 import sys
 import argparse
 from statistics import mean, stdev

 # Test strings
 test_strings = [
    "dog is here",           # Matches at start
    "the dog is here",       # Doesn't match at start
    "dog",                   # Exact match
    "no match here",         # No match
 ]

 def test_match_dog():
    """Test re.match('dog')"""
    pattern = re.compile('dog')
    for s in test_strings:
        pattern.match(s)

 def test_search_anchor_dog():
    """Test re.search('^dog')"""
    pattern = re.compile('^dog')
    for s in test_strings:
        pattern.search(s)

 def test_match_anchor_dog():
    """Test re.match('^dog')"""
    pattern = re.compile('^dog')
    for s in test_strings:
        pattern.match(s)

 def test_startswith_dog():
    """Test str.startswith('dog')"""
    for s in test_strings:
        s.startswith('dog')

 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Benchmark regex performance')
    parser.add_argument('--csv', action='store_true', help='Output results in CSV format')
    args = parser.parse_args()

    iterations = 100000
    runs = 5

    if not args.csv:
        print("=" * 70)
        print("Regex Performance Comparison")
        print("=" * 70)
        print(f"\nPython version: {sys.version}")
        print(f"Test strings: {test_strings}")
        print(f"Iterations per run: {iterations:,}")
        print(f"Number of runs: {runs}")
        print("\n" + "=" * 70)

    # Benchmark each pattern
    results = {}
    std_devs = {}

    tests = [
        ("re.match('dog')", test_match_dog),
        ("re.search('^dog')", test_search_anchor_dog),
        ("re.match('^dog')", test_match_anchor_dog),
        ("str.startswith('dog')", test_startswith_dog),
    ]

    for name, func in tests:
        times = []
        for _ in range(runs):
            # Randomize to reduce cache effects
            time = timeit.timeit(func, number=iterations)
            times.append(time)

        avg_time = mean(times)
        std_dev = stdev(times) if len(times) > 1 else 0
        results[name] = avg_time
        std_devs[name] = std_dev

        if not args.csv:
            print(f"\n{name}")
            print(f"  Average time: {avg_time:.6f} seconds")
            print(f"  Std deviation: {std_dev:.6f} seconds ({(std_dev/avg_time)*100:.2f}%)")
            print(f"  Time per op: {(avg_time/iterations)*1e6:.3f} microseconds")

    if args.csv:
        # Output CSV format: test_name,time_microseconds
        for name, time in tests:
            avg_time = results[name]
            time_per_op_us = (avg_time / iterations) * 1e6
            print(f"{name},{time_per_op_us:.6f}")
    else:
        # Show comparison relative to re.match('dog')
        print("\n" + "=" * 70)
        print("COMPARISON (relative to re.match('dog') baseline)")
        print("=" * 70)

        baseline_name = "re.match('dog')"
        baseline_time = results[baseline_name]
        baseline_std = std_devs[baseline_name]

        for name, time in sorted(results.items(), key=lambda x: x[1]):
            ratio = time / baseline_time
            percent_diff = (ratio - 1) * 100

            # Calculate combined standard deviation for significance check
            time_std = std_devs[name]
            combined_std = ((baseline_std ** 2 + time_std ** 2) ** 0.5) / baseline_time * 100

            if name == baseline_name:
                print(f"\n{name}: BASELINE")
            elif ratio < 1:
                significance = " *" if abs(percent_diff) > 2 * combined_std else " (within noise)"
                print(f"\n{name}: {1/ratio:.2f}x faster ({abs(percent_diff):.1f}% faster){significance}")
            else:
                significance = " *" if abs(percent_diff) > 2 * combined_std else " (within noise)"
                print(f"\n{name}: {ratio:.2f}x slower ({percent_diff:.1f}% slower){significance}")

        print("\n" + "=" * 70)
        print("NOTES:")
        print("- re.match() always matches at the start of the string")
        print("- The '^' anchor in re.match() is redundant (may get optimized)")
        print("- re.search('^dog') has to check the anchor condition")
        print("- str.startswith() is typically faster for literal string matching")
        print("- Differences < ~2-3% may be statistical noise or cache effects")
        print("- '*' indicates difference likely exceeds measurement noise")
        print("=" * 70)
diff --git a/run_all_python_versions.sh b/run_all_python_versions.sh
 #!/bin/bash
 # Run regex benchmark across Python 3.6 through 3.14 and generate CSV

 SCRIPT="/tmp/claude/regex_benchmark.py"
 CSV_FILE="/tmp/claude/benchmark_results.csv"
 TEMP_DIR="/tmp/claude/benchmark_temp"

 # Create temp directory for intermediate results
 mkdir -p "$TEMP_DIR"
 rm -f "$TEMP_DIR"/*.csv 2>/dev/null

 echo "Running regex benchmarks across Python versions 3.6-3.14..."
 echo "CSV results will be saved to: $CSV_FILE"
 echo ""

 # Array to track which Python versions are available
 declare -a available_versions

 # Run benchmark for each Python version
 for version in 3.6 3.7 3.8 3.9 3.10 3.11 3.12 3.13 3.14; do
    PYTHON_BIN="/usr/bin/python${version}"

    if [ -f "$PYTHON_BIN" ]; then
        echo "Testing with Python ${version}..."

        # Run benchmark in CSV mode and save to temp file
        "$PYTHON_BIN" "$SCRIPT" --csv > "$TEMP_DIR/python${version}.csv" 2>/dev/null

        if [ $? -eq 0 ]; then
            available_versions+=("$version")
        else
            echo "  Warning: Python ${version} failed to run"
            rm -f "$TEMP_DIR/python${version}.csv"
        fi
    else
        echo "Python ${version} not found at $PYTHON_BIN"
    fi
 done

 echo ""
 echo "Generating CSV file..."

 # Build CSV header
 header="Test"
 for version in "${available_versions[@]}"; do
    header="${header},Python ${version}"
 done

 # Write header to CSV
 echo "$header" > "$CSV_FILE"

 # Get list of test names from first available version
 first_version="${available_versions[0]}"
 test_file="$TEMP_DIR/python${first_version}.csv"

 if [ ! -f "$test_file" ]; then
    echo "Error: No benchmark results available"
    exit 1
 fi

 # Read each test and collect results from all versions
 while IFS=',' read -r test_name time_value; do
    row="$test_name"

    for version in "${available_versions[@]}"; do
        version_file="$TEMP_DIR/python${version}.csv"

        if [ -f "$version_file" ]; then
            # Extract the time for this test from this version's results
            value=$(grep "^${test_name}," "$version_file" | cut -d',' -f2)
            row="${row},${value}"
        else
            row="${row},N/A"
        fi
    done

    echo "$row" >> "$CSV_FILE"
 done < "$test_file"

 # Clean up temp files
 rm -rf "$TEMP_DIR"

 echo ""
 echo "=========================================="
 echo "CSV generation complete!"
 echo "Results saved to: $CSV_FILE"
 echo "=========================================="
 echo ""
 echo "Preview:"
 column -t -s',' "$CSV_FILE" 2>/dev/null || cat "$CSV_FILE"
	#!/usr/bin/env python3
	"""
	Benchmark comparison of regex execution times:
	- re.match('dog')
	- re.search('^dog')
	- re.match('^dog')
	- str.startswith('dog')
	"""

	import re
	import timeit
	import random
	import sys
	import argparse
	from statistics import mean, stdev

	# Test strings
	test_strings = [
	"dog is here", # Matches at start
	"the dog is here", # Doesn't match at start
	"dog", # Exact match
	"no match here", # No match
	]

	def test_match_dog():
	"""Test re.match('dog')"""
	pattern = re.compile('dog')
	for s in test_strings:
	pattern.match(s)

	def test_search_anchor_dog():
	"""Test re.search('^dog')"""
	pattern = re.compile('^dog')
	for s in test_strings:
	pattern.search(s)

	def test_match_anchor_dog():
	"""Test re.match('^dog')"""
	pattern = re.compile('^dog')
	for s in test_strings:
	pattern.match(s)

	def test_startswith_dog():
	"""Test str.startswith('dog')"""
	for s in test_strings:
	s.startswith('dog')

	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description='Benchmark regex performance')
	parser.add_argument('--csv', action='store_true', help='Output results in CSV format')
	args = parser.parse_args()

	iterations = 100000
	runs = 5

	if not args.csv:
	print("=" * 70)
	print("Regex Performance Comparison")
	print("=" * 70)
	print(f"\nPython version: {sys.version}")
	print(f"Test strings: {test_strings}")
	print(f"Iterations per run: {iterations:,}")
	print(f"Number of runs: {runs}")
	print("\n" + "=" * 70)

	# Benchmark each pattern
	results = {}
	std_devs = {}

	tests = [
	("re.match('dog')", test_match_dog),
	("re.search('^dog')", test_search_anchor_dog),
	("re.match('^dog')", test_match_anchor_dog),
	("str.startswith('dog')", test_startswith_dog),
	]

	for name, func in tests:
	times = []
	for _ in range(runs):
	# Randomize to reduce cache effects
	time = timeit.timeit(func, number=iterations)
	times.append(time)

	avg_time = mean(times)
	std_dev = stdev(times) if len(times) > 1 else 0
	results[name] = avg_time
	std_devs[name] = std_dev

	if not args.csv:
	print(f"\n{name}")
	print(f" Average time: {avg_time:.6f} seconds")
	print(f" Std deviation: {std_dev:.6f} seconds ({(std_dev/avg_time)*100:.2f}%)")
	print(f" Time per op: {(avg_time/iterations)*1e6:.3f} microseconds")

	if args.csv:
	# Output CSV format: test_name,time_microseconds
	for name, time in tests:
	avg_time = results[name]
	time_per_op_us = (avg_time / iterations) * 1e6
	print(f"{name},{time_per_op_us:.6f}")
	else:
	# Show comparison relative to re.match('dog')
	print("\n" + "=" * 70)
	print("COMPARISON (relative to re.match('dog') baseline)")
	print("=" * 70)

	baseline_name = "re.match('dog')"
	baseline_time = results[baseline_name]
	baseline_std = std_devs[baseline_name]

	for name, time in sorted(results.items(), key=lambda x: x[1]):
	ratio = time / baseline_time
	percent_diff = (ratio - 1) * 100

	# Calculate combined standard deviation for significance check
	time_std = std_devs[name]
	combined_std = ((baseline_std 2 + time_std 2) ** 0.5) / baseline_time * 100

	if name == baseline_name:
	print(f"\n{name}: BASELINE")
	elif ratio < 1:
	significance = " " if abs(percent_diff) > 2 combined_std else " (within noise)"
	print(f"\n{name}: {1/ratio:.2f}x faster ({abs(percent_diff):.1f}% faster){significance}")
	else:
	significance = " " if abs(percent_diff) > 2 combined_std else " (within noise)"
	print(f"\n{name}: {ratio:.2f}x slower ({percent_diff:.1f}% slower){significance}")

	print("\n" + "=" * 70)
	print("NOTES:")
	print("- re.match() always matches at the start of the string")
	print("- The '^' anchor in re.match() is redundant (may get optimized)")
	print("- re.search('^dog') has to check the anchor condition")
	print("- str.startswith() is typically faster for literal string matching")
	print("- Differences < ~2-3% may be statistical noise or cache effects")
	print("- '*' indicates difference likely exceeds measurement noise")
	print("=" * 70)
	#!/bin/bash
	# Run regex benchmark across Python 3.6 through 3.14 and generate CSV

	SCRIPT="/tmp/claude/regex_benchmark.py"
	CSV_FILE="/tmp/claude/benchmark_results.csv"
	TEMP_DIR="/tmp/claude/benchmark_temp"

	# Create temp directory for intermediate results
	mkdir -p "$TEMP_DIR"
	rm -f "$TEMP_DIR"/*.csv 2>/dev/null

	echo "Running regex benchmarks across Python versions 3.6-3.14..."
	echo "CSV results will be saved to: $CSV_FILE"
	echo ""

	# Array to track which Python versions are available
	declare -a available_versions

	# Run benchmark for each Python version
	for version in 3.6 3.7 3.8 3.9 3.10 3.11 3.12 3.13 3.14; do
	PYTHON_BIN="/usr/bin/python${version}"

	if [ -f "$PYTHON_BIN" ]; then
	echo "Testing with Python ${version}..."

	# Run benchmark in CSV mode and save to temp file
	"$PYTHON_BIN" "$SCRIPT" --csv > "$TEMP_DIR/python${version}.csv" 2>/dev/null

	if [ $? -eq 0 ]; then
	available_versions+=("$version")
	else
	echo " Warning: Python ${version} failed to run"
	rm -f "$TEMP_DIR/python${version}.csv"
	fi
	else
	echo "Python ${version} not found at $PYTHON_BIN"
	fi
	done

	echo ""
	echo "Generating CSV file..."

	# Build CSV header
	header="Test"
	for version in "${available_versions[@]}"; do
	header="${header},Python ${version}"
	done

	# Write header to CSV
	echo "$header" > "$CSV_FILE"

	# Get list of test names from first available version
	first_version="${available_versions[0]}"
	test_file="$TEMP_DIR/python${first_version}.csv"

	if [ ! -f "$test_file" ]; then
	echo "Error: No benchmark results available"
	exit 1
	fi

	# Read each test and collect results from all versions
	while IFS=',' read -r test_name time_value; do
	row="$test_name"

	for version in "${available_versions[@]}"; do
	version_file="$TEMP_DIR/python${version}.csv"

	if [ -f "$version_file" ]; then
	# Extract the time for this test from this version's results
	value=$(grep "^${test_name}," "$version_file" \| cut -d',' -f2)
	row="${row},${value}"
	else
	row="${row},N/A"
	fi
	done

	echo "$row" >> "$CSV_FILE"
	done < "$test_file"

	# Clean up temp files
	rm -rf "$TEMP_DIR"

	echo ""
	echo "=========================================="
	echo "CSV generation complete!"
	echo "Results saved to: $CSV_FILE"
	echo "=========================================="
	echo ""
	echo "Preview:"
	column -t -s',' "$CSV_FILE" 2>/dev/null \|\| cat "$CSV_FILE"