Skip to content

Instantly share code, notes, and snippets.

@Gerardwx
Created January 30, 2026 14:23
Show Gist options
  • Select an option

  • Save Gerardwx/aff90de1932a8ef06cad82d2743a8dd4 to your computer and use it in GitHub Desktop.

Select an option

Save Gerardwx/aff90de1932a8ef06cad82d2743a8dd4 to your computer and use it in GitHub Desktop.
regex benchmark
#!/usr/bin/env python3
"""
Benchmark comparison of regex execution times:
- re.match('dog')
- re.search('^dog')
- re.match('^dog')
- str.startswith('dog')
"""
import re
import timeit
import random
import sys
import argparse
from statistics import mean, stdev
# Test strings
test_strings = [
"dog is here", # Matches at start
"the dog is here", # Doesn't match at start
"dog", # Exact match
"no match here", # No match
]
def test_match_dog():
"""Test re.match('dog')"""
pattern = re.compile('dog')
for s in test_strings:
pattern.match(s)
def test_search_anchor_dog():
"""Test re.search('^dog')"""
pattern = re.compile('^dog')
for s in test_strings:
pattern.search(s)
def test_match_anchor_dog():
"""Test re.match('^dog')"""
pattern = re.compile('^dog')
for s in test_strings:
pattern.match(s)
def test_startswith_dog():
"""Test str.startswith('dog')"""
for s in test_strings:
s.startswith('dog')
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Benchmark regex performance')
parser.add_argument('--csv', action='store_true', help='Output results in CSV format')
args = parser.parse_args()
iterations = 100000
runs = 5
if not args.csv:
print("=" * 70)
print("Regex Performance Comparison")
print("=" * 70)
print(f"\nPython version: {sys.version}")
print(f"Test strings: {test_strings}")
print(f"Iterations per run: {iterations:,}")
print(f"Number of runs: {runs}")
print("\n" + "=" * 70)
# Benchmark each pattern
results = {}
std_devs = {}
tests = [
("re.match('dog')", test_match_dog),
("re.search('^dog')", test_search_anchor_dog),
("re.match('^dog')", test_match_anchor_dog),
("str.startswith('dog')", test_startswith_dog),
]
for name, func in tests:
times = []
for _ in range(runs):
# Randomize to reduce cache effects
time = timeit.timeit(func, number=iterations)
times.append(time)
avg_time = mean(times)
std_dev = stdev(times) if len(times) > 1 else 0
results[name] = avg_time
std_devs[name] = std_dev
if not args.csv:
print(f"\n{name}")
print(f" Average time: {avg_time:.6f} seconds")
print(f" Std deviation: {std_dev:.6f} seconds ({(std_dev/avg_time)*100:.2f}%)")
print(f" Time per op: {(avg_time/iterations)*1e6:.3f} microseconds")
if args.csv:
# Output CSV format: test_name,time_microseconds
for name, time in tests:
avg_time = results[name]
time_per_op_us = (avg_time / iterations) * 1e6
print(f"{name},{time_per_op_us:.6f}")
else:
# Show comparison relative to re.match('dog')
print("\n" + "=" * 70)
print("COMPARISON (relative to re.match('dog') baseline)")
print("=" * 70)
baseline_name = "re.match('dog')"
baseline_time = results[baseline_name]
baseline_std = std_devs[baseline_name]
for name, time in sorted(results.items(), key=lambda x: x[1]):
ratio = time / baseline_time
percent_diff = (ratio - 1) * 100
# Calculate combined standard deviation for significance check
time_std = std_devs[name]
combined_std = ((baseline_std ** 2 + time_std ** 2) ** 0.5) / baseline_time * 100
if name == baseline_name:
print(f"\n{name}: BASELINE")
elif ratio < 1:
significance = " *" if abs(percent_diff) > 2 * combined_std else " (within noise)"
print(f"\n{name}: {1/ratio:.2f}x faster ({abs(percent_diff):.1f}% faster){significance}")
else:
significance = " *" if abs(percent_diff) > 2 * combined_std else " (within noise)"
print(f"\n{name}: {ratio:.2f}x slower ({percent_diff:.1f}% slower){significance}")
print("\n" + "=" * 70)
print("NOTES:")
print("- re.match() always matches at the start of the string")
print("- The '^' anchor in re.match() is redundant (may get optimized)")
print("- re.search('^dog') has to check the anchor condition")
print("- str.startswith() is typically faster for literal string matching")
print("- Differences < ~2-3% may be statistical noise or cache effects")
print("- '*' indicates difference likely exceeds measurement noise")
print("=" * 70)
#!/bin/bash
# Run regex benchmark across Python 3.6 through 3.14 and generate CSV
SCRIPT="/tmp/claude/regex_benchmark.py"
CSV_FILE="/tmp/claude/benchmark_results.csv"
TEMP_DIR="/tmp/claude/benchmark_temp"
# Create temp directory for intermediate results
mkdir -p "$TEMP_DIR"
rm -f "$TEMP_DIR"/*.csv 2>/dev/null
echo "Running regex benchmarks across Python versions 3.6-3.14..."
echo "CSV results will be saved to: $CSV_FILE"
echo ""
# Array to track which Python versions are available
declare -a available_versions
# Run benchmark for each Python version
for version in 3.6 3.7 3.8 3.9 3.10 3.11 3.12 3.13 3.14; do
PYTHON_BIN="/usr/bin/python${version}"
if [ -f "$PYTHON_BIN" ]; then
echo "Testing with Python ${version}..."
# Run benchmark in CSV mode and save to temp file
"$PYTHON_BIN" "$SCRIPT" --csv > "$TEMP_DIR/python${version}.csv" 2>/dev/null
if [ $? -eq 0 ]; then
available_versions+=("$version")
else
echo " Warning: Python ${version} failed to run"
rm -f "$TEMP_DIR/python${version}.csv"
fi
else
echo "Python ${version} not found at $PYTHON_BIN"
fi
done
echo ""
echo "Generating CSV file..."
# Build CSV header
header="Test"
for version in "${available_versions[@]}"; do
header="${header},Python ${version}"
done
# Write header to CSV
echo "$header" > "$CSV_FILE"
# Get list of test names from first available version
first_version="${available_versions[0]}"
test_file="$TEMP_DIR/python${first_version}.csv"
if [ ! -f "$test_file" ]; then
echo "Error: No benchmark results available"
exit 1
fi
# Read each test and collect results from all versions
while IFS=',' read -r test_name time_value; do
row="$test_name"
for version in "${available_versions[@]}"; do
version_file="$TEMP_DIR/python${version}.csv"
if [ -f "$version_file" ]; then
# Extract the time for this test from this version's results
value=$(grep "^${test_name}," "$version_file" | cut -d',' -f2)
row="${row},${value}"
else
row="${row},N/A"
fi
done
echo "$row" >> "$CSV_FILE"
done < "$test_file"
# Clean up temp files
rm -rf "$TEMP_DIR"
echo ""
echo "=========================================="
echo "CSV generation complete!"
echo "Results saved to: $CSV_FILE"
echo "=========================================="
echo ""
echo "Preview:"
column -t -s',' "$CSV_FILE" 2>/dev/null || cat "$CSV_FILE"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment