Created
January 30, 2026 14:23
-
-
Save Gerardwx/aff90de1932a8ef06cad82d2743a8dd4 to your computer and use it in GitHub Desktop.
regex benchmark
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| Benchmark comparison of regex execution times: | |
| - re.match('dog') | |
| - re.search('^dog') | |
| - re.match('^dog') | |
| - str.startswith('dog') | |
| """ | |
| import re | |
| import timeit | |
| import random | |
| import sys | |
| import argparse | |
| from statistics import mean, stdev | |
| # Test strings | |
| test_strings = [ | |
| "dog is here", # Matches at start | |
| "the dog is here", # Doesn't match at start | |
| "dog", # Exact match | |
| "no match here", # No match | |
| ] | |
| def test_match_dog(): | |
| """Test re.match('dog')""" | |
| pattern = re.compile('dog') | |
| for s in test_strings: | |
| pattern.match(s) | |
| def test_search_anchor_dog(): | |
| """Test re.search('^dog')""" | |
| pattern = re.compile('^dog') | |
| for s in test_strings: | |
| pattern.search(s) | |
| def test_match_anchor_dog(): | |
| """Test re.match('^dog')""" | |
| pattern = re.compile('^dog') | |
| for s in test_strings: | |
| pattern.match(s) | |
| def test_startswith_dog(): | |
| """Test str.startswith('dog')""" | |
| for s in test_strings: | |
| s.startswith('dog') | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser(description='Benchmark regex performance') | |
| parser.add_argument('--csv', action='store_true', help='Output results in CSV format') | |
| args = parser.parse_args() | |
| iterations = 100000 | |
| runs = 5 | |
| if not args.csv: | |
| print("=" * 70) | |
| print("Regex Performance Comparison") | |
| print("=" * 70) | |
| print(f"\nPython version: {sys.version}") | |
| print(f"Test strings: {test_strings}") | |
| print(f"Iterations per run: {iterations:,}") | |
| print(f"Number of runs: {runs}") | |
| print("\n" + "=" * 70) | |
| # Benchmark each pattern | |
| results = {} | |
| std_devs = {} | |
| tests = [ | |
| ("re.match('dog')", test_match_dog), | |
| ("re.search('^dog')", test_search_anchor_dog), | |
| ("re.match('^dog')", test_match_anchor_dog), | |
| ("str.startswith('dog')", test_startswith_dog), | |
| ] | |
| for name, func in tests: | |
| times = [] | |
| for _ in range(runs): | |
| # Randomize to reduce cache effects | |
| time = timeit.timeit(func, number=iterations) | |
| times.append(time) | |
| avg_time = mean(times) | |
| std_dev = stdev(times) if len(times) > 1 else 0 | |
| results[name] = avg_time | |
| std_devs[name] = std_dev | |
| if not args.csv: | |
| print(f"\n{name}") | |
| print(f" Average time: {avg_time:.6f} seconds") | |
| print(f" Std deviation: {std_dev:.6f} seconds ({(std_dev/avg_time)*100:.2f}%)") | |
| print(f" Time per op: {(avg_time/iterations)*1e6:.3f} microseconds") | |
| if args.csv: | |
| # Output CSV format: test_name,time_microseconds | |
| for name, time in tests: | |
| avg_time = results[name] | |
| time_per_op_us = (avg_time / iterations) * 1e6 | |
| print(f"{name},{time_per_op_us:.6f}") | |
| else: | |
| # Show comparison relative to re.match('dog') | |
| print("\n" + "=" * 70) | |
| print("COMPARISON (relative to re.match('dog') baseline)") | |
| print("=" * 70) | |
| baseline_name = "re.match('dog')" | |
| baseline_time = results[baseline_name] | |
| baseline_std = std_devs[baseline_name] | |
| for name, time in sorted(results.items(), key=lambda x: x[1]): | |
| ratio = time / baseline_time | |
| percent_diff = (ratio - 1) * 100 | |
| # Calculate combined standard deviation for significance check | |
| time_std = std_devs[name] | |
| combined_std = ((baseline_std ** 2 + time_std ** 2) ** 0.5) / baseline_time * 100 | |
| if name == baseline_name: | |
| print(f"\n{name}: BASELINE") | |
| elif ratio < 1: | |
| significance = " *" if abs(percent_diff) > 2 * combined_std else " (within noise)" | |
| print(f"\n{name}: {1/ratio:.2f}x faster ({abs(percent_diff):.1f}% faster){significance}") | |
| else: | |
| significance = " *" if abs(percent_diff) > 2 * combined_std else " (within noise)" | |
| print(f"\n{name}: {ratio:.2f}x slower ({percent_diff:.1f}% slower){significance}") | |
| print("\n" + "=" * 70) | |
| print("NOTES:") | |
| print("- re.match() always matches at the start of the string") | |
| print("- The '^' anchor in re.match() is redundant (may get optimized)") | |
| print("- re.search('^dog') has to check the anchor condition") | |
| print("- str.startswith() is typically faster for literal string matching") | |
| print("- Differences < ~2-3% may be statistical noise or cache effects") | |
| print("- '*' indicates difference likely exceeds measurement noise") | |
| print("=" * 70) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| # Run regex benchmark across Python 3.6 through 3.14 and generate CSV | |
| SCRIPT="/tmp/claude/regex_benchmark.py" | |
| CSV_FILE="/tmp/claude/benchmark_results.csv" | |
| TEMP_DIR="/tmp/claude/benchmark_temp" | |
| # Create temp directory for intermediate results | |
| mkdir -p "$TEMP_DIR" | |
| rm -f "$TEMP_DIR"/*.csv 2>/dev/null | |
| echo "Running regex benchmarks across Python versions 3.6-3.14..." | |
| echo "CSV results will be saved to: $CSV_FILE" | |
| echo "" | |
| # Array to track which Python versions are available | |
| declare -a available_versions | |
| # Run benchmark for each Python version | |
| for version in 3.6 3.7 3.8 3.9 3.10 3.11 3.12 3.13 3.14; do | |
| PYTHON_BIN="/usr/bin/python${version}" | |
| if [ -f "$PYTHON_BIN" ]; then | |
| echo "Testing with Python ${version}..." | |
| # Run benchmark in CSV mode and save to temp file | |
| "$PYTHON_BIN" "$SCRIPT" --csv > "$TEMP_DIR/python${version}.csv" 2>/dev/null | |
| if [ $? -eq 0 ]; then | |
| available_versions+=("$version") | |
| else | |
| echo " Warning: Python ${version} failed to run" | |
| rm -f "$TEMP_DIR/python${version}.csv" | |
| fi | |
| else | |
| echo "Python ${version} not found at $PYTHON_BIN" | |
| fi | |
| done | |
| echo "" | |
| echo "Generating CSV file..." | |
| # Build CSV header | |
| header="Test" | |
| for version in "${available_versions[@]}"; do | |
| header="${header},Python ${version}" | |
| done | |
| # Write header to CSV | |
| echo "$header" > "$CSV_FILE" | |
| # Get list of test names from first available version | |
| first_version="${available_versions[0]}" | |
| test_file="$TEMP_DIR/python${first_version}.csv" | |
| if [ ! -f "$test_file" ]; then | |
| echo "Error: No benchmark results available" | |
| exit 1 | |
| fi | |
| # Read each test and collect results from all versions | |
| while IFS=',' read -r test_name time_value; do | |
| row="$test_name" | |
| for version in "${available_versions[@]}"; do | |
| version_file="$TEMP_DIR/python${version}.csv" | |
| if [ -f "$version_file" ]; then | |
| # Extract the time for this test from this version's results | |
| value=$(grep "^${test_name}," "$version_file" | cut -d',' -f2) | |
| row="${row},${value}" | |
| else | |
| row="${row},N/A" | |
| fi | |
| done | |
| echo "$row" >> "$CSV_FILE" | |
| done < "$test_file" | |
| # Clean up temp files | |
| rm -rf "$TEMP_DIR" | |
| echo "" | |
| echo "==========================================" | |
| echo "CSV generation complete!" | |
| echo "Results saved to: $CSV_FILE" | |
| echo "==========================================" | |
| echo "" | |
| echo "Preview:" | |
| column -t -s',' "$CSV_FILE" 2>/dev/null || cat "$CSV_FILE" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment