Created
February 15, 2026 06:36
-
-
Save serjflint/7d25f853f8391648213cfb6bca4f9143 to your computer and use it in GitHub Desktop.
Benchmarks for yaml in Python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| YAML to Python Objects Benchmark | |
| ================================ | |
| Installation (ALL REQUIRED) | |
| --------------------------- | |
| pip install rapidyaml ryaml ruamel.yaml pyyaml prettytable | |
| For PyYAML C extension: | |
| Ubuntu/Debian: sudo apt-get install libyaml-dev && pip install --force-reinstall pyyaml | |
| macOS: brew install libyaml && pip install --force-reinstall pyyaml | |
| Usage | |
| ----- | |
| python benchmark.py # Default: 200 files × 20KB | |
| python benchmark.py --files 500 # More files | |
| python benchmark.py --size 50000 # Larger files | |
| """ | |
| import sys | |
| import gc | |
| import time | |
| import tempfile | |
| import shutil | |
| import random | |
| import string | |
| import argparse | |
| from pathlib import Path | |
| from io import StringIO | |
| from typing import Any | |
| import ryaml | |
| import yaml | |
| from ruamel.yaml import YAML | |
| import prettytable | |
| # Try to import ryml_fast (Cython version) from same directory | |
| try: | |
| # Add the directory containing this script to path for ryml_fast | |
| script_dir = Path(__file__).parent | |
| if str(script_dir) not in sys.path: | |
| sys.path.insert(0, str(script_dir)) | |
| import ryml | |
| import ryml_fast | |
| HAS_RYML_FAST = True | |
| print(f"✓ ryml_fast (Cython) available") | |
| except ImportError as e: | |
| HAS_RYML_FAST = False | |
| print(f"ℹ ryml_fast not available: {e}") | |
| # ============================================================================ | |
| # ryml → Python Objects Converter | |
| # ============================================================================ | |
| def ryml_loads(source: str | bytes | bytearray) -> Any: | |
| """Parse YAML to Python objects using ryml (rapidyaml).""" | |
| if isinstance(source, str): | |
| source = source.encode('utf-8') | |
| if isinstance(source, bytes): | |
| source = bytearray(source) | |
| tree = ryml.parse_in_arena(source) | |
| # Handle different ryml API versions | |
| if hasattr(tree, 'rootref'): | |
| root = tree.rootref() | |
| elif hasattr(tree, 'root_ref'): | |
| root = tree.root_ref() | |
| else: | |
| # Direct indexing for newer versions | |
| root = tree | |
| return _tree_to_python(tree, 0) | |
| def _tree_to_python(tree, node_id: int) -> Any: | |
| """Convert ryml tree node to Python objects using node IDs.""" | |
| if tree.is_map(node_id): | |
| result = {} | |
| child_id = tree.first_child(node_id) | |
| while child_id != ryml.NONE: | |
| if tree.has_key(child_id): | |
| key_data = tree.key(child_id) | |
| # Handle both bytes and memoryview objects | |
| if isinstance(key_data, memoryview): | |
| key_data = bytes(key_data) | |
| key = key_data.decode('utf-8') | |
| else: | |
| key = None | |
| if key is not None: | |
| result[_parse_scalar(key)] = _tree_to_python(tree, child_id) | |
| child_id = tree.next_sibling(child_id) | |
| return result | |
| elif tree.is_seq(node_id): | |
| result = [] | |
| child_id = tree.first_child(node_id) | |
| while child_id != ryml.NONE: | |
| result.append(_tree_to_python(tree, child_id)) | |
| child_id = tree.next_sibling(child_id) | |
| return result | |
| elif tree.has_val(node_id): | |
| val_data = tree.val(node_id) | |
| # Handle both bytes and memoryview objects | |
| if isinstance(val_data, memoryview): | |
| val_data = bytes(val_data) | |
| val = val_data.decode('utf-8') | |
| return _parse_scalar(val) | |
| return None | |
| def _parse_scalar(val: str) -> Any: | |
| """Convert YAML scalar string to Python type.""" | |
| if val == '' or val in ('null', 'Null', 'NULL', '~'): | |
| return None | |
| if val in ('true', 'True', 'TRUE', 'yes', 'Yes', 'YES', 'on', 'On', 'ON'): | |
| return True | |
| if val in ('false', 'False', 'FALSE', 'no', 'No', 'NO', 'off', 'Off', 'OFF'): | |
| return False | |
| try: | |
| if val.startswith(('0x', '0X')): | |
| return int(val, 16) | |
| if val.startswith(('0o', '0O')): | |
| return int(val, 8) | |
| return int(val) | |
| except ValueError: | |
| pass | |
| try: | |
| if val in ('.inf', '.Inf', '.INF'): | |
| return float('inf') | |
| if val in ('-.inf', '-.Inf', '-.INF'): | |
| return float('-inf') | |
| if val in ('.nan', '.NaN', '.NAN'): | |
| return float('nan') | |
| return float(val) | |
| except ValueError: | |
| pass | |
| # Strip quotes | |
| if len(val) >= 2 and ((val[0] == '"' and val[-1] == '"') or (val[0] == "'" and val[-1] == "'")): | |
| return val[1:-1] | |
| return val | |
| class RymlLoaderReuse: | |
| """ryml loader with tree reuse.""" | |
| def __init__(self): | |
| self.tree = ryml.Tree() | |
| def loads(self, source: str | bytes | bytearray) -> Any: | |
| if isinstance(source, str): | |
| source = source.encode('utf-8') | |
| if isinstance(source, bytes): | |
| source = bytearray(source) | |
| self.tree.clear() | |
| self.tree.clear_arena() | |
| ryml.parse_in_arena(source, tree=self.tree) | |
| return _tree_to_python(self.tree, 0) | |
| # ============================================================================ | |
| # Test File Generation | |
| # ============================================================================ | |
| def generate_yaml_content(target_size: int = 20000) -> str: | |
| """Generate realistic YAML content.""" | |
| random_str = lambda n: ''.join(random.choices(string.ascii_letters + string.digits, k=n)) | |
| lines = [ | |
| "metadata:", | |
| f" id: {random_str(32)}", | |
| " version: '1.0.0'", | |
| " enabled: true", | |
| "", | |
| ] | |
| current_size = sum(len(line) + 1 for line in lines) | |
| section_num = 0 | |
| while current_size < target_size: | |
| section_num += 1 | |
| section_lines = [ | |
| f"section_{section_num}:", | |
| f" name: '{random_str(20)}'", | |
| f" enabled: {random.choice(['true', 'false'])}", | |
| f" priority: {random.randint(1, 100)}", | |
| f" score: {random.random():.6f}", | |
| " tags:", | |
| ] | |
| for _ in range(random.randint(3, 6)): | |
| section_lines.append(f" - {random_str(10)}") | |
| section_lines.append(" config:") | |
| for _ in range(random.randint(4, 8)): | |
| key = random_str(12) | |
| val = random.choice([ | |
| f"'{random_str(15)}'", | |
| str(random.randint(-1000, 1000)), | |
| f"{random.uniform(-100, 100):.4f}", | |
| random.choice(['true', 'false']), | |
| 'null' | |
| ]) | |
| section_lines.append(f" {key}: {val}") | |
| section_lines.append(" items:") | |
| for i in range(random.randint(5, 10)): | |
| section_lines.extend([ | |
| f" - id: {i}", | |
| f" name: '{random_str(12)}'", | |
| f" value: {random.randint(0, 100)}", | |
| ]) | |
| section_lines.append("") | |
| section_size = sum(len(l) + 1 for l in section_lines) | |
| if current_size + section_size > target_size * 1.2: | |
| break | |
| lines.extend(section_lines) | |
| current_size += section_size | |
| return '\n'.join(lines) | |
| def create_test_files(directory: Path, num_files: int, avg_size: int) -> list[Path]: | |
| """Create test files with ±30% size variation.""" | |
| files = [] | |
| for i in range(num_files): | |
| size = int(avg_size * random.uniform(0.7, 1.3)) | |
| content = generate_yaml_content(size) | |
| filepath = directory / f"test_{i:04d}.yaml" | |
| filepath.write_text(content) | |
| files.append(filepath) | |
| return files | |
| # ============================================================================ | |
| # Benchmark | |
| # ============================================================================ | |
| def benchmark(name: str, load_fn, contents: list[str], iterations: int = 3) -> dict: | |
| """Benchmark a loader function.""" | |
| total_bytes = sum(len(c.encode('utf-8')) for c in contents) | |
| # Warmup | |
| for content in contents[:5]: | |
| _ = load_fn(content) | |
| # Timed runs | |
| times = [] | |
| for _ in range(iterations): | |
| gc.collect() | |
| gc.disable() | |
| start = time.perf_counter() | |
| for content in contents: | |
| _ = load_fn(content) | |
| times.append(time.perf_counter() - start) | |
| gc.enable() | |
| best = min(times) | |
| return { | |
| 'name': name, | |
| 'time': best, | |
| 'files': len(contents), | |
| 'bytes': total_bytes, | |
| 'files_per_sec': len(contents) / best, | |
| 'mb_per_sec': (total_bytes / 1e6) / best, | |
| 'ms_per_file': (best * 1000) / len(contents), | |
| } | |
| def run_benchmark(files: list[Path], iterations: int = 3) -> list[dict]: | |
| """Run all benchmarks.""" | |
| print("Loading file contents...") | |
| contents = [f.read_text() for f in files] | |
| total_bytes = sum(len(c.encode('utf-8')) for c in contents) | |
| print(f"\nConfiguration:") | |
| print(f" Files: {len(files)}") | |
| print(f" Total: {total_bytes / 1024 / 1024:.2f} MB") | |
| print(f" Avg size: {total_bytes / len(files) / 1024:.1f} KB") | |
| print(f" Iterations: {iterations}") | |
| results = [] | |
| # Prepare loaders | |
| ruamel_base = YAML(typ='base') | |
| ruamel_safe = YAML(typ='safe') | |
| ruamel_rt = YAML(typ='rt') | |
| loaders = [] | |
| # Add ryml benchmarks if available | |
| if HAS_RYML_FAST: | |
| ryml_reuse = RymlLoaderReuse() | |
| loaders.extend([ | |
| ('ryml parse only', lambda s: ryml.parse_in_arena(bytearray(s.encode('utf-8')))), | |
| ('ryml → Python', ryml_loads), | |
| ('ryml → Python (reuse)', ryml_reuse.loads), | |
| ('ryml_fast (Cython)', ryml_fast.loads), | |
| ]) | |
| loaders.extend([ | |
| ('ryaml.loads', ryaml.loads), | |
| ('ruamel.yaml (base)', lambda s: ruamel_base.load(StringIO(s))), | |
| ('ruamel.yaml (safe)', lambda s: ruamel_safe.load(StringIO(s))), | |
| ('ruamel.yaml (rt)', lambda s: ruamel_rt.load(StringIO(s))), | |
| ('pyyaml CSafeLoader', lambda s: yaml.load(s, Loader=yaml.CSafeLoader)), | |
| ('pyyaml safe_load', yaml.safe_load), | |
| ]) | |
| print("\n" + "=" * 75) | |
| print("Running benchmarks...") | |
| print("=" * 75 + "\n") | |
| for name, loader in loaders: | |
| print(f" {name:<30} ", end='', flush=True) | |
| r = benchmark(name, loader, contents, iterations) | |
| results.append(r) | |
| print(f"{r['time']:.3f}s {r['mb_per_sec']:>6.2f} MB/s {r['files_per_sec']:>6.0f} files/s") | |
| return results | |
| def print_results(results: list[dict]): | |
| """Print formatted results.""" | |
| parse_only = [r for r in results if 'parse only' in r['name']] | |
| full = [r for r in results if 'parse only' not in r['name']] | |
| full.sort(key=lambda x: x['time']) | |
| fastest = full[0]['time'] | |
| print("\n") | |
| print("=" * 90) | |
| print("RESULTS: YAML → Python Objects") | |
| print("=" * 90) | |
| table = prettytable.PrettyTable() | |
| table.field_names = ['Library', 'Time (s)', 'MB/s', 'Files/s', 'ms/file', 'vs Fastest'] | |
| table.align['Library'] = 'l' | |
| for col in table.field_names[1:]: | |
| table.align[col] = 'r' | |
| for r in full: | |
| table.add_row([ | |
| r['name'], | |
| f"{r['time']:.4f}", | |
| f"{r['mb_per_sec']:.2f}", | |
| f"{r['files_per_sec']:.1f}", | |
| f"{r['ms_per_file']:.3f}", | |
| f"{r['time'] / fastest:.2f}x" | |
| ]) | |
| print(table) | |
| if parse_only: | |
| p = parse_only[0] | |
| print(f"\nReference - {p['name']}: {p['time']:.4f}s ({p['mb_per_sec']:.2f} MB/s)") | |
| ryml_full = next((r for r in full if r['name'] == 'ryml → Python'), None) | |
| if ryml_full: | |
| overhead = ryml_full['time'] - p['time'] | |
| pct = (overhead / ryml_full['time']) * 100 | |
| print(f" → Python conversion overhead: {overhead:.4f}s ({pct:.1f}% of ryml total)") | |
| # Bar chart | |
| print("\n\nThroughput (MB/s) - Higher is Better") | |
| print("-" * 75) | |
| all_results = full | |
| all_results.sort(key=lambda x: -x['mb_per_sec']) | |
| max_mbps = max(r['mb_per_sec'] for r in all_results) | |
| for r in all_results: | |
| bar_len = int(40 * r['mb_per_sec'] / max_mbps) | |
| bar = "█" * bar_len | |
| note = " *" if 'parse only' in r['name'] else "" | |
| print(f"{r['name']:<30} {bar} {r['mb_per_sec']:.1f}{note}") | |
| print("\n * = parse only, no Python object creation") | |
| # Comparisons | |
| print("\n\nHead-to-Head") | |
| print("-" * 75) | |
| def get(name): | |
| return next((r for r in full if name in r['name']), None) | |
| pairs = [ | |
| ('ryaml.loads', 'pyyaml CSafeLoader'), | |
| ('ryaml.loads', 'ruamel.yaml (safe)'), | |
| ('ruamel.yaml (safe)', 'ruamel.yaml (rt)'), | |
| ('pyyaml CSafeLoader', 'pyyaml safe_load'), | |
| ] | |
| if HAS_RYML_FAST: | |
| pairs.insert(0, ('ryml → Python', 'ryaml.loads')) | |
| for a_name, b_name in pairs: | |
| a, b = get(a_name), get(b_name) | |
| if a and b: | |
| if a['time'] < b['time']: | |
| print(f" • {a['name']} is {b['time']/a['time']:.2f}x faster than {b['name']}") | |
| else: | |
| print(f" • {b['name']} is {a['time']/b['time']:.2f}x faster than {a['name']}") | |
| print("\n\nRecommendations") | |
| print("-" * 75) | |
| print(f" 🏆 Fastest: {full[0]['name']}") | |
| print(f" ⚖️ Simple & Fast: ryaml.loads") | |
| print(f" 📝 Preserve comments: ruamel.yaml (rt)") | |
| print(f" 🔧 Max compatibility: pyyaml CSafeLoader") | |
| def verify_outputs(): | |
| """Verify all loaders produce equivalent output.""" | |
| test_yaml = """ | |
| name: test | |
| count: 42 | |
| enabled: true | |
| ratio: 3.14 | |
| nothing: null | |
| items: | |
| - one | |
| - two | |
| nested: | |
| key: value | |
| """ | |
| print("Verifying outputs match...") | |
| ruamel_safe = YAML(typ='safe') | |
| results = { | |
| 'ryaml': ryaml.loads(test_yaml), | |
| 'ruamel': ruamel_safe.load(StringIO(test_yaml)), | |
| 'pyyaml': yaml.safe_load(test_yaml), | |
| } | |
| if HAS_RYML_FAST: | |
| results['ryml'] = ryml_loads(test_yaml) | |
| results['ryml_fast'] = ryml_fast.loads(test_yaml) | |
| reference = results['pyyaml'] | |
| for name, result in results.items(): | |
| match = "✓" if result == reference else "✗" | |
| print(f" {match} {name}") | |
| print() | |
| def main(): | |
| parser = argparse.ArgumentParser(description="YAML to Python Objects Benchmark") | |
| parser.add_argument('path', nargs='?', help='Directory with YAML files') | |
| parser.add_argument('--files', type=int, default=200) | |
| parser.add_argument('--size', type=int, default=20000) | |
| parser.add_argument('--iterations', type=int, default=3) | |
| parser.add_argument('--seed', type=int, default=42) | |
| parser.add_argument('--keep', action='store_true') | |
| args = parser.parse_args() | |
| random.seed(args.seed) | |
| print() | |
| print("=" * 75) | |
| print("YAML → Python Objects Benchmark") | |
| print("=" * 75) | |
| print() | |
| verify_outputs() | |
| if args.path: | |
| path = Path(args.path) | |
| files = sorted(path.glob('*.yaml')) + sorted(path.glob('*.yml')) | |
| results = run_benchmark(files, args.iterations) | |
| else: | |
| temp_dir = Path(tempfile.mkdtemp(prefix="yaml_bench_")) | |
| print(f"Generating {args.files} files (~{args.size // 1024}KB each)") | |
| try: | |
| files = create_test_files(temp_dir, args.files, args.size) | |
| results = run_benchmark(files, args.iterations) | |
| finally: | |
| if args.keep: | |
| print(f"\nFiles kept: {temp_dir}") | |
| else: | |
| shutil.rmtree(temp_dir) | |
| print_results(results) | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment