Skip to content

Instantly share code, notes, and snippets.

@serjflint
Created February 15, 2026 06:36
Show Gist options
  • Select an option

  • Save serjflint/7d25f853f8391648213cfb6bca4f9143 to your computer and use it in GitHub Desktop.

Select an option

Save serjflint/7d25f853f8391648213cfb6bca4f9143 to your computer and use it in GitHub Desktop.
Benchmarks for yaml in Python
#!/usr/bin/env python3
"""
YAML to Python Objects Benchmark
================================
Installation (ALL REQUIRED)
---------------------------
pip install rapidyaml ryaml ruamel.yaml pyyaml prettytable
For PyYAML C extension:
Ubuntu/Debian: sudo apt-get install libyaml-dev && pip install --force-reinstall pyyaml
macOS: brew install libyaml && pip install --force-reinstall pyyaml
Usage
-----
python benchmark.py # Default: 200 files × 20KB
python benchmark.py --files 500 # More files
python benchmark.py --size 50000 # Larger files
"""
import sys
import gc
import time
import tempfile
import shutil
import random
import string
import argparse
from pathlib import Path
from io import StringIO
from typing import Any
import ryaml
import yaml
from ruamel.yaml import YAML
import prettytable
# Try to import ryml_fast (Cython version) from same directory
try:
# Add the directory containing this script to path for ryml_fast
script_dir = Path(__file__).parent
if str(script_dir) not in sys.path:
sys.path.insert(0, str(script_dir))
import ryml
import ryml_fast
HAS_RYML_FAST = True
print(f"✓ ryml_fast (Cython) available")
except ImportError as e:
HAS_RYML_FAST = False
print(f"ℹ ryml_fast not available: {e}")
# ============================================================================
# ryml → Python Objects Converter
# ============================================================================
def ryml_loads(source: str | bytes | bytearray) -> Any:
"""Parse YAML to Python objects using ryml (rapidyaml)."""
if isinstance(source, str):
source = source.encode('utf-8')
if isinstance(source, bytes):
source = bytearray(source)
tree = ryml.parse_in_arena(source)
# Handle different ryml API versions
if hasattr(tree, 'rootref'):
root = tree.rootref()
elif hasattr(tree, 'root_ref'):
root = tree.root_ref()
else:
# Direct indexing for newer versions
root = tree
return _tree_to_python(tree, 0)
def _tree_to_python(tree, node_id: int) -> Any:
"""Convert ryml tree node to Python objects using node IDs."""
if tree.is_map(node_id):
result = {}
child_id = tree.first_child(node_id)
while child_id != ryml.NONE:
if tree.has_key(child_id):
key_data = tree.key(child_id)
# Handle both bytes and memoryview objects
if isinstance(key_data, memoryview):
key_data = bytes(key_data)
key = key_data.decode('utf-8')
else:
key = None
if key is not None:
result[_parse_scalar(key)] = _tree_to_python(tree, child_id)
child_id = tree.next_sibling(child_id)
return result
elif tree.is_seq(node_id):
result = []
child_id = tree.first_child(node_id)
while child_id != ryml.NONE:
result.append(_tree_to_python(tree, child_id))
child_id = tree.next_sibling(child_id)
return result
elif tree.has_val(node_id):
val_data = tree.val(node_id)
# Handle both bytes and memoryview objects
if isinstance(val_data, memoryview):
val_data = bytes(val_data)
val = val_data.decode('utf-8')
return _parse_scalar(val)
return None
def _parse_scalar(val: str) -> Any:
"""Convert YAML scalar string to Python type."""
if val == '' or val in ('null', 'Null', 'NULL', '~'):
return None
if val in ('true', 'True', 'TRUE', 'yes', 'Yes', 'YES', 'on', 'On', 'ON'):
return True
if val in ('false', 'False', 'FALSE', 'no', 'No', 'NO', 'off', 'Off', 'OFF'):
return False
try:
if val.startswith(('0x', '0X')):
return int(val, 16)
if val.startswith(('0o', '0O')):
return int(val, 8)
return int(val)
except ValueError:
pass
try:
if val in ('.inf', '.Inf', '.INF'):
return float('inf')
if val in ('-.inf', '-.Inf', '-.INF'):
return float('-inf')
if val in ('.nan', '.NaN', '.NAN'):
return float('nan')
return float(val)
except ValueError:
pass
# Strip quotes
if len(val) >= 2 and ((val[0] == '"' and val[-1] == '"') or (val[0] == "'" and val[-1] == "'")):
return val[1:-1]
return val
class RymlLoaderReuse:
"""ryml loader with tree reuse."""
def __init__(self):
self.tree = ryml.Tree()
def loads(self, source: str | bytes | bytearray) -> Any:
if isinstance(source, str):
source = source.encode('utf-8')
if isinstance(source, bytes):
source = bytearray(source)
self.tree.clear()
self.tree.clear_arena()
ryml.parse_in_arena(source, tree=self.tree)
return _tree_to_python(self.tree, 0)
# ============================================================================
# Test File Generation
# ============================================================================
def generate_yaml_content(target_size: int = 20000) -> str:
"""Generate realistic YAML content."""
random_str = lambda n: ''.join(random.choices(string.ascii_letters + string.digits, k=n))
lines = [
"metadata:",
f" id: {random_str(32)}",
" version: '1.0.0'",
" enabled: true",
"",
]
current_size = sum(len(line) + 1 for line in lines)
section_num = 0
while current_size < target_size:
section_num += 1
section_lines = [
f"section_{section_num}:",
f" name: '{random_str(20)}'",
f" enabled: {random.choice(['true', 'false'])}",
f" priority: {random.randint(1, 100)}",
f" score: {random.random():.6f}",
" tags:",
]
for _ in range(random.randint(3, 6)):
section_lines.append(f" - {random_str(10)}")
section_lines.append(" config:")
for _ in range(random.randint(4, 8)):
key = random_str(12)
val = random.choice([
f"'{random_str(15)}'",
str(random.randint(-1000, 1000)),
f"{random.uniform(-100, 100):.4f}",
random.choice(['true', 'false']),
'null'
])
section_lines.append(f" {key}: {val}")
section_lines.append(" items:")
for i in range(random.randint(5, 10)):
section_lines.extend([
f" - id: {i}",
f" name: '{random_str(12)}'",
f" value: {random.randint(0, 100)}",
])
section_lines.append("")
section_size = sum(len(l) + 1 for l in section_lines)
if current_size + section_size > target_size * 1.2:
break
lines.extend(section_lines)
current_size += section_size
return '\n'.join(lines)
def create_test_files(directory: Path, num_files: int, avg_size: int) -> list[Path]:
"""Create test files with ±30% size variation."""
files = []
for i in range(num_files):
size = int(avg_size * random.uniform(0.7, 1.3))
content = generate_yaml_content(size)
filepath = directory / f"test_{i:04d}.yaml"
filepath.write_text(content)
files.append(filepath)
return files
# ============================================================================
# Benchmark
# ============================================================================
def benchmark(name: str, load_fn, contents: list[str], iterations: int = 3) -> dict:
"""Benchmark a loader function."""
total_bytes = sum(len(c.encode('utf-8')) for c in contents)
# Warmup
for content in contents[:5]:
_ = load_fn(content)
# Timed runs
times = []
for _ in range(iterations):
gc.collect()
gc.disable()
start = time.perf_counter()
for content in contents:
_ = load_fn(content)
times.append(time.perf_counter() - start)
gc.enable()
best = min(times)
return {
'name': name,
'time': best,
'files': len(contents),
'bytes': total_bytes,
'files_per_sec': len(contents) / best,
'mb_per_sec': (total_bytes / 1e6) / best,
'ms_per_file': (best * 1000) / len(contents),
}
def run_benchmark(files: list[Path], iterations: int = 3) -> list[dict]:
"""Run all benchmarks."""
print("Loading file contents...")
contents = [f.read_text() for f in files]
total_bytes = sum(len(c.encode('utf-8')) for c in contents)
print(f"\nConfiguration:")
print(f" Files: {len(files)}")
print(f" Total: {total_bytes / 1024 / 1024:.2f} MB")
print(f" Avg size: {total_bytes / len(files) / 1024:.1f} KB")
print(f" Iterations: {iterations}")
results = []
# Prepare loaders
ruamel_base = YAML(typ='base')
ruamel_safe = YAML(typ='safe')
ruamel_rt = YAML(typ='rt')
loaders = []
# Add ryml benchmarks if available
if HAS_RYML_FAST:
ryml_reuse = RymlLoaderReuse()
loaders.extend([
('ryml parse only', lambda s: ryml.parse_in_arena(bytearray(s.encode('utf-8')))),
('ryml → Python', ryml_loads),
('ryml → Python (reuse)', ryml_reuse.loads),
('ryml_fast (Cython)', ryml_fast.loads),
])
loaders.extend([
('ryaml.loads', ryaml.loads),
('ruamel.yaml (base)', lambda s: ruamel_base.load(StringIO(s))),
('ruamel.yaml (safe)', lambda s: ruamel_safe.load(StringIO(s))),
('ruamel.yaml (rt)', lambda s: ruamel_rt.load(StringIO(s))),
('pyyaml CSafeLoader', lambda s: yaml.load(s, Loader=yaml.CSafeLoader)),
('pyyaml safe_load', yaml.safe_load),
])
print("\n" + "=" * 75)
print("Running benchmarks...")
print("=" * 75 + "\n")
for name, loader in loaders:
print(f" {name:<30} ", end='', flush=True)
r = benchmark(name, loader, contents, iterations)
results.append(r)
print(f"{r['time']:.3f}s {r['mb_per_sec']:>6.2f} MB/s {r['files_per_sec']:>6.0f} files/s")
return results
def print_results(results: list[dict]):
"""Print formatted results."""
parse_only = [r for r in results if 'parse only' in r['name']]
full = [r for r in results if 'parse only' not in r['name']]
full.sort(key=lambda x: x['time'])
fastest = full[0]['time']
print("\n")
print("=" * 90)
print("RESULTS: YAML → Python Objects")
print("=" * 90)
table = prettytable.PrettyTable()
table.field_names = ['Library', 'Time (s)', 'MB/s', 'Files/s', 'ms/file', 'vs Fastest']
table.align['Library'] = 'l'
for col in table.field_names[1:]:
table.align[col] = 'r'
for r in full:
table.add_row([
r['name'],
f"{r['time']:.4f}",
f"{r['mb_per_sec']:.2f}",
f"{r['files_per_sec']:.1f}",
f"{r['ms_per_file']:.3f}",
f"{r['time'] / fastest:.2f}x"
])
print(table)
if parse_only:
p = parse_only[0]
print(f"\nReference - {p['name']}: {p['time']:.4f}s ({p['mb_per_sec']:.2f} MB/s)")
ryml_full = next((r for r in full if r['name'] == 'ryml → Python'), None)
if ryml_full:
overhead = ryml_full['time'] - p['time']
pct = (overhead / ryml_full['time']) * 100
print(f" → Python conversion overhead: {overhead:.4f}s ({pct:.1f}% of ryml total)")
# Bar chart
print("\n\nThroughput (MB/s) - Higher is Better")
print("-" * 75)
all_results = full
all_results.sort(key=lambda x: -x['mb_per_sec'])
max_mbps = max(r['mb_per_sec'] for r in all_results)
for r in all_results:
bar_len = int(40 * r['mb_per_sec'] / max_mbps)
bar = "█" * bar_len
note = " *" if 'parse only' in r['name'] else ""
print(f"{r['name']:<30} {bar} {r['mb_per_sec']:.1f}{note}")
print("\n * = parse only, no Python object creation")
# Comparisons
print("\n\nHead-to-Head")
print("-" * 75)
def get(name):
return next((r for r in full if name in r['name']), None)
pairs = [
('ryaml.loads', 'pyyaml CSafeLoader'),
('ryaml.loads', 'ruamel.yaml (safe)'),
('ruamel.yaml (safe)', 'ruamel.yaml (rt)'),
('pyyaml CSafeLoader', 'pyyaml safe_load'),
]
if HAS_RYML_FAST:
pairs.insert(0, ('ryml → Python', 'ryaml.loads'))
for a_name, b_name in pairs:
a, b = get(a_name), get(b_name)
if a and b:
if a['time'] < b['time']:
print(f" • {a['name']} is {b['time']/a['time']:.2f}x faster than {b['name']}")
else:
print(f" • {b['name']} is {a['time']/b['time']:.2f}x faster than {a['name']}")
print("\n\nRecommendations")
print("-" * 75)
print(f" 🏆 Fastest: {full[0]['name']}")
print(f" ⚖️ Simple & Fast: ryaml.loads")
print(f" 📝 Preserve comments: ruamel.yaml (rt)")
print(f" 🔧 Max compatibility: pyyaml CSafeLoader")
def verify_outputs():
"""Verify all loaders produce equivalent output."""
test_yaml = """
name: test
count: 42
enabled: true
ratio: 3.14
nothing: null
items:
- one
- two
nested:
key: value
"""
print("Verifying outputs match...")
ruamel_safe = YAML(typ='safe')
results = {
'ryaml': ryaml.loads(test_yaml),
'ruamel': ruamel_safe.load(StringIO(test_yaml)),
'pyyaml': yaml.safe_load(test_yaml),
}
if HAS_RYML_FAST:
results['ryml'] = ryml_loads(test_yaml)
results['ryml_fast'] = ryml_fast.loads(test_yaml)
reference = results['pyyaml']
for name, result in results.items():
match = "✓" if result == reference else "✗"
print(f" {match} {name}")
print()
def main():
parser = argparse.ArgumentParser(description="YAML to Python Objects Benchmark")
parser.add_argument('path', nargs='?', help='Directory with YAML files')
parser.add_argument('--files', type=int, default=200)
parser.add_argument('--size', type=int, default=20000)
parser.add_argument('--iterations', type=int, default=3)
parser.add_argument('--seed', type=int, default=42)
parser.add_argument('--keep', action='store_true')
args = parser.parse_args()
random.seed(args.seed)
print()
print("=" * 75)
print("YAML → Python Objects Benchmark")
print("=" * 75)
print()
verify_outputs()
if args.path:
path = Path(args.path)
files = sorted(path.glob('*.yaml')) + sorted(path.glob('*.yml'))
results = run_benchmark(files, args.iterations)
else:
temp_dir = Path(tempfile.mkdtemp(prefix="yaml_bench_"))
print(f"Generating {args.files} files (~{args.size // 1024}KB each)")
try:
files = create_test_files(temp_dir, args.files, args.size)
results = run_benchmark(files, args.iterations)
finally:
if args.keep:
print(f"\nFiles kept: {temp_dir}")
else:
shutil.rmtree(temp_dir)
print_results(results)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment