Skip to content

Instantly share code, notes, and snippets.

@aahmed-se
Created February 11, 2026 01:51
Show Gist options
  • Select an option

  • Save aahmed-se/79782ed2e60ca5b870af231dd3568c52 to your computer and use it in GitHub Desktop.

Select an option

Save aahmed-se/79782ed2e60ca5b870af231dd3568c52 to your computer and use it in GitHub Desktop.
#!/bin/bash
# =============================================================================
# 🏎️ NFS STORAGE BENCHMARK β€” Audio File I/O Profile (PyTorch-Realistic)
# =============================================================================
#
# PURPOSE:
# Benchmarks an NFS (or similar networked) filesystem for ML training pipelines
# that consume large numbers of small WAV/FLAC audio files using PyTorch-style
# dataloaders.
#
# MODELLED WORKLOAD:
# - Many small files (~3 MB each)
# - Random file access (shuffle every epoch)
# - Synchronous reads (iodepth=1)
# - Parallelism from worker count, not async I/O
#
# This closely matches PyTorch Dataset.__getitem__ behavior and exposes
# metadata latency (open/stat/close), which is the dominant NFS bottleneck.
#
# MULTI-NODE USAGE:
# Run simultaneously on all nodes to measure aggregate pressure:
# pdsh -w node[01-08] '/path/to/storage_bench.sh /mnt/shared/bench_$(hostname)'
#
# DEPENDENCIES: fio, python3
# =============================================================================
set -euo pipefail
# ── Configuration ────────────────────────────────────────────────────────────
DIR="${1:-/data_vast/bench_$(hostname)}"
JOBS=32 # Parallel workers (match PyTorch num_workers)
RUNTIME=180s # Duration per test
RAMP=5s # Warm-up period
NRFILES=64 # Files per worker
FILESIZE=3m # Avg WAV/FLAC size
TOTAL_SIZE=192m # NRFILES Γ— FILESIZE
# ── Preflight Checks ─────────────────────────────────────────────────────────
command -v fio >/dev/null 2>&1 || {
echo "ERROR: fio not found. Install with: apt install fio (or yum install fio)"
exit 1
}
command -v python3 >/dev/null 2>&1 || {
echo "ERROR: python3 not found."
exit 1
}
mkdir -p "$DIR"
trap 'echo ""; echo "Cleaning up test files..."; rm -f "$DIR"/bench.* 2>/dev/null; echo "Done."' EXIT
# ── Header ───────────────────────────────────────────────────────────────────
TOTAL_FILES=$((JOBS * NRFILES))
TOTAL_GB=$(python3 -c "print(f'{$TOTAL_FILES * 3 / 1024:.1f}')")
echo "========================================================================"
echo " 🏎️ NFS STORAGE BENCHMARK β€” Audio File I/O Profile"
echo " πŸ–₯️ Node: $(hostname)"
echo " πŸ“ Target: $DIR"
echo " βš™οΈ Config: $JOBS workers Γ— $NRFILES files Γ— $FILESIZE each"
echo " πŸ“Š Footprint: $TOTAL_FILES files, ~${TOTAL_GB} GB"
echo " ⏱️ Runtime: $RUNTIME (+ ${RAMP} ramp)"
echo " πŸ“Œ Model: PyTorch-style synchronous I/O (iodepth=1)"
echo "========================================================================"
echo ""
# ── FIO Runner ───────────────────────────────────────────────────────────────
run_fio() {
local label="$1" rw="$2" bs="$3"
shift 3
local extra_args=("$@")
printf " %-30s ... " "$label"
result=$(fio --name=bench --group_reporting --time_based=1 \
--runtime="$RUNTIME" --ramp_time="$RAMP" \
--size="$TOTAL_SIZE" --filesize="$FILESIZE" \
--nrfiles="$NRFILES" --file_service_type=random \
--numjobs="$JOBS" \
--ioengine=libaio \
--iodepth=1 \
--rw="$rw" --bs="$bs" \
--directory="$DIR" \
--thread \
--fallocate=none \
--allow_file_create=1 \
"${extra_args[@]}" \
--output-format=json 2>/dev/null) || {
echo "❌ FAILED (check mount/permissions at $DIR)"
return
}
stats=$(echo "$result" | python3 -c "
import sys, json
try:
j = json.load(sys.stdin)['jobs'][0]
res = j['read'] if j['read']['bw_bytes'] > 0 else j['write']
bw = res['bw_bytes']
iops = res['iops']
p99 = res['clat_ns']['percentile'].get('99.000000', 0)
print(f'{bw} {iops} {p99}')
except Exception:
print('0 0 0')
")
read -r bw iops p99 <<< "$stats"
if [ "$bw" = "0" ] && [ "$iops" = "0" ]; then
echo "⚠️ No data (test too short or stalled mount)"
return
fi
bw_mb=$(python3 -c "print(f'{$bw / 1e6:.0f}')")
iops_k=$(python3 -c "print(f'{$iops / 1e3:.1f}')")
p99_ms=$(python3 -c "print(f'{$p99 / 1e6:.1f}')")
printf "βœ… %6s MB/s | %7s K IOPS | p99: %s ms\n" "$bw_mb" "$iops_k" "$p99_ms"
}
# =============================================================================
# Small-File Write (Direct I/O)
# =============================================================================
# Models preprocessing pipelines writing many small audio files to shared storage.
echo "── Small-File Write (${TOTAL_FILES} Γ— ${FILESIZE}, direct I/O) ───────────────"
run_fio "Rand Write 1M (iodepth=1)" randwrite 1m --direct=1
echo ""
# =============================================================================
# Small-File Read (Direct I/O)
# =============================================================================
# Models shuffled PyTorch dataloader reads with cold page cache.
echo "── Small-File Read (${TOTAL_FILES} Γ— ${FILESIZE}, direct I/O) ────────────────"
run_fio "Rand Read 1M (iodepth=1)" randread 1m --direct=1
echo ""
# =============================================================================
echo "========================================================================"
echo " βœ… Benchmark Complete"
echo ""
echo " INTERPRETATION:"
echo ""
echo " Read p99 latency is the primary signal."
echo ""
echo " p99 < 5 ms β†’ Excellent (safe for multi-GPU training)"
echo " 5–10 ms β†’ Acceptable but watch scaling"
echo " 10–20 ms β†’ Intermittent GPU stalls likely"
echo " >20 ms β†’ Training throughput collapse at scale"
echo ""
echo " If results are poor, consider:"
echo " - Increasing NFS rsize/wsize (e.g. 1MB)"
echo " - Multiple client IPs / nconnect"
echo " - Packing files into tar/WebDataset shards"
echo "========================================================================"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment