Skip to content

Instantly share code, notes, and snippets.

@aahmed-se
Created February 11, 2026 01:45
Show Gist options
  • Select an option

  • Save aahmed-se/6382da27c06b28cb2bd7e166ce7f194a to your computer and use it in GitHub Desktop.

Select an option

Save aahmed-se/6382da27c06b28cb2bd7e166ce7f194a to your computer and use it in GitHub Desktop.
#!/bin/bash
# =============================================================================
# 🏎️ NFS STORAGE BENCHMARK β€” Audio File I/O Profile
# =============================================================================
#
# PURPOSE:
# Benchmarks an NFS (or similar networked) filesystem for a distributed ML
# training pipeline that consumes large volumes of WAV/FLAC audio files.
#
# WHAT THIS TESTS:
# Creates thousands of 3 MB files (β‰ˆ average WAV size) and measures write
# and read performance while hopping between them randomly β€” exactly what
# a shuffled dataloader does every epoch.
#
# This exercises the NFS metadata path (open/stat/close) which is typically
# the real bottleneck, not raw sequential bandwidth.
#
# MULTI-NODE USAGE:
# Run simultaneously on all nodes to measure aggregate throughput:
# pdsh -w node[01-08] '/path/to/storage_bench.sh /mnt/shared/bench_$(hostname)'
#
# DEPENDENCIES: fio, python3
# =============================================================================
set -euo pipefail
# ── Configuration ────────────────────────────────────────────────────────────
DIR="${1:-/data_vast/bench_$(hostname)}"
JOBS=32 # Parallel threads (match your dataloader num_workers)
RUNTIME=180s # Duration per test (3 min gives stable p99 numbers)
RAMP=5s # Warm-up before measurement begins
NRFILES=64 # Number of files per thread
FILESIZE=3m # Size per file (β‰ˆ average WAV: ~3 MB)
TOTAL_SIZE=192m # I/O cap per thread (NRFILES Γ— FILESIZE)
# ── Preflight Checks ────────────────────────────────────────────────────────
command -v fio >/dev/null 2>&1 || {
echo "ERROR: fio not found. Install with: apt install fio (or yum install fio)"
exit 1
}
command -v python3 >/dev/null 2>&1 || {
echo "ERROR: python3 not found."
exit 1
}
mkdir -p "$DIR"
trap 'echo ""; echo "Cleaning up test files..."; rm -f "$DIR"/bench.* 2>/dev/null; echo "Done."' EXIT
# ── Header ───────────────────────────────────────────────────────────────────
TOTAL_FILES=$((JOBS * NRFILES))
TOTAL_GB=$(python3 -c "print(f'{$TOTAL_FILES * 3 / 1024:.1f}')")
echo "========================================================================"
echo " 🏎️ NFS STORAGE BENCHMARK β€” Audio File I/O Profile"
echo " πŸ–₯️ Node: $(hostname)"
echo " πŸ“ Target: $DIR"
echo " βš™οΈ Config: $JOBS threads Γ— $NRFILES files Γ— $FILESIZE each"
echo " πŸ“Š Footprint: $TOTAL_FILES files, ~${TOTAL_GB} GB total"
echo " ⏱️ Runtime: $RUNTIME per test (+ ${RAMP} ramp)"
echo "========================================================================"
echo ""
# ── FIO Runner ───────────────────────────────────────────────────────────────
# Runs a single fio test against many small files and parses JSON output.
#
# Args:
# $1 label β€” Display name for the test
# $2 rw β€” I/O pattern (randread, randwrite)
# $3 bs β€” Block size (1m = read whole file in ~3 chunks)
# $4+ extra β€” Additional fio flags (optional)
run_fio() {
local label="$1" rw="$2" bs="$3"
shift 3
local extra_args=("$@")
printf " %-28s ... " "$label"
result=$(fio --name=bench --group_reporting --time_based=1 \
--runtime="$RUNTIME" --ramp_time="$RAMP" \
--size="$TOTAL_SIZE" --filesize="$FILESIZE" \
--nrfiles="$NRFILES" --file_service_type=random \
--numjobs="$JOBS" --ioengine=libaio \
--iodepth=32 --rw="$rw" --bs="$bs" --directory="$DIR" \
--thread --fallocate=none --allow_file_create=1 \
"${extra_args[@]}" \
--output-format=json 2>/dev/null) || {
echo "❌ FAILED (check mount/permissions at $DIR)"
return
}
stats=$(echo "$result" | python3 -c "
import sys, json
try:
j = json.load(sys.stdin)['jobs'][0]
res = j['read'] if j['read']['bw_bytes'] > 0 else j['write']
bw = res['bw_bytes']
iops = res['iops']
p99 = res['clat_ns']['percentile'].get('99.000000', 0)
print(f'{bw} {iops} {p99}')
except Exception:
print('0 0 0')
")
read -r bw iops p99 <<< "$stats"
if [ "$bw" = "0" ] && [ "$iops" = "0" ]; then
echo "⚠️ No data (test may have been too short or mount is stalled)"
return
fi
bw_mb=$(python3 -c "print(f'{$bw / 1e6:.0f}')")
iops_k=$(python3 -c "print(f'{$iops / 1e3:.1f}')")
p99_ms=$(python3 -c "print(f'{$p99 / 1e6:.1f}')")
printf "βœ… %6s MB/s | %8s K IOPS | p99: %s ms\n" "$bw_mb" "$iops_k" "$p99_ms"
}
# =============================================================================
# Small-File Write (Direct I/O)
# =============================================================================
# Writes 2,048 Γ— 3 MB files with random access pattern and O_DIRECT.
# Simulates a preprocessing pipeline writing out WAV/FLAC files to shared
# storage. Write runs first so files exist for the read tests.
echo "── Small-File Write (${TOTAL_FILES} Γ— ${FILESIZE}, direct I/O) ─────────────────────"
run_fio "Rand Write 1M (64Γ—3MB)" randwrite 1m --direct=1
echo ""
# =============================================================================
# Small-File Read (Direct I/O)
# =============================================================================
# Randomly hops across the 3 MB files and reads in 1 MB chunks β€” the closest
# fio gets to a shuffled dataloader reading WAV files. O_DIRECT bypasses the
# page cache to measure actual NFS/storage throughput.
echo "── Small-File Read (${TOTAL_FILES} Γ— ${FILESIZE}, direct I/O) ──────────────────────"
run_fio "Rand Read 1M (64Γ—3MB)" randread 1m --direct=1
echo ""
# =============================================================================
echo "========================================================================"
echo " βœ… Benchmark Complete"
echo ""
echo " INTERPRETATION:"
echo ""
echo " Write: Preprocessing/data-staging throughput to shared storage."
echo " Target: enough to not bottleneck your ingest pipeline."
echo ""
echo " Read (direct): True NFS throughput for a shuffled dataloader."
echo " If < 500 MB/s, GPUs will likely be data-starved."
echo " Tune NFS rsize/wsize (try rsize=1048576)."
echo ""
echo " If all reads are slow, consider packing files into tar/WebDataset"
echo " shards to reduce NFS metadata overhead (open/stat/close per file)."
echo "========================================================================"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment