Created
February 11, 2026 02:00
-
-
Save aahmed-se/b152106fc0770d845e7a7f623345c8eb to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| # ============================================================================= | |
| # ποΈ NFS STORAGE BENCHMARK β Audio File I/O Profile (PyTorch-Realistic) | |
| # ============================================================================= | |
| # | |
| # PURPOSE: | |
| # Benchmarks an NFS (or similar networked) filesystem for ML training pipelines | |
| # that consume large numbers of small WAV/FLAC audio files using PyTorch-style | |
| # dataloaders. | |
| # | |
| # MODELLED WORKLOAD: | |
| # - Many small files (~3 MB each) | |
| # - Random file access (shuffle every epoch) | |
| # - Synchronous reads (iodepth=1) | |
| # - Parallelism from worker count, not async I/O | |
| # | |
| # This explicitly exposes metadata latency (open/stat/close), which is the | |
| # dominant bottleneck on NFS at scale. | |
| # | |
| # MULTI-NODE USAGE: | |
| # pdsh -w node[01-08] '/path/to/storage_bench.sh /mnt/shared/bench_$(hostname)' | |
| # | |
| # DEPENDENCIES: fio, python3 | |
| # ============================================================================= | |
| set -euo pipefail | |
| # ββ Configuration ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| DIR="${1:-/data_vast/bench_$(hostname)}" | |
| JOBS=32 # Parallel workers (match PyTorch num_workers) | |
| IODEPTH=1 # ONE outstanding I/O per worker (PyTorch-style sync reads) | |
| RUNTIME=180s # Duration per test | |
| RAMP=5s # Warm-up period | |
| NRFILES=64 # Files per worker | |
| FILESIZE=3m # Avg WAV/FLAC size | |
| TOTAL_SIZE=192m # NRFILES Γ FILESIZE | |
| # ββ Preflight Checks βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| command -v fio >/dev/null 2>&1 || { | |
| echo "ERROR: fio not found. Install with: apt install fio (or yum install fio)" | |
| exit 1 | |
| } | |
| command -v python3 >/dev/null 2>&1 || { | |
| echo "ERROR: python3 not found." | |
| exit 1 | |
| } | |
| mkdir -p "$DIR" | |
| trap 'echo ""; echo "Cleaning up test files..."; rm -f "$DIR"/bench.* 2>/dev/null; echo "Done."' EXIT | |
| # ββ Header βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| TOTAL_FILES=$((JOBS * NRFILES)) | |
| TOTAL_GB=$(python3 -c "print(f'{$TOTAL_FILES * 3 / 1024:.1f}')") | |
| echo "========================================================================" | |
| echo " ποΈ NFS STORAGE BENCHMARK β Audio File I/O Profile" | |
| echo " π₯οΈ Node: $(hostname)" | |
| echo " π Target: $DIR" | |
| echo " βοΈ Config: $JOBS workers Γ $NRFILES files Γ $FILESIZE" | |
| echo " π Footprint: $TOTAL_FILES files, ~${TOTAL_GB} GB" | |
| echo " β±οΈ Runtime: $RUNTIME (+ ${RAMP} ramp)" | |
| echo " π I/O Model: iodepth=${IODEPTH} (synchronous per-worker reads)" | |
| echo "========================================================================" | |
| echo "" | |
| # ββ FIO Runner βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| run_fio() { | |
| local label="$1" rw="$2" bs="$3" | |
| shift 3 | |
| local extra_args=("$@") | |
| printf " %-30s ... " "$label" | |
| result=$(fio --name=bench --group_reporting --time_based=1 \ | |
| --runtime="$RUNTIME" --ramp_time="$RAMP" \ | |
| --size="$TOTAL_SIZE" --filesize="$FILESIZE" \ | |
| --nrfiles="$NRFILES" --file_service_type=random \ | |
| --numjobs="$JOBS" \ | |
| --ioengine=libaio \ | |
| --iodepth="$IODEPTH" \ | |
| --rw="$rw" --bs="$bs" \ | |
| --directory="$DIR" \ | |
| --thread \ | |
| --fallocate=none \ | |
| --allow_file_create=1 \ | |
| "${extra_args[@]}" \ | |
| --output-format=json 2>/dev/null) || { | |
| echo "β FAILED (check mount/permissions at $DIR)" | |
| return | |
| } | |
| stats=$(echo "$result" | python3 -c " | |
| import sys, json | |
| try: | |
| j = json.load(sys.stdin)['jobs'][0] | |
| res = j['read'] if j['read']['bw_bytes'] > 0 else j['write'] | |
| bw = res['bw_bytes'] | |
| iops = res['iops'] | |
| p99 = res['clat_ns']['percentile'].get('99.000000', 0) | |
| print(f'{bw} {iops} {p99}') | |
| except Exception: | |
| print('0 0 0') | |
| ") | |
| read -r bw iops p99 <<< "$stats" | |
| if [ "$bw" = "0" ] && [ "$iops" = "0" ]; then | |
| echo "β οΈ No data (test too short or stalled mount)" | |
| return | |
| fi | |
| bw_mb=$(python3 -c "print(f'{$bw / 1e6:.0f}')") | |
| iops_k=$(python3 -c "print(f'{$iops / 1e3:.1f}')") | |
| p99_ms=$(python3 -c "print(f'{$p99 / 1e6:.1f}')") | |
| printf "β %6s MB/s | %7s K IOPS | p99: %s ms\n" "$bw_mb" "$iops_k" "$p99_ms" | |
| } | |
| # ============================================================================= | |
| # Small-File Write (Direct I/O) | |
| # ============================================================================= | |
| echo "ββ Small-File Write (${TOTAL_FILES} Γ ${FILESIZE}, direct I/O) βββββββββββββββ" | |
| run_fio "Rand Write 1M (iodepth=${IODEPTH})" randwrite 1m --direct=1 | |
| echo "" | |
| # ============================================================================= | |
| # Small-File Read (Direct I/O) | |
| # ============================================================================= | |
| echo "ββ Small-File Read (${TOTAL_FILES} Γ ${FILESIZE}, direct I/O) ββββββββββββββββ" | |
| run_fio "Rand Read 1M (iodepth=${IODEPTH})" randread 1m --direct=1 | |
| echo "" | |
| # ============================================================================= | |
| echo "========================================================================" | |
| echo " β Benchmark Complete" | |
| echo "" | |
| echo " INTERPRETATION:" | |
| echo "" | |
| echo " Read p99 latency is the primary signal." | |
| echo "" | |
| echo " p99 < 5 ms β Excellent (safe for multi-GPU training)" | |
| echo " 5β10 ms β Acceptable but watch scaling" | |
| echo " 10β20 ms β Intermittent GPU stalls likely" | |
| echo " >20 ms β Training throughput collapse at scale" | |
| echo "" | |
| echo " If results are poor, consider:" | |
| echo " - Increasing NFS rsize/wsize (e.g. 1MB)" | |
| echo " - Multiple client IPs / nconnect" | |
| echo " - Packing files into tar/WebDataset shards" | |
| echo "========================================================================" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment