Created
February 11, 2026 01:51
-
-
Save aahmed-se/79782ed2e60ca5b870af231dd3568c52 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| # ============================================================================= | |
| # ποΈ NFS STORAGE BENCHMARK β Audio File I/O Profile (PyTorch-Realistic) | |
| # ============================================================================= | |
| # | |
| # PURPOSE: | |
| # Benchmarks an NFS (or similar networked) filesystem for ML training pipelines | |
| # that consume large numbers of small WAV/FLAC audio files using PyTorch-style | |
| # dataloaders. | |
| # | |
| # MODELLED WORKLOAD: | |
| # - Many small files (~3 MB each) | |
| # - Random file access (shuffle every epoch) | |
| # - Synchronous reads (iodepth=1) | |
| # - Parallelism from worker count, not async I/O | |
| # | |
| # This closely matches PyTorch Dataset.__getitem__ behavior and exposes | |
| # metadata latency (open/stat/close), which is the dominant NFS bottleneck. | |
| # | |
| # MULTI-NODE USAGE: | |
| # Run simultaneously on all nodes to measure aggregate pressure: | |
| # pdsh -w node[01-08] '/path/to/storage_bench.sh /mnt/shared/bench_$(hostname)' | |
| # | |
| # DEPENDENCIES: fio, python3 | |
| # ============================================================================= | |
| set -euo pipefail | |
| # ββ Configuration ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| DIR="${1:-/data_vast/bench_$(hostname)}" | |
| JOBS=32 # Parallel workers (match PyTorch num_workers) | |
| RUNTIME=180s # Duration per test | |
| RAMP=5s # Warm-up period | |
| NRFILES=64 # Files per worker | |
| FILESIZE=3m # Avg WAV/FLAC size | |
| TOTAL_SIZE=192m # NRFILES Γ FILESIZE | |
| # ββ Preflight Checks βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| command -v fio >/dev/null 2>&1 || { | |
| echo "ERROR: fio not found. Install with: apt install fio (or yum install fio)" | |
| exit 1 | |
| } | |
| command -v python3 >/dev/null 2>&1 || { | |
| echo "ERROR: python3 not found." | |
| exit 1 | |
| } | |
| mkdir -p "$DIR" | |
| trap 'echo ""; echo "Cleaning up test files..."; rm -f "$DIR"/bench.* 2>/dev/null; echo "Done."' EXIT | |
| # ββ Header βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| TOTAL_FILES=$((JOBS * NRFILES)) | |
| TOTAL_GB=$(python3 -c "print(f'{$TOTAL_FILES * 3 / 1024:.1f}')") | |
| echo "========================================================================" | |
| echo " ποΈ NFS STORAGE BENCHMARK β Audio File I/O Profile" | |
| echo " π₯οΈ Node: $(hostname)" | |
| echo " π Target: $DIR" | |
| echo " βοΈ Config: $JOBS workers Γ $NRFILES files Γ $FILESIZE each" | |
| echo " π Footprint: $TOTAL_FILES files, ~${TOTAL_GB} GB" | |
| echo " β±οΈ Runtime: $RUNTIME (+ ${RAMP} ramp)" | |
| echo " π Model: PyTorch-style synchronous I/O (iodepth=1)" | |
| echo "========================================================================" | |
| echo "" | |
| # ββ FIO Runner βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| run_fio() { | |
| local label="$1" rw="$2" bs="$3" | |
| shift 3 | |
| local extra_args=("$@") | |
| printf " %-30s ... " "$label" | |
| result=$(fio --name=bench --group_reporting --time_based=1 \ | |
| --runtime="$RUNTIME" --ramp_time="$RAMP" \ | |
| --size="$TOTAL_SIZE" --filesize="$FILESIZE" \ | |
| --nrfiles="$NRFILES" --file_service_type=random \ | |
| --numjobs="$JOBS" \ | |
| --ioengine=libaio \ | |
| --iodepth=1 \ | |
| --rw="$rw" --bs="$bs" \ | |
| --directory="$DIR" \ | |
| --thread \ | |
| --fallocate=none \ | |
| --allow_file_create=1 \ | |
| "${extra_args[@]}" \ | |
| --output-format=json 2>/dev/null) || { | |
| echo "β FAILED (check mount/permissions at $DIR)" | |
| return | |
| } | |
| stats=$(echo "$result" | python3 -c " | |
| import sys, json | |
| try: | |
| j = json.load(sys.stdin)['jobs'][0] | |
| res = j['read'] if j['read']['bw_bytes'] > 0 else j['write'] | |
| bw = res['bw_bytes'] | |
| iops = res['iops'] | |
| p99 = res['clat_ns']['percentile'].get('99.000000', 0) | |
| print(f'{bw} {iops} {p99}') | |
| except Exception: | |
| print('0 0 0') | |
| ") | |
| read -r bw iops p99 <<< "$stats" | |
| if [ "$bw" = "0" ] && [ "$iops" = "0" ]; then | |
| echo "β οΈ No data (test too short or stalled mount)" | |
| return | |
| fi | |
| bw_mb=$(python3 -c "print(f'{$bw / 1e6:.0f}')") | |
| iops_k=$(python3 -c "print(f'{$iops / 1e3:.1f}')") | |
| p99_ms=$(python3 -c "print(f'{$p99 / 1e6:.1f}')") | |
| printf "β %6s MB/s | %7s K IOPS | p99: %s ms\n" "$bw_mb" "$iops_k" "$p99_ms" | |
| } | |
| # ============================================================================= | |
| # Small-File Write (Direct I/O) | |
| # ============================================================================= | |
| # Models preprocessing pipelines writing many small audio files to shared storage. | |
| echo "ββ Small-File Write (${TOTAL_FILES} Γ ${FILESIZE}, direct I/O) βββββββββββββββ" | |
| run_fio "Rand Write 1M (iodepth=1)" randwrite 1m --direct=1 | |
| echo "" | |
| # ============================================================================= | |
| # Small-File Read (Direct I/O) | |
| # ============================================================================= | |
| # Models shuffled PyTorch dataloader reads with cold page cache. | |
| echo "ββ Small-File Read (${TOTAL_FILES} Γ ${FILESIZE}, direct I/O) ββββββββββββββββ" | |
| run_fio "Rand Read 1M (iodepth=1)" randread 1m --direct=1 | |
| echo "" | |
| # ============================================================================= | |
| echo "========================================================================" | |
| echo " β Benchmark Complete" | |
| echo "" | |
| echo " INTERPRETATION:" | |
| echo "" | |
| echo " Read p99 latency is the primary signal." | |
| echo "" | |
| echo " p99 < 5 ms β Excellent (safe for multi-GPU training)" | |
| echo " 5β10 ms β Acceptable but watch scaling" | |
| echo " 10β20 ms β Intermittent GPU stalls likely" | |
| echo " >20 ms β Training throughput collapse at scale" | |
| echo "" | |
| echo " If results are poor, consider:" | |
| echo " - Increasing NFS rsize/wsize (e.g. 1MB)" | |
| echo " - Multiple client IPs / nconnect" | |
| echo " - Packing files into tar/WebDataset shards" | |
| echo "========================================================================" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment