Created
February 11, 2026 01:45
-
-
Save aahmed-se/6382da27c06b28cb2bd7e166ce7f194a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| # ============================================================================= | |
| # ποΈ NFS STORAGE BENCHMARK β Audio File I/O Profile | |
| # ============================================================================= | |
| # | |
| # PURPOSE: | |
| # Benchmarks an NFS (or similar networked) filesystem for a distributed ML | |
| # training pipeline that consumes large volumes of WAV/FLAC audio files. | |
| # | |
| # WHAT THIS TESTS: | |
| # Creates thousands of 3 MB files (β average WAV size) and measures write | |
| # and read performance while hopping between them randomly β exactly what | |
| # a shuffled dataloader does every epoch. | |
| # | |
| # This exercises the NFS metadata path (open/stat/close) which is typically | |
| # the real bottleneck, not raw sequential bandwidth. | |
| # | |
| # MULTI-NODE USAGE: | |
| # Run simultaneously on all nodes to measure aggregate throughput: | |
| # pdsh -w node[01-08] '/path/to/storage_bench.sh /mnt/shared/bench_$(hostname)' | |
| # | |
| # DEPENDENCIES: fio, python3 | |
| # ============================================================================= | |
| set -euo pipefail | |
| # ββ Configuration ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| DIR="${1:-/data_vast/bench_$(hostname)}" | |
| JOBS=32 # Parallel threads (match your dataloader num_workers) | |
| RUNTIME=180s # Duration per test (3 min gives stable p99 numbers) | |
| RAMP=5s # Warm-up before measurement begins | |
| NRFILES=64 # Number of files per thread | |
| FILESIZE=3m # Size per file (β average WAV: ~3 MB) | |
| TOTAL_SIZE=192m # I/O cap per thread (NRFILES Γ FILESIZE) | |
| # ββ Preflight Checks ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| command -v fio >/dev/null 2>&1 || { | |
| echo "ERROR: fio not found. Install with: apt install fio (or yum install fio)" | |
| exit 1 | |
| } | |
| command -v python3 >/dev/null 2>&1 || { | |
| echo "ERROR: python3 not found." | |
| exit 1 | |
| } | |
| mkdir -p "$DIR" | |
| trap 'echo ""; echo "Cleaning up test files..."; rm -f "$DIR"/bench.* 2>/dev/null; echo "Done."' EXIT | |
| # ββ Header βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| TOTAL_FILES=$((JOBS * NRFILES)) | |
| TOTAL_GB=$(python3 -c "print(f'{$TOTAL_FILES * 3 / 1024:.1f}')") | |
| echo "========================================================================" | |
| echo " ποΈ NFS STORAGE BENCHMARK β Audio File I/O Profile" | |
| echo " π₯οΈ Node: $(hostname)" | |
| echo " π Target: $DIR" | |
| echo " βοΈ Config: $JOBS threads Γ $NRFILES files Γ $FILESIZE each" | |
| echo " π Footprint: $TOTAL_FILES files, ~${TOTAL_GB} GB total" | |
| echo " β±οΈ Runtime: $RUNTIME per test (+ ${RAMP} ramp)" | |
| echo "========================================================================" | |
| echo "" | |
| # ββ FIO Runner βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Runs a single fio test against many small files and parses JSON output. | |
| # | |
| # Args: | |
| # $1 label β Display name for the test | |
| # $2 rw β I/O pattern (randread, randwrite) | |
| # $3 bs β Block size (1m = read whole file in ~3 chunks) | |
| # $4+ extra β Additional fio flags (optional) | |
| run_fio() { | |
| local label="$1" rw="$2" bs="$3" | |
| shift 3 | |
| local extra_args=("$@") | |
| printf " %-28s ... " "$label" | |
| result=$(fio --name=bench --group_reporting --time_based=1 \ | |
| --runtime="$RUNTIME" --ramp_time="$RAMP" \ | |
| --size="$TOTAL_SIZE" --filesize="$FILESIZE" \ | |
| --nrfiles="$NRFILES" --file_service_type=random \ | |
| --numjobs="$JOBS" --ioengine=libaio \ | |
| --iodepth=32 --rw="$rw" --bs="$bs" --directory="$DIR" \ | |
| --thread --fallocate=none --allow_file_create=1 \ | |
| "${extra_args[@]}" \ | |
| --output-format=json 2>/dev/null) || { | |
| echo "β FAILED (check mount/permissions at $DIR)" | |
| return | |
| } | |
| stats=$(echo "$result" | python3 -c " | |
| import sys, json | |
| try: | |
| j = json.load(sys.stdin)['jobs'][0] | |
| res = j['read'] if j['read']['bw_bytes'] > 0 else j['write'] | |
| bw = res['bw_bytes'] | |
| iops = res['iops'] | |
| p99 = res['clat_ns']['percentile'].get('99.000000', 0) | |
| print(f'{bw} {iops} {p99}') | |
| except Exception: | |
| print('0 0 0') | |
| ") | |
| read -r bw iops p99 <<< "$stats" | |
| if [ "$bw" = "0" ] && [ "$iops" = "0" ]; then | |
| echo "β οΈ No data (test may have been too short or mount is stalled)" | |
| return | |
| fi | |
| bw_mb=$(python3 -c "print(f'{$bw / 1e6:.0f}')") | |
| iops_k=$(python3 -c "print(f'{$iops / 1e3:.1f}')") | |
| p99_ms=$(python3 -c "print(f'{$p99 / 1e6:.1f}')") | |
| printf "β %6s MB/s | %8s K IOPS | p99: %s ms\n" "$bw_mb" "$iops_k" "$p99_ms" | |
| } | |
| # ============================================================================= | |
| # Small-File Write (Direct I/O) | |
| # ============================================================================= | |
| # Writes 2,048 Γ 3 MB files with random access pattern and O_DIRECT. | |
| # Simulates a preprocessing pipeline writing out WAV/FLAC files to shared | |
| # storage. Write runs first so files exist for the read tests. | |
| echo "ββ Small-File Write (${TOTAL_FILES} Γ ${FILESIZE}, direct I/O) βββββββββββββββββββββ" | |
| run_fio "Rand Write 1M (64Γ3MB)" randwrite 1m --direct=1 | |
| echo "" | |
| # ============================================================================= | |
| # Small-File Read (Direct I/O) | |
| # ============================================================================= | |
| # Randomly hops across the 3 MB files and reads in 1 MB chunks β the closest | |
| # fio gets to a shuffled dataloader reading WAV files. O_DIRECT bypasses the | |
| # page cache to measure actual NFS/storage throughput. | |
| echo "ββ Small-File Read (${TOTAL_FILES} Γ ${FILESIZE}, direct I/O) ββββββββββββββββββββββ" | |
| run_fio "Rand Read 1M (64Γ3MB)" randread 1m --direct=1 | |
| echo "" | |
| # ============================================================================= | |
| echo "========================================================================" | |
| echo " β Benchmark Complete" | |
| echo "" | |
| echo " INTERPRETATION:" | |
| echo "" | |
| echo " Write: Preprocessing/data-staging throughput to shared storage." | |
| echo " Target: enough to not bottleneck your ingest pipeline." | |
| echo "" | |
| echo " Read (direct): True NFS throughput for a shuffled dataloader." | |
| echo " If < 500 MB/s, GPUs will likely be data-starved." | |
| echo " Tune NFS rsize/wsize (try rsize=1048576)." | |
| echo "" | |
| echo " If all reads are slow, consider packing files into tar/WebDataset" | |
| echo " shards to reduce NFS metadata overhead (open/stat/close per file)." | |
| echo "========================================================================" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment