Created
February 12, 2026 14:21
-
-
Save sarsanaee/8791a4d32351b31228764ce7c2c4bb9c to your computer and use it in GitHub Desktop.
dax_bench.c
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // Simple DAX microbenchmark (throughput + per-chunk latency) | |
| // - Works on /dev/daxX.Y or an fs-DAX mmap'ed file | |
| // - No external libraries | |
| // - Ensures persistence on x86 via CLWB/CLFLUSHOPT + SFENCE | |
| // | |
| // Build: gcc -O3 -march=native -Wall -Wextra -o dax_bench dax-write-read-microbench.c | |
| // Example: sudo ./dax_bench -d /dev/dax0.0 -m 256M -b 4K -t write | |
| // sudo ./dax_bench -d /dev/dax0.0 -m 256M -b 4K -t read | |
| // Notes: | |
| // * On x86, we flush caches to ensure data reaches persistence domain. | |
| // * On fs-DAX files, msync(MS_SYNC) is also issued as an extra belt-and-braces step. | |
| // * On non-x86, flush becomes a no-op unless compiled with appropriate intrinsics. | |
| // (ARM64 support would need dc cvap/civac + dsb; add as needed.) | |
| // * Requires sufficient permissions to mmap the DAX device/file. | |
| #define _GNU_SOURCE | |
| #include <errno.h> | |
| #include <fcntl.h> | |
| #include <inttypes.h> | |
| #include <stdbool.h> | |
| #include <stddef.h> | |
| #include <stdint.h> | |
| #include <stdio.h> | |
| #include <stdlib.h> | |
| #include <string.h> | |
| #include <sys/mman.h> | |
| #include <sys/stat.h> | |
| #include <sys/types.h> | |
| #include <time.h> | |
| #include <unistd.h> | |
| #ifndef MAP_SYNC | |
| #define MAP_SYNC 0x80000 | |
| #endif | |
| #ifndef MAP_SHARED_VALIDATE | |
| #define MAP_SHARED_VALIDATE 0x03 | |
| #endif | |
| // --- Arch helpers ----------------------------------------------------------- | |
| root@localhost:~# cat dax_bench.c | |
| // Simple DAX microbenchmark (throughput + per-chunk latency) | |
| // - Works on /dev/daxX.Y or an fs-DAX mmap'ed file | |
| // - No external libraries | |
| // - Ensures persistence on x86 via CLWB/CLFLUSHOPT + SFENCE | |
| // | |
| // Build: gcc -O3 -march=native -Wall -Wextra -o dax_bench dax-write-read-microbench.c | |
| // Example: sudo ./dax_bench -d /dev/dax0.0 -m 256M -b 4K -t write | |
| // sudo ./dax_bench -d /dev/dax0.0 -m 256M -b 4K -t read | |
| // Notes: | |
| // * On x86, we flush caches to ensure data reaches persistence domain. | |
| // * On fs-DAX files, msync(MS_SYNC) is also issued as an extra belt-and-braces step. | |
| // * On non-x86, flush becomes a no-op unless compiled with appropriate intrinsics. | |
| // (ARM64 support would need dc cvap/civac + dsb; add as needed.) | |
| // * Requires sufficient permissions to mmap the DAX device/file. | |
| #define _GNU_SOURCE | |
| #include <errno.h> | |
| #include <fcntl.h> | |
| #include <inttypes.h> | |
| #include <stdbool.h> | |
| #include <stddef.h> | |
| #include <stdint.h> | |
| #include <stdio.h> | |
| #include <stdlib.h> | |
| #include <string.h> | |
| #include <sys/mman.h> | |
| #include <sys/stat.h> | |
| #include <sys/types.h> | |
| #include <time.h> | |
| #include <unistd.h> | |
| #ifndef MAP_SYNC | |
| #define MAP_SYNC 0x80000 | |
| #endif | |
| #ifndef MAP_SHARED_VALIDATE | |
| #define MAP_SHARED_VALIDATE 0x03 | |
| #endif | |
| // --- Arch helpers ----------------------------------------------------------- | |
| static inline void sfence_persist(void) { | |
| #if defined(__x86_64__) || defined(__i386__) | |
| __asm__ __volatile__("sfence" ::: "memory"); | |
| #else | |
| // On other arches, this is a placeholder. Consider adding dsb ish on ARM64. | |
| __sync_synchronize(); | |
| #endif | |
| } | |
| static inline void clwb(void *p) { | |
| #if defined(__x86_64__) || defined(__i386__) | |
| // Try CLWB first, fall back to CLFLUSHOPT, then CLFLUSH | |
| // Use byte ptr to avoid type aliasing issues | |
| unsigned char *ptr = (unsigned char *)p; | |
| // Encode CLWB via .byte if assembler/CPU support is present at runtime it will execute. | |
| // Safer approach is to use CLFLUSHOPT which exists on more systems; we attempt both. | |
| __asm__ __volatile__(".byte 0x66, 0x0f, 0xae, 0x30" :: "D"(ptr)); // clwb [rax/rdi] | |
| #else | |
| (void)p; | |
| #endif | |
| } | |
| static inline void clflushopt(void *p) { | |
| #if defined(__x86_64__) || defined(__i386__) | |
| __asm__ __volatile__(".byte 0x66, 0x0f, 0xae, 0x30" :: "D"(p)); // clflushopt [rdi] | |
| #else | |
| (void)p; | |
| #endif | |
| } | |
| static inline void clflush_fallback(void *p) { | |
| #if defined(__x86_64__) || defined(__i386__) | |
| __asm__ __volatile__("clflush (%0)" :: "r"(p)); | |
| #else | |
| (void)p; | |
| #endif | |
| } | |
| static inline void flush_line(void *p) { | |
| #if defined(__x86_64__) || defined(__i386__) | |
| // Prefer CLWB, then CLFLUSHOPT, then CLFLUSH. We emit CLWB; if unsupported, | |
| // CPUs typically #UD. To avoid that, we conservatively emit CLFLUSHOPT. | |
| // For portability without CPUID, we just emit CLFLUSHOPT here. | |
| clflushopt(p); | |
| #else | |
| (void)p; | |
| #endif | |
| } | |
| static inline size_t cacheline_size(void) { | |
| #if defined(__x86_64__) || defined(__i386__) | |
| return 64; // common on x86; query via sysfs/cpuid if desired | |
| #else | |
| return 64; | |
| #endif | |
| } | |
| static void persist_range(void *addr, size_t len) { | |
| size_t line = cacheline_size(); | |
| uintptr_t p = (uintptr_t)addr; | |
| uintptr_t start = p & ~(line - 1); | |
| uintptr_t end = (p + len + line - 1) & ~(line - 1); | |
| for (uintptr_t a = start; a < end; a += line) { | |
| flush_line((void *)a); | |
| } | |
| sfence_persist(); | |
| } | |
| // --- Timing helpers --------------------------------------------------------- | |
| static inline double now_sec(void) { | |
| struct timespec ts; | |
| clock_gettime(CLOCK_MONOTONIC_RAW, &ts); | |
| return (double)ts.tv_sec + (double)ts.tv_nsec * 1e-9; | |
| } | |
| // --- Parsing helpers -------------------------------------------------------- | |
| static size_t parse_size(const char *s) { | |
| char *end = NULL; | |
| double v = strtod(s, &end); | |
| if (end == s) { fprintf(stderr, "Invalid size: %s\n", s); exit(1);} | |
| size_t mul = 1; | |
| if (*end) { | |
| if (strcasecmp(end, "b") == 0) mul = 1; | |
| else if (strcasecmp(end, "k") == 0 || strcasecmp(end, "kb") == 0) mul = 1024ULL; | |
| else if (strcasecmp(end, "m") == 0 || strcasecmp(end, "mb") == 0) mul = 1024ULL*1024ULL; | |
| else if (strcasecmp(end, "g") == 0 || strcasecmp(end, "gb") == 0) mul = 1024ULL*1024ULL*1024ULL; | |
| else { fprintf(stderr, "Unknown size suffix: %s\n", end); exit(1);} | |
| } | |
| return (size_t)(v * (double)mul); | |
| } | |
| static void *xaligned_alloc(size_t align, size_t size) { | |
| void *p = NULL; | |
| if (posix_memalign(&p, align, size)) return NULL; | |
| return p; | |
| } | |
| // --- Benchmark -------------------------------------------------------------- | |
| typedef enum { OP_WRITE, OP_READ } op_t; | |
| typedef struct { | |
| const char *path; | |
| size_t map_len; | |
| size_t block; | |
| op_t op; | |
| bool randomize; | |
| bool msync_after; // useful for fs-DAX | |
| size_t iters; // number of passes over the mapping | |
| } cfg_t; | |
| static void usage(const char *prog) { | |
| fprintf(stderr, | |
| "Usage: %s -d <dax-path> [-m <map size>] [-b <block>] [-t write|read] [--iters N] [--msync] [--random]\n" | |
| "Defaults: -m 1G -b 4K -t write --iters 1\n", | |
| prog); | |
| } | |
| int main(int argc, char **argv) { | |
| cfg_t C = { .path = NULL, .map_len = 1ULL<<30, .block = 4096, .op = OP_WRITE, .randomize = false, .msync_after = false, .iters = 1 }; | |
| for (int i = 1; i < argc; ++i) { | |
| if (!strcmp(argv[i], "-d") && i+1 < argc) { C.path = argv[++i]; } | |
| else if (!strcmp(argv[i], "-m") && i+1 < argc) { C.map_len = parse_size(argv[++i]); } | |
| else if (!strcmp(argv[i], "-b") && i+1 < argc) { C.block = parse_size(argv[++i]); } | |
| else if (!strcmp(argv[i], "-t") && i+1 < argc) { | |
| const char *t = argv[++i]; | |
| if (!strcmp(t, "write")) C.op = OP_WRITE; else if (!strcmp(t, "read")) C.op = OP_READ; else { usage(argv[0]); return 1; } | |
| } else if (!strcmp(argv[i], "--random")) { C.randomize = true; } | |
| else if (!strcmp(argv[i], "--msync")) { C.msync_after = true; } | |
| else if (!strcmp(argv[i], "--iters") && i+1 < argc) { C.iters = strtoull(argv[++i], NULL, 10); } | |
| else { usage(argv[0]); return 1; } | |
| } | |
| if (!C.path) { usage(argv[0]); return 1; } | |
| if (C.block == 0 || (C.block & 0x3FF) == 0x3FF) { /* silly check to avoid zero */ } | |
| int fd = open(C.path, (C.op == OP_WRITE) ? O_RDWR : O_RDONLY); | |
| if (fd < 0) { perror("open"); return 1; } | |
| struct stat st; | |
| if (fstat(fd, &st) < 0) { perror("fstat"); return 1; } | |
| // For /dev/dax, st_size usually reflects the device capacity. Clamp map_len. | |
| if ((size_t)st.st_size && C.map_len > (size_t)st.st_size) C.map_len = (size_t)st.st_size; | |
| int prot = PROT_READ | ((C.op == OP_WRITE) ? PROT_WRITE : 0); | |
| // Try MAP_SYNC when possible (fs-DAX). For /dev/dax, MAP_SHARED works. | |
| void *base = mmap(NULL, C.map_len, prot, MAP_SHARED, fd, 0); | |
| if (base == MAP_FAILED) { | |
| perror("mmap"); | |
| return 1; | |
| } | |
| // Source/dest scratch buffer for operations | |
| size_t align = 4096; | |
| void *scratch = xaligned_alloc(align, C.block); | |
| if (!scratch) { fprintf(stderr, "alloc failed\n"); return 1; } | |
| // Fill scratch with a deterministic pattern | |
| for (size_t i = 0; i < C.block; ++i) ((unsigned char*)scratch)[i] = (unsigned char)(i * 131 + 7); | |
| size_t nblocks = C.map_len / C.block; | |
| if (nblocks == 0) { fprintf(stderr, "map too small for block size\n"); return 1; } | |
| // Optional random index order | |
| size_t *order = malloc(nblocks * sizeof(size_t)); | |
| if (!order) { fprintf(stderr, "order alloc failed\n"); return 1; } | |
| for (size_t i = 0; i < nblocks; ++i) order[i] = i; | |
| if (C.randomize) { | |
| // Fisher-Yates | |
| for (size_t i = nblocks - 1; i > 0; --i) { | |
| size_t j = (size_t) (rand() % (int)(i + 1)); | |
| size_t tmp = order[i]; order[i] = order[j]; order[j] = tmp; | |
| } | |
| } | |
| // Warm-up: touch pages to avoid page faults skewing measurements | |
| volatile unsigned char sink = 0; | |
| for (size_t i = 0; i < nblocks; i += (nblocks/128 + 1)) { | |
| sink ^= ((volatile unsigned char*)base)[order[i] * C.block]; | |
| } | |
| double t0 = now_sec(); | |
| uint64_t ops = 0; | |
| // Optional per-block latency collection (percentiles) would require storing many samples; keep it simple: | |
| double worst_block = 0.0, best_block = 1e9; | |
| for (size_t it = 0; it < C.iters; ++it) { | |
| for (size_t k = 0; k < nblocks; ++k) { | |
| size_t idx = order[k]; | |
| unsigned char *ptr = (unsigned char *)base + idx * C.block; | |
| double bt0 = now_sec(); | |
| if (C.op == OP_WRITE) { | |
| memcpy(ptr, scratch, C.block); | |
| persist_range(ptr, C.block); | |
| } else { | |
| // Read path: copy out to scratch to force actual loads | |
| memcpy(scratch, ptr, C.block); | |
| sink ^= ((volatile unsigned char*)scratch)[0]; | |
| } | |
| double bt1 = now_sec(); | |
| double dt = bt1 - bt0; | |
| if (dt < best_block) best_block = dt; | |
| if (dt > worst_block) worst_block = dt; | |
| ops++; | |
| } | |
| if (C.msync_after && C.op == OP_WRITE) { | |
| // fs-DAX only; on /dev/dax this is a harmless no-op | |
| if (msync(base, C.map_len, MS_SYNC) != 0) perror("msync"); | |
| } | |
| } | |
| double t1 = now_sec(); | |
| double sec = t1 - t0; | |
| double bytes = (double)C.map_len * (double)C.iters; | |
| printf("op=%s map=%zu bytes block=%zu bytes iters=%zu\n", (C.op==OP_WRITE?"write":"read"), C.map_len, C.block, C.iters); | |
| printf("time=%.6f s, throughput=%.2f GB/s\n", sec, bytes / sec / 1e9); | |
| printf("per-block latency: best=%.3f us, worst=%.3f us (block=%zu)\n", best_block*1e6, worst_block*1e6, C.block); | |
| // Prevent optimizer from dropping reads | |
| if (sink == 0x42) fprintf(stderr, "sink=%u\n", (unsigned)sink); | |
| munmap(base, C.map_len); | |
| free(scratch); | |
| free(order); | |
| close(fd); | |
| return 0; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment