Skip to content

Instantly share code, notes, and snippets.

@sarsanaee
Created February 12, 2026 14:21
Show Gist options
  • Select an option

  • Save sarsanaee/8791a4d32351b31228764ce7c2c4bb9c to your computer and use it in GitHub Desktop.

Select an option

Save sarsanaee/8791a4d32351b31228764ce7c2c4bb9c to your computer and use it in GitHub Desktop.
dax_bench.c
// Simple DAX microbenchmark (throughput + per-chunk latency)
// - Works on /dev/daxX.Y or an fs-DAX mmap'ed file
// - No external libraries
// - Ensures persistence on x86 via CLWB/CLFLUSHOPT + SFENCE
//
// Build: gcc -O3 -march=native -Wall -Wextra -o dax_bench dax-write-read-microbench.c
// Example: sudo ./dax_bench -d /dev/dax0.0 -m 256M -b 4K -t write
// sudo ./dax_bench -d /dev/dax0.0 -m 256M -b 4K -t read
// Notes:
// * On x86, we flush caches to ensure data reaches persistence domain.
// * On fs-DAX files, msync(MS_SYNC) is also issued as an extra belt-and-braces step.
// * On non-x86, flush becomes a no-op unless compiled with appropriate intrinsics.
// (ARM64 support would need dc cvap/civac + dsb; add as needed.)
// * Requires sufficient permissions to mmap the DAX device/file.
#define _GNU_SOURCE
#include <errno.h>
#include <fcntl.h>
#include <inttypes.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <time.h>
#include <unistd.h>
#ifndef MAP_SYNC
#define MAP_SYNC 0x80000
#endif
#ifndef MAP_SHARED_VALIDATE
#define MAP_SHARED_VALIDATE 0x03
#endif
// --- Arch helpers -----------------------------------------------------------
root@localhost:~# cat dax_bench.c
// Simple DAX microbenchmark (throughput + per-chunk latency)
// - Works on /dev/daxX.Y or an fs-DAX mmap'ed file
// - No external libraries
// - Ensures persistence on x86 via CLWB/CLFLUSHOPT + SFENCE
//
// Build: gcc -O3 -march=native -Wall -Wextra -o dax_bench dax-write-read-microbench.c
// Example: sudo ./dax_bench -d /dev/dax0.0 -m 256M -b 4K -t write
// sudo ./dax_bench -d /dev/dax0.0 -m 256M -b 4K -t read
// Notes:
// * On x86, we flush caches to ensure data reaches persistence domain.
// * On fs-DAX files, msync(MS_SYNC) is also issued as an extra belt-and-braces step.
// * On non-x86, flush becomes a no-op unless compiled with appropriate intrinsics.
// (ARM64 support would need dc cvap/civac + dsb; add as needed.)
// * Requires sufficient permissions to mmap the DAX device/file.
#define _GNU_SOURCE
#include <errno.h>
#include <fcntl.h>
#include <inttypes.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <time.h>
#include <unistd.h>
#ifndef MAP_SYNC
#define MAP_SYNC 0x80000
#endif
#ifndef MAP_SHARED_VALIDATE
#define MAP_SHARED_VALIDATE 0x03
#endif
// --- Arch helpers -----------------------------------------------------------
static inline void sfence_persist(void) {
#if defined(__x86_64__) || defined(__i386__)
__asm__ __volatile__("sfence" ::: "memory");
#else
// On other arches, this is a placeholder. Consider adding dsb ish on ARM64.
__sync_synchronize();
#endif
}
static inline void clwb(void *p) {
#if defined(__x86_64__) || defined(__i386__)
// Try CLWB first, fall back to CLFLUSHOPT, then CLFLUSH
// Use byte ptr to avoid type aliasing issues
unsigned char *ptr = (unsigned char *)p;
// Encode CLWB via .byte if assembler/CPU support is present at runtime it will execute.
// Safer approach is to use CLFLUSHOPT which exists on more systems; we attempt both.
__asm__ __volatile__(".byte 0x66, 0x0f, 0xae, 0x30" :: "D"(ptr)); // clwb [rax/rdi]
#else
(void)p;
#endif
}
static inline void clflushopt(void *p) {
#if defined(__x86_64__) || defined(__i386__)
__asm__ __volatile__(".byte 0x66, 0x0f, 0xae, 0x30" :: "D"(p)); // clflushopt [rdi]
#else
(void)p;
#endif
}
static inline void clflush_fallback(void *p) {
#if defined(__x86_64__) || defined(__i386__)
__asm__ __volatile__("clflush (%0)" :: "r"(p));
#else
(void)p;
#endif
}
static inline void flush_line(void *p) {
#if defined(__x86_64__) || defined(__i386__)
// Prefer CLWB, then CLFLUSHOPT, then CLFLUSH. We emit CLWB; if unsupported,
// CPUs typically #UD. To avoid that, we conservatively emit CLFLUSHOPT.
// For portability without CPUID, we just emit CLFLUSHOPT here.
clflushopt(p);
#else
(void)p;
#endif
}
static inline size_t cacheline_size(void) {
#if defined(__x86_64__) || defined(__i386__)
return 64; // common on x86; query via sysfs/cpuid if desired
#else
return 64;
#endif
}
static void persist_range(void *addr, size_t len) {
size_t line = cacheline_size();
uintptr_t p = (uintptr_t)addr;
uintptr_t start = p & ~(line - 1);
uintptr_t end = (p + len + line - 1) & ~(line - 1);
for (uintptr_t a = start; a < end; a += line) {
flush_line((void *)a);
}
sfence_persist();
}
// --- Timing helpers ---------------------------------------------------------
static inline double now_sec(void) {
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
return (double)ts.tv_sec + (double)ts.tv_nsec * 1e-9;
}
// --- Parsing helpers --------------------------------------------------------
static size_t parse_size(const char *s) {
char *end = NULL;
double v = strtod(s, &end);
if (end == s) { fprintf(stderr, "Invalid size: %s\n", s); exit(1);}
size_t mul = 1;
if (*end) {
if (strcasecmp(end, "b") == 0) mul = 1;
else if (strcasecmp(end, "k") == 0 || strcasecmp(end, "kb") == 0) mul = 1024ULL;
else if (strcasecmp(end, "m") == 0 || strcasecmp(end, "mb") == 0) mul = 1024ULL*1024ULL;
else if (strcasecmp(end, "g") == 0 || strcasecmp(end, "gb") == 0) mul = 1024ULL*1024ULL*1024ULL;
else { fprintf(stderr, "Unknown size suffix: %s\n", end); exit(1);}
}
return (size_t)(v * (double)mul);
}
static void *xaligned_alloc(size_t align, size_t size) {
void *p = NULL;
if (posix_memalign(&p, align, size)) return NULL;
return p;
}
// --- Benchmark --------------------------------------------------------------
typedef enum { OP_WRITE, OP_READ } op_t;
typedef struct {
const char *path;
size_t map_len;
size_t block;
op_t op;
bool randomize;
bool msync_after; // useful for fs-DAX
size_t iters; // number of passes over the mapping
} cfg_t;
static void usage(const char *prog) {
fprintf(stderr,
"Usage: %s -d <dax-path> [-m <map size>] [-b <block>] [-t write|read] [--iters N] [--msync] [--random]\n"
"Defaults: -m 1G -b 4K -t write --iters 1\n",
prog);
}
int main(int argc, char **argv) {
cfg_t C = { .path = NULL, .map_len = 1ULL<<30, .block = 4096, .op = OP_WRITE, .randomize = false, .msync_after = false, .iters = 1 };
for (int i = 1; i < argc; ++i) {
if (!strcmp(argv[i], "-d") && i+1 < argc) { C.path = argv[++i]; }
else if (!strcmp(argv[i], "-m") && i+1 < argc) { C.map_len = parse_size(argv[++i]); }
else if (!strcmp(argv[i], "-b") && i+1 < argc) { C.block = parse_size(argv[++i]); }
else if (!strcmp(argv[i], "-t") && i+1 < argc) {
const char *t = argv[++i];
if (!strcmp(t, "write")) C.op = OP_WRITE; else if (!strcmp(t, "read")) C.op = OP_READ; else { usage(argv[0]); return 1; }
} else if (!strcmp(argv[i], "--random")) { C.randomize = true; }
else if (!strcmp(argv[i], "--msync")) { C.msync_after = true; }
else if (!strcmp(argv[i], "--iters") && i+1 < argc) { C.iters = strtoull(argv[++i], NULL, 10); }
else { usage(argv[0]); return 1; }
}
if (!C.path) { usage(argv[0]); return 1; }
if (C.block == 0 || (C.block & 0x3FF) == 0x3FF) { /* silly check to avoid zero */ }
int fd = open(C.path, (C.op == OP_WRITE) ? O_RDWR : O_RDONLY);
if (fd < 0) { perror("open"); return 1; }
struct stat st;
if (fstat(fd, &st) < 0) { perror("fstat"); return 1; }
// For /dev/dax, st_size usually reflects the device capacity. Clamp map_len.
if ((size_t)st.st_size && C.map_len > (size_t)st.st_size) C.map_len = (size_t)st.st_size;
int prot = PROT_READ | ((C.op == OP_WRITE) ? PROT_WRITE : 0);
// Try MAP_SYNC when possible (fs-DAX). For /dev/dax, MAP_SHARED works.
void *base = mmap(NULL, C.map_len, prot, MAP_SHARED, fd, 0);
if (base == MAP_FAILED) {
perror("mmap");
return 1;
}
// Source/dest scratch buffer for operations
size_t align = 4096;
void *scratch = xaligned_alloc(align, C.block);
if (!scratch) { fprintf(stderr, "alloc failed\n"); return 1; }
// Fill scratch with a deterministic pattern
for (size_t i = 0; i < C.block; ++i) ((unsigned char*)scratch)[i] = (unsigned char)(i * 131 + 7);
size_t nblocks = C.map_len / C.block;
if (nblocks == 0) { fprintf(stderr, "map too small for block size\n"); return 1; }
// Optional random index order
size_t *order = malloc(nblocks * sizeof(size_t));
if (!order) { fprintf(stderr, "order alloc failed\n"); return 1; }
for (size_t i = 0; i < nblocks; ++i) order[i] = i;
if (C.randomize) {
// Fisher-Yates
for (size_t i = nblocks - 1; i > 0; --i) {
size_t j = (size_t) (rand() % (int)(i + 1));
size_t tmp = order[i]; order[i] = order[j]; order[j] = tmp;
}
}
// Warm-up: touch pages to avoid page faults skewing measurements
volatile unsigned char sink = 0;
for (size_t i = 0; i < nblocks; i += (nblocks/128 + 1)) {
sink ^= ((volatile unsigned char*)base)[order[i] * C.block];
}
double t0 = now_sec();
uint64_t ops = 0;
// Optional per-block latency collection (percentiles) would require storing many samples; keep it simple:
double worst_block = 0.0, best_block = 1e9;
for (size_t it = 0; it < C.iters; ++it) {
for (size_t k = 0; k < nblocks; ++k) {
size_t idx = order[k];
unsigned char *ptr = (unsigned char *)base + idx * C.block;
double bt0 = now_sec();
if (C.op == OP_WRITE) {
memcpy(ptr, scratch, C.block);
persist_range(ptr, C.block);
} else {
// Read path: copy out to scratch to force actual loads
memcpy(scratch, ptr, C.block);
sink ^= ((volatile unsigned char*)scratch)[0];
}
double bt1 = now_sec();
double dt = bt1 - bt0;
if (dt < best_block) best_block = dt;
if (dt > worst_block) worst_block = dt;
ops++;
}
if (C.msync_after && C.op == OP_WRITE) {
// fs-DAX only; on /dev/dax this is a harmless no-op
if (msync(base, C.map_len, MS_SYNC) != 0) perror("msync");
}
}
double t1 = now_sec();
double sec = t1 - t0;
double bytes = (double)C.map_len * (double)C.iters;
printf("op=%s map=%zu bytes block=%zu bytes iters=%zu\n", (C.op==OP_WRITE?"write":"read"), C.map_len, C.block, C.iters);
printf("time=%.6f s, throughput=%.2f GB/s\n", sec, bytes / sec / 1e9);
printf("per-block latency: best=%.3f us, worst=%.3f us (block=%zu)\n", best_block*1e6, worst_block*1e6, C.block);
// Prevent optimizer from dropping reads
if (sink == 0x42) fprintf(stderr, "sink=%u\n", (unsigned)sink);
munmap(base, C.map_len);
free(scratch);
free(order);
close(fd);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment