sarsanaee · February 12, 2026 14:21
diff --git a/gistfile1.txt b/gistfile1.txt
 // Simple DAX microbenchmark (throughput + per-chunk latency)
 // - Works on /dev/daxX.Y or an fs-DAX mmap'ed file
 // - No external libraries
 // - Ensures persistence on x86 via CLWB/CLFLUSHOPT + SFENCE
 //
 // Build:   gcc -O3 -march=native -Wall -Wextra -o dax_bench dax-write-read-microbench.c
 // Example: sudo ./dax_bench -d /dev/dax0.0 -m 256M -b 4K -t write
 //          sudo ./dax_bench -d /dev/dax0.0 -m 256M -b 4K -t read
 // Notes:
 // * On x86, we flush caches to ensure data reaches persistence domain.
 // * On fs-DAX files, msync(MS_SYNC) is also issued as an extra belt-and-braces step.
 // * On non-x86, flush becomes a no-op unless compiled with appropriate intrinsics.
 //   (ARM64 support would need dc cvap/civac + dsb; add as needed.)
 // * Requires sufficient permissions to mmap the DAX device/file.

 #define _GNU_SOURCE
 #include <errno.h>
 #include <fcntl.h>
 #include <inttypes.h>
 #include <stdbool.h>
 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/mman.h>
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <time.h>
 #include <unistd.h>

 #ifndef MAP_SYNC
 #define MAP_SYNC 0x80000
 #endif
 #ifndef MAP_SHARED_VALIDATE
 #define MAP_SHARED_VALIDATE 0x03
 #endif

 // --- Arch helpers -----------------------------------------------------------
 root@localhost:~# cat dax_bench.c
 // Simple DAX microbenchmark (throughput + per-chunk latency)
 // - Works on /dev/daxX.Y or an fs-DAX mmap'ed file
 // - No external libraries
 // - Ensures persistence on x86 via CLWB/CLFLUSHOPT + SFENCE
 //
 // Build:   gcc -O3 -march=native -Wall -Wextra -o dax_bench dax-write-read-microbench.c
 // Example: sudo ./dax_bench -d /dev/dax0.0 -m 256M -b 4K -t write
 //          sudo ./dax_bench -d /dev/dax0.0 -m 256M -b 4K -t read
 // Notes:
 // * On x86, we flush caches to ensure data reaches persistence domain.
 // * On fs-DAX files, msync(MS_SYNC) is also issued as an extra belt-and-braces step.
 // * On non-x86, flush becomes a no-op unless compiled with appropriate intrinsics.
 //   (ARM64 support would need dc cvap/civac + dsb; add as needed.)
 // * Requires sufficient permissions to mmap the DAX device/file.

 #define _GNU_SOURCE
 #include <errno.h>
 #include <fcntl.h>
 #include <inttypes.h>
 #include <stdbool.h>
 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/mman.h>
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <time.h>
 #include <unistd.h>

 #ifndef MAP_SYNC
 #define MAP_SYNC 0x80000
 #endif
 #ifndef MAP_SHARED_VALIDATE
 #define MAP_SHARED_VALIDATE 0x03
 #endif

 // --- Arch helpers -----------------------------------------------------------

 static inline void sfence_persist(void) {
 #if defined(__x86_64__) || defined(__i386__)
    __asm__ __volatile__("sfence" ::: "memory");
 #else
    // On other arches, this is a placeholder. Consider adding dsb ish on ARM64.
    __sync_synchronize();
 #endif
 }

 static inline void clwb(void *p) {
 #if defined(__x86_64__) || defined(__i386__)
    // Try CLWB first, fall back to CLFLUSHOPT, then CLFLUSH
    // Use byte ptr to avoid type aliasing issues
    unsigned char *ptr = (unsigned char *)p;
    // Encode CLWB via .byte if assembler/CPU support is present at runtime it will execute.
    // Safer approach is to use CLFLUSHOPT which exists on more systems; we attempt both.
    __asm__ __volatile__(".byte 0x66, 0x0f, 0xae, 0x30" :: "D"(ptr)); // clwb [rax/rdi]
 #else
    (void)p;
 #endif
 }

 static inline void clflushopt(void *p) {
 #if defined(__x86_64__) || defined(__i386__)
    __asm__ __volatile__(".byte 0x66, 0x0f, 0xae, 0x30" :: "D"(p)); // clflushopt [rdi]
 #else
    (void)p;
 #endif
 }

 static inline void clflush_fallback(void *p) {
 #if defined(__x86_64__) || defined(__i386__)
    __asm__ __volatile__("clflush (%0)" :: "r"(p));
 #else
    (void)p;
 #endif
 }

 static inline void flush_line(void *p) {
 #if defined(__x86_64__) || defined(__i386__)
    // Prefer CLWB, then CLFLUSHOPT, then CLFLUSH. We emit CLWB; if unsupported,
    // CPUs typically #UD. To avoid that, we conservatively emit CLFLUSHOPT.
    // For portability without CPUID, we just emit CLFLUSHOPT here.
    clflushopt(p);
 #else
    (void)p;
 #endif
 }

 static inline size_t cacheline_size(void) {
 #if defined(__x86_64__) || defined(__i386__)
    return 64; // common on x86; query via sysfs/cpuid if desired
 #else
    return 64;
 #endif
 }

 static void persist_range(void *addr, size_t len) {
    size_t line = cacheline_size();
    uintptr_t p = (uintptr_t)addr;
    uintptr_t start = p & ~(line - 1);
    uintptr_t end = (p + len + line - 1) & ~(line - 1);
    for (uintptr_t a = start; a < end; a += line) {
        flush_line((void *)a);
    }
    sfence_persist();
 }

 // --- Timing helpers ---------------------------------------------------------

 static inline double now_sec(void) {
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
    return (double)ts.tv_sec + (double)ts.tv_nsec * 1e-9;
 }

 // --- Parsing helpers --------------------------------------------------------

 static size_t parse_size(const char *s) {
    char *end = NULL;
    double v = strtod(s, &end);
    if (end == s) { fprintf(stderr, "Invalid size: %s\n", s); exit(1);}
    size_t mul = 1;
    if (*end) {
        if (strcasecmp(end, "b") == 0) mul = 1;
        else if (strcasecmp(end, "k") == 0 || strcasecmp(end, "kb") == 0) mul = 1024ULL;
        else if (strcasecmp(end, "m") == 0 || strcasecmp(end, "mb") == 0) mul = 1024ULL*1024ULL;
        else if (strcasecmp(end, "g") == 0 || strcasecmp(end, "gb") == 0) mul = 1024ULL*1024ULL*1024ULL;
        else { fprintf(stderr, "Unknown size suffix: %s\n", end); exit(1);}
    }
    return (size_t)(v * (double)mul);
 }

 static void *xaligned_alloc(size_t align, size_t size) {
    void *p = NULL;
    if (posix_memalign(&p, align, size)) return NULL;
    return p;
 }

 // --- Benchmark --------------------------------------------------------------

 typedef enum { OP_WRITE, OP_READ } op_t;

 typedef struct {
    const char *path;
    size_t map_len;
    size_t block;
    op_t op;
    bool randomize;
    bool msync_after; // useful for fs-DAX
    size_t iters;     // number of passes over the mapping
 } cfg_t;

 static void usage(const char *prog) {
    fprintf(stderr,
        "Usage: %s -d <dax-path> [-m <map size>] [-b <block>] [-t write|read] [--iters N] [--msync] [--random]\n"
        "Defaults: -m 1G -b 4K -t write --iters 1\n",
        prog);
 }

 int main(int argc, char **argv) {
    cfg_t C = { .path = NULL, .map_len = 1ULL<<30, .block = 4096, .op = OP_WRITE, .randomize = false, .msync_after = false, .iters = 1 };

    for (int i = 1; i < argc; ++i) {
        if (!strcmp(argv[i], "-d") && i+1 < argc) { C.path = argv[++i]; }
        else if (!strcmp(argv[i], "-m") && i+1 < argc) { C.map_len = parse_size(argv[++i]); }
        else if (!strcmp(argv[i], "-b") && i+1 < argc) { C.block   = parse_size(argv[++i]); }
        else if (!strcmp(argv[i], "-t") && i+1 < argc) {
            const char *t = argv[++i];
            if (!strcmp(t, "write")) C.op = OP_WRITE; else if (!strcmp(t, "read")) C.op = OP_READ; else { usage(argv[0]); return 1; }
        } else if (!strcmp(argv[i], "--random")) { C.randomize = true; }
        else if (!strcmp(argv[i], "--msync")) { C.msync_after = true; }
        else if (!strcmp(argv[i], "--iters") && i+1 < argc) { C.iters = strtoull(argv[++i], NULL, 10); }
        else { usage(argv[0]); return 1; }
    }

    if (!C.path) { usage(argv[0]); return 1; }
    if (C.block == 0 || (C.block & 0x3FF) == 0x3FF) { /* silly check to avoid zero */ }

    int fd = open(C.path, (C.op == OP_WRITE) ? O_RDWR : O_RDONLY);
    if (fd < 0) { perror("open"); return 1; }

    struct stat st;
    if (fstat(fd, &st) < 0) { perror("fstat"); return 1; }

    // For /dev/dax, st_size usually reflects the device capacity. Clamp map_len.
    if ((size_t)st.st_size && C.map_len > (size_t)st.st_size) C.map_len = (size_t)st.st_size;

    int prot = PROT_READ | ((C.op == OP_WRITE) ? PROT_WRITE : 0);

    // Try MAP_SYNC when possible (fs-DAX). For /dev/dax, MAP_SHARED works.
    void *base = mmap(NULL, C.map_len, prot, MAP_SHARED, fd, 0);
    if (base == MAP_FAILED) {
        perror("mmap");
        return 1;
    }

    // Source/dest scratch buffer for operations
    size_t align = 4096;
    void *scratch = xaligned_alloc(align, C.block);
    if (!scratch) { fprintf(stderr, "alloc failed\n"); return 1; }
    // Fill scratch with a deterministic pattern
    for (size_t i = 0; i < C.block; ++i) ((unsigned char*)scratch)[i] = (unsigned char)(i * 131 + 7);

    size_t nblocks = C.map_len / C.block;
    if (nblocks == 0) { fprintf(stderr, "map too small for block size\n"); return 1; }

    // Optional random index order
    size_t *order = malloc(nblocks * sizeof(size_t));
    if (!order) { fprintf(stderr, "order alloc failed\n"); return 1; }
    for (size_t i = 0; i < nblocks; ++i) order[i] = i;
    if (C.randomize) {
        // Fisher-Yates
        for (size_t i = nblocks - 1; i > 0; --i) {
            size_t j = (size_t) (rand() % (int)(i + 1));
            size_t tmp = order[i]; order[i] = order[j]; order[j] = tmp;
        }
    }

    // Warm-up: touch pages to avoid page faults skewing measurements
    volatile unsigned char sink = 0;
    for (size_t i = 0; i < nblocks; i += (nblocks/128 + 1)) {
        sink ^= ((volatile unsigned char*)base)[order[i] * C.block];
    }

    double t0 = now_sec();
    uint64_t ops = 0;

    // Optional per-block latency collection (percentiles) would require storing many samples; keep it simple:
    double worst_block = 0.0, best_block = 1e9;

    for (size_t it = 0; it < C.iters; ++it) {
        for (size_t k = 0; k < nblocks; ++k) {
            size_t idx = order[k];
            unsigned char *ptr = (unsigned char *)base + idx * C.block;
            double bt0 = now_sec();
            if (C.op == OP_WRITE) {
                memcpy(ptr, scratch, C.block);
                persist_range(ptr, C.block);
            } else {
                // Read path: copy out to scratch to force actual loads
                memcpy(scratch, ptr, C.block);
                sink ^= ((volatile unsigned char*)scratch)[0];
            }
            double bt1 = now_sec();
            double dt = bt1 - bt0;
            if (dt < best_block) best_block = dt;
            if (dt > worst_block) worst_block = dt;
            ops++;
        }
        if (C.msync_after && C.op == OP_WRITE) {
            // fs-DAX only; on /dev/dax this is a harmless no-op
            if (msync(base, C.map_len, MS_SYNC) != 0) perror("msync");
        }
    }

    double t1 = now_sec();
    double sec = t1 - t0;
    double bytes = (double)C.map_len * (double)C.iters;

    printf("op=%s map=%zu bytes block=%zu bytes iters=%zu\n", (C.op==OP_WRITE?"write":"read"), C.map_len, C.block, C.iters);
    printf("time=%.6f s, throughput=%.2f GB/s\n", sec, bytes / sec / 1e9);
    printf("per-block latency: best=%.3f us, worst=%.3f us (block=%zu)\n", best_block*1e6, worst_block*1e6, C.block);

    // Prevent optimizer from dropping reads
    if (sink == 0x42) fprintf(stderr, "sink=%u\n", (unsigned)sink);

    munmap(base, C.map_len);
    free(scratch);
    free(order);
    close(fd);
    return 0;
 }
	// Simple DAX microbenchmark (throughput + per-chunk latency)
	// - Works on /dev/daxX.Y or an fs-DAX mmap'ed file
	// - No external libraries
	// - Ensures persistence on x86 via CLWB/CLFLUSHOPT + SFENCE
	//
	// Build: gcc -O3 -march=native -Wall -Wextra -o dax_bench dax-write-read-microbench.c
	// Example: sudo ./dax_bench -d /dev/dax0.0 -m 256M -b 4K -t write
	// sudo ./dax_bench -d /dev/dax0.0 -m 256M -b 4K -t read
	// Notes:
	// * On x86, we flush caches to ensure data reaches persistence domain.
	// * On fs-DAX files, msync(MS_SYNC) is also issued as an extra belt-and-braces step.
	// * On non-x86, flush becomes a no-op unless compiled with appropriate intrinsics.
	// (ARM64 support would need dc cvap/civac + dsb; add as needed.)
	// * Requires sufficient permissions to mmap the DAX device/file.

	#define _GNU_SOURCE
	#include <errno.h>
	#include <fcntl.h>
	#include <inttypes.h>
	#include <stdbool.h>
	#include <stddef.h>
	#include <stdint.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <string.h>
	#include <sys/mman.h>
	#include <sys/stat.h>
	#include <sys/types.h>
	#include <time.h>
	#include <unistd.h>

	#ifndef MAP_SYNC
	#define MAP_SYNC 0x80000
	#endif
	#ifndef MAP_SHARED_VALIDATE
	#define MAP_SHARED_VALIDATE 0x03
	#endif

	// --- Arch helpers -----------------------------------------------------------
	root@localhost:~# cat dax_bench.c
	// Simple DAX microbenchmark (throughput + per-chunk latency)
	// - Works on /dev/daxX.Y or an fs-DAX mmap'ed file
	// - No external libraries
	// - Ensures persistence on x86 via CLWB/CLFLUSHOPT + SFENCE
	//
	// Build: gcc -O3 -march=native -Wall -Wextra -o dax_bench dax-write-read-microbench.c
	// Example: sudo ./dax_bench -d /dev/dax0.0 -m 256M -b 4K -t write
	// sudo ./dax_bench -d /dev/dax0.0 -m 256M -b 4K -t read
	// Notes:
	// * On x86, we flush caches to ensure data reaches persistence domain.
	// * On fs-DAX files, msync(MS_SYNC) is also issued as an extra belt-and-braces step.
	// * On non-x86, flush becomes a no-op unless compiled with appropriate intrinsics.
	// (ARM64 support would need dc cvap/civac + dsb; add as needed.)
	// * Requires sufficient permissions to mmap the DAX device/file.

	#define _GNU_SOURCE
	#include <errno.h>
	#include <fcntl.h>
	#include <inttypes.h>
	#include <stdbool.h>
	#include <stddef.h>
	#include <stdint.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <string.h>
	#include <sys/mman.h>
	#include <sys/stat.h>
	#include <sys/types.h>
	#include <time.h>
	#include <unistd.h>

	#ifndef MAP_SYNC
	#define MAP_SYNC 0x80000
	#endif
	#ifndef MAP_SHARED_VALIDATE
	#define MAP_SHARED_VALIDATE 0x03
	#endif

	// --- Arch helpers -----------------------------------------------------------

	static inline void sfence_persist(void) {
	#if defined(__x86_64__) \|\| defined(__i386__)
	__asm__ __volatile__("sfence" ::: "memory");
	#else
	// On other arches, this is a placeholder. Consider adding dsb ish on ARM64.
	__sync_synchronize();
	#endif
	}

	static inline void clwb(void *p) {
	#if defined(__x86_64__) \|\| defined(__i386__)
	// Try CLWB first, fall back to CLFLUSHOPT, then CLFLUSH
	// Use byte ptr to avoid type aliasing issues
	unsigned char ptr = (unsigned char )p;
	// Encode CLWB via .byte if assembler/CPU support is present at runtime it will execute.
	// Safer approach is to use CLFLUSHOPT which exists on more systems; we attempt both.
	__asm__ __volatile__(".byte 0x66, 0x0f, 0xae, 0x30" :: "D"(ptr)); // clwb [rax/rdi]
	#else
	(void)p;
	#endif
	}

	static inline void clflushopt(void *p) {
	#if defined(__x86_64__) \|\| defined(__i386__)
	__asm__ __volatile__(".byte 0x66, 0x0f, 0xae, 0x30" :: "D"(p)); // clflushopt [rdi]
	#else
	(void)p;
	#endif
	}

	static inline void clflush_fallback(void *p) {
	#if defined(__x86_64__) \|\| defined(__i386__)
	__asm__ __volatile__("clflush (%0)" :: "r"(p));
	#else
	(void)p;
	#endif
	}

	static inline void flush_line(void *p) {
	#if defined(__x86_64__) \|\| defined(__i386__)
	// Prefer CLWB, then CLFLUSHOPT, then CLFLUSH. We emit CLWB; if unsupported,
	// CPUs typically #UD. To avoid that, we conservatively emit CLFLUSHOPT.
	// For portability without CPUID, we just emit CLFLUSHOPT here.
	clflushopt(p);
	#else
	(void)p;
	#endif
	}

	static inline size_t cacheline_size(void) {
	#if defined(__x86_64__) \|\| defined(__i386__)
	return 64; // common on x86; query via sysfs/cpuid if desired
	#else
	return 64;
	#endif
	}

	static void persist_range(void *addr, size_t len) {
	size_t line = cacheline_size();
	uintptr_t p = (uintptr_t)addr;
	uintptr_t start = p & ~(line - 1);
	uintptr_t end = (p + len + line - 1) & ~(line - 1);
	for (uintptr_t a = start; a < end; a += line) {
	flush_line((void *)a);
	}
	sfence_persist();
	}

	// --- Timing helpers ---------------------------------------------------------

	static inline double now_sec(void) {
	struct timespec ts;
	clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
	return (double)ts.tv_sec + (double)ts.tv_nsec * 1e-9;
	}

	// --- Parsing helpers --------------------------------------------------------

	static size_t parse_size(const char *s) {
	char *end = NULL;
	double v = strtod(s, &end);
	if (end == s) { fprintf(stderr, "Invalid size: %s\n", s); exit(1);}
	size_t mul = 1;
	if (*end) {
	if (strcasecmp(end, "b") == 0) mul = 1;
	else if (strcasecmp(end, "k") == 0 \|\| strcasecmp(end, "kb") == 0) mul = 1024ULL;
	else if (strcasecmp(end, "m") == 0 \|\| strcasecmp(end, "mb") == 0) mul = 1024ULL*1024ULL;
	else if (strcasecmp(end, "g") == 0 \|\| strcasecmp(end, "gb") == 0) mul = 1024ULL1024ULL1024ULL;
	else { fprintf(stderr, "Unknown size suffix: %s\n", end); exit(1);}
	}
	return (size_t)(v * (double)mul);
	}

	static void *xaligned_alloc(size_t align, size_t size) {
	void *p = NULL;
	if (posix_memalign(&p, align, size)) return NULL;
	return p;
	}

	// --- Benchmark --------------------------------------------------------------

	typedef enum { OP_WRITE, OP_READ } op_t;

	typedef struct {
	const char *path;
	size_t map_len;
	size_t block;
	op_t op;
	bool randomize;
	bool msync_after; // useful for fs-DAX
	size_t iters; // number of passes over the mapping
	} cfg_t;

	static void usage(const char *prog) {
	fprintf(stderr,
	"Usage: %s -d <dax-path> [-m <map size>] [-b <block>] [-t write\|read] [--iters N] [--msync] [--random]\n"
	"Defaults: -m 1G -b 4K -t write --iters 1\n",
	prog);
	}

	int main(int argc, char **argv) {
	cfg_t C = { .path = NULL, .map_len = 1ULL<<30, .block = 4096, .op = OP_WRITE, .randomize = false, .msync_after = false, .iters = 1 };

	for (int i = 1; i < argc; ++i) {
	if (!strcmp(argv[i], "-d") && i+1 < argc) { C.path = argv[++i]; }
	else if (!strcmp(argv[i], "-m") && i+1 < argc) { C.map_len = parse_size(argv[++i]); }
	else if (!strcmp(argv[i], "-b") && i+1 < argc) { C.block = parse_size(argv[++i]); }
	else if (!strcmp(argv[i], "-t") && i+1 < argc) {
	const char *t = argv[++i];
	if (!strcmp(t, "write")) C.op = OP_WRITE; else if (!strcmp(t, "read")) C.op = OP_READ; else { usage(argv[0]); return 1; }
	} else if (!strcmp(argv[i], "--random")) { C.randomize = true; }
	else if (!strcmp(argv[i], "--msync")) { C.msync_after = true; }
	else if (!strcmp(argv[i], "--iters") && i+1 < argc) { C.iters = strtoull(argv[++i], NULL, 10); }
	else { usage(argv[0]); return 1; }
	}

	if (!C.path) { usage(argv[0]); return 1; }
	if (C.block == 0 \|\| (C.block & 0x3FF) == 0x3FF) { /* silly check to avoid zero */ }

	int fd = open(C.path, (C.op == OP_WRITE) ? O_RDWR : O_RDONLY);
	if (fd < 0) { perror("open"); return 1; }

	struct stat st;
	if (fstat(fd, &st) < 0) { perror("fstat"); return 1; }

	// For /dev/dax, st_size usually reflects the device capacity. Clamp map_len.
	if ((size_t)st.st_size && C.map_len > (size_t)st.st_size) C.map_len = (size_t)st.st_size;

	int prot = PROT_READ \| ((C.op == OP_WRITE) ? PROT_WRITE : 0);

	// Try MAP_SYNC when possible (fs-DAX). For /dev/dax, MAP_SHARED works.
	void *base = mmap(NULL, C.map_len, prot, MAP_SHARED, fd, 0);
	if (base == MAP_FAILED) {
	perror("mmap");
	return 1;
	}

	// Source/dest scratch buffer for operations
	size_t align = 4096;
	void *scratch = xaligned_alloc(align, C.block);
	if (!scratch) { fprintf(stderr, "alloc failed\n"); return 1; }
	// Fill scratch with a deterministic pattern
	for (size_t i = 0; i < C.block; ++i) ((unsigned char)scratch)[i] = (unsigned char)(i 131 + 7);

	size_t nblocks = C.map_len / C.block;
	if (nblocks == 0) { fprintf(stderr, "map too small for block size\n"); return 1; }

	// Optional random index order
	size_t order = malloc(nblocks sizeof(size_t));
	if (!order) { fprintf(stderr, "order alloc failed\n"); return 1; }
	for (size_t i = 0; i < nblocks; ++i) order[i] = i;
	if (C.randomize) {
	// Fisher-Yates
	for (size_t i = nblocks - 1; i > 0; --i) {
	size_t j = (size_t) (rand() % (int)(i + 1));
	size_t tmp = order[i]; order[i] = order[j]; order[j] = tmp;
	}
	}

	// Warm-up: touch pages to avoid page faults skewing measurements
	volatile unsigned char sink = 0;
	for (size_t i = 0; i < nblocks; i += (nblocks/128 + 1)) {
	sink ^= ((volatile unsigned char)base)[order[i] C.block];
	}

	double t0 = now_sec();
	uint64_t ops = 0;

	// Optional per-block latency collection (percentiles) would require storing many samples; keep it simple:
	double worst_block = 0.0, best_block = 1e9;

	for (size_t it = 0; it < C.iters; ++it) {
	for (size_t k = 0; k < nblocks; ++k) {
	size_t idx = order[k];
	unsigned char ptr = (unsigned char )base + idx * C.block;
	double bt0 = now_sec();
	if (C.op == OP_WRITE) {
	memcpy(ptr, scratch, C.block);
	persist_range(ptr, C.block);
	} else {
	// Read path: copy out to scratch to force actual loads
	memcpy(scratch, ptr, C.block);
	sink ^= ((volatile unsigned char*)scratch)[0];
	}
	double bt1 = now_sec();
	double dt = bt1 - bt0;
	if (dt < best_block) best_block = dt;
	if (dt > worst_block) worst_block = dt;
	ops++;
	}
	if (C.msync_after && C.op == OP_WRITE) {
	// fs-DAX only; on /dev/dax this is a harmless no-op
	if (msync(base, C.map_len, MS_SYNC) != 0) perror("msync");
	}
	}

	double t1 = now_sec();
	double sec = t1 - t0;
	double bytes = (double)C.map_len * (double)C.iters;

	printf("op=%s map=%zu bytes block=%zu bytes iters=%zu\n", (C.op==OP_WRITE?"write":"read"), C.map_len, C.block, C.iters);
	printf("time=%.6f s, throughput=%.2f GB/s\n", sec, bytes / sec / 1e9);
	printf("per-block latency: best=%.3f us, worst=%.3f us (block=%zu)\n", best_block1e6, worst_block1e6, C.block);

	// Prevent optimizer from dropping reads
	if (sink == 0x42) fprintf(stderr, "sink=%u\n", (unsigned)sink);

	munmap(base, C.map_len);
	free(scratch);
	free(order);
	close(fd);
	return 0;
	}
No results found