Skip to content

Instantly share code, notes, and snippets.

@youkaichao
Last active December 24, 2025 15:25
Show Gist options
  • Select an option

  • Save youkaichao/49ad7c952e9d9e0f0ba22e3fab1b884e to your computer and use it in GitHub Desktop.

Select an option

Save youkaichao/49ad7c952e9d9e0f0ba22e3fab1b884e to your computer and use it in GitHub Desktop.
a cupti example
// cupti_driver_stream_trace_preload.c
// LD_PRELOAD interposer that:
// - intercepts cudaMalloc
// - on first successful cudaMalloc, enables CUPTI Driver API callbacks for:
// * cuStreamCreate (log at API EXIT)
// * cuStreamSynchronize (log at API ENTER)
// compile: c++ -O2 -fPIC -shared cupti_driver_stream_trace_preload.c -o cupti_driver_stream_load.so -I/usr/local/cuda-12.8/include/ -L/usr/local/cuda-12.8/lib64/ -lcupti -lcuda
// important: only intercepts cudaMalloc, a runtime API. cannot intercept driver APIs, as the library depends on driver APIs.
// run: LD_PRELOAD=/data/youkaichao/vllm/cupti_driver_stream_load.so python test.py
#include <cuda.h>
#include <cupti.h>
#include <cupti_callbacks.h>
#include <cupti_driver_cbid.h>
#include <atomic>
#include <cstdio>
#include <cstdlib>
#include <dlfcn.h>
#include <unistd.h> // getpid
#include <pthread.h> // pthread_once
#define CU_CHECK(call) \
do { \
CUresult _e = (call); \
if (_e != CUDA_SUCCESS) { \
const char* s = nullptr; \
cuGetErrorString(_e, &s); \
std::fprintf(stderr, "pid=%d CUDA Driver error %s:%d: %s\n", \
(int)getpid(), __FILE__, __LINE__, s ? s : "(unknown)"); \
std::abort(); \
} \
} while (0)
#define CUPTI_CHECK(call) \
do { \
CUptiResult _e = (call); \
if (_e != CUPTI_SUCCESS) { \
const char* s = nullptr; \
cuptiGetResultString(_e, &s); \
std::fprintf(stderr, "pid=%d CUPTI error %s:%d: %s\n", \
(int)getpid(), __FILE__, __LINE__, s ? s : "(unknown)"); \
std::abort(); \
} \
} while (0)
static CUpti_SubscriberHandle g_subscriber = nullptr;
static std::atomic<bool> g_enabled{false};
static pthread_once_t g_enable_once_control = PTHREAD_ONCE_INIT;
static void CUPTIAPI driverCallback(void* /*userdata*/,
CUpti_CallbackDomain domain,
CUpti_CallbackId cbid,
const void* cbInfo) {
if (domain != CUPTI_CB_DOMAIN_DRIVER_API) return;
const CUpti_CallbackData* cbd = (const CUpti_CallbackData*)cbInfo;
if (!cbd) return;
const int pid = (int)getpid();
// cuStreamCreate(CUstream* phStream, unsigned int Flags)
if (cbid == CUPTI_DRIVER_TRACE_CBID_cuStreamCreate) {
const cuStreamCreate_params* p =
(const cuStreamCreate_params*)cbd->functionParams;
if (cbd->callbackSite == CUPTI_API_EXIT && p && p->phStream) {
CUstream s = *(p->phStream);
unsigned int flags = p->Flags;
std::fprintf(stderr,
"pid=%d [CUPTI][DRV] cuStreamCreate -> stream=%p flags=0x%x\n",
pid, (void*)s, flags);
}
return;
}
// cuStreamSynchronize(CUstream hStream)
if (cbid == CUPTI_DRIVER_TRACE_CBID_cuStreamSynchronize) {
const cuStreamSynchronize_params* p =
(const cuStreamSynchronize_params*)cbd->functionParams;
if (cbd->callbackSite == CUPTI_API_ENTER && p) {
std::fprintf(stderr,
"pid=%d [CUPTI][DRV] cuStreamSynchronize(stream=%p)\n",
pid, (void*)p->hStream);
}
return;
}
}
static void enable_cupti_impl() {
// pthread_once guarantees this runs once per process.
if (g_enabled.load(std::memory_order_acquire)) return;
CUPTI_CHECK(cuptiSubscribe(&g_subscriber,
(CUpti_CallbackFunc)driverCallback,
nullptr));
CUPTI_CHECK(cuptiEnableCallback(1, g_subscriber, CUPTI_CB_DOMAIN_DRIVER_API,
CUPTI_DRIVER_TRACE_CBID_cuStreamCreate));
CUPTI_CHECK(cuptiEnableCallback(1, g_subscriber, CUPTI_CB_DOMAIN_DRIVER_API,
CUPTI_DRIVER_TRACE_CBID_cuStreamSynchronize));
g_enabled.store(true, std::memory_order_release);
}
static void enable_cupti_once() {
pthread_once(&g_enable_once_control, enable_cupti_impl);
}
static void disable_cupti() {
if (!g_enabled.load(std::memory_order_acquire)) return;
// Best effort teardown; keep behavior simple.
CUPTI_CHECK(cuptiEnableCallback(0, g_subscriber, CUPTI_CB_DOMAIN_DRIVER_API,
CUPTI_DRIVER_TRACE_CBID_cuStreamCreate));
CUPTI_CHECK(cuptiEnableCallback(0, g_subscriber, CUPTI_CB_DOMAIN_DRIVER_API,
CUPTI_DRIVER_TRACE_CBID_cuStreamSynchronize));
CUPTI_CHECK(cuptiUnsubscribe(g_subscriber));
g_subscriber = nullptr;
g_enabled.store(false, std::memory_order_release);
}
// Optional: clean up at process exit (won't always run on abnormal termination).
__attribute__((destructor)) static void on_unload() {
// Avoid aborting during teardown; you can comment this out if you prefer.
if (g_enabled.load(std::memory_order_acquire)) {
// If CUPTI calls fail here, it's usually harmless; you can ignore failures
// by replacing CUPTI_CHECK with best-effort logic if desired.
disable_cupti();
}
}
// ---------------- cuInit interposer ----------------
using cudaMalloc_fn = cudaError_t (*)(void** /*devPtr*/, size_t /*size*/);
extern "C" cudaError_t cudaMalloc(void** devPtr, size_t size) {
static cudaMalloc_fn real_cudaMalloc = nullptr;
static bool first_time = true;
if (first_time) {
first_time = false;
enable_cupti_once();
}
if (!real_cudaMalloc) {
real_cudaMalloc = (cudaMalloc_fn)dlsym(RTLD_NEXT, "cudaMalloc");
if (!real_cudaMalloc) {
std::fprintf(stderr,
"pid=%d [CUPTI][DRV] dlsym(RTLD_NEXT, cudaMalloc) failed: %s\n",
(int)getpid(), dlerror());
// Best-effort: return a generic runtime error.
return cudaErrorUnknown;
}
}
// Call the real cudaMalloc first (this will trigger CUDA runtime init if needed).
cudaError_t res = real_cudaMalloc(devPtr, size);
return res;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment