Last active
December 24, 2025 15:25
-
-
Save youkaichao/49ad7c952e9d9e0f0ba22e3fab1b884e to your computer and use it in GitHub Desktop.
a cupti example
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // cupti_driver_stream_trace_preload.c | |
| // LD_PRELOAD interposer that: | |
| // - intercepts cudaMalloc | |
| // - on first successful cudaMalloc, enables CUPTI Driver API callbacks for: | |
| // * cuStreamCreate (log at API EXIT) | |
| // * cuStreamSynchronize (log at API ENTER) | |
| // compile: c++ -O2 -fPIC -shared cupti_driver_stream_trace_preload.c -o cupti_driver_stream_load.so -I/usr/local/cuda-12.8/include/ -L/usr/local/cuda-12.8/lib64/ -lcupti -lcuda | |
| // important: only intercepts cudaMalloc, a runtime API. cannot intercept driver APIs, as the library depends on driver APIs. | |
| // run: LD_PRELOAD=/data/youkaichao/vllm/cupti_driver_stream_load.so python test.py | |
| #include <cuda.h> | |
| #include <cupti.h> | |
| #include <cupti_callbacks.h> | |
| #include <cupti_driver_cbid.h> | |
| #include <atomic> | |
| #include <cstdio> | |
| #include <cstdlib> | |
| #include <dlfcn.h> | |
| #include <unistd.h> // getpid | |
| #include <pthread.h> // pthread_once | |
| #define CU_CHECK(call) \ | |
| do { \ | |
| CUresult _e = (call); \ | |
| if (_e != CUDA_SUCCESS) { \ | |
| const char* s = nullptr; \ | |
| cuGetErrorString(_e, &s); \ | |
| std::fprintf(stderr, "pid=%d CUDA Driver error %s:%d: %s\n", \ | |
| (int)getpid(), __FILE__, __LINE__, s ? s : "(unknown)"); \ | |
| std::abort(); \ | |
| } \ | |
| } while (0) | |
| #define CUPTI_CHECK(call) \ | |
| do { \ | |
| CUptiResult _e = (call); \ | |
| if (_e != CUPTI_SUCCESS) { \ | |
| const char* s = nullptr; \ | |
| cuptiGetResultString(_e, &s); \ | |
| std::fprintf(stderr, "pid=%d CUPTI error %s:%d: %s\n", \ | |
| (int)getpid(), __FILE__, __LINE__, s ? s : "(unknown)"); \ | |
| std::abort(); \ | |
| } \ | |
| } while (0) | |
| static CUpti_SubscriberHandle g_subscriber = nullptr; | |
| static std::atomic<bool> g_enabled{false}; | |
| static pthread_once_t g_enable_once_control = PTHREAD_ONCE_INIT; | |
| static void CUPTIAPI driverCallback(void* /*userdata*/, | |
| CUpti_CallbackDomain domain, | |
| CUpti_CallbackId cbid, | |
| const void* cbInfo) { | |
| if (domain != CUPTI_CB_DOMAIN_DRIVER_API) return; | |
| const CUpti_CallbackData* cbd = (const CUpti_CallbackData*)cbInfo; | |
| if (!cbd) return; | |
| const int pid = (int)getpid(); | |
| // cuStreamCreate(CUstream* phStream, unsigned int Flags) | |
| if (cbid == CUPTI_DRIVER_TRACE_CBID_cuStreamCreate) { | |
| const cuStreamCreate_params* p = | |
| (const cuStreamCreate_params*)cbd->functionParams; | |
| if (cbd->callbackSite == CUPTI_API_EXIT && p && p->phStream) { | |
| CUstream s = *(p->phStream); | |
| unsigned int flags = p->Flags; | |
| std::fprintf(stderr, | |
| "pid=%d [CUPTI][DRV] cuStreamCreate -> stream=%p flags=0x%x\n", | |
| pid, (void*)s, flags); | |
| } | |
| return; | |
| } | |
| // cuStreamSynchronize(CUstream hStream) | |
| if (cbid == CUPTI_DRIVER_TRACE_CBID_cuStreamSynchronize) { | |
| const cuStreamSynchronize_params* p = | |
| (const cuStreamSynchronize_params*)cbd->functionParams; | |
| if (cbd->callbackSite == CUPTI_API_ENTER && p) { | |
| std::fprintf(stderr, | |
| "pid=%d [CUPTI][DRV] cuStreamSynchronize(stream=%p)\n", | |
| pid, (void*)p->hStream); | |
| } | |
| return; | |
| } | |
| } | |
| static void enable_cupti_impl() { | |
| // pthread_once guarantees this runs once per process. | |
| if (g_enabled.load(std::memory_order_acquire)) return; | |
| CUPTI_CHECK(cuptiSubscribe(&g_subscriber, | |
| (CUpti_CallbackFunc)driverCallback, | |
| nullptr)); | |
| CUPTI_CHECK(cuptiEnableCallback(1, g_subscriber, CUPTI_CB_DOMAIN_DRIVER_API, | |
| CUPTI_DRIVER_TRACE_CBID_cuStreamCreate)); | |
| CUPTI_CHECK(cuptiEnableCallback(1, g_subscriber, CUPTI_CB_DOMAIN_DRIVER_API, | |
| CUPTI_DRIVER_TRACE_CBID_cuStreamSynchronize)); | |
| g_enabled.store(true, std::memory_order_release); | |
| } | |
| static void enable_cupti_once() { | |
| pthread_once(&g_enable_once_control, enable_cupti_impl); | |
| } | |
| static void disable_cupti() { | |
| if (!g_enabled.load(std::memory_order_acquire)) return; | |
| // Best effort teardown; keep behavior simple. | |
| CUPTI_CHECK(cuptiEnableCallback(0, g_subscriber, CUPTI_CB_DOMAIN_DRIVER_API, | |
| CUPTI_DRIVER_TRACE_CBID_cuStreamCreate)); | |
| CUPTI_CHECK(cuptiEnableCallback(0, g_subscriber, CUPTI_CB_DOMAIN_DRIVER_API, | |
| CUPTI_DRIVER_TRACE_CBID_cuStreamSynchronize)); | |
| CUPTI_CHECK(cuptiUnsubscribe(g_subscriber)); | |
| g_subscriber = nullptr; | |
| g_enabled.store(false, std::memory_order_release); | |
| } | |
| // Optional: clean up at process exit (won't always run on abnormal termination). | |
| __attribute__((destructor)) static void on_unload() { | |
| // Avoid aborting during teardown; you can comment this out if you prefer. | |
| if (g_enabled.load(std::memory_order_acquire)) { | |
| // If CUPTI calls fail here, it's usually harmless; you can ignore failures | |
| // by replacing CUPTI_CHECK with best-effort logic if desired. | |
| disable_cupti(); | |
| } | |
| } | |
| // ---------------- cuInit interposer ---------------- | |
| using cudaMalloc_fn = cudaError_t (*)(void** /*devPtr*/, size_t /*size*/); | |
| extern "C" cudaError_t cudaMalloc(void** devPtr, size_t size) { | |
| static cudaMalloc_fn real_cudaMalloc = nullptr; | |
| static bool first_time = true; | |
| if (first_time) { | |
| first_time = false; | |
| enable_cupti_once(); | |
| } | |
| if (!real_cudaMalloc) { | |
| real_cudaMalloc = (cudaMalloc_fn)dlsym(RTLD_NEXT, "cudaMalloc"); | |
| if (!real_cudaMalloc) { | |
| std::fprintf(stderr, | |
| "pid=%d [CUPTI][DRV] dlsym(RTLD_NEXT, cudaMalloc) failed: %s\n", | |
| (int)getpid(), dlerror()); | |
| // Best-effort: return a generic runtime error. | |
| return cudaErrorUnknown; | |
| } | |
| } | |
| // Call the real cudaMalloc first (this will trigger CUDA runtime init if needed). | |
| cudaError_t res = real_cudaMalloc(devPtr, size); | |
| return res; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment