Created
November 6, 2025 17:20
-
-
Save youkaichao/067b4138dd03f47e2558cb41e731a8b0 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <iostream> | |
| #include <vector> | |
| #include <cuda_runtime.h> | |
| #define cudaCheck(ans) { gpuAssert((ans), __FILE__, __LINE__); } | |
| inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) { | |
| if (code != cudaSuccess) { | |
| fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); | |
| if (abort) exit(code); | |
| } | |
| } | |
| // Kernel function to perform computation | |
| __global__ void kernel1(int64_t *data, int64_t repeat) { | |
| int idx = threadIdx.x + blockIdx.x * blockDim.x; | |
| for (size_t i = 0; i < repeat; i++) { | |
| data[idx] += 1; | |
| } | |
| } | |
| int main() { | |
| const int dataSize = 1024; | |
| int64_t *h_data = new int64_t[dataSize]; // Host data | |
| int64_t *d_data; // Device data | |
| // Initialize host data | |
| for (int i = 0; i < dataSize; i++) { | |
| h_data[i] = 0; | |
| } | |
| // Allocate memory on the device | |
| cudaCheck(cudaMalloc((void**)&d_data, dataSize * sizeof(int64_t))); | |
| // Transfer data from host to device | |
| cudaCheck(cudaMemcpy(d_data, h_data, dataSize * sizeof(int64_t), cudaMemcpyHostToDevice)); | |
| // Define grid and block dimensions | |
| dim3 blockDim(256); | |
| dim3 gridDim((dataSize + blockDim.x - 1) / blockDim.x); | |
| // Create stream | |
| cudaStream_t stream; | |
| cudaCheck(cudaStreamCreate(&stream)); | |
| const int64_t repeat = 1000; | |
| const int numInstances = 128; | |
| const int kernelsPerGraph = 128; | |
| // Get initial memory usage | |
| size_t freeMemBefore, totalMem; | |
| cudaCheck(cudaMemGetInfo(&freeMemBefore, &totalMem)); | |
| size_t usedMemBefore = totalMem - freeMemBefore; | |
| double usedMemBeforeMB = usedMemBefore / (1024.0 * 1024.0); | |
| std::cout << "Initial device memory usage: " << usedMemBeforeMB << " MB" << std::endl; | |
| // Capture a single graph with kernels executed 128 times | |
| std::cout << "Capturing graph with " << kernelsPerGraph << " kernel executions..." << std::endl; | |
| cudaCheck(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal)); | |
| // Capture kernels 128 times | |
| for (int kernelIter = 0; kernelIter < kernelsPerGraph; kernelIter++) { | |
| kernel1<<<gridDim, blockDim, 0, stream>>>(d_data, repeat); | |
| } | |
| // End stream capture | |
| cudaGraph_t graph; | |
| cudaCheck(cudaStreamEndCapture(stream, &graph)); | |
| // Get memory usage after creating the graph | |
| size_t freeMemAfterGraph, totalMemAfterGraph; | |
| cudaCheck(cudaMemGetInfo(&freeMemAfterGraph, &totalMemAfterGraph)); | |
| size_t usedMemAfterGraph = totalMemAfterGraph - freeMemAfterGraph; | |
| double usedMemAfterGraphMB = usedMemAfterGraph / (1024.0 * 1024.0); | |
| double graphMemoryMB = (usedMemAfterGraph - usedMemBefore) / (1024.0 * 1024.0); | |
| std::cout << "\nAfter creating graph:" << std::endl; | |
| std::cout << " Total device memory usage: " << usedMemAfterGraphMB << " MB" << std::endl; | |
| std::cout << " Memory used by graph: " << graphMemoryMB << " MB" << std::endl; | |
| // Instantiate the graph 128 times | |
| std::vector<cudaGraphExec_t> instances; | |
| std::cout << "\nInstantiating graph " << numInstances << " times..." << std::endl; | |
| for (int instanceIdx = 0; instanceIdx < numInstances; instanceIdx++) { | |
| cudaGraphExec_t instance; | |
| cudaCheck(cudaGraphInstantiate(&instance, graph, NULL, NULL, 0)); | |
| instances.push_back(instance); | |
| // Print memory usage periodically | |
| if ((instanceIdx + 1) % 16 == 0 || instanceIdx == 0) { | |
| size_t freeMem, totalMem; | |
| cudaCheck(cudaMemGetInfo(&freeMem, &totalMem)); | |
| size_t usedMem = totalMem - freeMem; | |
| double usedMemMB = usedMem / (1024.0 * 1024.0); | |
| double instanceMemoryMB = (usedMem - usedMemAfterGraph) / (1024.0 * 1024.0); | |
| std::cout << " Instantiated " << (instanceIdx + 1) << " instances. " | |
| << "Total memory: " << usedMemMB << " MB. " | |
| << "Memory for instances: " << instanceMemoryMB << " MB" << std::endl; | |
| } | |
| } | |
| // Get final memory usage after instantiation | |
| size_t freeMemAfterInstances, totalMemAfterInstances; | |
| cudaCheck(cudaMemGetInfo(&freeMemAfterInstances, &totalMemAfterInstances)); | |
| size_t usedMemAfterInstances = totalMemAfterInstances - freeMemAfterInstances; | |
| double usedMemAfterInstancesMB = usedMemAfterInstances / (1024.0 * 1024.0); | |
| double instanceMemoryMB = (usedMemAfterInstances - usedMemAfterGraph) / (1024.0 * 1024.0); | |
| std::cout << "\nFinal memory usage after instantiating " << numInstances << " instances:" << std::endl; | |
| std::cout << " Total device memory usage: " << usedMemAfterInstancesMB << " MB" << std::endl; | |
| std::cout << " Memory used by graph: " << graphMemoryMB << " MB" << std::endl; | |
| std::cout << " Memory used by graph instances: " << instanceMemoryMB << " MB" << std::endl; | |
| std::cout << " Average memory per instance: " << (instanceMemoryMB / numInstances) << " MB" << std::endl; | |
| std::cout << " Total memory (graph + instances): " | |
| << ((usedMemAfterInstances - usedMemBefore) / (1024.0 * 1024.0)) << " MB" << std::endl; | |
| // Cleanup all instances | |
| std::cout << "\nCleaning up instances..." << std::endl; | |
| for (auto instance : instances) { | |
| cudaCheck(cudaGraphExecDestroy(instance)); | |
| } | |
| instances.clear(); | |
| // Cleanup graph | |
| std::cout << "Cleaning up graph..." << std::endl; | |
| cudaCheck(cudaGraphDestroy(graph)); | |
| // Final memory check | |
| size_t freeMemFinal, totalMemFinal; | |
| cudaCheck(cudaMemGetInfo(&freeMemFinal, &totalMemFinal)); | |
| size_t usedMemFinal = totalMemFinal - freeMemFinal; | |
| double usedMemFinalMB = usedMemFinal / (1024.0 * 1024.0); | |
| std::cout << "\nFinal device memory usage after cleanup: " << usedMemFinalMB << " MB" << std::endl; | |
| // Cleanup | |
| cudaCheck(cudaFree(d_data)); | |
| delete[] h_data; | |
| cudaCheck(cudaStreamDestroy(stream)); | |
| return 0; | |
| } |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
each graph contains 128 kernels, and each kernel is instantiated 128 times.
running with
nvcc test.cu -o test && ./test:with Driver Version: 570.133.20 CUDA Version: 12.8 :
per kernel node per cudagraph memory is like 11 KiB.
with Driver Version: 580.82.07 CUDA Version: 13.0
per kernel node per cudagraph memory is like 2 KiB.