Skip to content

Instantly share code, notes, and snippets.

@youkaichao
Created November 6, 2025 17:20
Show Gist options
  • Select an option

  • Save youkaichao/067b4138dd03f47e2558cb41e731a8b0 to your computer and use it in GitHub Desktop.

Select an option

Save youkaichao/067b4138dd03f47e2558cb41e731a8b0 to your computer and use it in GitHub Desktop.
#include <iostream>
#include <vector>
#include <cuda_runtime.h>
#define cudaCheck(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) {
if (code != cudaSuccess) {
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
// Kernel function to perform computation
__global__ void kernel1(int64_t *data, int64_t repeat) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
for (size_t i = 0; i < repeat; i++) {
data[idx] += 1;
}
}
int main() {
const int dataSize = 1024;
int64_t *h_data = new int64_t[dataSize]; // Host data
int64_t *d_data; // Device data
// Initialize host data
for (int i = 0; i < dataSize; i++) {
h_data[i] = 0;
}
// Allocate memory on the device
cudaCheck(cudaMalloc((void**)&d_data, dataSize * sizeof(int64_t)));
// Transfer data from host to device
cudaCheck(cudaMemcpy(d_data, h_data, dataSize * sizeof(int64_t), cudaMemcpyHostToDevice));
// Define grid and block dimensions
dim3 blockDim(256);
dim3 gridDim((dataSize + blockDim.x - 1) / blockDim.x);
// Create stream
cudaStream_t stream;
cudaCheck(cudaStreamCreate(&stream));
const int64_t repeat = 1000;
const int numInstances = 128;
const int kernelsPerGraph = 128;
// Get initial memory usage
size_t freeMemBefore, totalMem;
cudaCheck(cudaMemGetInfo(&freeMemBefore, &totalMem));
size_t usedMemBefore = totalMem - freeMemBefore;
double usedMemBeforeMB = usedMemBefore / (1024.0 * 1024.0);
std::cout << "Initial device memory usage: " << usedMemBeforeMB << " MB" << std::endl;
// Capture a single graph with kernels executed 128 times
std::cout << "Capturing graph with " << kernelsPerGraph << " kernel executions..." << std::endl;
cudaCheck(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal));
// Capture kernels 128 times
for (int kernelIter = 0; kernelIter < kernelsPerGraph; kernelIter++) {
kernel1<<<gridDim, blockDim, 0, stream>>>(d_data, repeat);
}
// End stream capture
cudaGraph_t graph;
cudaCheck(cudaStreamEndCapture(stream, &graph));
// Get memory usage after creating the graph
size_t freeMemAfterGraph, totalMemAfterGraph;
cudaCheck(cudaMemGetInfo(&freeMemAfterGraph, &totalMemAfterGraph));
size_t usedMemAfterGraph = totalMemAfterGraph - freeMemAfterGraph;
double usedMemAfterGraphMB = usedMemAfterGraph / (1024.0 * 1024.0);
double graphMemoryMB = (usedMemAfterGraph - usedMemBefore) / (1024.0 * 1024.0);
std::cout << "\nAfter creating graph:" << std::endl;
std::cout << " Total device memory usage: " << usedMemAfterGraphMB << " MB" << std::endl;
std::cout << " Memory used by graph: " << graphMemoryMB << " MB" << std::endl;
// Instantiate the graph 128 times
std::vector<cudaGraphExec_t> instances;
std::cout << "\nInstantiating graph " << numInstances << " times..." << std::endl;
for (int instanceIdx = 0; instanceIdx < numInstances; instanceIdx++) {
cudaGraphExec_t instance;
cudaCheck(cudaGraphInstantiate(&instance, graph, NULL, NULL, 0));
instances.push_back(instance);
// Print memory usage periodically
if ((instanceIdx + 1) % 16 == 0 || instanceIdx == 0) {
size_t freeMem, totalMem;
cudaCheck(cudaMemGetInfo(&freeMem, &totalMem));
size_t usedMem = totalMem - freeMem;
double usedMemMB = usedMem / (1024.0 * 1024.0);
double instanceMemoryMB = (usedMem - usedMemAfterGraph) / (1024.0 * 1024.0);
std::cout << " Instantiated " << (instanceIdx + 1) << " instances. "
<< "Total memory: " << usedMemMB << " MB. "
<< "Memory for instances: " << instanceMemoryMB << " MB" << std::endl;
}
}
// Get final memory usage after instantiation
size_t freeMemAfterInstances, totalMemAfterInstances;
cudaCheck(cudaMemGetInfo(&freeMemAfterInstances, &totalMemAfterInstances));
size_t usedMemAfterInstances = totalMemAfterInstances - freeMemAfterInstances;
double usedMemAfterInstancesMB = usedMemAfterInstances / (1024.0 * 1024.0);
double instanceMemoryMB = (usedMemAfterInstances - usedMemAfterGraph) / (1024.0 * 1024.0);
std::cout << "\nFinal memory usage after instantiating " << numInstances << " instances:" << std::endl;
std::cout << " Total device memory usage: " << usedMemAfterInstancesMB << " MB" << std::endl;
std::cout << " Memory used by graph: " << graphMemoryMB << " MB" << std::endl;
std::cout << " Memory used by graph instances: " << instanceMemoryMB << " MB" << std::endl;
std::cout << " Average memory per instance: " << (instanceMemoryMB / numInstances) << " MB" << std::endl;
std::cout << " Total memory (graph + instances): "
<< ((usedMemAfterInstances - usedMemBefore) / (1024.0 * 1024.0)) << " MB" << std::endl;
// Cleanup all instances
std::cout << "\nCleaning up instances..." << std::endl;
for (auto instance : instances) {
cudaCheck(cudaGraphExecDestroy(instance));
}
instances.clear();
// Cleanup graph
std::cout << "Cleaning up graph..." << std::endl;
cudaCheck(cudaGraphDestroy(graph));
// Final memory check
size_t freeMemFinal, totalMemFinal;
cudaCheck(cudaMemGetInfo(&freeMemFinal, &totalMemFinal));
size_t usedMemFinal = totalMemFinal - freeMemFinal;
double usedMemFinalMB = usedMemFinal / (1024.0 * 1024.0);
std::cout << "\nFinal device memory usage after cleanup: " << usedMemFinalMB << " MB" << std::endl;
// Cleanup
cudaCheck(cudaFree(d_data));
delete[] h_data;
cudaCheck(cudaStreamDestroy(stream));
return 0;
}
@youkaichao
Copy link
Author

each graph contains 128 kernels, and each kernel is instantiated 128 times.

running with nvcc test.cu -o test && ./test :

with Driver Version: 570.133.20 CUDA Version: 12.8 :

Instantiating graph 128 times...
  Instantiated 1 instances. Total memory: 129950 MB. Memory for instances: 0 MB
  Instantiated 16 instances. Total memory: 129972 MB. Memory for instances: 22 MB
  Instantiated 32 instances. Total memory: 129994 MB. Memory for instances: 44 MB
  Instantiated 48 instances. Total memory: 130016 MB. Memory for instances: 66 MB
  Instantiated 64 instances. Total memory: 130036 MB. Memory for instances: 86 MB
  Instantiated 80 instances. Total memory: 130060 MB. Memory for instances: 110 MB
  Instantiated 96 instances. Total memory: 130080 MB. Memory for instances: 130 MB
  Instantiated 112 instances. Total memory: 130104 MB. Memory for instances: 154 MB
  Instantiated 128 instances. Total memory: 130124 MB. Memory for instances: 174 MB

per kernel node per cudagraph memory is like 11 KiB.

with Driver Version: 580.82.07 CUDA Version: 13.0

Instantiating graph 128 times...
  Instantiated 1 instances. Total memory: 532.812 MB. Memory for instances: 0 MB
  Instantiated 16 instances. Total memory: 536.812 MB. Memory for instances: 4 MB
  Instantiated 32 instances. Total memory: 544.812 MB. Memory for instances: 12 MB
  Instantiated 48 instances. Total memory: 550.812 MB. Memory for instances: 18 MB
  Instantiated 64 instances. Total memory: 556.812 MB. Memory for instances: 24 MB
  Instantiated 80 instances. Total memory: 562.812 MB. Memory for instances: 30 MB
  Instantiated 96 instances. Total memory: 568.812 MB. Memory for instances: 36 MB
  Instantiated 112 instances. Total memory: 574.812 MB. Memory for instances: 42 MB
  Instantiated 128 instances. Total memory: 582.812 MB. Memory for instances: 50 MB

per kernel node per cudagraph memory is like 2 KiB.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment