Skip to content

Instantly share code, notes, and snippets.

@lakshayg
Created January 28, 2026 19:17
Show Gist options
  • Select an option

  • Save lakshayg/488677fbcd027516692b019191ad12d0 to your computer and use it in GitHub Desktop.

Select an option

Save lakshayg/488677fbcd027516692b019191ad12d0 to your computer and use it in GitHub Desktop.
#include <cstdio>
#include <vector>
#include <cuda_runtime.h>
#include <cusolverDn.h>
#include <limits>
#define CHECK_CUDA(call) \
do { \
cudaError_t err = call; \
if (err != cudaSuccess) { \
printf("CUDA error %s:%d: %s\n", \
__FILE__, __LINE__, cudaGetErrorString(err)); \
std::exit(EXIT_FAILURE); \
} \
} while (0)
#define CHECK_CUSOLVER(call) \
do { \
cusolverStatus_t status = call; \
if (status != CUSOLVER_STATUS_SUCCESS) { \
printf("cuSOLVER error %s:%d\n", \
__FILE__, __LINE__); \
std::exit(EXIT_FAILURE); \
} \
} while (0)
int main() {
// Matrix size
const int m = 3;
const int n = 3;
const int lda = m;
// Host matrix (column-major)
double h_A[lda * n] = {
2.0, std::numeric_limits<double>::quiet_NaN(), 1.0,
4.0, std::numeric_limits<double>::quiet_NaN(), 0.0,
-2.0, std::numeric_limits<double>::quiet_NaN(), 2.0
};
// Device memory
double* d_A = nullptr;
int* d_Ipiv = nullptr;
int* d_info = nullptr;
CHECK_CUDA(cudaMalloc(&d_A, sizeof(double) * lda * n));
CHECK_CUDA(cudaMalloc(&d_Ipiv, sizeof(int) * std::min(m, n)));
CHECK_CUDA(cudaMalloc(&d_info, sizeof(int)));
CHECK_CUDA(cudaMemcpy(d_A, h_A,
sizeof(double) * lda * n,
cudaMemcpyHostToDevice));
// cuSOLVER handle
cusolverDnHandle_t handle;
CHECK_CUSOLVER(cusolverDnCreate(&handle));
// Query workspace size
int lwork = 0;
CHECK_CUSOLVER(
cusolverDnDgetrf_bufferSize(
handle, m, n, d_A, lda, &lwork));
double* d_work = nullptr;
CHECK_CUDA(cudaMalloc(&d_work, sizeof(double) * lwork));
// LU factorization
CHECK_CUSOLVER(
cusolverDnDgetrf(
handle,
m, n,
d_A, lda,
d_work,
d_Ipiv,
d_info));
// Copy results back
int h_info = 0;
CHECK_CUDA(cudaMemcpy(h_A, d_A,
sizeof(double) * lda * n,
cudaMemcpyDeviceToHost));
CHECK_CUDA(cudaMemcpy(&h_info, d_info,
sizeof(int),
cudaMemcpyDeviceToHost));
if (h_info != 0) {
printf("LU factorization failed: info = %d\n", h_info);
return EXIT_FAILURE;
}
// Print LU matrix
printf("LU (combined L and U):\n");
for (int i = 0; i < m; ++i) {
for (int j = 0; j < n; ++j) {
printf("%8.4f ", h_A[j * lda + i]);
}
printf("\n");
}
// Cleanup
cudaFree(d_A);
cudaFree(d_Ipiv);
cudaFree(d_info);
cudaFree(d_work);
cusolverDnDestroy(handle);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment