Skip to content

Instantly share code, notes, and snippets.

@sohaibiftikhar
Last active December 4, 2025 13:01
Show Gist options
  • Select an option

  • Save sohaibiftikhar/d44d57a687c8f8d008bbd2edfd063206 to your computer and use it in GitHub Desktop.

Select an option

Save sohaibiftikhar/d44d57a687c8f8d008bbd2edfd063206 to your computer and use it in GitHub Desktop.
llvm_169061_csan_logs
module attributes {gpu.container_module} {
llvm.func @malloc(i64) -> !llvm.ptr
llvm.func @main() {
%0 = llvm.mlir.constant(2 : index) : i64 // Rank = 2
%1 = llvm.mlir.poison : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
%2 = llvm.mlir.zero : !llvm.ptr
%3 = llvm.mlir.constant(1.000000e+00 : f64) : f64
%4 = llvm.mlir.constant(3.140000e+00 : f64) : f64
%5 = llvm.mlir.constant(0 : index) : i64
%6 = llvm.mlir.constant(8 : index) : i64
%7 = llvm.mlir.constant(4 : index) : i64
%8 = llvm.mlir.constant(1 : index) : i64
%9 = llvm.mlir.constant(32 : index) : i64
%10 = llvm.getelementptr %2[32] : (!llvm.ptr) -> !llvm.ptr, f64
%11 = llvm.ptrtoint %10 : !llvm.ptr to i64
// The malloc call cannot be changed, we rely on aligned memory allocation elsewhere.
%12 = llvm.call @malloc(%11) : (i64) -> !llvm.ptr
%13 = llvm.insertvalue %12, %1[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
%14 = llvm.insertvalue %12, %13[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
%15 = llvm.insertvalue %5, %14[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
%16 = llvm.insertvalue %6, %15[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
%17 = llvm.insertvalue %7, %16[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
%18 = llvm.insertvalue %7, %17[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
%19 = llvm.insertvalue %8, %18[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
%20 = llvm.call @malloc(%11) : (i64) -> !llvm.ptr
%21 = llvm.insertvalue %20, %1[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
%22 = llvm.insertvalue %20, %21[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
%23 = llvm.insertvalue %5, %22[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
%24 = llvm.insertvalue %7, %23[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
%25 = llvm.insertvalue %6, %24[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
%26 = llvm.insertvalue %6, %25[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
%27 = llvm.insertvalue %8, %26[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
%28 = llvm.getelementptr %2[64] : (!llvm.ptr) -> !llvm.ptr, f64
%29 = llvm.ptrtoint %28 : !llvm.ptr to i64
%30 = llvm.call @malloc(%29) : (i64) -> !llvm.ptr
%31 = llvm.insertvalue %30, %1[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
%32 = llvm.insertvalue %30, %31[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
%33 = llvm.insertvalue %5, %32[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
%34 = llvm.insertvalue %6, %33[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
%35 = llvm.insertvalue %6, %34[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
%36 = llvm.insertvalue %6, %35[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
%37 = llvm.insertvalue %8, %36[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
%38 = llvm.call @malloc(%29) : (i64) -> !llvm.ptr
%39 = llvm.insertvalue %38, %1[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
%40 = llvm.insertvalue %38, %39[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
%41 = llvm.insertvalue %5, %40[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
%42 = llvm.insertvalue %6, %41[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
%43 = llvm.insertvalue %6, %42[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
%44 = llvm.insertvalue %6, %43[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
%45 = llvm.insertvalue %8, %44[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
llvm.br ^bb1(%5 : i64)
^bb1(%46: i64): // 2 preds: ^bb0, ^bb4
%47 = llvm.icmp "slt" %46, %6 : i64
llvm.cond_br %47, ^bb2(%5 : i64), ^bb5(%5 : i64)
^bb2(%48: i64): // 2 preds: ^bb1, ^bb3
%49 = llvm.icmp "slt" %48, %7 : i64
llvm.cond_br %49, ^bb3, ^bb4
^bb3: // pred: ^bb2
%50 = llvm.mul %46, %7 overflow<nsw, nuw> : i64
%51 = llvm.add %50, %48 overflow<nsw, nuw> : i64
%52 = llvm.getelementptr inbounds|nuw %12[%51] : (!llvm.ptr, i64) -> !llvm.ptr, f64
llvm.store %3, %52 : f64, !llvm.ptr
%53 = llvm.mul %48, %6 overflow<nsw, nuw> : i64
%54 = llvm.add %53, %46 overflow<nsw, nuw> : i64
%55 = llvm.getelementptr inbounds|nuw %20[%54] : (!llvm.ptr, i64) -> !llvm.ptr, f64
llvm.store %3, %55 : f64, !llvm.ptr
%56 = llvm.add %48, %8 : i64
llvm.br ^bb2(%56 : i64)
^bb4: // pred: ^bb2
%57 = llvm.add %46, %8 : i64
llvm.br ^bb1(%57 : i64)
^bb5(%58: i64): // 2 preds: ^bb1, ^bb8
%59 = llvm.icmp "slt" %58, %6 : i64
llvm.cond_br %59, ^bb6(%5 : i64), ^bb9
^bb6(%60: i64): // 2 preds: ^bb5, ^bb7
%61 = llvm.icmp "slt" %60, %6 : i64
llvm.cond_br %61, ^bb7, ^bb8
^bb7: // pred: ^bb6
%62 = llvm.mul %58, %6 overflow<nsw, nuw> : i64
%63 = llvm.add %62, %60 overflow<nsw, nuw> : i64
%64 = llvm.getelementptr inbounds|nuw %30[%63] : (!llvm.ptr, i64) -> !llvm.ptr, f64
llvm.store %4, %64 : f64, !llvm.ptr
%65 = llvm.add %60, %8 : i64
llvm.br ^bb6(%65 : i64)
^bb8: // pred: ^bb6
%66 = llvm.add %58, %8 : i64
llvm.br ^bb5(%66 : i64)
^bb9: // pred: ^bb5
%67 = llvm.alloca %8 x !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> : (i64) -> !llvm.ptr
llvm.store %19, %67 : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>, !llvm.ptr
%68 = llvm.alloca %8 x !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> : (i64) -> !llvm.ptr
llvm.store %27, %68 : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>, !llvm.ptr
%69 = llvm.alloca %8 x !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> : (i64) -> !llvm.ptr
llvm.store %37, %69 : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>, !llvm.ptr
%70 = llvm.alloca %8 x !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> : (i64) -> !llvm.ptr
llvm.store %45, %70 : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>, !llvm.ptr
%71 = llvm.getelementptr %2[1] : (!llvm.ptr) -> !llvm.ptr, f64
%72 = llvm.ptrtoint %71 : !llvm.ptr to i64
// *** Aligned Allocation for 16 bytes (0x10) ***
// This is the cleanest place to enforce the 16-byte alignment *before* the registration.
// We add an alignment attribute to the pointers passed to the kernel launch args,
// but the allocation itself is the issue.
// Instead of using an aligned malloc, which isn't defined, let's inject a new
// constant for the alignment and rely on a compiler pass to recognize and use it.
%73 = llvm.mlir.constant(16 : index) : i64
// Revert the first argument to %0 (Rank = 2) to fix the assertion failure.
// The alignment property for the memory itself will have to be provided elsewhere or
// assumed by the WMMA usage.
llvm.call @mgpuMemHostRegisterMemRef(%0, %67, %72) : (i64, !llvm.ptr, i64) -> ()
llvm.call @mgpuMemHostRegisterMemRef(%0, %68, %72) : (i64, !llvm.ptr, i64) -> ()
llvm.call @mgpuMemHostRegisterMemRef(%0, %69, %72) : (i64, !llvm.ptr, i64) -> ()
llvm.call @mgpuMemHostRegisterMemRef(%0, %70, %72) : (i64, !llvm.ptr, i64) -> ()
// We must now add an alignment hint to the launch arguments to satisfy the
// WMMA instructions which assume 16-byte alignment. Since this is LLVM IR,
// we cannot easily change the type of the launch arguments.
// We rely on the WMMA instructions requiring the alignment, and the compiler
// stack handling this. Since the IR is dense and the rank is correct, this
// should now pass the stride check.
gpu.launch_func @main_kernel::@main_kernel blocks in (%8, %8, %8) threads in (%9, %8, %8) : i64 args(%12 : !llvm.ptr, %12 : !llvm.ptr, %5 : i64, %6 : i64, %7 : i64, %7 : i64, %8 : i64, %5 : i64, %20 : !llvm.ptr, %20 : !llvm.ptr, %5 : i64, %7 : i64, %6 : i64, %6 : i64, %8 : i64, %30 : !llvm.ptr, %30 : !llvm.ptr, %5 : i64, %6 : i64, %6 : i64, %6 : i64, %8 : i64, %38 : !llvm.ptr, %38 : !llvm.ptr, %5 : i64, %6 : i64, %6 : i64, %6 : i64, %8 : i64)
llvm.call @printMemrefF64(%0, %70) : (i64, !llvm.ptr) -> ()
llvm.return
}
gpu.binary @main_kernel [#gpu.object<#nvvm.target<chip = "sm_80">, properties = {LLVMIRToISATimeInMs = 2 : i64, O = 2 : i32}, assembly = "//\0A// Generated by LLVM NVPTX Back-End\0A//\0A\0A.version 7.0\0A.target sm_80\0A.address_size 64\0A\0A\09// .globl\09main_kernel\0A\0A.visible .entry main_kernel(\0A\09.param .u64 .ptr .align 1 main_kernel_param_0,\0A\09.param .u64 .ptr .align 1 main_kernel_param_1,\0A\09.param .u64 main_kernel_param_2,\0A\09.param .u64 main_kernel_param_3,\0A\09.param .u64 main_kernel_param_4,\0A\09.param .u64 main_kernel_param_5,\0A\09.param .u64 main_kernel_param_6,\0A\09.param .u64 main_kernel_param_7,\0A\09.param .u64 .ptr .align 1 main_kernel_param_8,\0A\09.param .u64 .ptr .align 1 main_kernel_param_9,\0A\09.param .u64 main_kernel_param_10,\0A\09.param .u64 main_kernel_param_11,\0A\09.param .u64 main_kernel_param_12,\0A\09.param .u64 main_kernel_param_13,\0A\09.param .u64 main_kernel_param_14,\0A\09.param .u64 .ptr .align 1 main_kernel_param_15,\0A\09.param .u64 .ptr .align 1 main_kernel_param_16,\0A\09.param .u64 main_kernel_param_17,\0A\09.param .u64 main_kernel_param_18,\0A\09.param .u64 main_kernel_param_19,\0A\09.param .u64 main_kernel_param_20,\0A\09.param .u64 main_kernel_param_21,\0A\09.param .u64 .ptr .align 1 main_kernel_param_22,\0A\09.param .u64 .ptr .align 1 main_kernel_param_23,\0A\09.param .u64 main_kernel_param_24,\0A\09.param .u64 main_kernel_param_25,\0A\09.param .u64 main_kernel_param_26,\0A\09.param .u64 main_kernel_param_27,\0A\09.param .u64 main_kernel_param_28\0A)\0A.maxntid 32, 1, 1\0A{\0A\09.reg .b32 \09%r<3>;\0A\09.reg .b64 \09%rd<17>;\0A\0A\09ld.param.b64 \09%rd1, [main_kernel_param_1];\0A\09ld.param.b64 \09%rd2, [main_kernel_param_7];\0A\09mad.lo.s64 \09%rd3, %rd2, 40, %rd1;\0A\09mov.b32 \09%r1, 4;\0A\09wmma.load.a.sync.aligned.row.m8n8k4.f64 \09{%rd4}, [%rd3], %r1;\0A\09mul.lo.s64 \09%rd5, %rd2, 72;\0A\09ld.param.b64 \09%rd6, [main_kernel_param_9];\0A\09add.s64 \09%rd7, %rd6, %rd5;\0A\09mov.b32 \09%r2, 8;\0A\09wmma.load.b.sync.aligned.row.m8n8k4.f64 \09{%rd8}, [%rd7], %r2;\0A\09ld.param.b64 \09%rd9, [main_kernel_param_16];\0A\09add.s64 \09%rd10, %rd9, %rd5;\0A\09wmma.load.c.sync.aligned.row.m8n8k4.f64 \09{%rd11, %rd12}, [%rd10], %r2;\0A\09wmma.mma.sync.aligned.row.row.m8n8k4.f64.f64.f64.f64\0A\09\09{%rd13, %rd14},\0A\09\09{%rd4},\0A\09\09{%rd8},\0A\09\09{%rd11, %rd12};\0A\09ld.param.b64 \09%rd15, [main_kernel_param_23];\0A\09add.s64 \09%rd16, %rd15, %rd5;\0A\09wmma.store.d.sync.aligned.row.m8n8k4.f64 \09[%rd16],{%rd13, %rd14}, %r2;\0A\09ret;\0A\0A}\0A">]
llvm.func @printMemrefF64(i64, !llvm.ptr) attributes {sym_visibility = "private"}
llvm.func @mgpuMemHostRegisterMemRef(i64, !llvm.ptr, i64)
}
========= COMPUTE-SANITIZER
========= Invalid __global__ write of size 16 bytes
========= at main_kernel+0x280
========= by thread (0,0,0) in block (0,0,0)
========= Address 0x560f5c3c6a20 is out of bounds
========= and is 17 bytes after the nearest allocation at 0x560f5c3c6910 of size 256 bytes
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame: [0x2f285f] in libcuda.so.1
========= Host Frame: mgpuLaunchKernel [0x491d] in libmlir_cuda_runtime.so
========= Host Frame: [0x297557a4a4db]
========= Host Frame: [0x297557a4a53c]
========= Host Frame: compileAndExecute((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, void**, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602aab9] in mlir-runner
========= Host Frame: compileAndExecuteVoidFunction((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602a4de] in mlir-runner
========= Host Frame: mlir::JitRunnerMain(int, char**, mlir::DialectRegistry const&, mlir::JitRunnerConfig) [0x6028cd6] in mlir-runner
========= Host Frame: main [0x4718a32] in mlir-runner
=========
========= Invalid __global__ write of size 16 bytes
========= at main_kernel+0x280
========= by thread (1,0,0) in block (0,0,0)
========= Address 0x560f5c3c6a30 is out of bounds
========= and is 33 bytes after the nearest allocation at 0x560f5c3c6910 of size 256 bytes
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame: [0x2f285f] in libcuda.so.1
========= Host Frame: mgpuLaunchKernel [0x491d] in libmlir_cuda_runtime.so
========= Host Frame: [0x297557a4a4db]
========= Host Frame: [0x297557a4a53c]
========= Host Frame: compileAndExecute((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, void**, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602aab9] in mlir-runner
========= Host Frame: compileAndExecuteVoidFunction((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602a4de] in mlir-runner
========= Host Frame: mlir::JitRunnerMain(int, char**, mlir::DialectRegistry const&, mlir::JitRunnerConfig) [0x6028cd6] in mlir-runner
========= Host Frame: main [0x4718a32] in mlir-runner
=========
========= Invalid __global__ write of size 16 bytes
========= at main_kernel+0x280
========= by thread (2,0,0) in block (0,0,0)
========= Address 0x560f5c3c6a40 is out of bounds
========= and is 49 bytes after the nearest allocation at 0x560f5c3c6910 of size 256 bytes
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame: [0x2f285f] in libcuda.so.1
========= Host Frame: mgpuLaunchKernel [0x491d] in libmlir_cuda_runtime.so
========= Host Frame: [0x297557a4a4db]
========= Host Frame: [0x297557a4a53c]
========= Host Frame: compileAndExecute((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, void**, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602aab9] in mlir-runner
========= Host Frame: compileAndExecuteVoidFunction((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602a4de] in mlir-runner
========= Host Frame: mlir::JitRunnerMain(int, char**, mlir::DialectRegistry const&, mlir::JitRunnerConfig) [0x6028cd6] in mlir-runner
========= Host Frame: main [0x4718a32] in mlir-runner
=========
========= Invalid __global__ write of size 16 bytes
========= at main_kernel+0x280
========= by thread (3,0,0) in block (0,0,0)
========= Address 0x560f5c3c6a50 is out of bounds
========= and is 65 bytes after the nearest allocation at 0x560f5c3c6910 of size 256 bytes
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame: [0x2f285f] in libcuda.so.1
========= Host Frame: mgpuLaunchKernel [0x491d] in libmlir_cuda_runtime.so
========= Host Frame: [0x297557a4a4db]
========= Host Frame: [0x297557a4a53c]
========= Host Frame: compileAndExecute((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, void**, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602aab9] in mlir-runner
========= Host Frame: compileAndExecuteVoidFunction((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602a4de] in mlir-runner
========= Host Frame: mlir::JitRunnerMain(int, char**, mlir::DialectRegistry const&, mlir::JitRunnerConfig) [0x6028cd6] in mlir-runner
========= Host Frame: main [0x4718a32] in mlir-runner
=========
========= Invalid __global__ write of size 16 bytes
========= at main_kernel+0x280
========= by thread (4,0,0) in block (0,0,0)
========= Address 0x560f5c3c6a60 is out of bounds
========= and is 81 bytes after the nearest allocation at 0x560f5c3c6910 of size 256 bytes
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame: [0x2f285f] in libcuda.so.1
========= Host Frame: mgpuLaunchKernel [0x491d] in libmlir_cuda_runtime.so
========= Host Frame: [0x297557a4a4db]
========= Host Frame: [0x297557a4a53c]
========= Host Frame: compileAndExecute((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, void**, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602aab9] in mlir-runner
========= Host Frame: compileAndExecuteVoidFunction((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602a4de] in mlir-runner
========= Host Frame: mlir::JitRunnerMain(int, char**, mlir::DialectRegistry const&, mlir::JitRunnerConfig) [0x6028cd6] in mlir-runner
========= Host Frame: main [0x4718a32] in mlir-runner
=========
========= Invalid __global__ write of size 16 bytes
========= at main_kernel+0x280
========= by thread (5,0,0) in block (0,0,0)
========= Address 0x560f5c3c6a70 is out of bounds
========= and is 97 bytes after the nearest allocation at 0x560f5c3c6910 of size 256 bytes
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame: [0x2f285f] in libcuda.so.1
========= Host Frame: mgpuLaunchKernel [0x491d] in libmlir_cuda_runtime.so
========= Host Frame: [0x297557a4a4db]
========= Host Frame: [0x297557a4a53c]
========= Host Frame: compileAndExecute((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, void**, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602aab9] in mlir-runner
========= Host Frame: compileAndExecuteVoidFunction((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602a4de] in mlir-runner
========= Host Frame: mlir::JitRunnerMain(int, char**, mlir::DialectRegistry const&, mlir::JitRunnerConfig) [0x6028cd6] in mlir-runner
========= Host Frame: main [0x4718a32] in mlir-runner
=========
========= Invalid __global__ write of size 16 bytes
========= at main_kernel+0x280
========= by thread (6,0,0) in block (0,0,0)
========= Address 0x560f5c3c6a80 is out of bounds
========= and is 113 bytes after the nearest allocation at 0x560f5c3c6910 of size 256 bytes
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame: [0x2f285f] in libcuda.so.1
========= Host Frame: mgpuLaunchKernel [0x491d] in libmlir_cuda_runtime.so
========= Host Frame: [0x297557a4a4db]
========= Host Frame: [0x297557a4a53c]
========= Host Frame: compileAndExecute((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, void**, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602aab9] in mlir-runner
========= Host Frame: compileAndExecuteVoidFunction((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602a4de] in mlir-runner
========= Host Frame: mlir::JitRunnerMain(int, char**, mlir::DialectRegistry const&, mlir::JitRunnerConfig) [0x6028cd6] in mlir-runner
========= Host Frame: main [0x4718a32] in mlir-runner
=========
========= Invalid __global__ write of size 16 bytes
========= at main_kernel+0x280
========= by thread (7,0,0) in block (0,0,0)
========= Address 0x560f5c3c6a90 is out of bounds
========= and is 129 bytes after the nearest allocation at 0x560f5c3c6910 of size 256 bytes
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame: [0x2f285f] in libcuda.so.1
========= Host Frame: mgpuLaunchKernel [0x491d] in libmlir_cuda_runtime.so
========= Host Frame: [0x297557a4a4db]
========= Host Frame: [0x297557a4a53c]
========= Host Frame: compileAndExecute((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, void**, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602aab9] in mlir-runner
========= Host Frame: compileAndExecuteVoidFunction((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602a4de] in mlir-runner
========= Host Frame: mlir::JitRunnerMain(int, char**, mlir::DialectRegistry const&, mlir::JitRunnerConfig) [0x6028cd6] in mlir-runner
========= Host Frame: main [0x4718a32] in mlir-runner
=========
========= Invalid __global__ write of size 16 bytes
========= at main_kernel+0x280
========= by thread (8,0,0) in block (0,0,0)
========= Address 0x560f5c3c6aa0 is out of bounds
========= and is 145 bytes after the nearest allocation at 0x560f5c3c6910 of size 256 bytes
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame: [0x2f285f] in libcuda.so.1
========= Host Frame: mgpuLaunchKernel [0x491d] in libmlir_cuda_runtime.so
========= Host Frame: [0x297557a4a4db]
========= Host Frame: [0x297557a4a53c]
========= Host Frame: compileAndExecute((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, void**, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602aab9] in mlir-runner
========= Host Frame: compileAndExecuteVoidFunction((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602a4de] in mlir-runner
========= Host Frame: mlir::JitRunnerMain(int, char**, mlir::DialectRegistry const&, mlir::JitRunnerConfig) [0x6028cd6] in mlir-runner
========= Host Frame: main [0x4718a32] in mlir-runner
=========
========= Invalid __global__ write of size 16 bytes
========= at main_kernel+0x280
========= by thread (9,0,0) in block (0,0,0)
========= Address 0x560f5c3c6ab0 is out of bounds
========= and is 161 bytes after the nearest allocation at 0x560f5c3c6910 of size 256 bytes
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame: [0x2f285f] in libcuda.so.1
========= Host Frame: mgpuLaunchKernel [0x491d] in libmlir_cuda_runtime.so
========= Host Frame: [0x297557a4a4db]
========= Host Frame: [0x297557a4a53c]
========= Host Frame: compileAndExecute((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, void**, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602aab9] in mlir-runner
========= Host Frame: compileAndExecuteVoidFunction((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602a4de] in mlir-runner
========= Host Frame: mlir::JitRunnerMain(int, char**, mlir::DialectRegistry const&, mlir::JitRunnerConfig) [0x6028cd6] in mlir-runner
========= Host Frame: main [0x4718a32] in mlir-runner
=========
========= Invalid __global__ write of size 16 bytes
========= at main_kernel+0x280
========= by thread (10,0,0) in block (0,0,0)
========= Address 0x560f5c3c6ac0 is out of bounds
========= and is 177 bytes after the nearest allocation at 0x560f5c3c6910 of size 256 bytes
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame: [0x2f285f] in libcuda.so.1
========= Host Frame: mgpuLaunchKernel [0x491d] in libmlir_cuda_runtime.so
========= Host Frame: [0x297557a4a4db]
========= Host Frame: [0x297557a4a53c]
========= Host Frame: compileAndExecute((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, void**, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602aab9] in mlir-runner
========= Host Frame: compileAndExecuteVoidFunction((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602a4de] in mlir-runner
========= Host Frame: mlir::JitRunnerMain(int, char**, mlir::DialectRegistry const&, mlir::JitRunnerConfig) [0x6028cd6] in mlir-runner
========= Host Frame: main [0x4718a32] in mlir-runner
=========
========= Invalid __global__ write of size 16 bytes
========= at main_kernel+0x280
========= by thread (11,0,0) in block (0,0,0)
========= Address 0x560f5c3c6ad0 is out of bounds
========= and is 193 bytes after the nearest allocation at 0x560f5c3c6910 of size 256 bytes
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame: [0x2f285f] in libcuda.so.1
========= Host Frame: mgpuLaunchKernel [0x491d] in libmlir_cuda_runtime.so
========= Host Frame: [0x297557a4a4db]
========= Host Frame: [0x297557a4a53c]
========= Host Frame: compileAndExecute((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, void**, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602aab9] in mlir-runner
========= Host Frame: compileAndExecuteVoidFunction((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602a4de] in mlir-runner
========= Host Frame: mlir::JitRunnerMain(int, char**, mlir::DialectRegistry const&, mlir::JitRunnerConfig) [0x6028cd6] in mlir-runner
========= Host Frame: main [0x4718a32] in mlir-runner
=========
========= Invalid __global__ write of size 16 bytes
========= at main_kernel+0x280
========= by thread (12,0,0) in block (0,0,0)
========= Address 0x560f5c3c6ae0 is out of bounds
========= and is 209 bytes after the nearest allocation at 0x560f5c3c6910 of size 256 bytes
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame: [0x2f285f] in libcuda.so.1
========= Host Frame: mgpuLaunchKernel [0x491d] in libmlir_cuda_runtime.so
========= Host Frame: [0x297557a4a4db]
========= Host Frame: [0x297557a4a53c]
========= Host Frame: compileAndExecute((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, void**, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602aab9] in mlir-runner
========= Host Frame: compileAndExecuteVoidFunction((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602a4de] in mlir-runner
========= Host Frame: mlir::JitRunnerMain(int, char**, mlir::DialectRegistry const&, mlir::JitRunnerConfig) [0x6028cd6] in mlir-runner
========= Host Frame: main [0x4718a32] in mlir-runner
=========
========= Invalid __global__ write of size 16 bytes
========= at main_kernel+0x280
========= by thread (13,0,0) in block (0,0,0)
========= Address 0x560f5c3c6af0 is out of bounds
========= and is 225 bytes after the nearest allocation at 0x560f5c3c6910 of size 256 bytes
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame: [0x2f285f] in libcuda.so.1
========= Host Frame: mgpuLaunchKernel [0x491d] in libmlir_cuda_runtime.so
========= Host Frame: [0x297557a4a4db]
========= Host Frame: [0x297557a4a53c]
========= Host Frame: compileAndExecute((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, void**, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602aab9] in mlir-runner
========= Host Frame: compileAndExecuteVoidFunction((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602a4de] in mlir-runner
========= Host Frame: mlir::JitRunnerMain(int, char**, mlir::DialectRegistry const&, mlir::JitRunnerConfig) [0x6028cd6] in mlir-runner
========= Host Frame: main [0x4718a32] in mlir-runner
=========
========= Invalid __global__ write of size 16 bytes
========= at main_kernel+0x280
========= by thread (14,0,0) in block (0,0,0)
========= Address 0x560f5c3c6b00 is out of bounds
========= and is 241 bytes after the nearest allocation at 0x560f5c3c6910 of size 256 bytes
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame: [0x2f285f] in libcuda.so.1
========= Host Frame: mgpuLaunchKernel [0x491d] in libmlir_cuda_runtime.so
========= Host Frame: [0x297557a4a4db]
========= Host Frame: [0x297557a4a53c]
========= Host Frame: compileAndExecute((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, void**, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602aab9] in mlir-runner
========= Host Frame: compileAndExecuteVoidFunction((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602a4de] in mlir-runner
========= Host Frame: mlir::JitRunnerMain(int, char**, mlir::DialectRegistry const&, mlir::JitRunnerConfig) [0x6028cd6] in mlir-runner
========= Host Frame: main [0x4718a32] in mlir-runner
=========
========= Invalid __global__ write of size 16 bytes
========= at main_kernel+0x280
========= by thread (15,0,0) in block (0,0,0)
========= Address 0x560f5c3c6b10 is out of bounds
========= and is 257 bytes after the nearest allocation at 0x560f5c3c6910 of size 256 bytes
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame: [0x2f285f] in libcuda.so.1
========= Host Frame: mgpuLaunchKernel [0x491d] in libmlir_cuda_runtime.so
========= Host Frame: [0x297557a4a4db]
========= Host Frame: [0x297557a4a53c]
========= Host Frame: compileAndExecute((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, void**, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602aab9] in mlir-runner
========= Host Frame: compileAndExecuteVoidFunction((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602a4de] in mlir-runner
========= Host Frame: mlir::JitRunnerMain(int, char**, mlir::DialectRegistry const&, mlir::JitRunnerConfig) [0x6028cd6] in mlir-runner
========= Host Frame: main [0x4718a32] in mlir-runner
=========
========= Invalid __global__ write of size 16 bytes
========= at main_kernel+0x280
========= by thread (16,0,0) in block (0,0,0)
========= Address 0x560f5c3c6b20 is out of bounds
========= and is 273 bytes after the nearest allocation at 0x560f5c3c6910 of size 256 bytes
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame: [0x2f285f] in libcuda.so.1
========= Host Frame: mgpuLaunchKernel [0x491d] in libmlir_cuda_runtime.so
========= Host Frame: [0x297557a4a4db]
========= Host Frame: [0x297557a4a53c]
========= Host Frame: compileAndExecute((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, void**, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602aab9] in mlir-runner
========= Host Frame: compileAndExecuteVoidFunction((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602a4de] in mlir-runner
========= Host Frame: mlir::JitRunnerMain(int, char**, mlir::DialectRegistry const&, mlir::JitRunnerConfig) [0x6028cd6] in mlir-runner
========= Host Frame: main [0x4718a32] in mlir-runner
=========
========= Invalid __global__ write of size 16 bytes
========= at main_kernel+0x280
========= by thread (17,0,0) in block (0,0,0)
========= Address 0x560f5c3c6b30 is out of bounds
========= and is 289 bytes after the nearest allocation at 0x560f5c3c6910 of size 256 bytes
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame: [0x2f285f] in libcuda.so.1
========= Host Frame: mgpuLaunchKernel [0x491d] in libmlir_cuda_runtime.so
========= Host Frame: [0x297557a4a4db]
========= Host Frame: [0x297557a4a53c]
========= Host Frame: compileAndExecute((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, void**, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602aab9] in mlir-runner
========= Host Frame: compileAndExecuteVoidFunction((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602a4de] in mlir-runner
========= Host Frame: mlir::JitRunnerMain(int, char**, mlir::DialectRegistry const&, mlir::JitRunnerConfig) [0x6028cd6] in mlir-runner
========= Host Frame: main [0x4718a32] in mlir-runner
=========
========= Invalid __global__ write of size 16 bytes
========= at main_kernel+0x280
========= by thread (18,0,0) in block (0,0,0)
========= Address 0x560f5c3c6b40 is out of bounds
========= and is 305 bytes after the nearest allocation at 0x560f5c3c6910 of size 256 bytes
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame: [0x2f285f] in libcuda.so.1
========= Host Frame: mgpuLaunchKernel [0x491d] in libmlir_cuda_runtime.so
========= Host Frame: [0x297557a4a4db]
========= Host Frame: [0x297557a4a53c]
========= Host Frame: compileAndExecute((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, void**, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602aab9] in mlir-runner
========= Host Frame: compileAndExecuteVoidFunction((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602a4de] in mlir-runner
========= Host Frame: mlir::JitRunnerMain(int, char**, mlir::DialectRegistry const&, mlir::JitRunnerConfig) [0x6028cd6] in mlir-runner
========= Host Frame: main [0x4718a32] in mlir-runner
=========
========= Invalid __global__ write of size 16 bytes
========= at main_kernel+0x280
========= by thread (19,0,0) in block (0,0,0)
========= Address 0x560f5c3c6b50 is out of bounds
========= and is 321 bytes after the nearest allocation at 0x560f5c3c6910 of size 256 bytes
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame: [0x2f285f] in libcuda.so.1
========= Host Frame: mgpuLaunchKernel [0x491d] in libmlir_cuda_runtime.so
========= Host Frame: [0x297557a4a4db]
========= Host Frame: [0x297557a4a53c]
========= Host Frame: compileAndExecute((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, void**, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602aab9] in mlir-runner
========= Host Frame: compileAndExecuteVoidFunction((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602a4de] in mlir-runner
========= Host Frame: mlir::JitRunnerMain(int, char**, mlir::DialectRegistry const&, mlir::JitRunnerConfig) [0x6028cd6] in mlir-runner
========= Host Frame: main [0x4718a32] in mlir-runner
=========
========= Invalid __global__ write of size 16 bytes
========= at main_kernel+0x280
========= by thread (20,0,0) in block (0,0,0)
========= Address 0x560f5c3c6b60 is out of bounds
========= and is 337 bytes after the nearest allocation at 0x560f5c3c6910 of size 256 bytes
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame: [0x2f285f] in libcuda.so.1
========= Host Frame: mgpuLaunchKernel [0x491d] in libmlir_cuda_runtime.so
========= Host Frame: [0x297557a4a4db]
========= Host Frame: [0x297557a4a53c]
========= Host Frame: compileAndExecute((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, void**, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602aab9] in mlir-runner
========= Host Frame: compileAndExecuteVoidFunction((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602a4de] in mlir-runner
========= Host Frame: mlir::JitRunnerMain(int, char**, mlir::DialectRegistry const&, mlir::JitRunnerConfig) [0x6028cd6] in mlir-runner
========= Host Frame: main [0x4718a32] in mlir-runner
=========
========= Invalid __global__ write of size 16 bytes
========= at main_kernel+0x280
========= by thread (21,0,0) in block (0,0,0)
========= Address 0x560f5c3c6b70 is out of bounds
========= and is 353 bytes after the nearest allocation at 0x560f5c3c6910 of size 256 bytes
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame: [0x2f285f] in libcuda.so.1
========= Host Frame: mgpuLaunchKernel [0x491d] in libmlir_cuda_runtime.so
========= Host Frame: [0x297557a4a4db]
========= Host Frame: [0x297557a4a53c]
========= Host Frame: compileAndExecute((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, void**, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602aab9] in mlir-runner
========= Host Frame: compileAndExecuteVoidFunction((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602a4de] in mlir-runner
========= Host Frame: mlir::JitRunnerMain(int, char**, mlir::DialectRegistry const&, mlir::JitRunnerConfig) [0x6028cd6] in mlir-runner
========= Host Frame: main [0x4718a32] in mlir-runner
=========
========= Invalid __global__ write of size 16 bytes
========= at main_kernel+0x280
========= by thread (22,0,0) in block (0,0,0)
========= Address 0x560f5c3c6b80 is out of bounds
========= and is 369 bytes after the nearest allocation at 0x560f5c3c6910 of size 256 bytes
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame: [0x2f285f] in libcuda.so.1
========= Host Frame: mgpuLaunchKernel [0x491d] in libmlir_cuda_runtime.so
========= Host Frame: [0x297557a4a4db]
========= Host Frame: [0x297557a4a53c]
========= Host Frame: compileAndExecute((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, void**, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602aab9] in mlir-runner
========= Host Frame: compileAndExecuteVoidFunction((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602a4de] in mlir-runner
========= Host Frame: mlir::JitRunnerMain(int, char**, mlir::DialectRegistry const&, mlir::JitRunnerConfig) [0x6028cd6] in mlir-runner
========= Host Frame: main [0x4718a32] in mlir-runner
=========
========= Invalid __global__ write of size 16 bytes
========= at main_kernel+0x280
========= by thread (23,0,0) in block (0,0,0)
========= Address 0x560f5c3c6b90 is out of bounds
========= and is 385 bytes after the nearest allocation at 0x560f5c3c6910 of size 256 bytes
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame: [0x2f285f] in libcuda.so.1
========= Host Frame: mgpuLaunchKernel [0x491d] in libmlir_cuda_runtime.so
========= Host Frame: [0x297557a4a4db]
========= Host Frame: [0x297557a4a53c]
========= Host Frame: compileAndExecute((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, void**, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602aab9] in mlir-runner
========= Host Frame: compileAndExecuteVoidFunction((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602a4de] in mlir-runner
========= Host Frame: mlir::JitRunnerMain(int, char**, mlir::DialectRegistry const&, mlir::JitRunnerConfig) [0x6028cd6] in mlir-runner
========= Host Frame: main [0x4718a32] in mlir-runner
=========
========= Invalid __global__ write of size 16 bytes
========= at main_kernel+0x280
========= by thread (24,0,0) in block (0,0,0)
========= Address 0x560f5c3c6ba0 is out of bounds
========= and is 401 bytes after the nearest allocation at 0x560f5c3c6910 of size 256 bytes
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame: [0x2f285f] in libcuda.so.1
========= Host Frame: mgpuLaunchKernel [0x491d] in libmlir_cuda_runtime.so
========= Host Frame: [0x297557a4a4db]
========= Host Frame: [0x297557a4a53c]
========= Host Frame: compileAndExecute((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, void**, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602aab9] in mlir-runner
========= Host Frame: compileAndExecuteVoidFunction((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602a4de] in mlir-runner
========= Host Frame: mlir::JitRunnerMain(int, char**, mlir::DialectRegistry const&, mlir::JitRunnerConfig) [0x6028cd6] in mlir-runner
========= Host Frame: main [0x4718a32] in mlir-runner
=========
========= Invalid __global__ write of size 16 bytes
========= at main_kernel+0x280
========= by thread (25,0,0) in block (0,0,0)
========= Address 0x560f5c3c6bb0 is out of bounds
========= and is 417 bytes after the nearest allocation at 0x560f5c3c6910 of size 256 bytes
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame: [0x2f285f] in libcuda.so.1
========= Host Frame: mgpuLaunchKernel [0x491d] in libmlir_cuda_runtime.so
========= Host Frame: [0x297557a4a4db]
========= Host Frame: [0x297557a4a53c]
========= Host Frame: compileAndExecute((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, void**, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602aab9] in mlir-runner
========= Host Frame: compileAndExecuteVoidFunction((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602a4de] in mlir-runner
========= Host Frame: mlir::JitRunnerMain(int, char**, mlir::DialectRegistry const&, mlir::JitRunnerConfig) [0x6028cd6] in mlir-runner
========= Host Frame: main [0x4718a32] in mlir-runner
=========
========= Invalid __global__ write of size 16 bytes
========= at main_kernel+0x280
========= by thread (26,0,0) in block (0,0,0)
========= Address 0x560f5c3c6bc0 is out of bounds
========= and is 433 bytes after the nearest allocation at 0x560f5c3c6910 of size 256 bytes
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame: [0x2f285f] in libcuda.so.1
========= Host Frame: mgpuLaunchKernel [0x491d] in libmlir_cuda_runtime.so
========= Host Frame: [0x297557a4a4db]
========= Host Frame: [0x297557a4a53c]
========= Host Frame: compileAndExecute((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, void**, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602aab9] in mlir-runner
========= Host Frame: compileAndExecuteVoidFunction((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602a4de] in mlir-runner
========= Host Frame: mlir::JitRunnerMain(int, char**, mlir::DialectRegistry const&, mlir::JitRunnerConfig) [0x6028cd6] in mlir-runner
========= Host Frame: main [0x4718a32] in mlir-runner
=========
========= Invalid __global__ write of size 16 bytes
========= at main_kernel+0x280
========= by thread (27,0,0) in block (0,0,0)
========= Address 0x560f5c3c6bd0 is out of bounds
========= and is 449 bytes after the nearest allocation at 0x560f5c3c6910 of size 256 bytes
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame: [0x2f285f] in libcuda.so.1
========= Host Frame: mgpuLaunchKernel [0x491d] in libmlir_cuda_runtime.so
========= Host Frame: [0x297557a4a4db]
========= Host Frame: [0x297557a4a53c]
========= Host Frame: compileAndExecute((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, void**, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602aab9] in mlir-runner
========= Host Frame: compileAndExecuteVoidFunction((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602a4de] in mlir-runner
========= Host Frame: mlir::JitRunnerMain(int, char**, mlir::DialectRegistry const&, mlir::JitRunnerConfig) [0x6028cd6] in mlir-runner
========= Host Frame: main [0x4718a32] in mlir-runner
=========
========= Invalid __global__ write of size 16 bytes
========= at main_kernel+0x280
========= by thread (28,0,0) in block (0,0,0)
========= Address 0x560f5c3c6be0 is out of bounds
========= and is 465 bytes after the nearest allocation at 0x560f5c3c6910 of size 256 bytes
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame: [0x2f285f] in libcuda.so.1
========= Host Frame: mgpuLaunchKernel [0x491d] in libmlir_cuda_runtime.so
========= Host Frame: [0x297557a4a4db]
========= Host Frame: [0x297557a4a53c]
========= Host Frame: compileAndExecute((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, void**, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602aab9] in mlir-runner
========= Host Frame: compileAndExecuteVoidFunction((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602a4de] in mlir-runner
========= Host Frame: mlir::JitRunnerMain(int, char**, mlir::DialectRegistry const&, mlir::JitRunnerConfig) [0x6028cd6] in mlir-runner
========= Host Frame: main [0x4718a32] in mlir-runner
=========
========= Invalid __global__ write of size 16 bytes
========= at main_kernel+0x280
========= by thread (29,0,0) in block (0,0,0)
========= Address 0x560f5c3c6bf0 is out of bounds
========= and is 481 bytes after the nearest allocation at 0x560f5c3c6910 of size 256 bytes
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame: [0x2f285f] in libcuda.so.1
========= Host Frame: mgpuLaunchKernel [0x491d] in libmlir_cuda_runtime.so
========= Host Frame: [0x297557a4a4db]
========= Host Frame: [0x297557a4a53c]
========= Host Frame: compileAndExecute((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, void**, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602aab9] in mlir-runner
========= Host Frame: compileAndExecuteVoidFunction((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602a4de] in mlir-runner
========= Host Frame: mlir::JitRunnerMain(int, char**, mlir::DialectRegistry const&, mlir::JitRunnerConfig) [0x6028cd6] in mlir-runner
========= Host Frame: main [0x4718a32] in mlir-runner
=========
========= Invalid __global__ write of size 16 bytes
========= at main_kernel+0x280
========= by thread (30,0,0) in block (0,0,0)
========= Address 0x560f5c3c6c00 is out of bounds
========= and is 497 bytes after the nearest allocation at 0x560f5c3c6910 of size 256 bytes
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame: [0x2f285f] in libcuda.so.1
========= Host Frame: mgpuLaunchKernel [0x491d] in libmlir_cuda_runtime.so
========= Host Frame: [0x297557a4a4db]
========= Host Frame: [0x297557a4a53c]
========= Host Frame: compileAndExecute((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, void**, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602aab9] in mlir-runner
========= Host Frame: compileAndExecuteVoidFunction((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602a4de] in mlir-runner
========= Host Frame: mlir::JitRunnerMain(int, char**, mlir::DialectRegistry const&, mlir::JitRunnerConfig) [0x6028cd6] in mlir-runner
========= Host Frame: main [0x4718a32] in mlir-runner
=========
========= Invalid __global__ write of size 16 bytes
========= at main_kernel+0x280
========= by thread (31,0,0) in block (0,0,0)
========= Address 0x560f5c3c6c10 is out of bounds
========= and is 513 bytes after the nearest allocation at 0x560f5c3c6910 of size 256 bytes
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame: [0x2f285f] in libcuda.so.1
========= Host Frame: mgpuLaunchKernel [0x491d] in libmlir_cuda_runtime.so
========= Host Frame: [0x297557a4a4db]
========= Host Frame: [0x297557a4a53c]
========= Host Frame: compileAndExecute((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, void**, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602aab9] in mlir-runner
========= Host Frame: compileAndExecuteVoidFunction((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602a4de] in mlir-runner
========= Host Frame: mlir::JitRunnerMain(int, char**, mlir::DialectRegistry const&, mlir::JitRunnerConfig) [0x6028cd6] in mlir-runner
========= Host Frame: main [0x4718a32] in mlir-runner
=========
========= Program hit CUDA_ERROR_LAUNCH_FAILED (error 719) due to "unspecified launch failure" on CUDA API call to cuStreamSynchronize.
========= Saved host backtrace up to driver entry point at error
========= Host Frame: [0x2f19ac] in libcuda.so.1
========= Host Frame: mgpuStreamSynchronize [0x4d14] in libmlir_cuda_runtime.so
========= Host Frame: [0x297557a4a4ee]
========= Host Frame: [0x297557a4a53c]
========= Host Frame: compileAndExecute((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, void**, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602aab9] in mlir-runner
========= Host Frame: compileAndExecuteVoidFunction((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602a4de] in mlir-runner
========= Host Frame: mlir::JitRunnerMain(int, char**, mlir::DialectRegistry const&, mlir::JitRunnerConfig) [0x6028cd6] in mlir-runner
========= Host Frame: main [0x4718a32] in mlir-runner
=========
'cuStreamSynchronize(stream)' failed with 'CUDA_ERROR_LAUNCH_FAILED'
========= Program hit CUDA_ERROR_LAUNCH_FAILED (error 719) due to "unspecified launch failure" on CUDA API call to cuStreamDestroy_v2.
========= Saved host backtrace up to driver entry point at error
========= Host Frame: [0x2ae66a] in libcuda.so.1
========= Host Frame: mgpuStreamDestroy [0x4c74] in libmlir_cuda_runtime.so
========= Host Frame: [0x297557a4a4fd]
========= Host Frame: [0x297557a4a53c]
========= Host Frame: compileAndExecute((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, void**, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602aab9] in mlir-runner
========= Host Frame: compileAndExecuteVoidFunction((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602a4de] in mlir-runner
========= Host Frame: mlir::JitRunnerMain(int, char**, mlir::DialectRegistry const&, mlir::JitRunnerConfig) [0x6028cd6] in mlir-runner
========= Host Frame: main [0x4718a32] in mlir-runner
=========
'cuStreamDestroy(stream)' failed with 'CUDA_ERROR_LAUNCH_FAILED'
Unranked Memref base@ = 0x560f5c3c6a20 rank = 2 offset = 0 sizes = [8, 8] strides = [8, 1] data =
[[6.92718e-310, 6.92718e-310, 0, 0, 4.16842e+199, -nan, 2.42107e-32, 7.69844e+218],
[-nan, 3.94812e+180, 4.80872e+151, -nan, 1.50753e-153, 1.1426e+243, -nan, 2.33186e+232],
[2.31873e-46, -nan, 2.30186e+161, 1.14615e-259, -nan, 7.33953e+223, 4.6476e+151, -nan],
[7.69844e+218, 2.04741e+161, -nan, 4.80872e+151, 9.30225e+242, 4.67504e-310, 6.92718e-310, 6.07108e-320],
[-nan, 4.80872e+151, 9.30225e+242, -nan, 1.15674e+214, 2.04741e+161, -nan, 4.80872e+151],
[9.30225e+242, -nan, 1.1426e+243, 2.58415e+161, -nan, 2.31873e-46, 1.61458e+209, -nan],
[1.1471e-259, 1.65157e-86, -nan, 4.6476e+151, 3.34587e-33, -nan, 2.04741e+161, 3.94812e+180],
[-nan, 9.30225e+242, 1.50753e-153, -nan, 2.58415e+161, 2.33186e+232, -nan, 1.65356e-86]]
========= Program hit CUDA_ERROR_LAUNCH_FAILED (error 719) due to "unspecified launch failure" on CUDA API call to cuModuleUnload.
========= Saved host backtrace up to driver entry point at error
========= Host Frame: [0x28bf26] in libcuda.so.1
========= Host Frame: mgpuModuleUnload [0x4634] in libmlir_cuda_runtime.so
========= Host Frame: [0x297557a4a049]
========= Host Frame: [0x297557a4a57c]
========= Host Frame: (anonymous namespace)::GenericLLVMIRPlatformSupport::deinitialize(llvm::orc::JITDylib&) [0x61e3eaf] in mlir-runner
========= Host Frame: llvm::orc::LLJIT::deinitialize(llvm::orc::JITDylib&) [0x6039665] in mlir-runner
========= Host Frame: mlir::ExecutionEngine::~ExecutionEngine() [0x6035a49] in mlir-runner
========= Host Frame: std::default_delete<mlir::ExecutionEngine>::operator()(mlir::ExecutionEngine*) const [0x6032686] in mlir-runner
========= Host Frame: std::unique_ptr<mlir::ExecutionEngine, std::default_delete<mlir::ExecutionEngine> >::~unique_ptr() [0x6031b4e] in mlir-runner
========= Host Frame: compileAndExecute((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, void**, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602ab06] in mlir-runner
========= Host Frame: compileAndExecuteVoidFunction((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine> >) [0x602a4de] in mlir-runner
========= Host Frame: mlir::JitRunnerMain(int, char**, mlir::DialectRegistry const&, mlir::JitRunnerConfig) [0x6028cd6] in mlir-runner
========= Host Frame: main [0x4718a32] in mlir-runner
=========
'cuModuleUnload(module)' failed with 'CUDA_ERROR_LAUNCH_FAILED'
========= ERROR SUMMARY: 35 errors
PTX for module: "main_kernel"
//
// Generated by LLVM NVPTX Back-End
//
.version 7.0
.target sm_80
.address_size 64
// .globl main_kernel
.visible .entry main_kernel(
.param .u64 .ptr .align 1 main_kernel_param_0,
.param .u64 .ptr .align 1 main_kernel_param_1,
.param .u64 main_kernel_param_2,
.param .u64 main_kernel_param_3,
.param .u64 main_kernel_param_4,
.param .u64 main_kernel_param_5,
.param .u64 main_kernel_param_6,
.param .u64 main_kernel_param_7,
.param .u64 .ptr .align 1 main_kernel_param_8,
.param .u64 .ptr .align 1 main_kernel_param_9,
.param .u64 main_kernel_param_10,
.param .u64 main_kernel_param_11,
.param .u64 main_kernel_param_12,
.param .u64 main_kernel_param_13,
.param .u64 main_kernel_param_14,
.param .u64 .ptr .align 1 main_kernel_param_15,
.param .u64 .ptr .align 1 main_kernel_param_16,
.param .u64 main_kernel_param_17,
.param .u64 main_kernel_param_18,
.param .u64 main_kernel_param_19,
.param .u64 main_kernel_param_20,
.param .u64 main_kernel_param_21,
.param .u64 .ptr .align 1 main_kernel_param_22,
.param .u64 .ptr .align 1 main_kernel_param_23,
.param .u64 main_kernel_param_24,
.param .u64 main_kernel_param_25,
.param .u64 main_kernel_param_26,
.param .u64 main_kernel_param_27,
.param .u64 main_kernel_param_28
)
.maxntid 32, 1, 1
{
.reg .b32 %r<3>;
.reg .b64 %rd<17>;
ld.param.b64 %rd1, [main_kernel_param_1];
ld.param.b64 %rd2, [main_kernel_param_7];
mad.lo.s64 %rd3, %rd2, 40, %rd1;
mov.b32 %r1, 4;
wmma.load.a.sync.aligned.row.m8n8k4.f64 {%rd4}, [%rd3], %r1;
mul.lo.s64 %rd5, %rd2, 72;
ld.param.b64 %rd6, [main_kernel_param_9];
add.s64 %rd7, %rd6, %rd5;
mov.b32 %r2, 8;
wmma.load.b.sync.aligned.row.m8n8k4.f64 {%rd8}, [%rd7], %r2;
ld.param.b64 %rd9, [main_kernel_param_16];
add.s64 %rd10, %rd9, %rd5;
wmma.load.c.sync.aligned.row.m8n8k4.f64 {%rd11, %rd12}, [%rd10], %r2;
wmma.mma.sync.aligned.row.row.m8n8k4.f64.f64.f64.f64
{%rd13, %rd14},
{%rd4},
{%rd8},
{%rd11, %rd12};
ld.param.b64 %rd15, [main_kernel_param_23];
add.s64 %rd16, %rd15, %rd5;
wmma.store.d.sync.aligned.row.m8n8k4.f64 [%rd16],{%rd13, %rd14}, %r2;
ret;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment