Old run: January 28, 2026 (Top of Main scaled GEMM before swizzle)
New run: February 19, 2026 (Top of Main scaled GEMM after swizzle)
Hardware: MI355, profiled with rocprofv3
| ; Reproducer for MemSDNode::getMemOperand() assertion (multi-MMO). See AMDGPU-multi-MMO-fix.md. | |
| ; To trigger: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -O3 repro-amdgcn-multi-mmo-e2e-matmul.ll -o - | |
| ; | |
| ; To reproduce the .optimized.ll from the .linked.ll, run: | |
| ; opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 --passes='verify,memprof-remove-attributes,annotation2metadata,forceattrs,inferattrs,coro-early,function<eager-inv>(ee-instrument<>,lower-expect,simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;no-switch-range-to-icmp;no-switch-to-arithmetic;no-switch-to-lookup;keep-loops;no-hoist-common-insts;no-hoist-loads-stores-with-cond-faulting;no-sink-common-insts;speculate-blocks;simplify-cond-branch;no-speculate-unpredictables>,sroa<modify-cfg>,early-cse<>),openmp-opt,amdgpu-printf-runtime-binding,ipsccp,called-value-propagation,globalopt,function<eager-inv>(mem2reg,instcombine<max-iterations=1;no-verify-fixpoint>,amdgpu-usenative,amdgpu-simplifylib,amdgpu-uniform-intrinsic-combine,simplifycfg<bonus-inst |
| shape,name,throughput (TFLOPS),stddev,AI,IREE-AI,created_date,benchmark,instrument,version,SQ_LDS_BANK_CONFLICT | |
| 500_512_256_16,scaled_matmul,15.134461058830322,0,201.76345992808237,331.6867762571868,2026-01-28,mxfp4 gemm,rocprofv3,,0.0 | |
| 512_512_256_16,scaled_matmul,22.748767457627117,0,202.2716049382716,334.3673469387755,2026-01-28,mxfp4 gemm,rocprofv3,,0.0 | |
| 1000_512_256_16,scaled_matmul,30.624299065420562,0,213.18898662364023,400.1172218423366,2026-01-28,mxfp4 gemm,rocprofv3,,0.0 | |
| 1024_512_256_16,scaled_matmul,45.497534915254235,0,213.47231270358307,402.06134969325154,2026-01-28,mxfp4 gemm,rocprofv3,,0.0 | |
| 8100_512_256_16,scaled_matmul,227.3289866709491,0,224.32378524751437,488.4482035200848,2026-01-28,mxfp4 gemm,rocprofv3,,0.0 | |
| 8192_512_256_16,scaled_matmul,305.0402909090909,0,224.3423192126658,488.618825722274,2026-01-28,mxfp4 gemm,rocprofv3,,0.0 | |
| 16300_512_256_16,scaled_matmul,434.24260162601627,0,225.1570269673266,496.20994503881474,2026-01-28,mxfp4 gemm,rocprofv3,,0.0 | |
| 16384_512_256_16,scaled_matmul,502.8941275 |
| shape | Feature_Throughput (TFLOps) | Top_of_main_Throughput(TFLOps) | improvement_ratio | improvement_percent | |
|---|---|---|---|---|---|
| 1000_1024_8192_512 | 96.044057 | 86.408117 | 1.111517 | 11.151661 | |
| 1000_16384_26624_1664 | 1409.736175 | 1366.244197 | 1.031833 | 3.183324 | |
| 1000_16384_8192_512 | 1377.701308 | 1247.127461 | 1.104700 | 10.469968 | |
| 1000_512_256_16 | 30.624299 | 28.066809 | 1.091121 | 9.112150 | |
| 1000_53248_8192_512 | 1043.042983 | 984.096396 | 1.059899 | 5.989920 | |
| 1024_1024_8192_512 | 167.149430 | 160.287635 | 1.042809 | 4.280926 | |
| 1024_16384_26624_1664 | 2414.689992 | 2327.495265 | 1.037463 | 3.746290 | |
| 1024_16384_8192_512 | 2381.916318 | 2284.148917 | 1.042803 | 4.280255 | |
| 1024_512_256_16 | 45.497535 | 41.943040 | 1.084746 | 8.474576 |
| //<dim> 8192 </dim> | |
| //<dim> 8192 </dim> | |
| //<dim> 256 </dim> | |
| //<dim> 32 </dim> | |
| //<input>tensor<8192x256x32xi8></input> | |
| //<input>tensor<8192x256xi8></input> | |
| //<input>tensor<8192x256x32xi8></input> | |
| //<input>tensor<8192x256xi8></input> | |
| !lhs = f4E2M1FN |
| /home/kdrewnia/fp4-benchmark/output/1024_1024_32_32_sample_scaled_MFMA.mlir:43:8: error: failed to legalize operation 'linalg.generic' that was explicitly marked illegal | |
| %D = linalg.generic { | |
| ^ | |
| /home/kdrewnia/fp4-benchmark/output/1024_1024_32_32_sample_scaled_MFMA.mlir:31:1: note: called from | |
| func.func @scaled_matmul(%lhs : !A, %lhs_scales : !A_s, %rhs : !B, %rhs_scales : !B_s) -> !C_size { | |
| ^ | |
| /home/kdrewnia/fp4-benchmark/output/1024_1024_32_32_sample_scaled_MFMA.mlir:43:8: note: see current operation: | |
| %19 = "linalg.generic"(%11, %12, %13, %14, %18) <{indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>, #linalg.iterator_type<reduction>], operandSegmentSizes = array<i32: 4, 1>}> ({ | |
| ^bb0(%arg4: f4E2M1FN, % |
| // <input> tensor<1024x1024xi8> </input> | |
| // <input> tensor<1024x1024xi8> </input> | |
| // <input> tensor<1024x32xi8> </input> | |
| // <input> tensor<1024x32xi8> </input> | |
| #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {target_arch = "gfx950", ukernels = "none"}> | |
| #map = affine_map<(d0) -> (d0)> | |
| #map1 = affine_map<(d0, d1) -> (d0, d1)> | |
| #map2 = affine_map<(d0, d1) -> (d0)> | |
| #map3 = affine_map<()[s0] -> ((s0 ceildiv 256) * 256)> | |
| #map4 = affine_map<()[s0] -> ((s0 ceildiv 32) * 32)> |
| func.func @scaled_matmul_1x1() -> tensor<1x1xf32> { | |
| %lhs = util.unfoldable_constant dense<1.0> : tensor<1x1x32xf32> | |
| %rhs = util.unfoldable_constant dense<1.0> : tensor<1x1x32xf32> | |
| %lhs_scales = util.unfoldable_constant dense<126> : tensor<1x1xi8> | |
| %rhs_scales = util.unfoldable_constant dense<126> : tensor<1x1xi8> | |
| %A_scales = arith.bitcast %lhs_scales : tensor<1x1xi8> to tensor<1x1xf8E8M0FNU> | |
| %B_scales = arith.bitcast %rhs_scales : tensor<1x1xi8> to tensor<1x1xf8E8M0FNU> | |
| %lhs_splat = util.unfoldable_constant dense<1.0> : tensor<1x1x32xf8E8M0FNU> | |
| %rhs_splat = util.unfoldable_constant dense<1.0> : tensor<1x1x32xf8E8M0FNU> | |
| %A = arith.scaling_truncf %lhs, %lhs_splat : tensor<1x1x32xf32>, tensor<1x1x32xf8E8M0FNU> to tensor<1x1x32xf4E2M1FN> |
| func.func @scaled_matmul_1x1() { | |
| %lhs = util.unfoldable_constant dense<1.0> : tensor<1x1x32xf32> | |
| %rhs = util.unfoldable_constant dense<1.0> : tensor<1x1x32xf32> | |
| %lhs_scales = util.unfoldable_constant dense<126> : tensor<1x1xi8> | |
| %rhs_scales = util.unfoldable_constant dense<126> : tensor<1x1xi8> | |
| %A_scales = arith.bitcast %lhs_scales : tensor<1x1xi8> to tensor<1x1xf8E8M0FNU> | |
| %B_scales = arith.bitcast %rhs_scales : tensor<1x1xi8> to tensor<1x1xf8E8M0FNU> | |
| %lhs_splat = util.unfoldable_constant dense<1.0> : tensor<1x1x32xf8E8M0FNU> | |
| %rhs_splat = util.unfoldable_constant dense<1.0> : tensor<1x1x32xf8E8M0FNU> | |
| %A = arith.scaling_truncf %lhs, %lhs_splat : tensor<1x1x32xf32>, tensor<1x1x32xf8E8M0FNU> to tensor<1x1x32xf4E2M1FN> |