Muzammiluddin Syed Muzammiluddin-Syed-ECE

MXFP4 GEMM Benchmark Comparison after XOR enabling

Old run: January 28, 2026 (Top of Main scaled GEMM before swizzle)

New run: February 19, 2026 (Top of Main scaled GEMM after swizzle)

Hardware: MI355, profiled with rocprofv3

shape	Feature_Throughput (TFLOps)	Top_of_main_Throughput(TFLOps)	improvement_ratio	improvement_percent
1000_1024_8192_512	96.044057	86.408117	1.111517	11.151661
1000_16384_26624_1664	1409.736175	1366.244197	1.031833	3.183324
1000_16384_8192_512	1377.701308	1247.127461	1.104700	10.469968
1000_512_256_16	30.624299	28.066809	1.091121	9.112150
1000_53248_8192_512	1043.042983	984.096396	1.059899	5.989920
1024_1024_8192_512	167.149430	160.287635	1.042809	4.280926
1024_16384_26624_1664	2414.689992	2327.495265	1.037463	3.746290
1024_16384_8192_512	2381.916318	2284.148917	1.042803	4.280255
1024_512_256_16	45.497535	41.943040	1.084746	8.474576

Step 1.

Download the Visual Studio installer and select "Desktop development with C++" (installs MSVC, Clang, CMake) along with some minimal options.

	; Reproducer for MemSDNode::getMemOperand() assertion (multi-MMO). See AMDGPU-multi-MMO-fix.md.
	; To trigger: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -O3 repro-amdgcn-multi-mmo-e2e-matmul.ll -o -
	;
	; To reproduce the .optimized.ll from the .linked.ll, run:
	; opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 --passes='verify,memprof-remove-attributes,annotation2metadata,forceattrs,inferattrs,coro-early,function<eager-inv>(ee-instrument<>,lower-expect,simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;no-switch-range-to-icmp;no-switch-to-arithmetic;no-switch-to-lookup;keep-loops;no-hoist-common-insts;no-hoist-loads-stores-with-cond-faulting;no-sink-common-insts;speculate-blocks;simplify-cond-branch;no-speculate-unpredictables>,sroa<modify-cfg>,early-cse<>),openmp-opt,amdgpu-printf-runtime-binding,ipsccp,called-value-propagation,globalopt,function<eager-inv>(mem2reg,instcombine<max-iterations=1;no-verify-fixpoint>,amdgpu-usenative,amdgpu-simplifylib,amdgpu-uniform-intrinsic-combine,simplifycfg<bonus-inst

	shape,name,throughput (TFLOPS),stddev,AI,IREE-AI,created_date,benchmark,instrument,version,SQ_LDS_BANK_CONFLICT
	500_512_256_16,scaled_matmul,15.134461058830322,0,201.76345992808237,331.6867762571868,2026-01-28,mxfp4 gemm,rocprofv3,,0.0
	512_512_256_16,scaled_matmul,22.748767457627117,0,202.2716049382716,334.3673469387755,2026-01-28,mxfp4 gemm,rocprofv3,,0.0
	1000_512_256_16,scaled_matmul,30.624299065420562,0,213.18898662364023,400.1172218423366,2026-01-28,mxfp4 gemm,rocprofv3,,0.0
	1024_512_256_16,scaled_matmul,45.497534915254235,0,213.47231270358307,402.06134969325154,2026-01-28,mxfp4 gemm,rocprofv3,,0.0
	8100_512_256_16,scaled_matmul,227.3289866709491,0,224.32378524751437,488.4482035200848,2026-01-28,mxfp4 gemm,rocprofv3,,0.0
	8192_512_256_16,scaled_matmul,305.0402909090909,0,224.3423192126658,488.618825722274,2026-01-28,mxfp4 gemm,rocprofv3,,0.0
	16300_512_256_16,scaled_matmul,434.24260162601627,0,225.1570269673266,496.20994503881474,2026-01-28,mxfp4 gemm,rocprofv3,,0.0
	16384_512_256_16,scaled_matmul,502.8941275


	/home/kdrewnia/fp4-benchmark/output/1024_1024_32_32_sample_scaled_MFMA.mlir:43:8: error: failed to legalize operation 'linalg.generic' that was explicitly marked illegal
	%D = linalg.generic {
	^
	/home/kdrewnia/fp4-benchmark/output/1024_1024_32_32_sample_scaled_MFMA.mlir:31:1: note: called from
	func.func @scaled_matmul(%lhs : !A, %lhs_scales : !A_s, %rhs : !B, %rhs_scales : !B_s) -> !C_size {
	^
	/home/kdrewnia/fp4-benchmark/output/1024_1024_32_32_sample_scaled_MFMA.mlir:43:8: note: see current operation:
	%19 = "linalg.generic"(%11, %12, %13, %14, %18) <{indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>, #linalg.iterator_type<reduction>], operandSegmentSizes = array<i32: 4, 1>}> ({
	^bb0(%arg4: f4E2M1FN, %

	// <input> tensor<1024x1024xi8> </input>
	// <input> tensor<1024x1024xi8> </input>
	// <input> tensor<1024x32xi8> </input>
	// <input> tensor<1024x32xi8> </input>
	#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {target_arch = "gfx950", ukernels = "none"}>
	#map = affine_map<(d0) -> (d0)>
	#map1 = affine_map<(d0, d1) -> (d0, d1)>
	#map2 = affine_map<(d0, d1) -> (d0)>
	#map3 = affine_map<()[s0] -> ((s0 ceildiv 256) * 256)>
	#map4 = affine_map<()[s0] -> ((s0 ceildiv 32) * 32)>

	func.func @scaled_matmul_1x1() -> tensor<1x1xf32> {
	%lhs = util.unfoldable_constant dense<1.0> : tensor<1x1x32xf32>
	%rhs = util.unfoldable_constant dense<1.0> : tensor<1x1x32xf32>
	%lhs_scales = util.unfoldable_constant dense<126> : tensor<1x1xi8>
	%rhs_scales = util.unfoldable_constant dense<126> : tensor<1x1xi8>
	%A_scales = arith.bitcast %lhs_scales : tensor<1x1xi8> to tensor<1x1xf8E8M0FNU>
	%B_scales = arith.bitcast %rhs_scales : tensor<1x1xi8> to tensor<1x1xf8E8M0FNU>
	%lhs_splat = util.unfoldable_constant dense<1.0> : tensor<1x1x32xf8E8M0FNU>
	%rhs_splat = util.unfoldable_constant dense<1.0> : tensor<1x1x32xf8E8M0FNU>
	%A = arith.scaling_truncf %lhs, %lhs_splat : tensor<1x1x32xf32>, tensor<1x1x32xf8E8M0FNU> to tensor<1x1x32xf4E2M1FN>

	func.func @scaled_matmul_1x1() {
	%lhs = util.unfoldable_constant dense<1.0> : tensor<1x1x32xf32>
	%rhs = util.unfoldable_constant dense<1.0> : tensor<1x1x32xf32>
	%lhs_scales = util.unfoldable_constant dense<126> : tensor<1x1xi8>
	%rhs_scales = util.unfoldable_constant dense<126> : tensor<1x1xi8>
	%A_scales = arith.bitcast %lhs_scales : tensor<1x1xi8> to tensor<1x1xf8E8M0FNU>
	%B_scales = arith.bitcast %rhs_scales : tensor<1x1xi8> to tensor<1x1xf8E8M0FNU>
	%lhs_splat = util.unfoldable_constant dense<1.0> : tensor<1x1x32xf8E8M0FNU>
	%rhs_splat = util.unfoldable_constant dense<1.0> : tensor<1x1x32xf8E8M0FNU>
	%A = arith.scaling_truncf %lhs, %lhs_splat : tensor<1x1x32xf32>, tensor<1x1x32xf8E8M0FNU> to tensor<1x1x32xf4E2M1FN>