Skip to content

Instantly share code, notes, and snippets.

@DeedleFake
Last active February 16, 2026 21:47
Show Gist options
  • Select an option

  • Save DeedleFake/e02324020dfce6686eb6cc45e252fa08 to your computer and use it in GitHub Desktop.

Select an option

Save DeedleFake/e02324020dfce6686eb6cc45e252fa08 to your computer and use it in GitHub Desktop.
Simple experimentation and benchmarking of Go 1.26's SIMD support.

Go 1.26 SIMD Benchmarks

goos: linux
goarch: amd64
pkg: test
cpu: AMD Ryzen 9 3900X 12-Core Processor            
       │ nosimd.txt  │              simd.txt               │             tensor.txt              │
       │   sec/op    │   sec/op     vs base                │   sec/op     vs base                │
Add-24   316.2n ± 1%   434.1n ± 1%  +37.32% (p=0.000 n=10)   184.9n ± 1%  -41.52% (p=0.000 n=10)

       │ nosimd.txt │            simd.txt            │           tensor.txt           │
       │    B/op    │    B/op     vs base            │    B/op     vs base            │
Add-24   0.000 ± 0%   0.000 ± 0%  ~ (p=1.000 n=10) ¹   0.000 ± 0%  ~ (p=1.000 n=10) ¹
¹ all samples are equal

       │ nosimd.txt │            simd.txt            │           tensor.txt           │
       │ allocs/op  │ allocs/op   vs base            │ allocs/op   vs base            │
Add-24   0.000 ± 0%   0.000 ± 0%  ~ (p=1.000 n=10) ¹   0.000 ± 0%  ~ (p=1.000 n=10) ¹
¹ all samples are equal
module test
go 1.26.0
goos: linux
goarch: amd64
pkg: test
cpu: AMD Ryzen 9 3900X 12-Core Processor
BenchmarkAdd-24 3779433 315.9 ns/op 0 B/op 0 allocs/op
BenchmarkAdd-24 3815323 315.3 ns/op 0 B/op 0 allocs/op
BenchmarkAdd-24 3771746 319.1 ns/op 0 B/op 0 allocs/op
BenchmarkAdd-24 3776943 316.6 ns/op 0 B/op 0 allocs/op
BenchmarkAdd-24 3822039 315.8 ns/op 0 B/op 0 allocs/op
BenchmarkAdd-24 3776506 317.8 ns/op 0 B/op 0 allocs/op
BenchmarkAdd-24 3823285 314.4 ns/op 0 B/op 0 allocs/op
BenchmarkAdd-24 3817754 315.3 ns/op 0 B/op 0 allocs/op
BenchmarkAdd-24 3766824 316.4 ns/op 0 B/op 0 allocs/op
BenchmarkAdd-24 3765676 318.0 ns/op 0 B/op 0 allocs/op
PASS
ok test 12.005s
goos: linux
goarch: amd64
pkg: test
cpu: AMD Ryzen 9 3900X 12-Core Processor
BenchmarkAdd-24 2749945 435.4 ns/op 0 B/op 0 allocs/op
BenchmarkAdd-24 2748529 438.4 ns/op 0 B/op 0 allocs/op
BenchmarkAdd-24 2774643 432.3 ns/op 0 B/op 0 allocs/op
BenchmarkAdd-24 2756433 433.8 ns/op 0 B/op 0 allocs/op
BenchmarkAdd-24 2765024 432.6 ns/op 0 B/op 0 allocs/op
BenchmarkAdd-24 2747886 436.3 ns/op 0 B/op 0 allocs/op
BenchmarkAdd-24 2764462 433.3 ns/op 0 B/op 0 allocs/op
BenchmarkAdd-24 2778027 434.5 ns/op 0 B/op 0 allocs/op
BenchmarkAdd-24 2758598 436.8 ns/op 0 B/op 0 allocs/op
BenchmarkAdd-24 2771936 433.3 ns/op 0 B/op 0 allocs/op
PASS
ok test 12.011s
goos: linux
goarch: amd64
pkg: test
cpu: AMD Ryzen 9 3900X 12-Core Processor
BenchmarkAdd-24 6450811 185.0 ns/op 0 B/op 0 allocs/op
BenchmarkAdd-24 6462780 185.2 ns/op 0 B/op 0 allocs/op
BenchmarkAdd-24 6475417 185.8 ns/op 0 B/op 0 allocs/op
BenchmarkAdd-24 6490584 184.3 ns/op 0 B/op 0 allocs/op
BenchmarkAdd-24 6493693 185.0 ns/op 0 B/op 0 allocs/op
BenchmarkAdd-24 6478975 185.1 ns/op 0 B/op 0 allocs/op
BenchmarkAdd-24 6501334 184.8 ns/op 0 B/op 0 allocs/op
BenchmarkAdd-24 6528841 183.9 ns/op 0 B/op 0 allocs/op
BenchmarkAdd-24 6509636 183.9 ns/op 0 B/op 0 allocs/op
BenchmarkAdd-24 6541306 183.8 ns/op 0 B/op 0 allocs/op
PASS
ok test 12.000s
package test_test
import (
"math/rand/v2"
"simd/archsimd"
"testing"
)
//func BenchmarkAdd(b *testing.B) {
// v1 := generateRandom()
// v2 := generateRandom()
//
// b.ResetTimer()
// for b.Loop() {
// add(v1, v2)
// }
//}
//func BenchmarkAdd(b *testing.B) {
// v1 := generateRandom()
// v2 := generateRandom()
//
// b.ResetTimer()
// for b.Loop() {
// addSIMD(v1, v2)
// }
//}
func BenchmarkAdd(b *testing.B) {
v1 := toTensor(generateRandom())
v2 := toTensor(generateRandom())
b.ResetTimer()
for b.Loop() {
addTensor(v1, v2)
}
}
func generateRandom() []uint64 {
v := make([]uint64, 1000)
for i := range v {
v[i] = rand.Uint64()
}
return v
}
func toTensor(v []uint64) []archsimd.Uint64x4 {
s := make([]archsimd.Uint64x4, 0, (len(v)+4-1)/4)
for i := 0; i < len(v); i += 4 {
s = append(s, archsimd.LoadUint64x4SlicePart(v[i:]))
}
return s
}
func addTensor(v1, v2 []archsimd.Uint64x4) []archsimd.Uint64x4 {
if len(v1) != len(v2) {
panic("len(v1) != len(v2)")
}
for i, s1 := range v1 {
v1[i] = s1.Add(v2[i])
}
return v1
}
func addSIMD(v1, v2 []uint64) []uint64 {
if len(v1) != len(v2) {
panic("len(v1) != len(v2)")
}
for i := 0; i < len(v1); i += 4 {
s1 := archsimd.LoadUint64x4SlicePart(v1[i:])
s2 := archsimd.LoadUint64x4SlicePart(v2[i:])
r := s1.Add(s2)
r.StoreSlicePart(v1[i:])
}
return v1
}
func add(v1, v2 []uint64) []uint64 {
if len(v1) != len(v2) {
panic("len(v1) != len(v2)")
}
for i := range v1 {
v1[i] += v2[i]
}
return v1
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment