Skip to content

Instantly share code, notes, and snippets.

@jakubtomsu
Last active January 27, 2026 20:57
Show Gist options
  • Select an option

  • Save jakubtomsu/d25210b55037858c3ed35fe00182f92a to your computer and use it in GitHub Desktop.

Select an option

Save jakubtomsu/d25210b55037858c3ed35fe00182f92a to your computer and use it in GitHub Desktop.
~11x faster exp(-x)
package foo
import "base:intrinsics"
import "core:math"
import "core:fmt"
import "core:time"
// https://www.desmos.com/calculator/i17pexccum
a: f32 = 1.0
b: f32 = 1.0/4.0
c: f32 = 1.0/6.0
// Used as a baseline for the error comparisons
taylor_nexp :: proc(x: f32) -> f32 {
x2 := x * x
return 1.0 / (1.0 + a * x + c * x * x * x)
}
// NOTE: the approximate reciprocal along with the reordered equation helps by like additional ~10%, nothing crazy.
// Feel free to use one of the simpler variants:
// return 1.0 / (1.0 + A * x + C * x * x * x)
// return 1.0 / (1.0 + x * (A + C * x * x))
approx_nexp :: proc(x: f32) -> (result: f32) {
A :: 1.1566406
C :: 0.53652346
when ODIN_ARCH == .amd64 {
result = intrinsics.simd_extract(rcpss(1.0 + x * (A + C * x * x)), 0)
} else {
result = 1.0 / (1.0 + x * (A + C * x * x))
}
// There's no built-in fast reciprocal intrinsic in odin yet.
// use _mm_rcp_ss from xmmintrin.h in C.
@(default_calling_convention = "none")
foreign _ {
@(link_name="llvm.x86.sse.rcp.ss")
rcpss :: proc(x: #simd[4]f32) -> #simd[4]f32 ---
}
return result
}
native_exp :: proc(x: f32) -> f32 {
return math.exp_f32(-x)
}
// Discrete integral over the absolute difference.
error_area :: proc(start: f32 = 0.0, end: f32 = 5.0, step: f32 = 0.001) -> (error: f32) {
x := start
for x < end {
error += step * abs(taylor_nexp(x) - native_exp(x)) / (1 + x)
x += step
}
return error
}
// Sensible ranges for the parameters to brute force in.
STEPS :: 1024
A_RANGE :: [2]f32{0.9, 1.2}
B_RANGE :: [2]f32{0.0, 0.1}
C_RANGE :: [2]f32{0.1, 1}
main :: proc() {
fmt.println("Running")
// Parameter optimization search
if false {
fmt.printfln("Taylor Error: %g (a=%g, c=%g)", error_area(), a, c)
best_error := max(f32)
best_a: f32
best_b: f32
best_c: f32
for ai in 0..<STEPS {
if ai % (STEPS / 4) == 0 {
fmt.println(ai)
}
for ci in 0..<STEPS {
a = A_RANGE[0] + A_RANGE[1] * f32(ai) / f32(STEPS)
// b = B_RANGE[0] + B_RANGE[1] * f32(bi) / f32(STEPS)
c = C_RANGE[0] + C_RANGE[1] * f32(ci) / f32(STEPS)
err := error_area()
if err < best_error {
best_error = err
best_a = a
best_b = b
best_c = c
}
}
}
fmt.println("Done")
fmt.printfln("Brute Force Error: %g (a=%g, c=%g)", best_error, best_a, best_c)
}
NUM :: 1000_000
STEPS :: 100_000
STEP :: f32(NUM) / STEPS
approx_dur: f64
for _ in 0..<10 {
start := time.tick_now()
sum: f32
for i in 0..<NUM {
sum += STEP * approx_nexp(f32(i) * STEP)
}
dur := time.duration_microseconds(time.tick_since(start))
fmt.printfln("Approx Duration: %.6f us, sum: %.6f", dur, sum)
approx_dur += dur
}
precise_dur: f64
for _ in 0..<10 {
start := time.tick_now()
sum: f32
for i in 0..<NUM {
sum += STEP * native_exp(f32(i) * STEP)
}
dur := time.duration_microseconds(time.tick_since(start))
fmt.printfln("Precise Duration: %.6f us, sum: %.6f", dur, sum)
precise_dur += dur
}
fmt.printfln("Total Precise Duration: %.6f us", precise_dur)
fmt.printfln("Total Approx Duration: %.6f us", approx_dur)
fmt.printfln("Approx is %.6fx faster", precise_dur / approx_dur)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment