Last active
January 27, 2026 20:57
-
-
Save jakubtomsu/d25210b55037858c3ed35fe00182f92a to your computer and use it in GitHub Desktop.
~11x faster exp(-x)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| package foo | |
| import "base:intrinsics" | |
| import "core:math" | |
| import "core:fmt" | |
| import "core:time" | |
| // https://www.desmos.com/calculator/i17pexccum | |
| a: f32 = 1.0 | |
| b: f32 = 1.0/4.0 | |
| c: f32 = 1.0/6.0 | |
| // Used as a baseline for the error comparisons | |
| taylor_nexp :: proc(x: f32) -> f32 { | |
| x2 := x * x | |
| return 1.0 / (1.0 + a * x + c * x * x * x) | |
| } | |
| // NOTE: the approximate reciprocal along with the reordered equation helps by like additional ~10%, nothing crazy. | |
| // Feel free to use one of the simpler variants: | |
| // return 1.0 / (1.0 + A * x + C * x * x * x) | |
| // return 1.0 / (1.0 + x * (A + C * x * x)) | |
| approx_nexp :: proc(x: f32) -> (result: f32) { | |
| A :: 1.1566406 | |
| C :: 0.53652346 | |
| when ODIN_ARCH == .amd64 { | |
| result = intrinsics.simd_extract(rcpss(1.0 + x * (A + C * x * x)), 0) | |
| } else { | |
| result = 1.0 / (1.0 + x * (A + C * x * x)) | |
| } | |
| // There's no built-in fast reciprocal intrinsic in odin yet. | |
| // use _mm_rcp_ss from xmmintrin.h in C. | |
| @(default_calling_convention = "none") | |
| foreign _ { | |
| @(link_name="llvm.x86.sse.rcp.ss") | |
| rcpss :: proc(x: #simd[4]f32) -> #simd[4]f32 --- | |
| } | |
| return result | |
| } | |
| native_exp :: proc(x: f32) -> f32 { | |
| return math.exp_f32(-x) | |
| } | |
| // Discrete integral over the absolute difference. | |
| error_area :: proc(start: f32 = 0.0, end: f32 = 5.0, step: f32 = 0.001) -> (error: f32) { | |
| x := start | |
| for x < end { | |
| error += step * abs(taylor_nexp(x) - native_exp(x)) / (1 + x) | |
| x += step | |
| } | |
| return error | |
| } | |
| // Sensible ranges for the parameters to brute force in. | |
| STEPS :: 1024 | |
| A_RANGE :: [2]f32{0.9, 1.2} | |
| B_RANGE :: [2]f32{0.0, 0.1} | |
| C_RANGE :: [2]f32{0.1, 1} | |
| main :: proc() { | |
| fmt.println("Running") | |
| // Parameter optimization search | |
| if false { | |
| fmt.printfln("Taylor Error: %g (a=%g, c=%g)", error_area(), a, c) | |
| best_error := max(f32) | |
| best_a: f32 | |
| best_b: f32 | |
| best_c: f32 | |
| for ai in 0..<STEPS { | |
| if ai % (STEPS / 4) == 0 { | |
| fmt.println(ai) | |
| } | |
| for ci in 0..<STEPS { | |
| a = A_RANGE[0] + A_RANGE[1] * f32(ai) / f32(STEPS) | |
| // b = B_RANGE[0] + B_RANGE[1] * f32(bi) / f32(STEPS) | |
| c = C_RANGE[0] + C_RANGE[1] * f32(ci) / f32(STEPS) | |
| err := error_area() | |
| if err < best_error { | |
| best_error = err | |
| best_a = a | |
| best_b = b | |
| best_c = c | |
| } | |
| } | |
| } | |
| fmt.println("Done") | |
| fmt.printfln("Brute Force Error: %g (a=%g, c=%g)", best_error, best_a, best_c) | |
| } | |
| NUM :: 1000_000 | |
| STEPS :: 100_000 | |
| STEP :: f32(NUM) / STEPS | |
| approx_dur: f64 | |
| for _ in 0..<10 { | |
| start := time.tick_now() | |
| sum: f32 | |
| for i in 0..<NUM { | |
| sum += STEP * approx_nexp(f32(i) * STEP) | |
| } | |
| dur := time.duration_microseconds(time.tick_since(start)) | |
| fmt.printfln("Approx Duration: %.6f us, sum: %.6f", dur, sum) | |
| approx_dur += dur | |
| } | |
| precise_dur: f64 | |
| for _ in 0..<10 { | |
| start := time.tick_now() | |
| sum: f32 | |
| for i in 0..<NUM { | |
| sum += STEP * native_exp(f32(i) * STEP) | |
| } | |
| dur := time.duration_microseconds(time.tick_since(start)) | |
| fmt.printfln("Precise Duration: %.6f us, sum: %.6f", dur, sum) | |
| precise_dur += dur | |
| } | |
| fmt.printfln("Total Precise Duration: %.6f us", precise_dur) | |
| fmt.printfln("Total Approx Duration: %.6f us", approx_dur) | |
| fmt.printfln("Approx is %.6fx faster", precise_dur / approx_dur) | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment