jakubtomsu · January 27, 2026 20:57
diff --git a/fast_negative_exp.odin b/fast_negative_exp.odin
 package foo

 import "base:intrinsics"
 import "core:math"
 import "core:fmt"
 import "core:time"

 // https://www.desmos.com/calculator/i17pexccum

 a: f32 = 1.0
 b: f32 = 1.0/4.0
 c: f32 = 1.0/6.0

 // Used as a baseline for the error comparisons
 taylor_nexp :: proc(x: f32) -> f32 {
    x2 := x * x
    return 1.0 / (1.0 + a * x + c * x * x * x)
 }

 // NOTE: the approximate reciprocal along with the reordered equation helps by like additional ~10%, nothing crazy.
 // Feel free to use one of the simpler variants:
 // return 1.0 / (1.0 + A * x + C * x * x * x)
 // return 1.0 / (1.0 + x * (A + C * x * x))
 approx_nexp :: proc(x: f32) -> (result: f32) {
    A :: 1.1566406
    C :: 0.53652346

    when ODIN_ARCH == .amd64 {
        result = intrinsics.simd_extract(rcpss(1.0 + x * (A + C * x * x)), 0)
    } else {
        result = 1.0 / (1.0 + x * (A + C * x * x))
    }

    // There's no built-in fast reciprocal intrinsic in odin yet.
    // use _mm_rcp_ss from xmmintrin.h in C.
    @(default_calling_convention = "none")
    foreign _ {
        @(link_name="llvm.x86.sse.rcp.ss")
        rcpss :: proc(x: #simd[4]f32) -> #simd[4]f32 ---
    }

    return result
 }


 native_exp :: proc(x: f32) -> f32 {
    return math.exp_f32(-x)
 }

 // Discrete integral over the absolute difference.
 error_area :: proc(start: f32 = 0.0, end: f32 = 5.0, step: f32 = 0.001) -> (error: f32) {
    x := start
    for x < end {
        error += step * abs(taylor_nexp(x) - native_exp(x)) / (1 + x)
        x += step
    }
    return error
 }

 // Sensible ranges for the parameters to brute force in.

 STEPS :: 1024

 A_RANGE :: [2]f32{0.9, 1.2}
 B_RANGE :: [2]f32{0.0,  0.1}
 C_RANGE :: [2]f32{0.1, 1}

 main :: proc() {
    fmt.println("Running")

    // Parameter optimization search
    if false {
        fmt.printfln("Taylor Error: %g (a=%g, c=%g)", error_area(), a, c)

        best_error := max(f32)
        best_a: f32
        best_b: f32
        best_c: f32

        for ai in 0..<STEPS {
            if ai % (STEPS / 4) == 0 {
                fmt.println(ai)
            }
            for ci in 0..<STEPS {
                a = A_RANGE[0] + A_RANGE[1] * f32(ai) / f32(STEPS)
                // b = B_RANGE[0] + B_RANGE[1] * f32(bi) / f32(STEPS)
                c = C_RANGE[0] + C_RANGE[1] * f32(ci) / f32(STEPS)

                err := error_area()

                if err < best_error {
                    best_error = err
                    best_a = a
                    best_b = b
                    best_c = c
                }
            }
        }

        fmt.println("Done")

        fmt.printfln("Brute Force Error: %g (a=%g, c=%g)", best_error, best_a, best_c)
    }

    NUM :: 1000_000
    STEPS :: 100_000
    STEP :: f32(NUM) / STEPS

    approx_dur: f64
    for _ in 0..<10 {
        start := time.tick_now()

        sum: f32
        for i in 0..<NUM {
            sum += STEP * approx_nexp(f32(i) * STEP)
        }

        dur := time.duration_microseconds(time.tick_since(start))
        fmt.printfln("Approx Duration: %.6f us, sum: %.6f", dur, sum)
        approx_dur += dur
    }

    precise_dur: f64
    for _ in 0..<10 {
        start := time.tick_now()

        sum: f32
        for i in 0..<NUM {
            sum += STEP * native_exp(f32(i) * STEP)
        }

        dur := time.duration_microseconds(time.tick_since(start))
        fmt.printfln("Precise Duration: %.6f us, sum: %.6f", dur, sum)
        precise_dur += dur
    }

    fmt.printfln("Total Precise Duration: %.6f us", precise_dur)
    fmt.printfln("Total Approx  Duration: %.6f us", approx_dur)
    fmt.printfln("Approx is %.6fx faster", precise_dur / approx_dur)
 }
	package foo

	import "base:intrinsics"
	import "core:math"
	import "core:fmt"
	import "core:time"

	// https://www.desmos.com/calculator/i17pexccum

	a: f32 = 1.0
	b: f32 = 1.0/4.0
	c: f32 = 1.0/6.0

	// Used as a baseline for the error comparisons
	taylor_nexp :: proc(x: f32) -> f32 {
	x2 := x * x
	return 1.0 / (1.0 + a * x + c * x * x * x)
	}

	// NOTE: the approximate reciprocal along with the reordered equation helps by like additional ~10%, nothing crazy.
	// Feel free to use one of the simpler variants:
	// return 1.0 / (1.0 + A * x + C * x * x * x)
	// return 1.0 / (1.0 + x * (A + C * x * x))
	approx_nexp :: proc(x: f32) -> (result: f32) {
	A :: 1.1566406
	C :: 0.53652346

	when ODIN_ARCH == .amd64 {
	result = intrinsics.simd_extract(rcpss(1.0 + x * (A + C * x * x)), 0)
	} else {
	result = 1.0 / (1.0 + x * (A + C * x * x))
	}

	// There's no built-in fast reciprocal intrinsic in odin yet.
	// use _mm_rcp_ss from xmmintrin.h in C.
	@(default_calling_convention = "none")
	foreign _ {
	@(link_name="llvm.x86.sse.rcp.ss")
	rcpss :: proc(x: #simd[4]f32) -> #simd[4]f32 ---
	}

	return result
	}


	native_exp :: proc(x: f32) -> f32 {
	return math.exp_f32(-x)
	}

	// Discrete integral over the absolute difference.
	error_area :: proc(start: f32 = 0.0, end: f32 = 5.0, step: f32 = 0.001) -> (error: f32) {
	x := start
	for x < end {
	error += step * abs(taylor_nexp(x) - native_exp(x)) / (1 + x)
	x += step
	}
	return error
	}

	// Sensible ranges for the parameters to brute force in.

	STEPS :: 1024

	A_RANGE :: [2]f32{0.9, 1.2}
	B_RANGE :: [2]f32{0.0, 0.1}
	C_RANGE :: [2]f32{0.1, 1}

	main :: proc() {
	fmt.println("Running")

	// Parameter optimization search
	if false {
	fmt.printfln("Taylor Error: %g (a=%g, c=%g)", error_area(), a, c)

	best_error := max(f32)
	best_a: f32
	best_b: f32
	best_c: f32

	for ai in 0..<STEPS {
	if ai % (STEPS / 4) == 0 {
	fmt.println(ai)
	}
	for ci in 0..<STEPS {
	a = A_RANGE[0] + A_RANGE[1] * f32(ai) / f32(STEPS)
	// b = B_RANGE[0] + B_RANGE[1] * f32(bi) / f32(STEPS)
	c = C_RANGE[0] + C_RANGE[1] * f32(ci) / f32(STEPS)

	err := error_area()

	if err < best_error {
	best_error = err
	best_a = a
	best_b = b
	best_c = c
	}
	}
	}

	fmt.println("Done")

	fmt.printfln("Brute Force Error: %g (a=%g, c=%g)", best_error, best_a, best_c)
	}

	NUM :: 1000_000
	STEPS :: 100_000
	STEP :: f32(NUM) / STEPS

	approx_dur: f64
	for _ in 0..<10 {
	start := time.tick_now()

	sum: f32
	for i in 0..<NUM {
	sum += STEP * approx_nexp(f32(i) * STEP)
	}

	dur := time.duration_microseconds(time.tick_since(start))
	fmt.printfln("Approx Duration: %.6f us, sum: %.6f", dur, sum)
	approx_dur += dur
	}

	precise_dur: f64
	for _ in 0..<10 {
	start := time.tick_now()

	sum: f32
	for i in 0..<NUM {
	sum += STEP * native_exp(f32(i) * STEP)
	}

	dur := time.duration_microseconds(time.tick_since(start))
	fmt.printfln("Precise Duration: %.6f us, sum: %.6f", dur, sum)
	precise_dur += dur
	}

	fmt.printfln("Total Precise Duration: %.6f us", precise_dur)
	fmt.printfln("Total Approx Duration: %.6f us", approx_dur)
	fmt.printfln("Approx is %.6fx faster", precise_dur / approx_dur)
	}
No results found