Last active
December 17, 2025 23:48
-
-
Save thoughtpolice/849234dfd4300ac73f56d56a8f4f29d8 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| float sum_avx512(const float* arr, size_t n) { | |
| __m512 acc = _mm512_setzero_ps(); | |
| size_t i = 0; | |
| for (; i + 16 <= n; i += 16) { | |
| __m512 v = _mm512_loadu_ps(&arr[i]); | |
| acc = _mm512_add_ps(acc, v); | |
| } | |
| float result = _mm512_reduce_add_ps(acc); | |
| // tail | |
| for (; i < n; i++) { | |
| result += arr[i]; | |
| } | |
| return result; | |
| } | |
| float sum_avx512_masked(const float* arr, size_t n) { | |
| __m512 acc = _mm512_setzero_ps(); | |
| size_t i = 0; | |
| for (; i + 16 <= n; i += 16) { | |
| acc = _mm512_add_ps(acc, _mm512_loadu_ps(&arr[i])); | |
| } | |
| if (i < n) { | |
| __mmask16 mask = (1U << (n - i)) - 1; | |
| acc = _mm512_add_ps(acc, _mm512_maskz_loadu_ps(mask, &arr[i])); | |
| } | |
| return _mm512_reduce_add_ps(acc); | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <riscv_vector.h> | |
| float sum_rvv(const float* arr, size_t n) { | |
| vfloat32m1_t acc = __riscv_vfmv_s_f_f32m1(0.0f, 1); | |
| for (size_t i = 0; i < n; ) { | |
| size_t vl = __riscv_vsetvl_e32m1(n - i); | |
| vfloat32m1_t v = __riscv_vle32_v_f32m1(&arr[i], vl); | |
| acc = __riscv_vfredusum_vs_f32m1_f32m1(v, acc, vl); | |
| i += vl; | |
| } | |
| return __riscv_vfmv_f_s_f32m1_f32(acc); | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <arm_sve.h> | |
| float sum_sve2(const float* arr, size_t n) { | |
| svfloat32_t acc = svdup_f32(0.0f); | |
| for (size_t i = 0; i < n; i += svcntw()) { | |
| svbool_t pg = svwhilelt_b32((uint64_t)i, (uint64_t)n); | |
| svfloat32_t v = svld1_f32(pg, &arr[i]); | |
| acc = svadd_f32_m(pg, acc, v); | |
| } | |
| return svaddv_f32(svptrue_b32(), acc); | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment