Skip to content

Instantly share code, notes, and snippets.

@thoughtpolice
Last active December 17, 2025 23:48
Show Gist options
  • Select an option

  • Save thoughtpolice/849234dfd4300ac73f56d56a8f4f29d8 to your computer and use it in GitHub Desktop.

Select an option

Save thoughtpolice/849234dfd4300ac73f56d56a8f4f29d8 to your computer and use it in GitHub Desktop.
float sum_avx512(const float* arr, size_t n) {
__m512 acc = _mm512_setzero_ps();
size_t i = 0;
for (; i + 16 <= n; i += 16) {
__m512 v = _mm512_loadu_ps(&arr[i]);
acc = _mm512_add_ps(acc, v);
}
float result = _mm512_reduce_add_ps(acc);
// tail
for (; i < n; i++) {
result += arr[i];
}
return result;
}
float sum_avx512_masked(const float* arr, size_t n) {
__m512 acc = _mm512_setzero_ps();
size_t i = 0;
for (; i + 16 <= n; i += 16) {
acc = _mm512_add_ps(acc, _mm512_loadu_ps(&arr[i]));
}
if (i < n) {
__mmask16 mask = (1U << (n - i)) - 1;
acc = _mm512_add_ps(acc, _mm512_maskz_loadu_ps(mask, &arr[i]));
}
return _mm512_reduce_add_ps(acc);
}
#include <riscv_vector.h>
float sum_rvv(const float* arr, size_t n) {
vfloat32m1_t acc = __riscv_vfmv_s_f_f32m1(0.0f, 1);
for (size_t i = 0; i < n; ) {
size_t vl = __riscv_vsetvl_e32m1(n - i);
vfloat32m1_t v = __riscv_vle32_v_f32m1(&arr[i], vl);
acc = __riscv_vfredusum_vs_f32m1_f32m1(v, acc, vl);
i += vl;
}
return __riscv_vfmv_f_s_f32m1_f32(acc);
}
#include <arm_sve.h>
float sum_sve2(const float* arr, size_t n) {
svfloat32_t acc = svdup_f32(0.0f);
for (size_t i = 0; i < n; i += svcntw()) {
svbool_t pg = svwhilelt_b32((uint64_t)i, (uint64_t)n);
svfloat32_t v = svld1_f32(pg, &arr[i]);
acc = svadd_f32_m(pg, acc, v);
}
return svaddv_f32(svptrue_b32(), acc);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment