Skip to content

Instantly share code, notes, and snippets.

@nmoinvaz
Created December 9, 2025 20:34
Show Gist options
  • Select an option

  • Save nmoinvaz/889dafb1f9c182b59192ec3d45729a55 to your computer and use it in GitHub Desktop.

Select an option

Save nmoinvaz/889dafb1f9c182b59192ec3d45729a55 to your computer and use it in GitHub Desktop.
crc32_armv8_pmull_single_lane
Z_INTERNAL Z_TARGET_PMULL uint32_t crc32_armv8_pmull_single_lane(uint32_t crc, const uint8_t *buf, size_t len) {
uint32_t crc0 = ~crc;
/* 1. Alignment (Scalar) */
for (; len && ((uintptr_t)buf & 7); --len) {
crc0 = __crc32b(crc0, *buf++);
}
/* 2. Alignment to 16-byte boundary (8-byte scalar CRC) */
if (((uintptr_t)buf & 8) && len >= 8) {
crc0 = __crc32d(crc0, *(const uint64_t*)buf);
buf += 8;
len -= 8;
}
/* Medium buffer path: 2-way PMULL folding (32 bytes/iter) */
if (len >= 32) {
uint64x2_t x0 = vld1q_u64((const uint64_t*)buf);
uint64x2_t x1 = vld1q_u64((const uint64_t*)(buf + 16));
uint64x2_t k;
/* * Constants for 256-bit fold (stride 32 bytes).
* k = { x^(256+64) mod P, x^256 mod P }
*/
{ static const uint64_t ALIGNED_(16) k_[] = {0xf1da05aa, 0x81256527}; k = vld1q_u64(k_); }
/* Mix in current CRC to the first vector (x0) */
x0 = veorq_u64((uint64x2_t){crc0, 0}, x0);
buf += 32;
len -= 32;
/* Fold 32 bytes at a time using 2 interleaved accumulators */
while (len >= 32) {
/* * Logic:
* x0_new = (x0 * k) ^ next_chunk_0
* x1_new = (x1 * k) ^ next_chunk_1
*/
/* Calculate lo parts */
uint64x2_t t0 = clmul_lo_e(x0, k, vld1q_u64((const uint64_t*)buf));
uint64x2_t t1 = clmul_lo_e(x1, k, vld1q_u64((const uint64_t*)(buf + 16)));
/* Calculate hi parts and XOR with lo parts */
x0 = clmul_hi_e(x0, k, t0);
x1 = clmul_hi_e(x1, k, t1);
buf += 32;
len -= 32;
}
/* * Fold 2 lanes into 1.
* We need to fold x0 forward by 128 bits (16 bytes) so it overlaps with x1.
* We reuse the 128-bit fold constants here.
*/
{ static const uint64_t ALIGNED_(16) k_[] = {0xae689191, 0xccaa009e}; k = vld1q_u64(k_); }
/* x0 = (x0 * k_128) ^ x1 */
uint64x2_t t0 = clmul_lo_e(x0, k, x1);
x0 = clmul_hi_e(x0, k, t0);
/* * At this point, x0 holds the combined 128-bit result.
* We now fall through to the final reduction (128-bit -> 32-bit).
*/
crc0 = __crc32d(0, vgetq_lane_u64(x0, 0));
crc0 = __crc32d(crc0, vgetq_lane_u64(x0, 1));
}
/* Single buffer path: 1-way PMULL folding (16 bytes/iter) */
if (len >= 16) {
uint64x2_t x0 = vld1q_u64((const uint64_t*)buf);
uint64x2_t k;
/* * Constants for 128-bit fold (stride 16 bytes).
* These match the reduction constants from your multi-lane code.
* k = { x^(128+64) mod P, x^128 mod P } (bit-reflected)
*/
{ static const uint64_t ALIGNED_(16) k_[] = {0xae689191, 0xccaa009e}; k = vld1q_u64(k_); }
/* Mix in current CRC to the first vector */
x0 = veorq_u64((uint64x2_t){crc0, 0}, x0);
buf += 16;
len -= 16;
/* Fold 16 bytes at a time */
while (len >= 16) {
uint64x2_t next = vld1q_u64((const uint64_t*)buf);
/* * Logic: x0_new = (x0 * k) ^ next
* We split this into low and high parts using your helpers.
*/
/* t0 = (x0_lo * k_lo) ^ next */
uint64x2_t t0 = clmul_lo_e(x0, k, next);
/* x0 = (x0_hi * k_hi) ^ t0 */
x0 = clmul_hi_e(x0, k, t0);
buf += 16;
len -= 16;
}
/* * Final reduction: 128-bit -> 32-bit.
* Since we only have one vector (x0), we skip the "Reduce 2 vectors to 1" step
* and proceed directly to draining the vector using scalar CRC instructions.
*/
crc0 = __crc32d(0, vgetq_lane_u64(x0, 0));
crc0 = __crc32d(crc0, vgetq_lane_u64(x0, 1));
}
/* 4. Process remaining 8-byte chunks (scalar) */
for (; len >= 8; buf += 8, len -= 8)
crc0 = __crc32d(crc0, *(const uint64_t*)buf);
/* 5. Process remaining bytes (scalar) */
for (; len; --len)
crc0 = __crc32b(crc0, *buf++);
return ~crc0;
}
@nmoinvaz
Copy link
Author

I saw at least 50% improvement compared to standard crc32_armv8 on M3. But it has multiple PMULL execution units...

@nmoinvaz
Copy link
Author

If it doesn't work well, consider removing the len >= 32 case and trying only the len >= 16 case.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment