Skip to content

Instantly share code, notes, and snippets.

@nmoinvaz
Created December 29, 2025 00:13
Show Gist options
  • Select an option

  • Save nmoinvaz/6a419e8f5c95928a26b524b83c4d9dba to your computer and use it in GitHub Desktop.

Select an option

Save nmoinvaz/6a419e8f5c95928a26b524b83c4d9dba to your computer and use it in GitHub Desktop.
// Polynomial for CRC32 (IEEE 802.3): 0x1EDC6F41
// Intel whitepaper uses reflected polynomial: 0x82F63B78
// We'll use 0x1EDC6F41 for compatibility with zlib-ng
Z_INTERNAL Z_TARGET_CRC uint32_t crc32_armv8_pmull(uint32_t crc, const uint8_t *buf, size_t len) {
uint32_t c = ~crc;
// Constants for PMULL folding (from Intel whitepaper)
const uint64x2_t k1 = {0x0154442bd4ULL, 0x00000001ULL};
const uint64x2_t k2 = {0xc6e41596ULL, 0x00000001ULL};
// Barrett constant for reduction (see whitepaper)
const uint64x2_t barrett = {0x1db710641ULL, 0x1ULL};
// Align buffer to 16 bytes
while (len && ((uintptr_t)buf & 0xF)) {
c = __crc32b(c, *buf++);
len--;
}
uint64x2_t crc128 = vdupq_n_u64(0);
if (len >= 16) {
// Load initial 16 bytes and xor with crc
uint8x16_t block0 = vld1q_u8(buf);
uint64x2_t block64_0 = vreinterpretq_u64_u8(block0);
// Insert crc into lower 32 bits
block64_0 = veorq_u64(block64_0, vsetq_lane_u64((uint64_t)c, vdupq_n_u64(0), 0));
crc128 = block64_0;
buf += 16;
len -= 16;
// Unroll: process 4x16B = 64B per loop
while (len >= 64) {
uint8x16_t block1 = vld1q_u8(buf);
uint8x16_t block2 = vld1q_u8(buf + 16);
uint8x16_t block3 = vld1q_u8(buf + 32);
uint8x16_t block4 = vld1q_u8(buf + 48);
uint64x2_t b1 = vreinterpretq_u64_u8(block1);
uint64x2_t b2 = vreinterpretq_u64_u8(block2);
uint64x2_t b3 = vreinterpretq_u64_u8(block3);
uint64x2_t b4 = vreinterpretq_u64_u8(block4);
// Fold each block into crc128
crc128 = veorq_u64(crc128, b1);
uint64x2_t t1 = vmull_p64(vgetq_lane_u64(crc128, 0), vgetq_lane_u64(k1, 0));
uint64x2_t t2 = vmull_p64(vgetq_lane_u64(crc128, 1), vgetq_lane_u64(k2, 0));
crc128 = veorq_u64(t1, t2);
crc128 = veorq_u64(crc128, b2);
t1 = vmull_p64(vgetq_lane_u64(crc128, 0), vgetq_lane_u64(k1, 0));
t2 = vmull_p64(vgetq_lane_u64(crc128, 1), vgetq_lane_u64(k2, 0));
crc128 = veorq_u64(t1, t2);
crc128 = veorq_u64(crc128, b3);
t1 = vmull_p64(vgetq_lane_u64(crc128, 0), vgetq_lane_u64(k1, 0));
t2 = vmull_p64(vgetq_lane_u64(crc128, 1), vgetq_lane_u64(k2, 0));
crc128 = veorq_u64(t1, t2);
crc128 = veorq_u64(crc128, b4);
t1 = vmull_p64(vgetq_lane_u64(crc128, 0), vgetq_lane_u64(k1, 0));
t2 = vmull_p64(vgetq_lane_u64(crc128, 1), vgetq_lane_u64(k2, 0));
crc128 = veorq_u64(t1, t2);
buf += 64;
len -= 64;
}
// Handle remaining 16B blocks
while (len >= 16) {
uint8x16_t block = vld1q_u8(buf);
uint64x2_t block64 = vreinterpretq_u64_u8(block);
crc128 = veorq_u64(crc128, block64);
uint64x2_t t1 = vmull_p64(vgetq_lane_u64(crc128, 0), vgetq_lane_u64(k1, 0));
uint64x2_t t2 = vmull_p64(vgetq_lane_u64(crc128, 1), vgetq_lane_u64(k2, 0));
crc128 = veorq_u64(t1, t2);
buf += 16;
len -= 16;
}
// Reduce 128-bit to 64-bit
uint64_t crc64 = vgetq_lane_u64(crc128, 0) ^ vgetq_lane_u64(crc128, 1);
// Barrett reduction to 32-bit
uint64_t q = (crc64 >> 32) * 0x1db710641ULL;
c = (uint32_t)(crc64 ^ q);
}
// Process remaining bytes
while (len >= 8) {
uint64_t v;
memcpy(&v, buf, 8);
c = __crc32d(c, v);
buf += 8;
len -= 8;
}
if (len >= 4) {
uint32_t v;
memcpy(&v, buf, 4);
c = __crc32w(c, v);
buf += 4;
len -= 4;
}
if (len >= 2) {
uint16_t v;
memcpy(&v, buf, 2);
c = __crc32h(c, v);
buf += 2;
len -= 2;
}
if (len) {
c = __crc32b(c, *buf);
}
return ~c;
}
#endif
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment