Created
December 29, 2025 00:13
-
-
Save nmoinvaz/6a419e8f5c95928a26b524b83c4d9dba to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // Polynomial for CRC32 (IEEE 802.3): 0x1EDC6F41 | |
| // Intel whitepaper uses reflected polynomial: 0x82F63B78 | |
| // We'll use 0x1EDC6F41 for compatibility with zlib-ng | |
| Z_INTERNAL Z_TARGET_CRC uint32_t crc32_armv8_pmull(uint32_t crc, const uint8_t *buf, size_t len) { | |
| uint32_t c = ~crc; | |
| // Constants for PMULL folding (from Intel whitepaper) | |
| const uint64x2_t k1 = {0x0154442bd4ULL, 0x00000001ULL}; | |
| const uint64x2_t k2 = {0xc6e41596ULL, 0x00000001ULL}; | |
| // Barrett constant for reduction (see whitepaper) | |
| const uint64x2_t barrett = {0x1db710641ULL, 0x1ULL}; | |
| // Align buffer to 16 bytes | |
| while (len && ((uintptr_t)buf & 0xF)) { | |
| c = __crc32b(c, *buf++); | |
| len--; | |
| } | |
| uint64x2_t crc128 = vdupq_n_u64(0); | |
| if (len >= 16) { | |
| // Load initial 16 bytes and xor with crc | |
| uint8x16_t block0 = vld1q_u8(buf); | |
| uint64x2_t block64_0 = vreinterpretq_u64_u8(block0); | |
| // Insert crc into lower 32 bits | |
| block64_0 = veorq_u64(block64_0, vsetq_lane_u64((uint64_t)c, vdupq_n_u64(0), 0)); | |
| crc128 = block64_0; | |
| buf += 16; | |
| len -= 16; | |
| // Unroll: process 4x16B = 64B per loop | |
| while (len >= 64) { | |
| uint8x16_t block1 = vld1q_u8(buf); | |
| uint8x16_t block2 = vld1q_u8(buf + 16); | |
| uint8x16_t block3 = vld1q_u8(buf + 32); | |
| uint8x16_t block4 = vld1q_u8(buf + 48); | |
| uint64x2_t b1 = vreinterpretq_u64_u8(block1); | |
| uint64x2_t b2 = vreinterpretq_u64_u8(block2); | |
| uint64x2_t b3 = vreinterpretq_u64_u8(block3); | |
| uint64x2_t b4 = vreinterpretq_u64_u8(block4); | |
| // Fold each block into crc128 | |
| crc128 = veorq_u64(crc128, b1); | |
| uint64x2_t t1 = vmull_p64(vgetq_lane_u64(crc128, 0), vgetq_lane_u64(k1, 0)); | |
| uint64x2_t t2 = vmull_p64(vgetq_lane_u64(crc128, 1), vgetq_lane_u64(k2, 0)); | |
| crc128 = veorq_u64(t1, t2); | |
| crc128 = veorq_u64(crc128, b2); | |
| t1 = vmull_p64(vgetq_lane_u64(crc128, 0), vgetq_lane_u64(k1, 0)); | |
| t2 = vmull_p64(vgetq_lane_u64(crc128, 1), vgetq_lane_u64(k2, 0)); | |
| crc128 = veorq_u64(t1, t2); | |
| crc128 = veorq_u64(crc128, b3); | |
| t1 = vmull_p64(vgetq_lane_u64(crc128, 0), vgetq_lane_u64(k1, 0)); | |
| t2 = vmull_p64(vgetq_lane_u64(crc128, 1), vgetq_lane_u64(k2, 0)); | |
| crc128 = veorq_u64(t1, t2); | |
| crc128 = veorq_u64(crc128, b4); | |
| t1 = vmull_p64(vgetq_lane_u64(crc128, 0), vgetq_lane_u64(k1, 0)); | |
| t2 = vmull_p64(vgetq_lane_u64(crc128, 1), vgetq_lane_u64(k2, 0)); | |
| crc128 = veorq_u64(t1, t2); | |
| buf += 64; | |
| len -= 64; | |
| } | |
| // Handle remaining 16B blocks | |
| while (len >= 16) { | |
| uint8x16_t block = vld1q_u8(buf); | |
| uint64x2_t block64 = vreinterpretq_u64_u8(block); | |
| crc128 = veorq_u64(crc128, block64); | |
| uint64x2_t t1 = vmull_p64(vgetq_lane_u64(crc128, 0), vgetq_lane_u64(k1, 0)); | |
| uint64x2_t t2 = vmull_p64(vgetq_lane_u64(crc128, 1), vgetq_lane_u64(k2, 0)); | |
| crc128 = veorq_u64(t1, t2); | |
| buf += 16; | |
| len -= 16; | |
| } | |
| // Reduce 128-bit to 64-bit | |
| uint64_t crc64 = vgetq_lane_u64(crc128, 0) ^ vgetq_lane_u64(crc128, 1); | |
| // Barrett reduction to 32-bit | |
| uint64_t q = (crc64 >> 32) * 0x1db710641ULL; | |
| c = (uint32_t)(crc64 ^ q); | |
| } | |
| // Process remaining bytes | |
| while (len >= 8) { | |
| uint64_t v; | |
| memcpy(&v, buf, 8); | |
| c = __crc32d(c, v); | |
| buf += 8; | |
| len -= 8; | |
| } | |
| if (len >= 4) { | |
| uint32_t v; | |
| memcpy(&v, buf, 4); | |
| c = __crc32w(c, v); | |
| buf += 4; | |
| len -= 4; | |
| } | |
| if (len >= 2) { | |
| uint16_t v; | |
| memcpy(&v, buf, 2); | |
| c = __crc32h(c, v); | |
| buf += 2; | |
| len -= 2; | |
| } | |
| if (len) { | |
| c = __crc32b(c, *buf); | |
| } | |
| return ~c; | |
| } | |
| #endif |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment