Created
December 9, 2025 20:34
-
-
Save nmoinvaz/889dafb1f9c182b59192ec3d45729a55 to your computer and use it in GitHub Desktop.
crc32_armv8_pmull_single_lane
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Z_INTERNAL Z_TARGET_PMULL uint32_t crc32_armv8_pmull_single_lane(uint32_t crc, const uint8_t *buf, size_t len) { | |
| uint32_t crc0 = ~crc; | |
| /* 1. Alignment (Scalar) */ | |
| for (; len && ((uintptr_t)buf & 7); --len) { | |
| crc0 = __crc32b(crc0, *buf++); | |
| } | |
| /* 2. Alignment to 16-byte boundary (8-byte scalar CRC) */ | |
| if (((uintptr_t)buf & 8) && len >= 8) { | |
| crc0 = __crc32d(crc0, *(const uint64_t*)buf); | |
| buf += 8; | |
| len -= 8; | |
| } | |
| /* Medium buffer path: 2-way PMULL folding (32 bytes/iter) */ | |
| if (len >= 32) { | |
| uint64x2_t x0 = vld1q_u64((const uint64_t*)buf); | |
| uint64x2_t x1 = vld1q_u64((const uint64_t*)(buf + 16)); | |
| uint64x2_t k; | |
| /* * Constants for 256-bit fold (stride 32 bytes). | |
| * k = { x^(256+64) mod P, x^256 mod P } | |
| */ | |
| { static const uint64_t ALIGNED_(16) k_[] = {0xf1da05aa, 0x81256527}; k = vld1q_u64(k_); } | |
| /* Mix in current CRC to the first vector (x0) */ | |
| x0 = veorq_u64((uint64x2_t){crc0, 0}, x0); | |
| buf += 32; | |
| len -= 32; | |
| /* Fold 32 bytes at a time using 2 interleaved accumulators */ | |
| while (len >= 32) { | |
| /* * Logic: | |
| * x0_new = (x0 * k) ^ next_chunk_0 | |
| * x1_new = (x1 * k) ^ next_chunk_1 | |
| */ | |
| /* Calculate lo parts */ | |
| uint64x2_t t0 = clmul_lo_e(x0, k, vld1q_u64((const uint64_t*)buf)); | |
| uint64x2_t t1 = clmul_lo_e(x1, k, vld1q_u64((const uint64_t*)(buf + 16))); | |
| /* Calculate hi parts and XOR with lo parts */ | |
| x0 = clmul_hi_e(x0, k, t0); | |
| x1 = clmul_hi_e(x1, k, t1); | |
| buf += 32; | |
| len -= 32; | |
| } | |
| /* * Fold 2 lanes into 1. | |
| * We need to fold x0 forward by 128 bits (16 bytes) so it overlaps with x1. | |
| * We reuse the 128-bit fold constants here. | |
| */ | |
| { static const uint64_t ALIGNED_(16) k_[] = {0xae689191, 0xccaa009e}; k = vld1q_u64(k_); } | |
| /* x0 = (x0 * k_128) ^ x1 */ | |
| uint64x2_t t0 = clmul_lo_e(x0, k, x1); | |
| x0 = clmul_hi_e(x0, k, t0); | |
| /* * At this point, x0 holds the combined 128-bit result. | |
| * We now fall through to the final reduction (128-bit -> 32-bit). | |
| */ | |
| crc0 = __crc32d(0, vgetq_lane_u64(x0, 0)); | |
| crc0 = __crc32d(crc0, vgetq_lane_u64(x0, 1)); | |
| } | |
| /* Single buffer path: 1-way PMULL folding (16 bytes/iter) */ | |
| if (len >= 16) { | |
| uint64x2_t x0 = vld1q_u64((const uint64_t*)buf); | |
| uint64x2_t k; | |
| /* * Constants for 128-bit fold (stride 16 bytes). | |
| * These match the reduction constants from your multi-lane code. | |
| * k = { x^(128+64) mod P, x^128 mod P } (bit-reflected) | |
| */ | |
| { static const uint64_t ALIGNED_(16) k_[] = {0xae689191, 0xccaa009e}; k = vld1q_u64(k_); } | |
| /* Mix in current CRC to the first vector */ | |
| x0 = veorq_u64((uint64x2_t){crc0, 0}, x0); | |
| buf += 16; | |
| len -= 16; | |
| /* Fold 16 bytes at a time */ | |
| while (len >= 16) { | |
| uint64x2_t next = vld1q_u64((const uint64_t*)buf); | |
| /* * Logic: x0_new = (x0 * k) ^ next | |
| * We split this into low and high parts using your helpers. | |
| */ | |
| /* t0 = (x0_lo * k_lo) ^ next */ | |
| uint64x2_t t0 = clmul_lo_e(x0, k, next); | |
| /* x0 = (x0_hi * k_hi) ^ t0 */ | |
| x0 = clmul_hi_e(x0, k, t0); | |
| buf += 16; | |
| len -= 16; | |
| } | |
| /* * Final reduction: 128-bit -> 32-bit. | |
| * Since we only have one vector (x0), we skip the "Reduce 2 vectors to 1" step | |
| * and proceed directly to draining the vector using scalar CRC instructions. | |
| */ | |
| crc0 = __crc32d(0, vgetq_lane_u64(x0, 0)); | |
| crc0 = __crc32d(crc0, vgetq_lane_u64(x0, 1)); | |
| } | |
| /* 4. Process remaining 8-byte chunks (scalar) */ | |
| for (; len >= 8; buf += 8, len -= 8) | |
| crc0 = __crc32d(crc0, *(const uint64_t*)buf); | |
| /* 5. Process remaining bytes (scalar) */ | |
| for (; len; --len) | |
| crc0 = __crc32b(crc0, *buf++); | |
| return ~crc0; | |
| } |
Author
Author
If it doesn't work well, consider removing the len >= 32 case and trying only the len >= 16 case.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I saw at least 50% improvement compared to standard
crc32_armv8on M3. But it has multiple PMULL execution units...