nmoinvaz · December 9, 2025 20:34 · nmoinvaz · Dec 10, 2025 · nmoinvaz · Dec 10, 2025
diff --git a/crc32_armv8_pmull_single_lane.c b/crc32_armv8_pmull_single_lane.c
 Z_INTERNAL Z_TARGET_PMULL uint32_t crc32_armv8_pmull_single_lane(uint32_t crc, const uint8_t *buf, size_t len) {
    uint32_t crc0 = ~crc;

    /* 1. Alignment (Scalar) */
    for (; len && ((uintptr_t)buf & 7); --len) {
        crc0 = __crc32b(crc0, *buf++);
    }

    /* 2. Alignment to 16-byte boundary (8-byte scalar CRC) */
    if (((uintptr_t)buf & 8) && len >= 8) {
        crc0 = __crc32d(crc0, *(const uint64_t*)buf);
        buf += 8;
        len -= 8;
    }
    /* Medium buffer path: 2-way PMULL folding (32 bytes/iter) */
    if (len >= 32) {
        uint64x2_t x0 = vld1q_u64((const uint64_t*)buf);
        uint64x2_t x1 = vld1q_u64((const uint64_t*)(buf + 16));
        uint64x2_t k;

        /* * Constants for 256-bit fold (stride 32 bytes).
        * k = { x^(256+64) mod P, x^256 mod P }
        */
        { static const uint64_t ALIGNED_(16) k_[] = {0xf1da05aa, 0x81256527}; k = vld1q_u64(k_); }

        /* Mix in current CRC to the first vector (x0) */
        x0 = veorq_u64((uint64x2_t){crc0, 0}, x0);

        buf += 32;
        len -= 32;

        /* Fold 32 bytes at a time using 2 interleaved accumulators */
        while (len >= 32) {
            /* * Logic:
            * x0_new = (x0 * k) ^ next_chunk_0
            * x1_new = (x1 * k) ^ next_chunk_1
            */

            /* Calculate lo parts */
            uint64x2_t t0 = clmul_lo_e(x0, k, vld1q_u64((const uint64_t*)buf));
            uint64x2_t t1 = clmul_lo_e(x1, k, vld1q_u64((const uint64_t*)(buf + 16)));

            /* Calculate hi parts and XOR with lo parts */
            x0 = clmul_hi_e(x0, k, t0);
            x1 = clmul_hi_e(x1, k, t1);

            buf += 32;
            len -= 32;
        }

        /* * Fold 2 lanes into 1.
        * We need to fold x0 forward by 128 bits (16 bytes) so it overlaps with x1.
        * We reuse the 128-bit fold constants here.
        */
        { static const uint64_t ALIGNED_(16) k_[] = {0xae689191, 0xccaa009e}; k = vld1q_u64(k_); }

        /* x0 = (x0 * k_128) ^ x1 */
        uint64x2_t t0 = clmul_lo_e(x0, k, x1);
        x0 = clmul_hi_e(x0, k, t0);

        /* * At this point, x0 holds the combined 128-bit result.
        * We now fall through to the final reduction (128-bit -> 32-bit).
        */
        crc0 = __crc32d(0, vgetq_lane_u64(x0, 0));
        crc0 = __crc32d(crc0, vgetq_lane_u64(x0, 1));
    }
    /* Single buffer path: 1-way PMULL folding (16 bytes/iter) */
    if (len >= 16) {
        uint64x2_t x0 = vld1q_u64((const uint64_t*)buf);
        uint64x2_t k;

        /* * Constants for 128-bit fold (stride 16 bytes).
        * These match the reduction constants from your multi-lane code.
        * k = { x^(128+64) mod P, x^128 mod P } (bit-reflected)
        */
        { static const uint64_t ALIGNED_(16) k_[] = {0xae689191, 0xccaa009e}; k = vld1q_u64(k_); }

        /* Mix in current CRC to the first vector */
        x0 = veorq_u64((uint64x2_t){crc0, 0}, x0);

        buf += 16;
        len -= 16;

        /* Fold 16 bytes at a time */
        while (len >= 16) {
            uint64x2_t next = vld1q_u64((const uint64_t*)buf);

            /* * Logic: x0_new = (x0 * k) ^ next
            * We split this into low and high parts using your helpers.
            */

            /* t0 = (x0_lo * k_lo) ^ next */
            uint64x2_t t0 = clmul_lo_e(x0, k, next);

            /* x0 = (x0_hi * k_hi) ^ t0 */
            x0 = clmul_hi_e(x0, k, t0);

            buf += 16;
            len -= 16;
        }

        /* * Final reduction: 128-bit -> 32-bit.
        * Since we only have one vector (x0), we skip the "Reduce 2 vectors to 1" step
        * and proceed directly to draining the vector using scalar CRC instructions.
        */
        crc0 = __crc32d(0, vgetq_lane_u64(x0, 0));
        crc0 = __crc32d(crc0, vgetq_lane_u64(x0, 1));
    }
    /* 4. Process remaining 8-byte chunks (scalar) */
    for (; len >= 8; buf += 8, len -= 8)
        crc0 = __crc32d(crc0, *(const uint64_t*)buf);

    /* 5. Process remaining bytes (scalar) */
    for (; len; --len)
        crc0 = __crc32b(crc0, *buf++);

    return ~crc0;
 }
	Z_INTERNAL Z_TARGET_PMULL uint32_t crc32_armv8_pmull_single_lane(uint32_t crc, const uint8_t *buf, size_t len) {
	uint32_t crc0 = ~crc;

	/* 1. Alignment (Scalar) */
	for (; len && ((uintptr_t)buf & 7); --len) {
	crc0 = __crc32b(crc0, *buf++);
	}

	/* 2. Alignment to 16-byte boundary (8-byte scalar CRC) */
	if (((uintptr_t)buf & 8) && len >= 8) {
	crc0 = __crc32d(crc0, (const uint64_t)buf);
	buf += 8;
	len -= 8;
	}
	/* Medium buffer path: 2-way PMULL folding (32 bytes/iter) */
	if (len >= 32) {
	uint64x2_t x0 = vld1q_u64((const uint64_t*)buf);
	uint64x2_t x1 = vld1q_u64((const uint64_t*)(buf + 16));
	uint64x2_t k;

	/* * Constants for 256-bit fold (stride 32 bytes).
	* k = { x^(256+64) mod P, x^256 mod P }
	*/
	{ static const uint64_t ALIGNED_(16) k_[] = {0xf1da05aa, 0x81256527}; k = vld1q_u64(k_); }

	/* Mix in current CRC to the first vector (x0) */
	x0 = veorq_u64((uint64x2_t){crc0, 0}, x0);

	buf += 32;
	len -= 32;

	/* Fold 32 bytes at a time using 2 interleaved accumulators */
	while (len >= 32) {
	/* * Logic:
	* x0_new = (x0 * k) ^ next_chunk_0
	* x1_new = (x1 * k) ^ next_chunk_1
	*/

	/* Calculate lo parts */
	uint64x2_t t0 = clmul_lo_e(x0, k, vld1q_u64((const uint64_t*)buf));
	uint64x2_t t1 = clmul_lo_e(x1, k, vld1q_u64((const uint64_t*)(buf + 16)));

	/* Calculate hi parts and XOR with lo parts */
	x0 = clmul_hi_e(x0, k, t0);
	x1 = clmul_hi_e(x1, k, t1);

	buf += 32;
	len -= 32;
	}

	/* * Fold 2 lanes into 1.
	* We need to fold x0 forward by 128 bits (16 bytes) so it overlaps with x1.
	* We reuse the 128-bit fold constants here.
	*/
	{ static const uint64_t ALIGNED_(16) k_[] = {0xae689191, 0xccaa009e}; k = vld1q_u64(k_); }

	/* x0 = (x0 * k_128) ^ x1 */
	uint64x2_t t0 = clmul_lo_e(x0, k, x1);
	x0 = clmul_hi_e(x0, k, t0);

	/* * At this point, x0 holds the combined 128-bit result.
	* We now fall through to the final reduction (128-bit -> 32-bit).
	*/
	crc0 = __crc32d(0, vgetq_lane_u64(x0, 0));
	crc0 = __crc32d(crc0, vgetq_lane_u64(x0, 1));
	}
	/* Single buffer path: 1-way PMULL folding (16 bytes/iter) */
	if (len >= 16) {
	uint64x2_t x0 = vld1q_u64((const uint64_t*)buf);
	uint64x2_t k;

	/* * Constants for 128-bit fold (stride 16 bytes).
	* These match the reduction constants from your multi-lane code.
	* k = { x^(128+64) mod P, x^128 mod P } (bit-reflected)
	*/
	{ static const uint64_t ALIGNED_(16) k_[] = {0xae689191, 0xccaa009e}; k = vld1q_u64(k_); }

	/* Mix in current CRC to the first vector */
	x0 = veorq_u64((uint64x2_t){crc0, 0}, x0);

	buf += 16;
	len -= 16;

	/* Fold 16 bytes at a time */
	while (len >= 16) {
	uint64x2_t next = vld1q_u64((const uint64_t*)buf);

	/* * Logic: x0_new = (x0 * k) ^ next
	* We split this into low and high parts using your helpers.
	*/

	/* t0 = (x0_lo * k_lo) ^ next */
	uint64x2_t t0 = clmul_lo_e(x0, k, next);

	/* x0 = (x0_hi * k_hi) ^ t0 */
	x0 = clmul_hi_e(x0, k, t0);

	buf += 16;
	len -= 16;
	}

	/* * Final reduction: 128-bit -> 32-bit.
	* Since we only have one vector (x0), we skip the "Reduce 2 vectors to 1" step
	* and proceed directly to draining the vector using scalar CRC instructions.
	*/
	crc0 = __crc32d(0, vgetq_lane_u64(x0, 0));
	crc0 = __crc32d(crc0, vgetq_lane_u64(x0, 1));
	}
	/* 4. Process remaining 8-byte chunks (scalar) */
	for (; len >= 8; buf += 8, len -= 8)
	crc0 = __crc32d(crc0, (const uint64_t)buf);

	/* 5. Process remaining bytes (scalar) */
	for (; len; --len)
	crc0 = __crc32b(crc0, *buf++);

	return ~crc0;
	}
No results found