Skip to content

Instantly share code, notes, and snippets.

@neudinger
Last active December 29, 2025 16:44
Show Gist options
  • Select an option

  • Save neudinger/15b427badeb367744af8296bb562da33 to your computer and use it in GitHub Desktop.

Select an option

Save neudinger/15b427badeb367744af8296bb562da33 to your computer and use it in GitHub Desktop.
#include <ranges>
#include <span>
#include <algorithm>
#include <cstdint>
#include <cstring>
#include <limits>
#include <vector>
#include <array>
#include <bit>
#if defined(__EMSCRIPTEN__) || defined(__EMSCRIPTEN_BUILD__)
#define attribute_alignas alignas(16)
#else
#define attribute_alignas alignas(64)
#endif
#if defined(_MSC_VER)
#define attribute_force_inline __forceinline
#elif defined(__clang__) || defined(__GNUC__)
#define attribute_force_inline __attribute__((always_inline)) inline
#else
#define attribute_force_inline inline
#endif
#if defined(__EMSCRIPTEN__) || defined(__clang__) || defined(__GNUC__)
#define attribute_hot __attribute__((hot))
#define attribute_pure __attribute__((pure))
#define attribute_const __attribute__((const))
#else
#define attribute_hot
#define attribute_pure
#define attribute_const
#endif
namespace blake3 {
namespace detail {
// --- Concepts ---
// Ensures the input is a contiguous range of 1-byte elements.
// Accepts: std::string, std::string_view, std::vector<uint8_t>,
// std::vector<char>, std::span, std::array<uint8_t>, etc.
template <typename T>
concept ByteContiguousRange = std::ranges::contiguous_range<T> &&
(sizeof(std::ranges::range_value_t<T>) == 1);
// --- Constants & Configuration ---
// chaining_value_ints
constexpr size_t k_chaining_value_ints = 8UL;
// block_len_bytes
constexpr size_t k_block_len_bytes = 64UL;
// chunk_len_bytes
constexpr size_t k_chunk_len_bytes = 1024UL;
// bytes_per_word
constexpr size_t k_bytes_per_word = sizeof(uint32_t);
// bits_per_byte
constexpr size_t k_bits_per_byte = std::numeric_limits<uint8_t>::digits;
// half_block_bytes
constexpr size_t k_half_block_bytes = 32UL;
enum RotationConstants : uint8_t {
ROT_7 = 7U,
ROT_8 = 8U,
ROT_12 = 12U,
ROT_16 = 16U,
};
// Bitwise constants
enum BitwiseConstants : uint32_t {
SHIFT_8 = 8U,
SHIFT_16 = 16U,
SHIFT_24 = 24U,
SHIFT_32 = 32U,
MASK_64_LOW_32 = 0xFFFFFFFFU,
};
constexpr std::array<uint32_t, k_chaining_value_ints> k_initial_vector = {
0x6A09E667U, 0xBB67AE85U, 0x3C6EF372U, 0xA54FF53AU,
0x510E527FU, 0x9B05688CU, 0x1F83D9ABU, 0x5BE0CD19U};
// Message permutation schedule
constexpr size_t k_block_words_count = 16UL;
constexpr size_t k_msg_schedule_rounds = 7UL;
constexpr std::array<uint8_t, k_msg_schedule_rounds * k_block_words_count>
k_msg_schedule_byte_offsets = {
// Round 0: {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15} * 4
0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60,
// Round 1: {2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8} * 4
8, 24, 12, 40, 28, 0, 16, 52, 4, 44, 48, 20, 36, 56, 60, 32,
// Round 2: {3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1} * 4
12, 16, 40, 48, 52, 8, 28, 56, 24, 20, 36, 0, 44, 60, 32, 4,
// Round 3: {10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6} * 4
40, 28, 48, 36, 56, 12, 52, 60, 16, 0, 44, 8, 20, 32, 4, 24,
// Round 4: {12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4} * 4
48, 52, 36, 44, 60, 40, 56, 32, 28, 8, 20, 12, 0, 4, 24, 16,
// Round 5: {9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7} * 4
36, 56, 44, 20, 32, 48, 60, 4, 52, 12, 0, 40, 8, 24, 16, 28,
// Round 6: {11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13} * 4
44, 60, 20, 0, 4, 36, 32, 24, 56, 40, 8, 48, 12, 16, 28, 52};
enum Flags : uint8_t {
FLAG_CHUNK_INIT = 0U,
FLAG_CHUNK_START = 1U << 0U,
FLAG_CHUNK_END = 1U << 1U,
FLAG_PARENT = 1U << 2U,
FLAG_ROOT = 1U << 3U,
};
// --- Load / Store Utilities ---
template <typename T>
constexpr auto as_uint8_span(std::span<T> span) -> std::span<const uint8_t> {
auto bytes = std::as_bytes(span);
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
return {reinterpret_cast<const uint8_t*>(bytes.data()), bytes.size()};
}
attribute_force_inline constexpr auto load_le32(
uint8_t const* const __restrict__ src) noexcept -> uint32_t {
if consteval {
// Compile-time path (manual shift)
// NOLINTBEGIN(cppcoreguidelines-pro-bounds-pointer-arithmetic)
return static_cast<uint32_t>(src[0]) |
(static_cast<uint32_t>(src[1]) << BitwiseConstants::SHIFT_8) |
(static_cast<uint32_t>(src[2]) << BitwiseConstants::SHIFT_16) |
(static_cast<uint32_t>(src[3]) << BitwiseConstants::SHIFT_24);
// NOLINTEND(cppcoreguidelines-pro-bounds-pointer-arithmetic)
} else {
// Runtime path (safe memcpy)
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
uint32_t word;
// Compiles to a single unaligned load instruction on x86/WASM
std::memcpy(&word, src, sizeof(word));
if constexpr (std::endian::native == std::endian::little) {
return word; // Assumes Little Endian host (WASM is LE)
} else {
#if __cpp_lib_byteswap >= 202110L
return std::byteswap(word);
#else
// Fallback for C++20 without <bit> byteswap support
return ((word & 0xFF000000U) >> BitwiseConstants::SHIFT_24) |
((word & 0x00FF0000U) >> BitwiseConstants::SHIFT_8) |
((word & 0x0000FF00U) << BitwiseConstants::SHIFT_8) |
((word & 0x000000FFU) << BitwiseConstants::SHIFT_24);
#endif
}
}
}
constexpr auto store_word_le(std::span<uint8_t> destination,
uint32_t const word) noexcept -> void {
if consteval {
destination[0] = static_cast<uint8_t>(word);
destination[1] = static_cast<uint8_t>(word >> BitwiseConstants::SHIFT_8);
destination[2] = static_cast<uint8_t>(word >> BitwiseConstants::SHIFT_16);
destination[3] = static_cast<uint8_t>(word >> BitwiseConstants::SHIFT_24);
} else {
if constexpr (std::endian::native == std::endian::little) {
std::memcpy(destination.data(), &word, sizeof(word));
} else {
destination[0] = static_cast<uint8_t>(word);
destination[1] = static_cast<uint8_t>(word >> BitwiseConstants::SHIFT_8);
destination[2] = static_cast<uint8_t>(word >> BitwiseConstants::SHIFT_16);
destination[3] = static_cast<uint8_t>(word >> BitwiseConstants::SHIFT_24);
}
}
}
// --- Core Math ---
// NOLINTBEGIN(cppcoreguidelines-pro-bounds-constant-array-index,
// cppcoreguidelines-avoid-magic-numbers)
// Helper for the "G" function logic to ensure strictly inlineable arithmetic
#define MIX(a, b, c, d, x, y) \
do { \
state[a] = state[a] + state[b] + (x); \
state[d] = std::rotr(state[d] ^ state[a], RotationConstants::ROT_16); \
state[c] = state[c] + state[d]; \
state[b] = std::rotr(state[b] ^ state[c], RotationConstants::ROT_12); \
state[a] = state[a] + state[b] + (y); \
state[d] = std::rotr(state[d] ^ state[a], RotationConstants::ROT_8); \
state[c] = state[c] + state[d]; \
state[b] = std::rotr(state[b] ^ state[c], RotationConstants::ROT_7); \
} while (false)
// Helper to load message words using your existing load_le32
// We use a macro here to force the inlining of the offset calculation
#define MSG(offset_idx) \
load_le32(message_bytes + k_msg_schedule_byte_offsets[base + offset_idx])
template <size_t RoundIndex>
attribute_force_inline constexpr void round_function(
std::array<uint32_t, k_block_words_count>& state,
uint8_t const* __restrict__ message_bytes) noexcept {
// Column Step
// The compiler can easily see that these 4 G calls are independent
// and can vectorize them into 4-wide SIMD instructions.
constexpr size_t base = RoundIndex * 16;
// We load all 16 words into local registers immediately.
// This allows the CPU to "fire and forget" 16 load instructions
// before the heavy arithmetic creates dependency stalls.
auto message_0 = MSG(0);
auto message_1 = MSG(1);
auto message_2 = MSG(2);
auto message_3 = MSG(3);
auto message_4 = MSG(4);
auto message_5 = MSG(5);
auto message_6 = MSG(6);
auto message_7 = MSG(7);
auto message_8 = MSG(8);
auto message_9 = MSG(9);
auto message_10 = MSG(10);
auto message_11 = MSG(11);
auto message_12 = MSG(12);
auto message_13 = MSG(13);
auto message_14 = MSG(14);
auto message_15 = MSG(15);
MIX(0, 4, 8, 12, message_0, message_1);
MIX(1, 5, 9, 13, message_2, message_3);
MIX(2, 6, 10, 14, message_4, message_5);
MIX(3, 7, 11, 15, message_6, message_7);
// Diagonal Step
MIX(0, 5, 10, 15, message_8, message_9);
MIX(1, 6, 11, 12, message_10, message_11);
MIX(2, 7, 8, 13, message_12, message_13);
MIX(3, 4, 9, 14, message_14, message_15);
}
#undef MSG
#undef MIX
// NOLINTEND(cppcoreguidelines-pro-bounds-constant-array-index,
// cppcoreguidelines-avoid-magic-numbers)
// NOLINTBEGIN(cppcoreguidelines-pro-type-member-init, hicpp-member-init,
// cppcoreguidelines-pro-bounds-constant-array-index,
// cppcoreguidelines-pro-type-reinterpret-cast,
// cppcoreguidelines-avoid-magic-numbers)
// Template parameter 'IsFullBlock' allows the compiler to
// completely delete the padding logic in the 'true' case.
template <bool IsFullBlock>
attribute_hot constexpr auto compress_block(
std::array<uint32_t, k_chaining_value_ints> const& chaining_value,
std::span<uint8_t const> block, uint64_t const counter, Flags const flags,
std::span<uint32_t, k_block_words_count> destination) noexcept -> void {
// NOLINTBEGIN(cppcoreguidelines-pro-type-member-init)
attribute_alignas std::array<uint32_t, k_block_words_count> state;
// We only initialize the padding buffer if we are in the "Safe Path".
// In the "Fast Path" (IsFullBlock=true), this variable is optimized away.
attribute_alignas std::array<uint8_t, k_block_len_bytes> padded_block_storage;
// Default to zero-copy: point directly to input data
uint8_t const* message_ptr = block.data();
// --- 1. Initialize State ---
if consteval {
std::copy_n(chaining_value.begin(), 8, state.begin());
} else {
// Vectorized initialization
std::memcpy(state.data(), chaining_value.data(), 32);
}
// Use constants directly for IV to avoid memory load latency
state[8] = k_initial_vector[0];
state[9] = k_initial_vector[1];
state[10] = k_initial_vector[2];
state[11] = k_initial_vector[3];
// Immediately follow with dynamic parameters
state[12] = static_cast<uint32_t>(counter);
state[13] = static_cast<uint32_t>(counter >> 32);
state[14] = static_cast<uint32_t>(block.size());
state[15] = static_cast<uint32_t>(flags);
// --- 2. Handle Padding (Compile-Time Switch) ---
if constexpr (!IsFullBlock) {
if (block.size() < k_block_len_bytes) {
if consteval {
std::ranges::copy(block, padded_block_storage.begin());
std::ranges::fill(std::span(padded_block_storage).subspan(block.size()),
uint8_t{});
} else {
std::memcpy(padded_block_storage.data(), block.data(), block.size());
std::memset(padded_block_storage.data() + block.size(), uint8_t{},
k_block_len_bytes - block.size());
}
message_ptr = padded_block_storage.data();
}
}
// --- 3. Rounds ---
state[12] = static_cast<uint32_t>(counter);
state[13] = static_cast<uint32_t>(counter >> 32);
state[14] = static_cast<uint32_t>(block.size());
state[15] = static_cast<uint32_t>(flags);
// Unrolled rounds with HARDCODED permutation types (from Optimization A)
round_function<0>(state, message_ptr);
round_function<1>(state, message_ptr);
round_function<2>(state, message_ptr);
round_function<3>(state, message_ptr);
round_function<4>(state, message_ptr);
round_function<5>(state, message_ptr);
round_function<6>(state, message_ptr);
// --- 4. Finalize ---
#pragma clang loop unroll(full)
for (size_t idx{}; idx < 8U; ++idx) {
state[idx] ^= state[idx + 8U];
state[idx + 8] ^= chaining_value[idx];
}
if consteval {
std::ranges::copy(state, destination.begin());
} else {
std::memcpy(destination.data(), state.data(), sizeof(state));
}
// NOLINTEND(cppcoreguidelines-pro-type-member-init)
}
// NOLINTEND(cppcoreguidelines-pro-type-member-init, hicpp-member-init,
// cppcoreguidelines-pro-bounds-constant-array-index,
// cppcoreguidelines-pro-type-reinterpret-cast,
// cppcoreguidelines-avoid-magic-numbers)
// --- Output Class ---
class output_t {
attribute_alignas std::array<uint32_t, k_chaining_value_ints>
_input_chaining_value{};
attribute_alignas std::array<uint8_t, k_chunk_len_bytes> _block_storage{};
size_t _block_len_bytes{};
uint8_t _flags{};
public:
constexpr output_t(
std::array<uint32_t, k_chaining_value_ints> const& chaining_value,
std::span<uint8_t const> block_bytes, uint8_t flags)
: _input_chaining_value{chaining_value},
_block_len_bytes(block_bytes.size()),
_flags(flags) {
std::ranges::copy(block_bytes, this->_block_storage.begin());
}
constexpr auto fill_destination(std::span<uint8_t> destination) const noexcept
-> void {
size_t const current_len = destination.size();
size_t output_block_counter{};
size_t offset_pos{};
// OPTIMIZATION: Reusable, aligned scratch buffer for compressor output.
// This avoids reconstructing the array object on every loop iteration.
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init, hicpp-member-init)
attribute_alignas std::array<uint32_t, k_block_words_count>
compress_scratch;
while (offset_pos < current_len) {
std::span<uint8_t const> const block_view{this->_block_storage.data(),
this->_block_len_bytes};
compress_block<false>(
this->_input_chaining_value, block_view, output_block_counter++,
Flags(this->_flags | Flags::FLAG_ROOT), compress_scratch);
size_t const remaining_bytes = current_len - offset_pos;
size_t const chunk_size = (remaining_bytes < k_block_len_bytes)
? remaining_bytes
: k_block_len_bytes;
// NOLINTBEGIN(cppcoreguidelines-pro-bounds-constant-array-index,
// cppcoreguidelines-pro-bounds-pointer-arithmetic)
if consteval {
// Keep the shift logic for Compile-Time (constexpr)
#pragma clang loop unroll(full)
for (size_t index{}; index < chunk_size; ++index) {
auto const word_index = index / k_bytes_per_word;
auto const byte_shift =
k_bits_per_byte * (index & (k_bytes_per_word - 1UL));
auto const& word = compress_scratch[word_index];
destination[offset_pos + index] =
static_cast<uint8_t>(word >> byte_shift);
}
} else {
// Runtime Optimization:
// Since WASM is Little Endian, the uint32 array is already
// laid out as the correct byte sequence.
if constexpr (std::endian::native == std::endian::little) {
std::memcpy(destination.data() + offset_pos, compress_scratch.data(),
chunk_size);
} else {
// Fallback for Big Endian architectures (rare)
for (size_t index{}; index < chunk_size; ++index) {
auto const word_index = index / k_bytes_per_word;
auto const byte_shift =
k_bits_per_byte * (index & (k_bytes_per_word - 1UL));
destination[offset_pos + index] = static_cast<uint8_t>(
compress_scratch[word_index] >> byte_shift);
}
}
}
// NOLINTEND(cppcoreguidelines-pro-bounds-constant-array-index,
// cppcoreguidelines-pro-bounds-pointer-arithmetic)
offset_pos += chunk_size;
}
}
template <size_t N>
[[nodiscard]] constexpr auto extract_array() const noexcept
-> std::array<uint8_t, N> {
std::array<uint8_t, N> result_array{};
fill_destination(result_array);
return result_array;
}
};
// --- Chunk State Class ---
class chunk_state_t {
attribute_alignas std::array<uint32_t, k_chaining_value_ints>
_chaining_value{};
attribute_alignas std::array<uint8_t, k_chunk_len_bytes> _buffer{};
uint64_t _chunk_counter{};
size_t _buffer_len_bytes{};
uint8_t _blocks_compressed_count{};
Flags _flags{};
public:
constexpr chunk_state_t(
std::array<uint32_t, k_chaining_value_ints> const& key,
uint64_t chunk_counter, Flags flags = Flags::FLAG_CHUNK_INIT)
: _chaining_value{key}, _chunk_counter(chunk_counter), _flags(flags) {}
[[nodiscard]] constexpr auto total_len() const noexcept -> size_t {
return (static_cast<size_t>(this->_blocks_compressed_count) *
k_block_len_bytes) +
this->_buffer_len_bytes;
}
[[nodiscard]] constexpr auto get_start_flag() const noexcept -> uint8_t {
return this->_blocks_compressed_count == uint8_t{} ? Flags::FLAG_CHUNK_START
: uint8_t{};
}
template <ByteContiguousRange Range>
constexpr auto update(Range const& input_data) noexcept -> void {
// 1. Basic Setup
auto const* data_ptr = std::ranges::data(input_data);
size_t input_len = std::ranges::size(input_data);
// For ByteContiguousRange, size() is effectively byte length because
// sizeof(T)==1. However, we track 'offset' to iterate cleanly.
size_t offset = 0;
// OPTIMIZATION: Reusable, aligned scratch buffer.
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init, hicpp-member-init)
attribute_alignas std::array<uint32_t, k_block_words_count>
compress_scratch;
// --- 1. Fill Buffer (Partial Data) ---
if (this->_buffer_len_bytes > 0) {
size_t const needed = k_block_len_bytes - this->_buffer_len_bytes;
size_t const take = (needed < input_len) ? needed : input_len;
auto dest_span =
std::span{this->_buffer}.subspan(this->_buffer_len_bytes, take);
if consteval {
// Safe for char/uint8_t mismatch: std::copy_n casts elements
// individually
std::copy_n(data_ptr + offset, take, dest_span.begin());
} else {
// Runtime fast path
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
auto const* byte_ptr = reinterpret_cast<uint8_t const*>(data_ptr);
// NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
std::memcpy(dest_span.data(), byte_ptr + offset, take);
}
this->_buffer_len_bytes += take;
offset += take;
input_len -= take;
if (this->_buffer_len_bytes == k_block_len_bytes) {
std::span<uint8_t const> const block_view{this->_buffer.data(),
k_block_len_bytes};
compress_block<true>(
this->_chaining_value, block_view, this->_chunk_counter,
Flags(this->_flags | get_start_flag()), compress_scratch);
if consteval {
std::copy_n(compress_scratch.begin(), k_chaining_value_ints,
this->_chaining_value.begin());
} else {
std::memcpy(this->_chaining_value.data(), compress_scratch.data(),
k_chaining_value_ints * k_bytes_per_word);
}
this->_blocks_compressed_count++;
this->_buffer_len_bytes = 0UL;
}
}
// --- 2. Fast Path (Full Blocks) ---
while (input_len >= k_block_len_bytes) [[likely]] {
if consteval {
// CONSTEXPR PATH
// We cannot reinterpret_cast. We must check types.
using ValT = std::ranges::range_value_t<Range>;
if constexpr (std::is_same_v<ValT, uint8_t>) {
// Input is ALREADY uint8_t. Zero-copy is safe.
auto const* ptr = data_ptr + offset;
compress_block<true>(this->_chaining_value, {ptr, k_block_len_bytes},
this->_chunk_counter,
Flags(this->_flags | get_start_flag()),
compress_scratch);
std::copy_n(compress_scratch.begin(), k_chaining_value_ints,
this->_chaining_value.begin());
} else {
// Input is char/signed char. We CANNOT cast pointer to uint8_t*.
// We MUST copy to a temporary buffer first to type-pun safely.
// Note: We use _buffer as a temporary scratchpad since it's empty
// here (loop condition).
std::copy_n(data_ptr + offset, k_block_len_bytes,
this->_buffer.begin());
compress_block<true>(
this->_chaining_value, {this->_buffer.data(), k_block_len_bytes},
this->_chunk_counter, Flags(this->_flags | get_start_flag()),
compress_scratch);
std::copy_n(compress_scratch.begin(), k_chaining_value_ints,
this->_chaining_value.begin());
}
} else {
// RUNTIME PATH (Zero-Copy for ALL types via reinterpret_cast)
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
auto const* byte_ptr = reinterpret_cast<uint8_t const*>(data_ptr);
compress_block<true>(
// NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
this->_chaining_value, {byte_ptr + offset, k_block_len_bytes},
this->_chunk_counter, Flags(this->_flags | get_start_flag()),
compress_scratch);
std::memcpy(this->_chaining_value.data(), compress_scratch.data(),
k_chaining_value_ints * k_bytes_per_word);
}
this->_blocks_compressed_count++;
offset += k_block_len_bytes;
input_len -= k_block_len_bytes;
}
if (input_len > 0) {
if consteval {
std::copy_n(data_ptr + offset, input_len, this->_buffer.begin());
} else {
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
auto const* byte_ptr = reinterpret_cast<uint8_t const*>(data_ptr);
// NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
std::memcpy(this->_buffer.data(), byte_ptr + offset, input_len);
}
this->_buffer_len_bytes = input_len;
}
}
[[nodiscard]] constexpr auto output_chaining_value() const noexcept
-> std::array<uint32_t, k_block_words_count> {
auto const current_flags =
Flags(static_cast<unsigned>(this->_flags) |
static_cast<unsigned>(get_start_flag()) |
static_cast<unsigned>(Flags::FLAG_CHUNK_END));
std::span<uint8_t const> block_view{_buffer.data(),
this->_buffer_len_bytes};
// OPTIMIZATION: Reusable, aligned scratch buffer for compressor output.
// This avoids reconstructing the array object on every loop iteration.
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init, hicpp-member-init)
attribute_alignas std::array<uint32_t, k_block_words_count>
compress_scratch;
compress_block<false>(this->_chaining_value, block_view,
this->_chunk_counter, current_flags,
compress_scratch);
return compress_scratch;
}
struct root_props_t {
attribute_alignas std::array<uint32_t, k_chaining_value_ints>
chaining_values{};
std::span<uint8_t const> block;
uint64_t counter{};
uint8_t flags{};
};
[[nodiscard]] constexpr auto get_root_props() const noexcept -> root_props_t {
return {.chaining_values = this->_chaining_value,
.block = std::span<uint8_t const>{_buffer.data(),
this->_buffer_len_bytes},
.counter = this->_chunk_counter,
.flags = static_cast<uint8_t>(
static_cast<unsigned>(_flags) |
static_cast<unsigned>(get_start_flag()) |
static_cast<unsigned>(Flags::FLAG_CHUNK_END))};
}
};
// --- Hasher Class ---
class hasher_t {
static constexpr size_t k_max_stack_depth = 54;
attribute_alignas std::array<uint32_t, k_chaining_value_ints> _key{};
attribute_alignas
std::array<std::array<uint32_t, k_chaining_value_ints>, k_max_stack_depth>
_chaining_value_stack{};
chunk_state_t _chunk_state;
uint8_t _stack_len{};
uint64_t _total_chunks_count{};
constexpr void push_stack(
// NOLINTBEGIN(cppcoreguidelines-pro-bounds-constant-array-index,
// cppcoreguidelines-pro-bounds-pointer-arithmetic)
std::array<uint32_t, k_block_words_count> const& chunk_output) {
uint64_t current_chunks = this->_total_chunks_count;
attribute_alignas std::array<uint32_t, k_chaining_value_ints>
right_chaining_value;
// OPTIMIZATION: Reusable, aligned scratch buffer for compressor output.
// This avoids reconstructing the array object on every loop iteration.
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init, hicpp-member-init)
attribute_alignas std::array<uint32_t, k_block_words_count>
compress_scratch;
if consteval {
std::copy_n(chunk_output.begin(), k_chaining_value_ints,
right_chaining_value.begin());
} else {
std::memcpy(right_chaining_value.data(), chunk_output.data(),
k_chaining_value_ints);
}
while ((current_chunks & 1U) != 0U) {
auto const& left_chaining_value =
this->_chaining_value_stack[--this->_stack_len];
attribute_alignas std::array<uint8_t, k_block_len_bytes> parent_block;
if consteval {
std::span<uint8_t> const block_span{parent_block};
for (size_t idx{}; idx < k_chaining_value_ints; ++idx) {
store_word_le(
block_span.subspan(k_bytes_per_word * idx, k_bytes_per_word),
left_chaining_value[idx]);
store_word_le(
block_span.subspan(k_half_block_bytes + (k_bytes_per_word * idx),
k_bytes_per_word),
right_chaining_value[idx]);
}
} else {
std::memcpy(parent_block.data(), left_chaining_value.data(),
k_half_block_bytes);
std::memcpy(parent_block.data() + k_half_block_bytes,
right_chaining_value.data(), k_half_block_bytes);
}
compress_block<true>(this->_key, parent_block, 0U, Flags::FLAG_PARENT,
compress_scratch);
if consteval {
std::copy_n(compress_scratch.begin(), k_chaining_value_ints,
right_chaining_value.begin());
} else {
std::memcpy(right_chaining_value.data(), compress_scratch.data(),
k_chaining_value_ints);
}
current_chunks >>= 1U;
}
this->_chaining_value_stack[this->_stack_len++] = right_chaining_value;
this->_total_chunks_count++;
// NOLINTEND(cppcoreguidelines-pro-bounds-constant-array-index,
// cppcoreguidelines-pro-bounds-pointer-arithmetic)
}
public:
constexpr hasher_t()
: _key{k_initial_vector},
_chunk_state(k_initial_vector, 0U, Flags::FLAG_CHUNK_INIT) {}
// Generic update accepting any ByteContiguousRange
template <ByteContiguousRange Range>
constexpr void update(Range const& input_data) noexcept {
size_t taken_bytes{};
auto const* data_ptr = std::ranges::data(input_data);
size_t remaining_len = std::ranges::size(input_data);
while (remaining_len > 0) {
if (this->_chunk_state.total_len() == k_chunk_len_bytes) {
auto const chunk_chaining_value =
this->_chunk_state.output_chaining_value();
push_stack(chunk_chaining_value);
this->_chunk_state = chunk_state_t(
this->_key, this->_total_chunks_count + 1U, Flags::FLAG_CHUNK_INIT);
}
size_t const needed = k_chunk_len_bytes - this->_chunk_state.total_len();
size_t const can_take = (needed < remaining_len) ? needed : remaining_len;
// Create a span subview for the specific chunk
// We rely on std::span deduction guide to handle the ptr + len
std::span const chunk_view(data_ptr + taken_bytes, can_take);
this->_chunk_state.update(chunk_view);
taken_bytes += can_take;
remaining_len -= can_take;
}
}
// NOLINTBEGIN(cppcoreguidelines-avoid-non-const-global-variables,
// cppcoreguidelines-pro-bounds-constant-array-index)
[[nodiscard]] constexpr auto finalize() const noexcept -> output_t {
// 1. Handle Empty/Root-only case
if (this->_total_chunks_count == 0UL) {
auto const props = this->_chunk_state.get_root_props();
return {props.chaining_values, props.block, props.flags};
}
// Prepare the common buffers
attribute_alignas std::array<uint8_t, k_block_len_bytes> parent_block;
// OPTIMIZATION: Reusable scratch buffer.
attribute_alignas std::array<uint32_t, k_block_words_count>
compress_scratch;
// Initialize the "Right Child" directly into the parent block (Bytes
// 32-63) We extract the chaining value from the current chunk state.
auto const current_output = this->_chunk_state.output_chaining_value();
if consteval {
// Direct write to the second half of the block
std::span<uint8_t> block_span{parent_block};
for (size_t idx{}; idx < k_chaining_value_ints; ++idx) {
store_word_le(
block_span.subspan(k_half_block_bytes + (k_bytes_per_word * idx),
k_bytes_per_word),
current_output[idx]);
}
} else {
std::memcpy(parent_block.data() + k_half_block_bytes,
current_output.data(), k_half_block_bytes);
}
// Reduce the Stack
size_t current_stack_idx = this->_stack_len;
while (current_stack_idx > 0) {
// A. Pop the "Left Child" from the stack
auto const& left_chaining_value =
this->_chaining_value_stack[--current_stack_idx];
// B. Copy Left Child into parent_block (Bytes 0-31)
if consteval {
std::span<uint8_t> block_span{parent_block};
for (size_t idx{}; idx < k_chaining_value_ints; ++idx) {
store_word_le(
block_span.subspan(k_bytes_per_word * idx, k_bytes_per_word),
left_chaining_value[idx]);
}
} else {
std::memcpy(parent_block.data(), left_chaining_value.data(),
k_half_block_bytes);
}
// Check if we are done
// If we just popped the last item, 'parent_block' now contains
// [Left Child | Right Child]. This is exactly what the Root Output needs.
// We break BEFORE compressing, because output_t handles the final
// compression.
if (current_stack_idx == 0UL) {
break;
}
// Compress the Parent
// This produces a new chaining value for the next level up.
compress_block<true>(this->_key, parent_block, 0UL, Flags::FLAG_PARENT,
compress_scratch);
// Update the "Right Child" for the next iteration (Bytes 32-63)
// OPTIMIZATION: Write directly from scratch to parent_block right half.
if consteval {
std::span<uint8_t> block_span{parent_block};
for (size_t idx{}; idx < k_chaining_value_ints; ++idx) {
store_word_le(
block_span.subspan(k_half_block_bytes + (k_bytes_per_word * idx),
k_bytes_per_word),
compress_scratch[idx]);
}
} else {
std::memcpy(parent_block.data() + k_half_block_bytes,
compress_scratch.data(), k_half_block_bytes);
}
}
return {this->_key, parent_block, Flags::FLAG_PARENT};
}
};
// NOLINTEND(cppcoreguidelines-avoid-non-const-global-variables,
// cppcoreguidelines-pro-bounds-constant-array-index)
} // namespace detail
constexpr auto hash_default_size = 32UL;
// 1. Compile-time constant size (Templated input)
template <size_t OutPutSize = hash_default_size,
detail::ByteContiguousRange Range>
constexpr auto hash(Range const& input) noexcept
-> std::array<uint8_t, OutPutSize> {
detail::hasher_t hasher_obj;
hasher_obj.update(input);
return hasher_obj.finalize().template extract_array<OutPutSize>();
}
// 2. Runtime variable size (Templated input)
template <detail::ByteContiguousRange Range>
inline auto hash(Range const& input, size_t const num_bytes) noexcept
-> std::vector<uint8_t> {
detail::hasher_t hasher_obj;
hasher_obj.update(input);
std::vector<uint8_t> output_buffer(num_bytes, uint8_t{});
hasher_obj.finalize().fill_destination(output_buffer);
return output_buffer;
}
} // namespace blake3
// --- Verification ---
#include <print>
#include "blake3.hpp"
constexpr size_t hash_test_size = 32UL;
constexpr std::array<uint8_t, hash_test_size> k_expected_hello = {
0xea, 0x8f, 0x16, 0x3d, 0xb3, 0x86, 0x82, 0x92, 0x5e, 0x44, 0x91,
0xc5, 0xe5, 0x8d, 0x4b, 0xb3, 0x50, 0x6e, 0xf8, 0xc1, 0x4e, 0xb7,
0x8a, 0x86, 0xe9, 0x08, 0xc5, 0x62, 0x4a, 0x67, 0x20, 0x0f};
constexpr std::array<uint8_t, hash_test_size> k_expected_empty = {
0xaf, 0x13, 0x49, 0xb9, 0xf5, 0xf9, 0xa1, 0xa6, 0xa0, 0x40, 0x4d,
0xea, 0x36, 0xdc, 0xc9, 0x49, 0x9b, 0xcb, 0x25, 0xc9, 0xad, 0xc1,
0x12, 0xb7, 0xcc, 0x9a, 0x93, 0xca, 0xe4, 0x1f, 0x32, 0x62};
template <size_t HashSize = blake3::hash_default_size>
consteval auto check_hash(std::string_view input_str,
std::array<uint8_t, HashSize> const& expected_bytes)
-> bool {
auto result_bytes = blake3::hash(input_str);
for (size_t index{}; index < HashSize; ++index) {
if (result_bytes.at(index) != expected_bytes.at(index)) {
return false;
}
}
return true;
}
static_assert(check_hash("hello", k_expected_hello),
"Hash of 'hello' failed CTFE check");
static_assert(check_hash("", k_expected_empty),
"Hash of empty string failed CTFE check");
using namespace std::literals;
// clang++ -O3 --std=c++23 main-blake.cpp && ./a.out
auto main() -> int {
auto constexpr hash_val = blake3::hash("hello"sv);
std::print("Input: \"hello\"\n");
std::println("BLAKE3 Hash: size = {} ", hash_val.size());
for (uint8_t const byte_val : hash_val) {
std::print("0x{:02x}, ", byte_val);
}
std::println();
if (hash_val == k_expected_hello) {
std::println("Status: Verified ✅");
} else {
std::println("Status: Failed ❌");
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment