Skip to content

Instantly share code, notes, and snippets.

@Rexicon226
Last active February 4, 2026 19:49
Show Gist options
  • Select an option

  • Save Rexicon226/17727a7f8365144b20c74b0457f2f9b5 to your computer and use it in GitHub Desktop.

Select an option

Save Rexicon226/17727a7f8365144b20c74b0457f2f9b5 to your computer and use it in GitHub Desktop.
Dead simple KangarooTwelve-128 implementation
#include <stdio.h>
#include <string.h>
#include <stdint.h>
#include <stdlib.h>
// Chunk size for tree hashing
#define CHUNK_SIZE 8192
#define ALIGN __attribute__ ((aligned (128)))
#define MODE 128
#if MODE==128
#define CV_SIZE 32
#define RATE 168
#elif
#error "TODO: kt256"
#endif
/* Helper functions */
typedef unsigned char uchar;
typedef unsigned int uint;
typedef unsigned long ulong;
static inline uint64_t rotl(uint64_t x, unsigned int n) {
return (x << n) | (x >> (-n & 63));
}
static inline uint64_t load_u64(const uchar *p) {
uint64_t v;
memcpy(&v, p, sizeof(v));
return v;
}
static inline void store_u64( uint64_t v, uchar *p ) {
memcpy(p, &v, sizeof(v));
}
#define min(x, y) (x<y) ? x : y
typedef struct re {
uchar bytes[9];
uchar len;
} re_t;
void kt_right_encode(re_t *r, ulong x) {
if (x == 0) {
r->bytes[0] = 0;
r->len = 1;
return;
}
uchar temp[9];
ulong len = 0;
for (; x > 0; x /= 256) {
temp[len++] = x % 256;
}
for (int i = 0; i < len; i++) {
r->bytes[i] = temp[len - 1 - i];
}
r->bytes[len] = len;
r->len = len + 1;
}
typedef struct {
const uchar *ptr;
ulong len;
} slice_t;
typedef struct {
slice_t slices[3];
ulong offsets[4];
} msv_t;
void msv_init( msv_t *r,
const uchar *s1, ulong s1len,
const uchar *s2, ulong s2len,
const uchar *s3, ulong s3len ) {
r->slices[0] = (slice_t){s1, s1len};
r->slices[1] = (slice_t){s2, s2len};
r->slices[2] = (slice_t){s3, s3len};
r->offsets[0] = 0;
r->offsets[1] = s1len;
r->offsets[2] = s1len + s2len;
r->offsets[3] = s1len + s2len + s3len;
}
ulong msv_len( const msv_t *v ) {
return v->offsets[3];
}
uchar msv_byte( const msv_t *v, ulong pos ) {
for (int i = 0; i<3; i++) {
if (pos >= v->offsets[i] && pos < v->offsets[i + 1]) {
return *(v->slices[i].ptr + (pos - v->offsets[i]));
}
}
return 0; // impossible to reach
// __builtin_unreachable();
}
const uchar *msv_get_range( const msv_t *v, ulong start, ulong end ) {
for (int i = 0; i<3; i++) {
if (start >= v->offsets[i] && end <= v->offsets[i + 1]) {
return v->slices[i].ptr + (start - v->offsets[i]);
}
}
return NULL;
}
void msv_copy_range( const msv_t *v, ulong start, ulong end, uchar *dst ) {
ulong pos = 0;
for (ulong i = start; i<end; i++) {
dst[pos] = msv_byte( v, i );
pos += 1;
}
}
// Round constants for Keccak-p[1600,12]
uint64_t RC[12] = {
0x000000008000808B,
0x800000000000008B,
0x8000000000008089,
0x8000000000008003,
0x8000000000008002,
0x8000000000000080,
0x000000000000800A,
0x800000008000000A,
0x8000000080008081,
0x8000000000008080,
0x0000000080000001,
0x8000000080008008,
};
#define KECCAK_PERMUTE keccak_perm_scalar
#define vec_t uint64_t
#define xor(a,b) ((a) ^ (b))
#define and(a,b) ((a) & (b))
#define not(a) (~(a))
#define rotlv(a,n) rotl(a,n)
#define splat(x) (x)
#define lanes 1
#define R(x,y,r) \
do { \
const int dst = (y) + 5 * ((2*(x) + 3*(y)) % 5); \
B[dst] = rotlv(state[(x) + 5*(y)], (r)); \
} while (0)
static inline void KECCAK_PERMUTE(vec_t state[25]) {
for (int round=0; round<12; round++) {
vec_t C[5], D[5];
for (int x = 0; x < 5; x++) {
C[x] = xor(
xor(state[x], state[x+5]),
xor(state[x+10], xor(state[x+15], state[x+20]))
);
}
for (int x = 0; x < 5; x++) {
D[x] = xor(C[(x + 4) % 5], rotlv(C[(x + 1) % 5], 1));
}
for (int i = 0; i < 25; i++) {
state[i] = xor(state[i], D[i % 5]);
}
vec_t B[25];
R(0,0, 0); R(1,0, 1); R(2,0, 62); R(3,0, 28); R(4,0, 27);
R(0,1, 36); R(1,1, 44); R(2,1, 6); R(3,1, 55); R(4,1, 20);
R(0,2, 3); R(1,2, 10); R(2,2, 43); R(3,2, 25); R(4,2, 39);
R(0,3, 41); R(1,3, 45); R(2,3, 15); R(3,3, 21); R(4,3, 8);
R(0,4, 18); R(1,4, 2); R(2,4, 61); R(3,4, 56); R(4,4, 14);
for (int y = 0; y < 5; y++) {
vec_t t0 = B[5 * y + 0];
vec_t t1 = B[5 * y + 1];
vec_t t2 = B[5 * y + 2];
vec_t t3 = B[5 * y + 3];
vec_t t4 = B[5 * y + 4];
state[5 * y + 0] = xor(t0, and(not(t1), t2));
state[5 * y + 1] = xor(t1, and(not(t2), t3));
state[5 * y + 2] = xor(t2, and(not(t3), t4));
state[5 * y + 3] = xor(t3, and(not(t4), t0));
state[5 * y + 4] = xor(t4, and(not(t0), t1));
}
state[0] = xor(state[0], splat(RC[round]));
}
}
typedef union keccak_state {
uchar bytes[200];
uint64_t limbs[25];
} keccak_state_t;
void kt_turbo_shake( const msv_t *v, uchar seperation, uchar *out, ulong outlen) {
keccak_state_t state = { 0 };
ulong state_pos = 0;
// Absorb all bytes from the MSV
ulong total = msv_len( v );
ulong position = 0;
while (position < total) {
state.bytes[state_pos] ^= msv_byte( v, position );
state_pos += 1;
position += 1;
if (state_pos == RATE) {
keccak_perm_scalar( state.limbs );
state_pos = 0;
}
}
// Add seperation and padding
state.bytes[state_pos] ^= seperation;
state.bytes[RATE - 1] ^= 0x80;
keccak_perm_scalar( state.limbs );
// Squeeze
ulong offset = 0;
while (offset < outlen) {
ulong chunk = min( RATE, outlen - offset );
memcpy(out + offset, state.bytes, chunk);
offset += chunk;
if (offset < outlen) {
keccak_perm_scalar( state.limbs );
}
}
}
// Turbo-shake state
typedef struct ts {
uchar delim;
ulong offset;
uchar buffer[RATE];
uint64_t state[25];
} ts_t;
void ts_init( ts_t *r, uchar delim ) {
r->delim = delim;
r->offset = 0;
memset(r->state, 0, sizeof(r->state));
}
void ts_add_bytes( ts_t *t, const uchar *msg, ulong len ) {
ulong i = 0;
for (; i + 8 <= len; i += 8) {
t->state[i / 8] ^= load_u64( msg + i );
}
if (i < len) {
uchar padded[8] = { 0 };
memcpy(padded, msg + i, len - i);
t->state[i / 8] ^= load_u64( padded );
}
}
void ts_add_byte( ts_t *t, uchar byte, ulong offset ) {
ulong z = 8 * (offset % 8);
t->state[offset / 8] ^= ((uint64_t)byte) << z;
}
/* Extract the first bytes from the Keccak state */
void ts_extract_bytes( ts_t *t, uchar *out, ulong outlen ) {
ulong i = 0;
for (; i + 8 <= outlen; i += 8) {
store_u64( t->state[i / 8], out + i );
}
if (i < outlen) {
uchar padded[8];
store_u64( t->state[i / 8], padded );
memcpy( out + i, padded, outlen - i );
}
}
/* Absorb bytes into the TurboSHAKE state */
void ts_update( ts_t *t, const uchar *msg, ulong len ) {
ulong i = 0;
if (t->offset > 0) {
ulong left = min(RATE - t->offset, len);
memcpy(t->buffer + t->offset, msg, left);
t->offset += left;
if (left == len) return;
if (t->offset == RATE) {
ts_add_bytes( t, t->buffer, RATE );
keccak_perm_scalar( t->state );
t->offset = 0;
}
i = left;
}
for (; i + RATE < len; i += RATE) {
ts_add_bytes( t, msg + i, RATE );
keccak_perm_scalar( t->state );
}
ulong left = len - i;
if (left > 0) memcpy(t->buffer, msg + i, left);
t->offset = left;
}
/* Squeeze bytes from the TurboSHAKE state */
void ts_squeeze( ts_t *t, uchar *out, ulong outlen ) {
ulong i = 0;
if (t->offset == RATE) {
keccak_perm_scalar( t->state );
} else if (t->offset > 0) {
uchar buffer[RATE];
ts_extract_bytes( t, buffer, RATE );
ulong left = min(RATE - t->offset, outlen);
memcpy(out, buffer + t->offset, left);
if (left == outlen) return;
if (t->offset == RATE) {
t->offset = 0;
keccak_perm_scalar( t->state );
}
i = left;
}
for (; i + RATE < outlen; i += RATE) {
ts_extract_bytes( t, out + i, RATE );
keccak_perm_scalar( t->state );
}
ulong left = outlen - i;
if (left > 0) {
ts_extract_bytes( t, out + i, left );
}
t->offset = left;
}
/* Sets the hash of the absorbed bytes */
void ts_final( ts_t *t, uchar *out, ulong outlen ) {
// Mark the end of the input
ts_add_bytes( t, t->buffer, t->offset );
if (t->offset == RATE) {
keccak_perm_scalar( t->state );
t->offset = 0;
}
ts_add_byte( t, t->delim, t->offset );
ts_add_byte( t, 0x80, RATE - 1 );
keccak_perm_scalar( t->state );
t->offset = 0;
// Squeeze into out from the state.
ulong offset = 0;
ulong full_blocks = outlen - (outlen % RATE);
if (full_blocks > 0) {
ts_squeeze( t, out, full_blocks );
offset += full_blocks;
}
ulong remaining = outlen - offset;
if (remaining > 0) {
uchar buffer[RATE];
ts_squeeze( t, buffer, RATE );
memcpy(out + offset, buffer, remaining);
}
}
// KT
void kt_hash(const uchar *msg, ulong msglen, uchar *out, ulong outlen) {
// Right-encode customization length
re_t custom_len_enc[1];
kt_right_encode(custom_len_enc, 0);
msv_t view[1];
msv_init(
view,
msg, msglen, /* s1 */
NULL, 0, /* s2 */
custom_len_enc->bytes, custom_len_enc->len /* s3 */
);
ulong total_length = msv_len( view );
// If the input fits into a chunk, we can immediately absorb it.
if (total_length <= CHUNK_SIZE) {
kt_turbo_shake( view, 0x07, out, outlen );
return;
}
// Tree mode
// 1. Create a TurboSHAKE state with delimiter 0x06
ts_t state[1];
ts_init( state, 0x06 );
// 2. Absorb first B bytes from input
{
const uchar *ptr = msv_get_range( view, 0, CHUNK_SIZE );
if (ptr) {
ts_update( state, ptr, CHUNK_SIZE );
} else {
uchar first_b_buffer[CHUNK_SIZE];
msv_copy_range( view, 0, CHUNK_SIZE, first_b_buffer );
ts_update( state, first_b_buffer, CHUNK_SIZE );
}
}
// 3. Absorb padding bytes
const uchar padding[8] = { 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
ts_update( state, padding, sizeof(padding) );
ulong j = CHUNK_SIZE;
ulong n = 0;
uchar leaves[CHUNK_SIZE * 8] ALIGN;
uchar cv_buffer[CV_SIZE];
// 4. Process leaves in SIMD batches
// 5. Process remaining leaves, one at a time
while (j < total_length) {
ulong chunk = min( CHUNK_SIZE, total_length - j );
msv_t cv[1];
const uchar *ptr = msv_get_range( view, j, j + chunk );
if (ptr) {
msv_init( cv, ptr, chunk, NULL, 0, NULL, 0 );
} else {
msv_init( cv, leaves, chunk, NULL, 0, NULL, 0 );
msv_copy_range( view, j, j + chunk, leaves );
}
kt_turbo_shake( cv, 0x0B, cv_buffer, CV_SIZE );
ts_update( state, cv_buffer, CV_SIZE );
j += CHUNK_SIZE;
n += 1;
}
// 6. Absorb right_encode(n)
re_t n_enc[1];
kt_right_encode( n_enc, n );
ts_update( state, n_enc->bytes, n_enc->len );
// 7. Absorb terminator
const uchar terminator[2] = { 0xFF, 0xFF };
ts_update( state, terminator, 2 );
ts_final( state, out, outlen );
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment