Skip to content

Instantly share code, notes, and snippets.

@phire
Last active December 19, 2025 06:27
Show Gist options
  • Select an option

  • Save phire/f8ce4f3a001c1fc55841f0c9b1b714ad to your computer and use it in GitHub Desktop.

Select an option

Save phire/f8ce4f3a001c1fc55841f0c9b1b714ad to your computer and use it in GitHub Desktop.
Attempt at creating an efficient h264 CAVLC decoder for the n64 (which doesn't have CLZ, and really hates cache misses)
/// Like decode_huffman_lookup, but the first level is unrolled, and much bigger
/// Any code that fits in 5 bits and decodes to 5 bits or less can be done in a single table lookup
pub fn decode_huffman_unrolled(buffer: u64, mut table: &[u8]) -> (u32, usize) {
// Lookup first 5 bits (32 entry table)
let idx = buffer >> 59; // 1 cycle
let lookup = unsafe { *table.get_unchecked(idx as usize) }; // 2 cycles
let mut total_bits = 5; // 1 cycle (load interlock)
let lookup_5 = lookup >> 5; // 1 cycle
let mut neg_bits = lookup_5 as i32 - 3; // 1 cycle
let mut offset = (lookup & 0x1F) as usize; // 1 cycle (branch delay slot)
if neg_bits >= 0 { // 1 cycle
let bits = lookup_5 - 2; // 1 cycle (return delay slot)
return (offset as u32, bits as usize); // 1 cycle, total 10 cycles
}
// scale offset. Allows our initial 5-bit offsets to cover the range 2-64
offset = offset << 1; // 1 cycle
loop { // 9 cycles to reach start of loop
let shift = neg_bits + 64; // 1 cycle
let trimmed = buffer << total_bits; // 1 cycle
let idx = (trimmed >> shift) as usize; // 1 cycle
(_, table) = unsafe { table.split_at_unchecked(offset) }; // Apply offset // 1 cycle
let lookup = unsafe { *table.get_unchecked(idx as usize) }; // 2 cycles
total_bits -= neg_bits; // 1 cycle (load interlock)
let cmp = lookup as i32 - 0xc0; // 1 cycle
offset = (lookup & 0x3f) as usize; // 1 cycle (branch delay)
if cmp >= 0 { // 1 cycle
let codepoint = offset as u32;
return (codepoint, total_bits as usize); // 1 cycle
// (return delay) // 1 cycle
// prolog + one loop iteration is ~21 cycles and (combined) can handle upto 8 bits
// each additional iteration is adds another ~10 cycles and can handle 1-3 bits
}
neg_bits = cmp >> 6; // 1 cycle
}
}
// 0 <= nC < 2
static COEFF_TOKEN_MAPPING_0: [u8; 108] = [
0x12, 0x51, 0x50, 0xeb, 0xa6, 0xa6, 0xa6, 0xa6, 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
0xc5, 0xc0, // 1: 00010_(0|1)
0x8a, 0xcf, // 4: 00001_(0|1)
0x12, 0x4e, 0x8c, 0x8a, 0xd7, 0xce, 0xc9, 0xc4, // 2: 00000_(000|001|010|011|100|101|110|111)
0xd3, 0xca, // 6: 00001_0_(0|1)
0xcd, 0xc8, // 3: 00000_011_(0|1)
0xdb, 0xd2, // 8: 00000_010_(0|1)
0xdf, 0xd6, 0xd1, 0xcc, // 5: 00000_001_(00|01|10|11)
0x18, 0x10, 0x4c, 0x48, 0xe3, 0xda, 0xd5, 0xd0, // 7: 00000_000_(000|001|010|011|100|101|110|111)
0xe7, 0xde, 0xd9, 0xd4, // 9: 00000_000_011_(00|01|10|11)
0xdc, 0xe2, 0xdd, 0xd8, // a: 00000_000_010_(00|01|10|11)
0xef, 0xea, 0xe5, 0xe4, 0xeb, 0xe6, 0xe1, 0xe0, // b: 00000_000_001_(000|001|010|011|100|101|110|111)
0x90, 0x5a, 0x56, 0x52, 0x8e, 0x8c, 0x8a, 0x88, // c: 00000_000_000_(000|001|010|011|100|101|110|111)
0xe9, 0xe8, // d: 00000_000_000_111_(0|1)
0xf3, 0xee, // f: 00000_000_000_110_(0|1)
0xed, 0xec, // e: 00000_000_000_101_(0|1)
0xf7, 0xf2, // 12: 00000_000_000_100_(0|1)
0x00, 0xf1, // 11: 00000_000_000_000_(0|1)
0xfb, 0xf6, 0xf5, 0xf0, // 10: 00000_000_000_011_(00|01|10|11)
0xff, 0xfa, 0xf9, 0xf4, // 13: 00000_000_000_010_(00|01|10|11)
0xfc, 0xfe, 0xfd, 0xf8, // 14: 00000_000_000_001_(00|01|10|11)
];
// 2 <= nC < 4
static COEFF_TOKEN_MAPPING_2: [u8; 104] = [
0x16, 0x34, 0x53, 0x52, 0x51, 0x50, 0xf3, 0xe5, 0xcf, 0xcf, 0xcb, 0xcb, 0xa6, 0xa6, 0xa6, 0xa6, 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x83, 0x83, 0x83, 0x83, 0x83, 0x83, 0x83, 0x83,
0xc9, 0xc0, // 1: 00101_(0|1)
0xd7, 0xca, // 4: 00100_(0|1)
0xcd, 0xc4, // 2: 00011_(0|1)
0xdb, 0xce, // 6: 00010_(0|1)
0xdf, 0xd2, 0xd1, 0xc8, // 3: 00001_(00|01|10|11)
0x14, 0x0c, 0x8a, 0x88, 0xd0, 0xd6, 0xd5, 0xcc, // 5: 00000_(000|001|010|011|100|101|110|111)
0xd9, 0xd4, // 7: 00000_011_(0|1)
0xe3, 0xda, // 9: 00000_010_(0|1)
0xeb, 0xe2, 0xe1, 0xdc, 0xe7, 0xde, 0xdd, 0xd8, // 8: 00000_001_(000|001|010|011|100|101|110|111)
0x5c, 0x58, 0x54, 0x50, 0x8e, 0x8c, 0x8a, 0x88, // a: 00000_000_(000|001|010|011|100|101|110|111)
0xe5, 0xe0, // b: 00000_000_111_(0|1)
0xef, 0xe6, // d: 00000_000_110_(0|1)
0xe9, 0xe4, // c: 00000_000_101_(0|1)
0xe8, 0xea, // e: 00000_000_100_(0|1)
0xf3, 0xee, 0xed, 0xec, // f: 00000_000_011_(00|01|10|11)
0xf7, 0xf2, 0xf1, 0xf0, // 10: 00000_000_010_(00|01|10|11)
0x8a, 0x88, 0xf6, 0xf4, // 11: 00000_000_001_(00|01|10|11)
0x00, 0xfb, 0x8a, 0x88, // 14: 00000_000_000_(00|01|10|11)
0xfa, 0xf5, // 12: 00000_000_001_01_(0|1)
0xf9, 0xf8, // 13: 00000_000_001_00_(0|1)
0xfd, 0xfc, // 15: 00000_000_000_11_(0|1)
0xff, 0xfe, // 16: 00000_000_000_10_(0|1)
];
// 4 <= nC < 8
static COEFF_TOKEN_MAPPING_4: [u8; 90] = [
0x1c, 0x18, 0x36, 0x34, 0x53, 0x52, 0x51, 0x50, 0xf1, 0xf2, 0xed, 0xee, 0xe9, 0xff, 0xea, 0xe5, 0xdb, 0xdb, 0xd7, 0xd7, 0xd3, 0xd3, 0xcf, 0xcf, 0xcb, 0xcb, 0xc6, 0xc6, 0xc1, 0xc1, 0xc3, 0xc3,
0xd5, 0xc0, // 1: 00111_(0|1)
0xe3, 0xd6, // 6: 00110_(0|1)
0xd9, 0xc4, // 2: 00101_(0|1)
0xc8, 0xda, // 3: 00100_(0|1)
0xe7, 0xde, 0xdd, 0xcc, // 4: 00011_(00|01|10|11)
0x94, 0xd4, 0xe2, 0xd0, // 5: 00010_(00|01|10|11)
0xef, 0xea, 0xe5, 0xe0, 0xeb, 0xe6, 0xe1, 0xdc, // 8: 00001_(000|001|010|011|100|101|110|111)
0x5c, 0x58, 0x54, 0x92, 0x90, 0x8e, 0x8c, 0x8a, // 9: 00000_(000|001|010|011|100|101|110|111)
0xd8, 0x00, // 7: 00010_00_(0|1)
0xe9, 0xe4, // a: 00000_111_(0|1)
0xf3, 0xee, // d: 00000_110_(0|1)
0xed, 0xe8, // b: 00000_101_(0|1)
0xec, 0xf2, // c: 00000_100_(0|1)
0x8e, 0xf1, // e: 00000_011_(0|1)
0xf9, 0xf4, 0xf7, 0xf6, // 10: 00000_010_(00|01|10|11)
0xfd, 0xf8, 0xfb, 0xfa, // 11: 00000_001_(00|01|10|11)
0x00, 0xfc, 0xff, 0xfe, // 12: 00000_000_(00|01|10|11)
0xf5, 0xf0, // f: 00000_011_0_(0|1)
];
// nc == -1
static COEFF_TOKEN_MAPPING_420: [u8; 44] = [
0x33, 0x52, 0x51, 0x50, 0xa6, 0xa6, 0xa6, 0xa6, 0x83, 0x83, 0x83, 0x83, 0x83, 0x83, 0x83, 0x83, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61,
0xc5, 0xc0, // 1: 00011_(0|1)
0xc4, 0xcb, // 2: 00010_(0|1)
0xcc, 0xc8, // 3: 00001_(0|1)
0xcf, 0x84, 0xca, 0xc9, // 4: 00000_(00|01|10|11)
0xce, 0xcd, // 5: 00000_01_(0|1)
];
// nC == -2
static COEFF_TOKEN_MAPPING_422: [u8; 70] = [
0x50, 0xeb, 0x33, 0x31, 0xa6, 0xa6, 0xa6, 0xa6, 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
0x4a, 0xcf, // 2: 00000_(0|1)
0xc9, 0xc5, 0xc4, 0xc0, // 1: 00011_(00|01|10|11)
0xd7, 0xd3, 0xce, 0xca, // 4: 00010_(00|01|10|11)
0x0c, 0x48, 0x86, 0x84, // 6: 00000_0_(00|01|10|11)
0xcc, 0xc8, // 3: 00000_0_11_(0|1)
0xd2, 0xcd, // 5: 00000_0_10_(0|1)
0xdb, 0xd6, 0xd1, 0xd0, // 7: 00000_0_01_(00|01|10|11)
0x00, 0x4c, 0x8a, 0x88, 0xdf, 0xda, 0xd5, 0xd4, // 8: 00000_0_00_(000|001|010|011|100|101|110|111)
0xd9, 0xd8, // 9: 00000_0_00_011_(0|1)
0xde, 0xdd, // b: 00000_0_00_010_(0|1)
0x00, 0x00, 0x00, 0xdc, // a: 00000_0_00_001_(00|01|10|11)
];
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment