Last active
December 19, 2025 06:27
-
-
Save phire/f8ce4f3a001c1fc55841f0c9b1b714ad to your computer and use it in GitHub Desktop.
Attempt at creating an efficient h264 CAVLC decoder for the n64 (which doesn't have CLZ, and really hates cache misses)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /// Like decode_huffman_lookup, but the first level is unrolled, and much bigger | |
| /// Any code that fits in 5 bits and decodes to 5 bits or less can be done in a single table lookup | |
| pub fn decode_huffman_unrolled(buffer: u64, mut table: &[u8]) -> (u32, usize) { | |
| // Lookup first 5 bits (32 entry table) | |
| let idx = buffer >> 59; // 1 cycle | |
| let lookup = unsafe { *table.get_unchecked(idx as usize) }; // 2 cycles | |
| let mut total_bits = 5; // 1 cycle (load interlock) | |
| let lookup_5 = lookup >> 5; // 1 cycle | |
| let mut neg_bits = lookup_5 as i32 - 3; // 1 cycle | |
| let mut offset = (lookup & 0x1F) as usize; // 1 cycle (branch delay slot) | |
| if neg_bits >= 0 { // 1 cycle | |
| let bits = lookup_5 - 2; // 1 cycle (return delay slot) | |
| return (offset as u32, bits as usize); // 1 cycle, total 10 cycles | |
| } | |
| // scale offset. Allows our initial 5-bit offsets to cover the range 2-64 | |
| offset = offset << 1; // 1 cycle | |
| loop { // 9 cycles to reach start of loop | |
| let shift = neg_bits + 64; // 1 cycle | |
| let trimmed = buffer << total_bits; // 1 cycle | |
| let idx = (trimmed >> shift) as usize; // 1 cycle | |
| (_, table) = unsafe { table.split_at_unchecked(offset) }; // Apply offset // 1 cycle | |
| let lookup = unsafe { *table.get_unchecked(idx as usize) }; // 2 cycles | |
| total_bits -= neg_bits; // 1 cycle (load interlock) | |
| let cmp = lookup as i32 - 0xc0; // 1 cycle | |
| offset = (lookup & 0x3f) as usize; // 1 cycle (branch delay) | |
| if cmp >= 0 { // 1 cycle | |
| let codepoint = offset as u32; | |
| return (codepoint, total_bits as usize); // 1 cycle | |
| // (return delay) // 1 cycle | |
| // prolog + one loop iteration is ~21 cycles and (combined) can handle upto 8 bits | |
| // each additional iteration is adds another ~10 cycles and can handle 1-3 bits | |
| } | |
| neg_bits = cmp >> 6; // 1 cycle | |
| } | |
| } | |
| // 0 <= nC < 2 | |
| static COEFF_TOKEN_MAPPING_0: [u8; 108] = [ | |
| 0x12, 0x51, 0x50, 0xeb, 0xa6, 0xa6, 0xa6, 0xa6, 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, | |
| 0xc5, 0xc0, // 1: 00010_(0|1) | |
| 0x8a, 0xcf, // 4: 00001_(0|1) | |
| 0x12, 0x4e, 0x8c, 0x8a, 0xd7, 0xce, 0xc9, 0xc4, // 2: 00000_(000|001|010|011|100|101|110|111) | |
| 0xd3, 0xca, // 6: 00001_0_(0|1) | |
| 0xcd, 0xc8, // 3: 00000_011_(0|1) | |
| 0xdb, 0xd2, // 8: 00000_010_(0|1) | |
| 0xdf, 0xd6, 0xd1, 0xcc, // 5: 00000_001_(00|01|10|11) | |
| 0x18, 0x10, 0x4c, 0x48, 0xe3, 0xda, 0xd5, 0xd0, // 7: 00000_000_(000|001|010|011|100|101|110|111) | |
| 0xe7, 0xde, 0xd9, 0xd4, // 9: 00000_000_011_(00|01|10|11) | |
| 0xdc, 0xe2, 0xdd, 0xd8, // a: 00000_000_010_(00|01|10|11) | |
| 0xef, 0xea, 0xe5, 0xe4, 0xeb, 0xe6, 0xe1, 0xe0, // b: 00000_000_001_(000|001|010|011|100|101|110|111) | |
| 0x90, 0x5a, 0x56, 0x52, 0x8e, 0x8c, 0x8a, 0x88, // c: 00000_000_000_(000|001|010|011|100|101|110|111) | |
| 0xe9, 0xe8, // d: 00000_000_000_111_(0|1) | |
| 0xf3, 0xee, // f: 00000_000_000_110_(0|1) | |
| 0xed, 0xec, // e: 00000_000_000_101_(0|1) | |
| 0xf7, 0xf2, // 12: 00000_000_000_100_(0|1) | |
| 0x00, 0xf1, // 11: 00000_000_000_000_(0|1) | |
| 0xfb, 0xf6, 0xf5, 0xf0, // 10: 00000_000_000_011_(00|01|10|11) | |
| 0xff, 0xfa, 0xf9, 0xf4, // 13: 00000_000_000_010_(00|01|10|11) | |
| 0xfc, 0xfe, 0xfd, 0xf8, // 14: 00000_000_000_001_(00|01|10|11) | |
| ]; | |
| // 2 <= nC < 4 | |
| static COEFF_TOKEN_MAPPING_2: [u8; 104] = [ | |
| 0x16, 0x34, 0x53, 0x52, 0x51, 0x50, 0xf3, 0xe5, 0xcf, 0xcf, 0xcb, 0xcb, 0xa6, 0xa6, 0xa6, 0xa6, 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x83, 0x83, 0x83, 0x83, 0x83, 0x83, 0x83, 0x83, | |
| 0xc9, 0xc0, // 1: 00101_(0|1) | |
| 0xd7, 0xca, // 4: 00100_(0|1) | |
| 0xcd, 0xc4, // 2: 00011_(0|1) | |
| 0xdb, 0xce, // 6: 00010_(0|1) | |
| 0xdf, 0xd2, 0xd1, 0xc8, // 3: 00001_(00|01|10|11) | |
| 0x14, 0x0c, 0x8a, 0x88, 0xd0, 0xd6, 0xd5, 0xcc, // 5: 00000_(000|001|010|011|100|101|110|111) | |
| 0xd9, 0xd4, // 7: 00000_011_(0|1) | |
| 0xe3, 0xda, // 9: 00000_010_(0|1) | |
| 0xeb, 0xe2, 0xe1, 0xdc, 0xe7, 0xde, 0xdd, 0xd8, // 8: 00000_001_(000|001|010|011|100|101|110|111) | |
| 0x5c, 0x58, 0x54, 0x50, 0x8e, 0x8c, 0x8a, 0x88, // a: 00000_000_(000|001|010|011|100|101|110|111) | |
| 0xe5, 0xe0, // b: 00000_000_111_(0|1) | |
| 0xef, 0xe6, // d: 00000_000_110_(0|1) | |
| 0xe9, 0xe4, // c: 00000_000_101_(0|1) | |
| 0xe8, 0xea, // e: 00000_000_100_(0|1) | |
| 0xf3, 0xee, 0xed, 0xec, // f: 00000_000_011_(00|01|10|11) | |
| 0xf7, 0xf2, 0xf1, 0xf0, // 10: 00000_000_010_(00|01|10|11) | |
| 0x8a, 0x88, 0xf6, 0xf4, // 11: 00000_000_001_(00|01|10|11) | |
| 0x00, 0xfb, 0x8a, 0x88, // 14: 00000_000_000_(00|01|10|11) | |
| 0xfa, 0xf5, // 12: 00000_000_001_01_(0|1) | |
| 0xf9, 0xf8, // 13: 00000_000_001_00_(0|1) | |
| 0xfd, 0xfc, // 15: 00000_000_000_11_(0|1) | |
| 0xff, 0xfe, // 16: 00000_000_000_10_(0|1) | |
| ]; | |
| // 4 <= nC < 8 | |
| static COEFF_TOKEN_MAPPING_4: [u8; 90] = [ | |
| 0x1c, 0x18, 0x36, 0x34, 0x53, 0x52, 0x51, 0x50, 0xf1, 0xf2, 0xed, 0xee, 0xe9, 0xff, 0xea, 0xe5, 0xdb, 0xdb, 0xd7, 0xd7, 0xd3, 0xd3, 0xcf, 0xcf, 0xcb, 0xcb, 0xc6, 0xc6, 0xc1, 0xc1, 0xc3, 0xc3, | |
| 0xd5, 0xc0, // 1: 00111_(0|1) | |
| 0xe3, 0xd6, // 6: 00110_(0|1) | |
| 0xd9, 0xc4, // 2: 00101_(0|1) | |
| 0xc8, 0xda, // 3: 00100_(0|1) | |
| 0xe7, 0xde, 0xdd, 0xcc, // 4: 00011_(00|01|10|11) | |
| 0x94, 0xd4, 0xe2, 0xd0, // 5: 00010_(00|01|10|11) | |
| 0xef, 0xea, 0xe5, 0xe0, 0xeb, 0xe6, 0xe1, 0xdc, // 8: 00001_(000|001|010|011|100|101|110|111) | |
| 0x5c, 0x58, 0x54, 0x92, 0x90, 0x8e, 0x8c, 0x8a, // 9: 00000_(000|001|010|011|100|101|110|111) | |
| 0xd8, 0x00, // 7: 00010_00_(0|1) | |
| 0xe9, 0xe4, // a: 00000_111_(0|1) | |
| 0xf3, 0xee, // d: 00000_110_(0|1) | |
| 0xed, 0xe8, // b: 00000_101_(0|1) | |
| 0xec, 0xf2, // c: 00000_100_(0|1) | |
| 0x8e, 0xf1, // e: 00000_011_(0|1) | |
| 0xf9, 0xf4, 0xf7, 0xf6, // 10: 00000_010_(00|01|10|11) | |
| 0xfd, 0xf8, 0xfb, 0xfa, // 11: 00000_001_(00|01|10|11) | |
| 0x00, 0xfc, 0xff, 0xfe, // 12: 00000_000_(00|01|10|11) | |
| 0xf5, 0xf0, // f: 00000_011_0_(0|1) | |
| ]; | |
| // nc == -1 | |
| static COEFF_TOKEN_MAPPING_420: [u8; 44] = [ | |
| 0x33, 0x52, 0x51, 0x50, 0xa6, 0xa6, 0xa6, 0xa6, 0x83, 0x83, 0x83, 0x83, 0x83, 0x83, 0x83, 0x83, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, | |
| 0xc5, 0xc0, // 1: 00011_(0|1) | |
| 0xc4, 0xcb, // 2: 00010_(0|1) | |
| 0xcc, 0xc8, // 3: 00001_(0|1) | |
| 0xcf, 0x84, 0xca, 0xc9, // 4: 00000_(00|01|10|11) | |
| 0xce, 0xcd, // 5: 00000_01_(0|1) | |
| ]; | |
| // nC == -2 | |
| static COEFF_TOKEN_MAPPING_422: [u8; 70] = [ | |
| 0x50, 0xeb, 0x33, 0x31, 0xa6, 0xa6, 0xa6, 0xa6, 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, | |
| 0x4a, 0xcf, // 2: 00000_(0|1) | |
| 0xc9, 0xc5, 0xc4, 0xc0, // 1: 00011_(00|01|10|11) | |
| 0xd7, 0xd3, 0xce, 0xca, // 4: 00010_(00|01|10|11) | |
| 0x0c, 0x48, 0x86, 0x84, // 6: 00000_0_(00|01|10|11) | |
| 0xcc, 0xc8, // 3: 00000_0_11_(0|1) | |
| 0xd2, 0xcd, // 5: 00000_0_10_(0|1) | |
| 0xdb, 0xd6, 0xd1, 0xd0, // 7: 00000_0_01_(00|01|10|11) | |
| 0x00, 0x4c, 0x8a, 0x88, 0xdf, 0xda, 0xd5, 0xd4, // 8: 00000_0_00_(000|001|010|011|100|101|110|111) | |
| 0xd9, 0xd8, // 9: 00000_0_00_011_(0|1) | |
| 0xde, 0xdd, // b: 00000_0_00_010_(0|1) | |
| 0x00, 0x00, 0x00, 0xdc, // a: 00000_0_00_001_(00|01|10|11) | |
| ]; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment