Created
February 5, 2026 20:30
-
-
Save agavra/f35157b04000dc611f167968f5cbc164 to your computer and use it in GitHub Desktop.
bitpacking can cause byte-algined compressors to struggle
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /// ┌──────────────────────────┬──────────┬────────────────┬────────────────┐ | |
| /// │ Scenario │ Raw+zstd │ Bitpacked+zstd │ Winner │ | |
| /// ├──────────────────────────┼──────────┼────────────────┼────────────────┤ | |
| /// │ Uniform random 7-bit │ 8837 │ 8759 │ Bitpack (-78) │ | |
| /// ├──────────────────────────┼──────────┼────────────────┼────────────────┤ | |
| /// │ Sorted 7-bit │ 246 │ 662 │ Raw (+416) │ | |
| /// ├──────────────────────────┼──────────┼────────────────┼────────────────┤ | |
| /// │ Repeated runs 7-bit │ 280 │ 984 │ Raw (+704) │ | |
| /// ├──────────────────────────┼──────────┼────────────────┼────────────────┤ | |
| /// │ Sequential 7-bit │ 147 │ 131 │ Bitpack (-16) │ | |
| /// ├──────────────────────────┼──────────┼────────────────┼────────────────┤ | |
| /// │ Sparse 7-bit (95% zeros) │ 1299 │ 1511 │ Raw (+212) │ | |
| /// ├──────────────────────────┼──────────┼────────────────┼────────────────┤ | |
| /// │ Delta-encoded 4-bit │ 3202 │ 2519 │ Bitpack (-683) │ | |
| /// └──────────────────────────┴──────────┴────────────────┴────────────────┘ | |
| /// Demonstrates that bitpacking before zstd can sometimes produce LARGER output | |
| /// than just compressing raw byte-aligned values with zstd alone. | |
| /// | |
| /// Bitpacking removes redundant bits, but also destroys byte-alignment patterns | |
| /// that zstd's dictionary/entropy coding can exploit effectively. | |
| use std::io::Write; | |
| fn bitpack(values: &[u8], bits_per_value: u8) -> Vec<u8> { | |
| let mask = (1u16 << bits_per_value) - 1; | |
| let mut buf = Vec::new(); | |
| let mut current: u64 = 0; | |
| let mut bits_filled: u32 = 0; | |
| for &v in values { | |
| current |= ((v as u64) & mask as u64) << bits_filled; | |
| bits_filled += bits_per_value as u32; | |
| while bits_filled >= 8 { | |
| buf.push((current & 0xFF) as u8); | |
| current >>= 8; | |
| bits_filled -= 8; | |
| } | |
| } | |
| if bits_filled > 0 { | |
| buf.push((current & 0xFF) as u8); | |
| } | |
| buf | |
| } | |
| fn zstd_compress(data: &[u8], level: i32) -> Vec<u8> { | |
| let mut encoder = zstd::Encoder::new(Vec::new(), level).unwrap(); | |
| encoder.write_all(data).unwrap(); | |
| encoder.finish().unwrap() | |
| } | |
| fn run_experiment(label: &str, values: &[u8], bits_per_value: u8) { | |
| let raw = values; | |
| let raw_zstd = zstd_compress(raw, 3); | |
| let packed = bitpack(values, bits_per_value); | |
| let packed_zstd = zstd_compress(&packed, 3); | |
| let winner = if packed_zstd.len() < raw_zstd.len() { | |
| "BITPACK WINS" | |
| } else { | |
| "RAW WINS" | |
| }; | |
| let delta = packed_zstd.len() as i64 - raw_zstd.len() as i64; | |
| println!( | |
| "--- {} ({} values, {} bits each) ---", | |
| label, | |
| values.len(), | |
| bits_per_value | |
| ); | |
| println!(" Raw bytes: {:>6}", raw.len()); | |
| println!(" Raw + zstd: {:>6}", raw_zstd.len()); | |
| println!(" Bitpacked bytes: {:>6}", packed.len()); | |
| println!(" Bitpacked + zstd: {:>6}", packed_zstd.len()); | |
| println!(" --> {} (delta: {:+} bytes)", winner, delta); | |
| println!(); | |
| } | |
| /// Simple LCG PRNG for reproducibility without extra deps. | |
| struct Rng(u64); | |
| impl Rng { | |
| fn new(seed: u64) -> Self { | |
| Self(seed) | |
| } | |
| fn next_u8(&mut self, max_exclusive: u8) -> u8 { | |
| // LCG constants from Numerical Recipes | |
| self.0 = self.0.wrapping_mul(6364136223846793005).wrapping_add(1); | |
| ((self.0 >> 33) % max_exclusive as u64) as u8 | |
| } | |
| fn next_usize(&mut self, max_exclusive: usize) -> usize { | |
| self.0 = self.0.wrapping_mul(6364136223846793005).wrapping_add(1); | |
| ((self.0 >> 33) % max_exclusive as u64) as usize | |
| } | |
| } | |
| fn main() { | |
| let n = 10_000usize; | |
| let mut rng = Rng::new(42); | |
| // 1: Uniform random 7-bit values | |
| let vals: Vec<u8> = (0..n).map(|_| rng.next_u8(128)).collect(); | |
| run_experiment("Uniform random 7-bit", &vals, 7); | |
| // 2: Sorted 7-bit values — zstd loves the smooth gradient at byte level | |
| let mut vals: Vec<u8> = (0..n).map(|_| rng.next_u8(128)).collect(); | |
| vals.sort(); | |
| run_experiment("Sorted 7-bit", &vals, 7); | |
| // 3: Runs of repeated 7-bit values — highly compressible until bitpacking scrambles it | |
| let mut vals = Vec::new(); | |
| for _ in 0..200 { | |
| let v = rng.next_u8(128); | |
| vals.extend(std::iter::repeat(v).take(50)); | |
| } | |
| run_experiment("Repeated runs 7-bit", &vals, 7); | |
| // 4: Sequential (0,1,2,...,127,0,1,...) — very regular byte-level pattern | |
| let vals: Vec<u8> = (0..n).map(|i| (i % 128) as u8).collect(); | |
| run_experiment("Sequential 7-bit", &vals, 7); | |
| // 5: Sparse data (95% zeros) — zstd run-length encoding loves this | |
| let mut vals = vec![0u8; n]; | |
| for _ in 0..(n / 20) { | |
| let idx = rng.next_usize(n); | |
| vals[idx] = rng.next_u8(127) + 1; | |
| } | |
| run_experiment("Sparse 7-bit (95% zeros)", &vals, 7); | |
| // 6: Small deltas (4-bit) from sorted data | |
| let mut base: Vec<u16> = (0..n).map(|_| rng.next_usize(10000) as u16).collect(); | |
| base.sort(); | |
| let mut deltas = vec![0u8; n]; | |
| for i in 1..n { | |
| deltas[i] = std::cmp::min(base[i] - base[i - 1], 15) as u8; | |
| } | |
| run_experiment("Delta-encoded (4-bit)", &deltas, 4); | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment