Skip to content

Instantly share code, notes, and snippets.

@sparr
Created December 9, 2025 16:46
Show Gist options
  • Select an option

  • Save sparr/59bc773800d37d731cb6d23995b36024 to your computer and use it in GitHub Desktop.

Select an option

Save sparr/59bc773800d37d731cb6d23995b36024 to your computer and use it in GitHub Desktop.
utf16-pack serializes javascript values to binary data packed into a utf-16 string. only partially implemented, focusing on performance problems before finishing implementation
// MessagePack-like encoding targeting UTF-16 storage instead of byte arrays.
// Encoding of each value involves one UTF-16 character for the value or a header,
// possibly some additional characters for further fixed size value or length information,
// then possibly some additional characters or encoded values
// 15-bit signed integer 0b0xxxxxxxxxxxxxxx
// types with 32-bit value, first 2 bits -------------vv followed by two characters containing 15 bits of value each
const INT_32_MASK = 0b1000000000000100; // 0b0xxxxxxxxxxxxxxx 0b0xxxxxxxxxxxxxxx
const UINT_32_MASK = 0b1000000000001000; // 0b0xxxxxxxxxxxxxxx 0b0xxxxxxxxxxxxxxx
const FLOAT_32_MASK = 0b1000000000001100; // 0b0xxxxxxxxxxxxxxx 0b0xxxxxxxxxxxxxxx
// types with 64-bit value, first 4 bits -----------vvvv followed by four characters containing 15 bits of value each
const INT_64_MASK = 0b1000000000010000; // 0b0xxxxxxxxxxxxxxx 0b0xxxxxxxxxxxxxxx 0b0xxxxxxxxxxxxxxx 0b0xxxxxxxxxxxxxxx
const UINT_64_MASK = 0b1000000000100000; // 0b0xxxxxxxxxxxxxxx 0b0xxxxxxxxxxxxxxx 0b0xxxxxxxxxxxxxxx 0b0xxxxxxxxxxxxxxx
const FLOAT_64_MASK = 0b1000000000110000; // 0b0xxxxxxxxxxxxxxx 0b0xxxxxxxxxxxxxxx 0b0xxxxxxxxxxxxxxx 0b0xxxxxxxxxxxxxxx
// more 64-bit value types 0b10000000xxxxyyyy
// types with 8-bit value ----------------------vvvvvvvv
const CHAR_MASK = 0b1000000100000000;
// more 8-bit types, 128-bit headers 0b100xxxx1yyyyyyyy
// types with 10-bit length ------------------vvvvvvvvvv
const STRING_10_MASK = 0b1010000000000000;
const BYTEARRAY_10_MASK = 0b1010010000000000;
const ARRAY_10_MASK = 0b1010100000000000;
const MAP_10_MASK = 0b1010110000000000;
// types with 40-bit length, first 10 bits ---vvvvvvvvvv followed by two characters containing 15 bits of length each
const STRING_40_MASK = 0b1011000000000000; // 0b0xxxxxxxxxxxxxxx 0b0xxxxxxxxxxxxxxx
const BYTEARRAY_40_MASK = 0b1011010000000000; // 0b0xxxxxxxxxxxxxxx 0b0xxxxxxxxxxxxxxx
const ARRAY_40_MASK = 0b1011100000000000; // 0b0xxxxxxxxxxxxxxx 0b0xxxxxxxxxxxxxxx
const MAP_40_MASK = 0b1011110000000000; // 0b0xxxxxxxxxxxxxxx 0b0xxxxxxxxxxxxxxx
// available values 0b1100xxxxxxxxxxxx
// available values 0b11010xxxxxxxxxxx
// UTF-16 surrogates, invalid values 0b11011xxxxxxxxxxx
// constants
const NULL_CODE = 0b1110000000000000;
const UNDEFINED_CODE = 0b1110000000000001;
const FALSE_CODE = 0b1110000000000010;
const TRUE_CODE = 0b1110000000000011;
// available values 0b111xxxxxxxxxxxxx
// Array type headers are followed by `length` additional encoded values
// Map type headers are followed by `length` pairs of alternating encoded keys and values
// String type headers are followed by `length` characters
const STRING_40_CHAR = String.fromCharCode(STRING_40_MASK);
const BYTEARRAY_40_CHAR = String.fromCharCode(BYTEARRAY_40_MASK);
const ARRAY_40_CHAR = String.fromCharCode(ARRAY_40_MASK);
const MAP_40_CHAR = String.fromCharCode(MAP_40_MASK);
const INT_32_CHAR = String.fromCharCode(INT_32_MASK);
const UINT_32_CHAR = String.fromCharCode(UINT_32_MASK);
const FLOAT_32_CHAR = String.fromCharCode(FLOAT_32_MASK);
const INT_64_CHAR = String.fromCharCode(INT_64_MASK);
const UINT_64_CHAR = String.fromCharCode(UINT_64_MASK);
const FLOAT_64_CHAR = String.fromCharCode(FLOAT_64_MASK);
const NULL_CHAR = String.fromCharCode(NULL_CODE);
const UNDEFINED_CHAR = String.fromCharCode(UNDEFINED_CODE);
const FALSE_CHAR = String.fromCharCode(FALSE_CODE);
const TRUE_CHAR = String.fromCharCode(TRUE_CODE);
const BITMASK_2 = 2**2-1;
const BITMASK_4 = 2**4-1;
const BITMASK_8 = 2**8-1;
const BITMASK_10 = 2**10-1;
const BITMASK_15 = 2**15-1;
const BITMASK_30 = 2**30-1;
// ENCODE
function encodeObject(obj: any, entries: string[]) {
const keys = Object.keys(obj);
const length = keys.length;
if (length < 2**10) {
entries.push(String.fromCharCode(MAP_10_MASK | length));
} else if (length < 2**30) {
entries.push(
MAP_40_CHAR,
String.fromCharCode(
length >> 15 & BITMASK_15,
length & BITMASK_15
)
);
} else if (length < 2**40) {
entries.push(
String.fromCharCode(
MAP_40_MASK | (length/2**30|0),
(length/2**15|0) & BITMASK_15,
length & BITMASK_15
)
);
} else {
throw new Error(`Object.keys.length ${length} >= 2**40`);
}
for (let i=0; i<length; i++) {
const key = keys[i]!;
_encode(key, entries);
_encode(obj[key], entries);
}
}
function encodeArray(arr: any[], entries: string[]) {
const length = arr.length;
if (length < 2**10) {
entries.push(String.fromCharCode(ARRAY_10_MASK | length));
} else if (length < 2**30) {
entries.push(
ARRAY_40_CHAR,
String.fromCharCode(
length >> 15 & BITMASK_15,
length & BITMASK_15
)
);
} else if (length < 2**40) {
entries.push(
String.fromCharCode(
ARRAY_40_MASK | (length/2**30|0),
(length/2**15|0) & BITMASK_15,
length & BITMASK_15
)
);
} else {
throw new Error(`Array.length ${length} >= 2**40`);
}
for (let i=0; i<length; i++) {
_encode(arr[i], entries);
}
}
function encodeString(s: string, entries: string[]) {
const length = s.length;
if (length < 2**10) {
entries.push(String.fromCharCode(STRING_10_MASK | length));
} else if (length < 2**30) {
entries.push(
STRING_40_CHAR,
String.fromCharCode(
length >> 15 & BITMASK_15,
length & BITMASK_15
)
);
} else if (length < 2**40) {
entries.push(
String.fromCharCode(
STRING_40_MASK | (length/2**30|0),
(length/2**15|0) & BITMASK_15,
length & BITMASK_15
)
);
} else {
throw new Error(`String.length ${length} >= 2**40`);
}
entries.push(s);
}
function encodeNumber(n: number, entries: string[]) {
if (Number.isInteger(n)) {
if (Math.abs(n)<=BITMASK_15) {
entries.push(String.fromCharCode(n&BITMASK_15));
} else {
// TODO INT_32, UINT_32
}
} else { // float
// TODO FLOAT_32, FLOAT_64
}
}
function _encode(v: any, entries: string[]) {
switch(typeof v) {
case 'string':
encodeString(v, entries);
break;
case 'number':
encodeNumber(v, entries);
break;
case 'boolean':
entries.push(v?TRUE_CHAR:FALSE_CHAR);
break;
case 'undefined':
entries.push(UNDEFINED_CHAR);
break;
case 'object':
if (v===null) {
entries.push(NULL_CHAR);
} else if (Array.isArray(v)) {
encodeArray(v, entries);
} else if (v instanceof ArrayBuffer || ArrayBuffer.isView(v)) {
throw new Error('TODO ArrayBuffer as Byte Array');
} else {
encodeObject(v, entries);
}
break;
default:
throw new Error(`Unrecognized type ${typeof(v)}`);
}
}
export function encode(v: any): string {
const entries: string[] = [];
_encode(v, entries);
return entries.join('');
}
// DECODE
type DecodeIndex = [string, number];
function _decode(v: DecodeIndex): any {
const str = v[0];
const index = v[1];
const code = str.charCodeAt(index);
if ((code & 0b1000000000000000) === 0) {
// 15-bit signed integer
v[1]++;
return code<<17>>17;
}
if ((code&NULL_CODE)===NULL_CODE) {
// TODO benchmark removing the conditional
// constants
switch (code) {
case NULL_CODE:
v[1]++;
return null;
case UNDEFINED_CODE:
v[1]++;
return undefined;
case FALSE_CODE:
v[1]++;
return false;
case TRUE_CODE:
v[1]++;
return true;
}
throw new Error(`Unrecognized constant code ${code.toString(2).padStart(16,'0')}`);
}
if (code>>13 === 0b101) {
// things with 10 bits of payload
const subcode = (code>>10)&3;
let length = code&BITMASK_10;
let offset = 1;
if (code>>12 === 0b1011) {
length = length * 2**30 + str.charCodeAt(index+1) * 2**15 + str.charCodeAt(index+2);
offset = 3;
}
switch (subcode) {
case 0b00: // string
v[1]+=offset+length;
return str.slice(index+offset,index+offset+length);
case 0b01: // byte array
throw new Error('TODO ArrayBuffer as Byte Array');
case 0b10: // array
const arr = [];
v[1]+=offset
for (let i=0; i<length; i++) {
arr.push(_decode(v));
}
return arr;
case 0b11: // map
const obj: any = {};
v[1]+=offset
for (let i=0; i<length; i++) {
const key = _decode(v);
obj[key] = _decode(v);
}
return obj;
}
}
}
export function decode(v:string): any {
return _decode([v,0]);
}
import { readFileSync } from 'fs';
import { encode, decode } from 'utf16-pack.js';
const fileContent = readFileSync('memory.json', 'utf8');
const data = JSON.parse(fileContent);
const testCases = [
0,1,100,1000,10000,-1,-999,
null,undefined,true,false,
"a","abc","ᴏᴠᴇʀᴍɪɴᴅ",
[],[0],["a"],[1,-1,null,undefined,true,false,"a"],
{},{a:0},{0:"a"},{zzz:null,yyy:undefined,asdf:true,foo:false}
]
let fails = 0;
for (const testCase of testCases) {
const json = JSON.stringify(testCase);
const encoded = encode(testCase);
const decoded = decode(encoded);
const jsond = JSON.stringify(decoded);
const pass = json === jsond;
if (!pass) {
fails++;
console.log("FAIL");
console.log(json);
console.log(jsond);
console.log(encoded.split("")
.map(c => c.charCodeAt(0).toString(2).padStart(16, "0"))
.join(" "));
console.log();
}
}
console.log(`Passed ${testCases.length-fails} out of ${testCases.length} tests`);
function measureTime(fn) {
const start = process.hrtime.bigint();
fn();
const end = process.hrtime.bigint();
return Number(end - start) / 1_000_000; // Convert to milliseconds
}
console.log('JSON.stringify time', measureTime(()=>{
for(let i=0;i<1000;i++) {
JSON.stringify(data);
}
}));
console.log('utf16-pack.enc time', measureTime(()=>{
console.profile();
for(let i=0;i<1000;i++) {
encode(data);
}
console.profileEnd();
}));
console.log('JSON.stringify length', JSON.stringify(data).length);
console.log('utf16-pack.enc length', encode(data).length);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment