Created
December 9, 2025 16:46
-
-
Save sparr/59bc773800d37d731cb6d23995b36024 to your computer and use it in GitHub Desktop.
utf16-pack serializes javascript values to binary data packed into a utf-16 string. only partially implemented, focusing on performance problems before finishing implementation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // MessagePack-like encoding targeting UTF-16 storage instead of byte arrays. | |
| // Encoding of each value involves one UTF-16 character for the value or a header, | |
| // possibly some additional characters for further fixed size value or length information, | |
| // then possibly some additional characters or encoded values | |
| // 15-bit signed integer 0b0xxxxxxxxxxxxxxx | |
| // types with 32-bit value, first 2 bits -------------vv followed by two characters containing 15 bits of value each | |
| const INT_32_MASK = 0b1000000000000100; // 0b0xxxxxxxxxxxxxxx 0b0xxxxxxxxxxxxxxx | |
| const UINT_32_MASK = 0b1000000000001000; // 0b0xxxxxxxxxxxxxxx 0b0xxxxxxxxxxxxxxx | |
| const FLOAT_32_MASK = 0b1000000000001100; // 0b0xxxxxxxxxxxxxxx 0b0xxxxxxxxxxxxxxx | |
| // types with 64-bit value, first 4 bits -----------vvvv followed by four characters containing 15 bits of value each | |
| const INT_64_MASK = 0b1000000000010000; // 0b0xxxxxxxxxxxxxxx 0b0xxxxxxxxxxxxxxx 0b0xxxxxxxxxxxxxxx 0b0xxxxxxxxxxxxxxx | |
| const UINT_64_MASK = 0b1000000000100000; // 0b0xxxxxxxxxxxxxxx 0b0xxxxxxxxxxxxxxx 0b0xxxxxxxxxxxxxxx 0b0xxxxxxxxxxxxxxx | |
| const FLOAT_64_MASK = 0b1000000000110000; // 0b0xxxxxxxxxxxxxxx 0b0xxxxxxxxxxxxxxx 0b0xxxxxxxxxxxxxxx 0b0xxxxxxxxxxxxxxx | |
| // more 64-bit value types 0b10000000xxxxyyyy | |
| // types with 8-bit value ----------------------vvvvvvvv | |
| const CHAR_MASK = 0b1000000100000000; | |
| // more 8-bit types, 128-bit headers 0b100xxxx1yyyyyyyy | |
| // types with 10-bit length ------------------vvvvvvvvvv | |
| const STRING_10_MASK = 0b1010000000000000; | |
| const BYTEARRAY_10_MASK = 0b1010010000000000; | |
| const ARRAY_10_MASK = 0b1010100000000000; | |
| const MAP_10_MASK = 0b1010110000000000; | |
| // types with 40-bit length, first 10 bits ---vvvvvvvvvv followed by two characters containing 15 bits of length each | |
| const STRING_40_MASK = 0b1011000000000000; // 0b0xxxxxxxxxxxxxxx 0b0xxxxxxxxxxxxxxx | |
| const BYTEARRAY_40_MASK = 0b1011010000000000; // 0b0xxxxxxxxxxxxxxx 0b0xxxxxxxxxxxxxxx | |
| const ARRAY_40_MASK = 0b1011100000000000; // 0b0xxxxxxxxxxxxxxx 0b0xxxxxxxxxxxxxxx | |
| const MAP_40_MASK = 0b1011110000000000; // 0b0xxxxxxxxxxxxxxx 0b0xxxxxxxxxxxxxxx | |
| // available values 0b1100xxxxxxxxxxxx | |
| // available values 0b11010xxxxxxxxxxx | |
| // UTF-16 surrogates, invalid values 0b11011xxxxxxxxxxx | |
| // constants | |
| const NULL_CODE = 0b1110000000000000; | |
| const UNDEFINED_CODE = 0b1110000000000001; | |
| const FALSE_CODE = 0b1110000000000010; | |
| const TRUE_CODE = 0b1110000000000011; | |
| // available values 0b111xxxxxxxxxxxxx | |
| // Array type headers are followed by `length` additional encoded values | |
| // Map type headers are followed by `length` pairs of alternating encoded keys and values | |
| // String type headers are followed by `length` characters | |
| const STRING_40_CHAR = String.fromCharCode(STRING_40_MASK); | |
| const BYTEARRAY_40_CHAR = String.fromCharCode(BYTEARRAY_40_MASK); | |
| const ARRAY_40_CHAR = String.fromCharCode(ARRAY_40_MASK); | |
| const MAP_40_CHAR = String.fromCharCode(MAP_40_MASK); | |
| const INT_32_CHAR = String.fromCharCode(INT_32_MASK); | |
| const UINT_32_CHAR = String.fromCharCode(UINT_32_MASK); | |
| const FLOAT_32_CHAR = String.fromCharCode(FLOAT_32_MASK); | |
| const INT_64_CHAR = String.fromCharCode(INT_64_MASK); | |
| const UINT_64_CHAR = String.fromCharCode(UINT_64_MASK); | |
| const FLOAT_64_CHAR = String.fromCharCode(FLOAT_64_MASK); | |
| const NULL_CHAR = String.fromCharCode(NULL_CODE); | |
| const UNDEFINED_CHAR = String.fromCharCode(UNDEFINED_CODE); | |
| const FALSE_CHAR = String.fromCharCode(FALSE_CODE); | |
| const TRUE_CHAR = String.fromCharCode(TRUE_CODE); | |
| const BITMASK_2 = 2**2-1; | |
| const BITMASK_4 = 2**4-1; | |
| const BITMASK_8 = 2**8-1; | |
| const BITMASK_10 = 2**10-1; | |
| const BITMASK_15 = 2**15-1; | |
| const BITMASK_30 = 2**30-1; | |
| // ENCODE | |
| function encodeObject(obj: any, entries: string[]) { | |
| const keys = Object.keys(obj); | |
| const length = keys.length; | |
| if (length < 2**10) { | |
| entries.push(String.fromCharCode(MAP_10_MASK | length)); | |
| } else if (length < 2**30) { | |
| entries.push( | |
| MAP_40_CHAR, | |
| String.fromCharCode( | |
| length >> 15 & BITMASK_15, | |
| length & BITMASK_15 | |
| ) | |
| ); | |
| } else if (length < 2**40) { | |
| entries.push( | |
| String.fromCharCode( | |
| MAP_40_MASK | (length/2**30|0), | |
| (length/2**15|0) & BITMASK_15, | |
| length & BITMASK_15 | |
| ) | |
| ); | |
| } else { | |
| throw new Error(`Object.keys.length ${length} >= 2**40`); | |
| } | |
| for (let i=0; i<length; i++) { | |
| const key = keys[i]!; | |
| _encode(key, entries); | |
| _encode(obj[key], entries); | |
| } | |
| } | |
| function encodeArray(arr: any[], entries: string[]) { | |
| const length = arr.length; | |
| if (length < 2**10) { | |
| entries.push(String.fromCharCode(ARRAY_10_MASK | length)); | |
| } else if (length < 2**30) { | |
| entries.push( | |
| ARRAY_40_CHAR, | |
| String.fromCharCode( | |
| length >> 15 & BITMASK_15, | |
| length & BITMASK_15 | |
| ) | |
| ); | |
| } else if (length < 2**40) { | |
| entries.push( | |
| String.fromCharCode( | |
| ARRAY_40_MASK | (length/2**30|0), | |
| (length/2**15|0) & BITMASK_15, | |
| length & BITMASK_15 | |
| ) | |
| ); | |
| } else { | |
| throw new Error(`Array.length ${length} >= 2**40`); | |
| } | |
| for (let i=0; i<length; i++) { | |
| _encode(arr[i], entries); | |
| } | |
| } | |
| function encodeString(s: string, entries: string[]) { | |
| const length = s.length; | |
| if (length < 2**10) { | |
| entries.push(String.fromCharCode(STRING_10_MASK | length)); | |
| } else if (length < 2**30) { | |
| entries.push( | |
| STRING_40_CHAR, | |
| String.fromCharCode( | |
| length >> 15 & BITMASK_15, | |
| length & BITMASK_15 | |
| ) | |
| ); | |
| } else if (length < 2**40) { | |
| entries.push( | |
| String.fromCharCode( | |
| STRING_40_MASK | (length/2**30|0), | |
| (length/2**15|0) & BITMASK_15, | |
| length & BITMASK_15 | |
| ) | |
| ); | |
| } else { | |
| throw new Error(`String.length ${length} >= 2**40`); | |
| } | |
| entries.push(s); | |
| } | |
| function encodeNumber(n: number, entries: string[]) { | |
| if (Number.isInteger(n)) { | |
| if (Math.abs(n)<=BITMASK_15) { | |
| entries.push(String.fromCharCode(n&BITMASK_15)); | |
| } else { | |
| // TODO INT_32, UINT_32 | |
| } | |
| } else { // float | |
| // TODO FLOAT_32, FLOAT_64 | |
| } | |
| } | |
| function _encode(v: any, entries: string[]) { | |
| switch(typeof v) { | |
| case 'string': | |
| encodeString(v, entries); | |
| break; | |
| case 'number': | |
| encodeNumber(v, entries); | |
| break; | |
| case 'boolean': | |
| entries.push(v?TRUE_CHAR:FALSE_CHAR); | |
| break; | |
| case 'undefined': | |
| entries.push(UNDEFINED_CHAR); | |
| break; | |
| case 'object': | |
| if (v===null) { | |
| entries.push(NULL_CHAR); | |
| } else if (Array.isArray(v)) { | |
| encodeArray(v, entries); | |
| } else if (v instanceof ArrayBuffer || ArrayBuffer.isView(v)) { | |
| throw new Error('TODO ArrayBuffer as Byte Array'); | |
| } else { | |
| encodeObject(v, entries); | |
| } | |
| break; | |
| default: | |
| throw new Error(`Unrecognized type ${typeof(v)}`); | |
| } | |
| } | |
| export function encode(v: any): string { | |
| const entries: string[] = []; | |
| _encode(v, entries); | |
| return entries.join(''); | |
| } | |
| // DECODE | |
| type DecodeIndex = [string, number]; | |
| function _decode(v: DecodeIndex): any { | |
| const str = v[0]; | |
| const index = v[1]; | |
| const code = str.charCodeAt(index); | |
| if ((code & 0b1000000000000000) === 0) { | |
| // 15-bit signed integer | |
| v[1]++; | |
| return code<<17>>17; | |
| } | |
| if ((code&NULL_CODE)===NULL_CODE) { | |
| // TODO benchmark removing the conditional | |
| // constants | |
| switch (code) { | |
| case NULL_CODE: | |
| v[1]++; | |
| return null; | |
| case UNDEFINED_CODE: | |
| v[1]++; | |
| return undefined; | |
| case FALSE_CODE: | |
| v[1]++; | |
| return false; | |
| case TRUE_CODE: | |
| v[1]++; | |
| return true; | |
| } | |
| throw new Error(`Unrecognized constant code ${code.toString(2).padStart(16,'0')}`); | |
| } | |
| if (code>>13 === 0b101) { | |
| // things with 10 bits of payload | |
| const subcode = (code>>10)&3; | |
| let length = code&BITMASK_10; | |
| let offset = 1; | |
| if (code>>12 === 0b1011) { | |
| length = length * 2**30 + str.charCodeAt(index+1) * 2**15 + str.charCodeAt(index+2); | |
| offset = 3; | |
| } | |
| switch (subcode) { | |
| case 0b00: // string | |
| v[1]+=offset+length; | |
| return str.slice(index+offset,index+offset+length); | |
| case 0b01: // byte array | |
| throw new Error('TODO ArrayBuffer as Byte Array'); | |
| case 0b10: // array | |
| const arr = []; | |
| v[1]+=offset | |
| for (let i=0; i<length; i++) { | |
| arr.push(_decode(v)); | |
| } | |
| return arr; | |
| case 0b11: // map | |
| const obj: any = {}; | |
| v[1]+=offset | |
| for (let i=0; i<length; i++) { | |
| const key = _decode(v); | |
| obj[key] = _decode(v); | |
| } | |
| return obj; | |
| } | |
| } | |
| } | |
| export function decode(v:string): any { | |
| return _decode([v,0]); | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import { readFileSync } from 'fs'; | |
| import { encode, decode } from 'utf16-pack.js'; | |
| const fileContent = readFileSync('memory.json', 'utf8'); | |
| const data = JSON.parse(fileContent); | |
| const testCases = [ | |
| 0,1,100,1000,10000,-1,-999, | |
| null,undefined,true,false, | |
| "a","abc","ᴏᴠᴇʀᴍɪɴᴅ", | |
| [],[0],["a"],[1,-1,null,undefined,true,false,"a"], | |
| {},{a:0},{0:"a"},{zzz:null,yyy:undefined,asdf:true,foo:false} | |
| ] | |
| let fails = 0; | |
| for (const testCase of testCases) { | |
| const json = JSON.stringify(testCase); | |
| const encoded = encode(testCase); | |
| const decoded = decode(encoded); | |
| const jsond = JSON.stringify(decoded); | |
| const pass = json === jsond; | |
| if (!pass) { | |
| fails++; | |
| console.log("FAIL"); | |
| console.log(json); | |
| console.log(jsond); | |
| console.log(encoded.split("") | |
| .map(c => c.charCodeAt(0).toString(2).padStart(16, "0")) | |
| .join(" ")); | |
| console.log(); | |
| } | |
| } | |
| console.log(`Passed ${testCases.length-fails} out of ${testCases.length} tests`); | |
| function measureTime(fn) { | |
| const start = process.hrtime.bigint(); | |
| fn(); | |
| const end = process.hrtime.bigint(); | |
| return Number(end - start) / 1_000_000; // Convert to milliseconds | |
| } | |
| console.log('JSON.stringify time', measureTime(()=>{ | |
| for(let i=0;i<1000;i++) { | |
| JSON.stringify(data); | |
| } | |
| })); | |
| console.log('utf16-pack.enc time', measureTime(()=>{ | |
| console.profile(); | |
| for(let i=0;i<1000;i++) { | |
| encode(data); | |
| } | |
| console.profileEnd(); | |
| })); | |
| console.log('JSON.stringify length', JSON.stringify(data).length); | |
| console.log('utf16-pack.enc length', encode(data).length); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment