Skip to content

Instantly share code, notes, and snippets.

@PsyChip
Created June 10, 2025 20:40
Show Gist options
  • Select an option

  • Save PsyChip/3131598c9d6f835c4304efb04fd9e70f to your computer and use it in GitHub Desktop.

Select an option

Save PsyChip/3131598c9d6f835c4304efb04fd9e70f to your computer and use it in GitHub Desktop.
/**
* Chonkie.js - Lightweight text chunking library
* Extracted core functionality from the Python Chonkie library
*/
class Chonkie {
/**
* Initialize Chonkie with configuration
* @param {Object} config - Configuration options
* @param {string} config.tokenizer - Tokenizer type ('word', 'character', 'gpt')
* @param {number} config.chunkSize - Maximum tokens per chunk (default: 512)
* @param {number} config.minChunkSize - Minimum tokens per chunk (default: 2)
* @param {number} config.overlap - Token overlap between chunks (default: 0)
*/
constructor(config = {}) {
this.tokenizer = config.tokenizer || 'word';
this.chunkSize = config.chunkSize || 512;
this.minChunkSize = config.minChunkSize || 2;
this.overlap = config.overlap || 0;
this.minCharacters = config.minCharacters || 12;
// GPT-like tokenizer approximation (rough estimate: ~4 chars per token)
this.CHARS_PER_TOKEN = 4;
}
/**
* Count tokens in text based on tokenizer type
* @param {string} text - Input text
* @returns {number} Token count
*/
countTokens(text) {
if (!text) return 0;
switch (this.tokenizer) {
case 'character':
return text.length;
case 'word':
return text.trim().split(/\s+/).filter(word => word.length > 0).length;
case 'gpt':
// Rough approximation for GPT tokenization
return Math.ceil(text.length / this.CHARS_PER_TOKEN);
default:
return text.trim().split(/\s+/).filter(word => word.length > 0).length;
}
}
/**
* Create a chunk object with metadata
* @param {string} text - Chunk text
* @param {number} startIndex - Start position in original text
* @param {number} endIndex - End position in original text
* @param {number} tokenCount - Number of tokens
* @returns {Object} Chunk object
*/
createChunk(text, startIndex, endIndex, tokenCount) {
return {
text,
startIndex,
endIndex,
tokenCount,
length: text.length
};
}
/**
* Token-based chunking - splits text into fixed-size token chunks
* @param {string} text - Input text to chunk
* @returns {Array} Array of chunk objects
*/
tokenChunk(text) {
if (!text) return [];
const chunks = [];
let currentIndex = 0;
while (currentIndex < text.length) {
let chunkEnd = currentIndex;
let tokenCount = 0;
// Find chunk boundary respecting token limits
while (chunkEnd < text.length && tokenCount < this.chunkSize) {
const char = text[chunkEnd];
if (this.tokenizer === 'character') {
tokenCount++;
chunkEnd++;
} else if (this.tokenizer === 'word') {
// Find next word boundary
const nextSpace = text.indexOf(' ', chunkEnd);
const wordEnd = nextSpace === -1 ? text.length : nextSpace;
const word = text.slice(chunkEnd, wordEnd);
if (tokenCount + 1 <= this.chunkSize) {
tokenCount++;
chunkEnd = wordEnd;
// Skip whitespace
while (chunkEnd < text.length && text[chunkEnd] === ' ') {
chunkEnd++;
}
} else {
break;
}
} else { // gpt approximation
const estimatedTokens = Math.ceil((chunkEnd - currentIndex + 1) / this.CHARS_PER_TOKEN);
if (estimatedTokens <= this.chunkSize) {
chunkEnd++;
tokenCount = estimatedTokens;
} else {
break;
}
}
}
// Ensure we have some content
if (chunkEnd === currentIndex) {
chunkEnd = Math.min(currentIndex + 1, text.length);
}
const chunkText = text.slice(currentIndex, chunkEnd);
const actualTokenCount = this.countTokens(chunkText);
chunks.push(this.createChunk(chunkText, currentIndex, chunkEnd, actualTokenCount));
// Apply overlap
const overlapChars = Math.floor(this.overlap * this.CHARS_PER_TOKEN);
currentIndex = Math.max(currentIndex + 1, chunkEnd - overlapChars);
}
return chunks;
}
/**
* Sentence-based chunking - splits on sentence boundaries
* @param {string} text - Input text to chunk
* @returns {Array} Array of chunk objects
*/
sentenceChunk(text) {
if (!text) return [];
// Split into sentences using common delimiters
const sentenceDelimiters = /[.!?]+\s+/g;
const sentences = [];
let lastIndex = 0;
let match;
while ((match = sentenceDelimiters.exec(text)) !== null) {
const sentence = text.slice(lastIndex, match.index + match[0].length).trim();
if (sentence.length >= this.minCharacters) {
sentences.push({
text: sentence,
startIndex: lastIndex,
endIndex: match.index + match[0].length
});
}
lastIndex = match.index + match[0].length;
}
// Add remaining text as final sentence
if (lastIndex < text.length) {
const remaining = text.slice(lastIndex).trim();
if (remaining.length >= this.minCharacters) {
sentences.push({
text: remaining,
startIndex: lastIndex,
endIndex: text.length
});
}
}
// Group sentences into chunks
const chunks = [];
let currentChunk = [];
let currentTokens = 0;
for (const sentence of sentences) {
const sentenceTokens = this.countTokens(sentence.text);
if (currentTokens + sentenceTokens > this.chunkSize && currentChunk.length > 0) {
// Create chunk from current sentences
const chunkText = currentChunk.map(s => s.text).join(' ');
chunks.push(this.createChunk(
chunkText,
currentChunk[0].startIndex,
currentChunk[currentChunk.length - 1].endIndex,
currentTokens
));
// Start new chunk
currentChunk = [sentence];
currentTokens = sentenceTokens;
} else {
currentChunk.push(sentence);
currentTokens += sentenceTokens;
}
}
// Add final chunk
if (currentChunk.length > 0) {
const chunkText = currentChunk.map(s => s.text).join(' ');
chunks.push(this.createChunk(
chunkText,
currentChunk[0].startIndex,
currentChunk[currentChunk.length - 1].endIndex,
currentTokens
));
}
return chunks;
}
/**
* Recursive chunking - hierarchical splitting with multiple delimiters
* @param {string} text - Input text to chunk
* @param {Array} delimiters - Array of delimiter patterns (default: paragraph, sentence, word)
* @returns {Array} Array of chunk objects
*/
recursiveChunk(text, delimiters = ['\n\n', '\n', '. ', ' ']) {
if (!text) return [];
return this._recursiveChunkHelper(text, delimiters, 0, 0);
}
/**
* Helper method for recursive chunking
* @private
*/
_recursiveChunkHelper(text, delimiters, level, startOffset) {
const tokenCount = this.countTokens(text);
// Base case: text fits in chunk size or no more delimiters
if (tokenCount <= this.chunkSize || level >= delimiters.length) {
return [this.createChunk(text, startOffset, startOffset + text.length, tokenCount)];
}
const delimiter = delimiters[level];
const splits = text.split(delimiter);
// If no splits occurred, try next delimiter level
if (splits.length === 1) {
return this._recursiveChunkHelper(text, delimiters, level + 1, startOffset);
}
// Merge splits that are too small
const mergedSplits = this._mergeSplits(splits, delimiter);
const chunks = [];
let currentOffset = startOffset;
for (const split of mergedSplits) {
const splitTokenCount = this.countTokens(split);
if (splitTokenCount > this.chunkSize) {
// Recursively chunk large splits
const subChunks = this._recursiveChunkHelper(split, delimiters, level + 1, currentOffset);
chunks.push(...subChunks);
} else {
// Create chunk for appropriately sized split
chunks.push(this.createChunk(split, currentOffset, currentOffset + split.length, splitTokenCount));
}
currentOffset += split.length + delimiter.length;
}
return chunks;
}
/**
* Merge small splits to respect minimum chunk size
* @private
*/
_mergeSplits(splits, delimiter) {
const merged = [];
let current = '';
for (const split of splits) {
const testMerged = current ? current + delimiter + split : split;
const tokenCount = this.countTokens(testMerged);
if (tokenCount <= this.chunkSize) {
current = testMerged;
} else {
if (current) {
merged.push(current);
}
current = split;
}
}
if (current) {
merged.push(current);
}
return merged;
}
/**
* Main chunking method - automatically selects best strategy
* @param {string} text - Input text to chunk
* @param {string} strategy - Chunking strategy ('token', 'sentence', 'recursive', 'auto')
* @returns {Array} Array of chunk objects
*/
chunk(text, strategy = 'auto') {
if (!text || typeof text !== 'string') {
return [];
}
// Auto-select strategy based on text characteristics
if (strategy === 'auto') {
const hasMultipleParagraphs = text.includes('\n\n');
const hasSentences = /[.!?]+\s+/.test(text);
if (hasMultipleParagraphs) {
strategy = 'recursive';
} else if (hasSentences) {
strategy = 'sentence';
} else {
strategy = 'token';
}
}
switch (strategy) {
case 'token':
return this.tokenChunk(text);
case 'sentence':
return this.sentenceChunk(text);
case 'recursive':
return this.recursiveChunk(text);
default:
return this.tokenChunk(text);
}
}
/**
* Batch process multiple texts
* @param {Array} texts - Array of text strings
* @param {string} strategy - Chunking strategy
* @returns {Array} Array of arrays containing chunks for each text
*/
chunkBatch(texts, strategy = 'auto') {
return texts.map(text => this.chunk(text, strategy));
}
/**
* Get chunk statistics
* @param {Array} chunks - Array of chunk objects
* @returns {Object} Statistics object
*/
getStats(chunks) {
if (!chunks || chunks.length === 0) {
return { count: 0, totalTokens: 0, avgTokens: 0, minTokens: 0, maxTokens: 0 };
}
const tokenCounts = chunks.map(chunk => chunk.tokenCount);
const totalTokens = tokenCounts.reduce((sum, count) => sum + count, 0);
return {
count: chunks.length,
totalTokens,
avgTokens: Math.round(totalTokens / chunks.length),
minTokens: Math.min(...tokenCounts),
maxTokens: Math.max(...tokenCounts)
};
}
}
// Export for different module systems
if (typeof module !== 'undefined' && module.exports) {
module.exports = Chonkie;
} else if (typeof window !== 'undefined') {
window.Chonkie = Chonkie;
}
// Usage examples:
/*
// Basic usage
const chunker = new Chonkie({ chunkSize: 100, tokenizer: 'word' });
const chunks = chunker.chunk("Your text here...");
// Different strategies
const tokenChunks = chunker.chunk(text, 'token');
const sentenceChunks = chunker.chunk(text, 'sentence');
const recursiveChunks = chunker.chunk(text, 'recursive');
// Batch processing
const multipleTexts = ["Text 1", "Text 2", "Text 3"];
const batchResults = chunker.chunkBatch(multipleTexts);
// Get statistics
const stats = chunker.getStats(chunks);
console.log(`Created ${stats.count} chunks with avg ${stats.avgTokens} tokens`);
*/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment