Created
June 10, 2025 20:40
-
-
Save PsyChip/3131598c9d6f835c4304efb04fd9e70f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /** | |
| * Chonkie.js - Lightweight text chunking library | |
| * Extracted core functionality from the Python Chonkie library | |
| */ | |
| class Chonkie { | |
| /** | |
| * Initialize Chonkie with configuration | |
| * @param {Object} config - Configuration options | |
| * @param {string} config.tokenizer - Tokenizer type ('word', 'character', 'gpt') | |
| * @param {number} config.chunkSize - Maximum tokens per chunk (default: 512) | |
| * @param {number} config.minChunkSize - Minimum tokens per chunk (default: 2) | |
| * @param {number} config.overlap - Token overlap between chunks (default: 0) | |
| */ | |
| constructor(config = {}) { | |
| this.tokenizer = config.tokenizer || 'word'; | |
| this.chunkSize = config.chunkSize || 512; | |
| this.minChunkSize = config.minChunkSize || 2; | |
| this.overlap = config.overlap || 0; | |
| this.minCharacters = config.minCharacters || 12; | |
| // GPT-like tokenizer approximation (rough estimate: ~4 chars per token) | |
| this.CHARS_PER_TOKEN = 4; | |
| } | |
| /** | |
| * Count tokens in text based on tokenizer type | |
| * @param {string} text - Input text | |
| * @returns {number} Token count | |
| */ | |
| countTokens(text) { | |
| if (!text) return 0; | |
| switch (this.tokenizer) { | |
| case 'character': | |
| return text.length; | |
| case 'word': | |
| return text.trim().split(/\s+/).filter(word => word.length > 0).length; | |
| case 'gpt': | |
| // Rough approximation for GPT tokenization | |
| return Math.ceil(text.length / this.CHARS_PER_TOKEN); | |
| default: | |
| return text.trim().split(/\s+/).filter(word => word.length > 0).length; | |
| } | |
| } | |
| /** | |
| * Create a chunk object with metadata | |
| * @param {string} text - Chunk text | |
| * @param {number} startIndex - Start position in original text | |
| * @param {number} endIndex - End position in original text | |
| * @param {number} tokenCount - Number of tokens | |
| * @returns {Object} Chunk object | |
| */ | |
| createChunk(text, startIndex, endIndex, tokenCount) { | |
| return { | |
| text, | |
| startIndex, | |
| endIndex, | |
| tokenCount, | |
| length: text.length | |
| }; | |
| } | |
| /** | |
| * Token-based chunking - splits text into fixed-size token chunks | |
| * @param {string} text - Input text to chunk | |
| * @returns {Array} Array of chunk objects | |
| */ | |
| tokenChunk(text) { | |
| if (!text) return []; | |
| const chunks = []; | |
| let currentIndex = 0; | |
| while (currentIndex < text.length) { | |
| let chunkEnd = currentIndex; | |
| let tokenCount = 0; | |
| // Find chunk boundary respecting token limits | |
| while (chunkEnd < text.length && tokenCount < this.chunkSize) { | |
| const char = text[chunkEnd]; | |
| if (this.tokenizer === 'character') { | |
| tokenCount++; | |
| chunkEnd++; | |
| } else if (this.tokenizer === 'word') { | |
| // Find next word boundary | |
| const nextSpace = text.indexOf(' ', chunkEnd); | |
| const wordEnd = nextSpace === -1 ? text.length : nextSpace; | |
| const word = text.slice(chunkEnd, wordEnd); | |
| if (tokenCount + 1 <= this.chunkSize) { | |
| tokenCount++; | |
| chunkEnd = wordEnd; | |
| // Skip whitespace | |
| while (chunkEnd < text.length && text[chunkEnd] === ' ') { | |
| chunkEnd++; | |
| } | |
| } else { | |
| break; | |
| } | |
| } else { // gpt approximation | |
| const estimatedTokens = Math.ceil((chunkEnd - currentIndex + 1) / this.CHARS_PER_TOKEN); | |
| if (estimatedTokens <= this.chunkSize) { | |
| chunkEnd++; | |
| tokenCount = estimatedTokens; | |
| } else { | |
| break; | |
| } | |
| } | |
| } | |
| // Ensure we have some content | |
| if (chunkEnd === currentIndex) { | |
| chunkEnd = Math.min(currentIndex + 1, text.length); | |
| } | |
| const chunkText = text.slice(currentIndex, chunkEnd); | |
| const actualTokenCount = this.countTokens(chunkText); | |
| chunks.push(this.createChunk(chunkText, currentIndex, chunkEnd, actualTokenCount)); | |
| // Apply overlap | |
| const overlapChars = Math.floor(this.overlap * this.CHARS_PER_TOKEN); | |
| currentIndex = Math.max(currentIndex + 1, chunkEnd - overlapChars); | |
| } | |
| return chunks; | |
| } | |
| /** | |
| * Sentence-based chunking - splits on sentence boundaries | |
| * @param {string} text - Input text to chunk | |
| * @returns {Array} Array of chunk objects | |
| */ | |
| sentenceChunk(text) { | |
| if (!text) return []; | |
| // Split into sentences using common delimiters | |
| const sentenceDelimiters = /[.!?]+\s+/g; | |
| const sentences = []; | |
| let lastIndex = 0; | |
| let match; | |
| while ((match = sentenceDelimiters.exec(text)) !== null) { | |
| const sentence = text.slice(lastIndex, match.index + match[0].length).trim(); | |
| if (sentence.length >= this.minCharacters) { | |
| sentences.push({ | |
| text: sentence, | |
| startIndex: lastIndex, | |
| endIndex: match.index + match[0].length | |
| }); | |
| } | |
| lastIndex = match.index + match[0].length; | |
| } | |
| // Add remaining text as final sentence | |
| if (lastIndex < text.length) { | |
| const remaining = text.slice(lastIndex).trim(); | |
| if (remaining.length >= this.minCharacters) { | |
| sentences.push({ | |
| text: remaining, | |
| startIndex: lastIndex, | |
| endIndex: text.length | |
| }); | |
| } | |
| } | |
| // Group sentences into chunks | |
| const chunks = []; | |
| let currentChunk = []; | |
| let currentTokens = 0; | |
| for (const sentence of sentences) { | |
| const sentenceTokens = this.countTokens(sentence.text); | |
| if (currentTokens + sentenceTokens > this.chunkSize && currentChunk.length > 0) { | |
| // Create chunk from current sentences | |
| const chunkText = currentChunk.map(s => s.text).join(' '); | |
| chunks.push(this.createChunk( | |
| chunkText, | |
| currentChunk[0].startIndex, | |
| currentChunk[currentChunk.length - 1].endIndex, | |
| currentTokens | |
| )); | |
| // Start new chunk | |
| currentChunk = [sentence]; | |
| currentTokens = sentenceTokens; | |
| } else { | |
| currentChunk.push(sentence); | |
| currentTokens += sentenceTokens; | |
| } | |
| } | |
| // Add final chunk | |
| if (currentChunk.length > 0) { | |
| const chunkText = currentChunk.map(s => s.text).join(' '); | |
| chunks.push(this.createChunk( | |
| chunkText, | |
| currentChunk[0].startIndex, | |
| currentChunk[currentChunk.length - 1].endIndex, | |
| currentTokens | |
| )); | |
| } | |
| return chunks; | |
| } | |
| /** | |
| * Recursive chunking - hierarchical splitting with multiple delimiters | |
| * @param {string} text - Input text to chunk | |
| * @param {Array} delimiters - Array of delimiter patterns (default: paragraph, sentence, word) | |
| * @returns {Array} Array of chunk objects | |
| */ | |
| recursiveChunk(text, delimiters = ['\n\n', '\n', '. ', ' ']) { | |
| if (!text) return []; | |
| return this._recursiveChunkHelper(text, delimiters, 0, 0); | |
| } | |
| /** | |
| * Helper method for recursive chunking | |
| * @private | |
| */ | |
| _recursiveChunkHelper(text, delimiters, level, startOffset) { | |
| const tokenCount = this.countTokens(text); | |
| // Base case: text fits in chunk size or no more delimiters | |
| if (tokenCount <= this.chunkSize || level >= delimiters.length) { | |
| return [this.createChunk(text, startOffset, startOffset + text.length, tokenCount)]; | |
| } | |
| const delimiter = delimiters[level]; | |
| const splits = text.split(delimiter); | |
| // If no splits occurred, try next delimiter level | |
| if (splits.length === 1) { | |
| return this._recursiveChunkHelper(text, delimiters, level + 1, startOffset); | |
| } | |
| // Merge splits that are too small | |
| const mergedSplits = this._mergeSplits(splits, delimiter); | |
| const chunks = []; | |
| let currentOffset = startOffset; | |
| for (const split of mergedSplits) { | |
| const splitTokenCount = this.countTokens(split); | |
| if (splitTokenCount > this.chunkSize) { | |
| // Recursively chunk large splits | |
| const subChunks = this._recursiveChunkHelper(split, delimiters, level + 1, currentOffset); | |
| chunks.push(...subChunks); | |
| } else { | |
| // Create chunk for appropriately sized split | |
| chunks.push(this.createChunk(split, currentOffset, currentOffset + split.length, splitTokenCount)); | |
| } | |
| currentOffset += split.length + delimiter.length; | |
| } | |
| return chunks; | |
| } | |
| /** | |
| * Merge small splits to respect minimum chunk size | |
| * @private | |
| */ | |
| _mergeSplits(splits, delimiter) { | |
| const merged = []; | |
| let current = ''; | |
| for (const split of splits) { | |
| const testMerged = current ? current + delimiter + split : split; | |
| const tokenCount = this.countTokens(testMerged); | |
| if (tokenCount <= this.chunkSize) { | |
| current = testMerged; | |
| } else { | |
| if (current) { | |
| merged.push(current); | |
| } | |
| current = split; | |
| } | |
| } | |
| if (current) { | |
| merged.push(current); | |
| } | |
| return merged; | |
| } | |
| /** | |
| * Main chunking method - automatically selects best strategy | |
| * @param {string} text - Input text to chunk | |
| * @param {string} strategy - Chunking strategy ('token', 'sentence', 'recursive', 'auto') | |
| * @returns {Array} Array of chunk objects | |
| */ | |
| chunk(text, strategy = 'auto') { | |
| if (!text || typeof text !== 'string') { | |
| return []; | |
| } | |
| // Auto-select strategy based on text characteristics | |
| if (strategy === 'auto') { | |
| const hasMultipleParagraphs = text.includes('\n\n'); | |
| const hasSentences = /[.!?]+\s+/.test(text); | |
| if (hasMultipleParagraphs) { | |
| strategy = 'recursive'; | |
| } else if (hasSentences) { | |
| strategy = 'sentence'; | |
| } else { | |
| strategy = 'token'; | |
| } | |
| } | |
| switch (strategy) { | |
| case 'token': | |
| return this.tokenChunk(text); | |
| case 'sentence': | |
| return this.sentenceChunk(text); | |
| case 'recursive': | |
| return this.recursiveChunk(text); | |
| default: | |
| return this.tokenChunk(text); | |
| } | |
| } | |
| /** | |
| * Batch process multiple texts | |
| * @param {Array} texts - Array of text strings | |
| * @param {string} strategy - Chunking strategy | |
| * @returns {Array} Array of arrays containing chunks for each text | |
| */ | |
| chunkBatch(texts, strategy = 'auto') { | |
| return texts.map(text => this.chunk(text, strategy)); | |
| } | |
| /** | |
| * Get chunk statistics | |
| * @param {Array} chunks - Array of chunk objects | |
| * @returns {Object} Statistics object | |
| */ | |
| getStats(chunks) { | |
| if (!chunks || chunks.length === 0) { | |
| return { count: 0, totalTokens: 0, avgTokens: 0, minTokens: 0, maxTokens: 0 }; | |
| } | |
| const tokenCounts = chunks.map(chunk => chunk.tokenCount); | |
| const totalTokens = tokenCounts.reduce((sum, count) => sum + count, 0); | |
| return { | |
| count: chunks.length, | |
| totalTokens, | |
| avgTokens: Math.round(totalTokens / chunks.length), | |
| minTokens: Math.min(...tokenCounts), | |
| maxTokens: Math.max(...tokenCounts) | |
| }; | |
| } | |
| } | |
| // Export for different module systems | |
| if (typeof module !== 'undefined' && module.exports) { | |
| module.exports = Chonkie; | |
| } else if (typeof window !== 'undefined') { | |
| window.Chonkie = Chonkie; | |
| } | |
| // Usage examples: | |
| /* | |
| // Basic usage | |
| const chunker = new Chonkie({ chunkSize: 100, tokenizer: 'word' }); | |
| const chunks = chunker.chunk("Your text here..."); | |
| // Different strategies | |
| const tokenChunks = chunker.chunk(text, 'token'); | |
| const sentenceChunks = chunker.chunk(text, 'sentence'); | |
| const recursiveChunks = chunker.chunk(text, 'recursive'); | |
| // Batch processing | |
| const multipleTexts = ["Text 1", "Text 2", "Text 3"]; | |
| const batchResults = chunker.chunkBatch(multipleTexts); | |
| // Get statistics | |
| const stats = chunker.getStats(chunks); | |
| console.log(`Created ${stats.count} chunks with avg ${stats.avgTokens} tokens`); | |
| */ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment