PsyChip · June 10, 2025 20:40
diff --git a/chonkie.js b/chonkie.js
 /**
 * Chonkie.js - Lightweight text chunking library
 * Extracted core functionality from the Python Chonkie library
 */

 class Chonkie {
  /**
   * Initialize Chonkie with configuration
   * @param {Object} config - Configuration options
   * @param {string} config.tokenizer - Tokenizer type ('word', 'character', 'gpt')
   * @param {number} config.chunkSize - Maximum tokens per chunk (default: 512)
   * @param {number} config.minChunkSize - Minimum tokens per chunk (default: 2)
   * @param {number} config.overlap - Token overlap between chunks (default: 0)
   */
  constructor(config = {}) {
    this.tokenizer = config.tokenizer || 'word';
    this.chunkSize = config.chunkSize || 512;
    this.minChunkSize = config.minChunkSize || 2;
    this.overlap = config.overlap || 0;
    this.minCharacters = config.minCharacters || 12;
    
    // GPT-like tokenizer approximation (rough estimate: ~4 chars per token)
    this.CHARS_PER_TOKEN = 4;
  }

  /**
   * Count tokens in text based on tokenizer type
   * @param {string} text - Input text
   * @returns {number} Token count
   */
  countTokens(text) {
    if (!text) return 0;
    
    switch (this.tokenizer) {
      case 'character':
        return text.length;
      case 'word':
        return text.trim().split(/\s+/).filter(word => word.length > 0).length;
      case 'gpt':
        // Rough approximation for GPT tokenization
        return Math.ceil(text.length / this.CHARS_PER_TOKEN);
      default:
        return text.trim().split(/\s+/).filter(word => word.length > 0).length;
    }
  }

  /**
   * Create a chunk object with metadata
   * @param {string} text - Chunk text
   * @param {number} startIndex - Start position in original text
   * @param {number} endIndex - End position in original text
   * @param {number} tokenCount - Number of tokens
   * @returns {Object} Chunk object
   */
  createChunk(text, startIndex, endIndex, tokenCount) {
    return {
      text,
      startIndex,
      endIndex,
      tokenCount,
      length: text.length
    };
  }

  /**
   * Token-based chunking - splits text into fixed-size token chunks
   * @param {string} text - Input text to chunk
   * @returns {Array} Array of chunk objects
   */
  tokenChunk(text) {
    if (!text) return [];
    
    const chunks = [];
    let currentIndex = 0;
    
    while (currentIndex < text.length) {
      let chunkEnd = currentIndex;
      let tokenCount = 0;
      
      // Find chunk boundary respecting token limits
      while (chunkEnd < text.length && tokenCount < this.chunkSize) {
        const char = text[chunkEnd];
        if (this.tokenizer === 'character') {
          tokenCount++;
          chunkEnd++;
        } else if (this.tokenizer === 'word') {
          // Find next word boundary
          const nextSpace = text.indexOf(' ', chunkEnd);
          const wordEnd = nextSpace === -1 ? text.length : nextSpace;
          const word = text.slice(chunkEnd, wordEnd);
          
          if (tokenCount + 1 <= this.chunkSize) {
            tokenCount++;
            chunkEnd = wordEnd;
            // Skip whitespace
            while (chunkEnd < text.length && text[chunkEnd] === ' ') {
              chunkEnd++;
            }
          } else {
            break;
          }
        } else { // gpt approximation
          const estimatedTokens = Math.ceil((chunkEnd - currentIndex + 1) / this.CHARS_PER_TOKEN);
          if (estimatedTokens <= this.chunkSize) {
            chunkEnd++;
            tokenCount = estimatedTokens;
          } else {
            break;
          }
        }
      }
      
      // Ensure we have some content
      if (chunkEnd === currentIndex) {
        chunkEnd = Math.min(currentIndex + 1, text.length);
      }
      
      const chunkText = text.slice(currentIndex, chunkEnd);
      const actualTokenCount = this.countTokens(chunkText);
      
      chunks.push(this.createChunk(chunkText, currentIndex, chunkEnd, actualTokenCount));
      
      // Apply overlap
      const overlapChars = Math.floor(this.overlap * this.CHARS_PER_TOKEN);
      currentIndex = Math.max(currentIndex + 1, chunkEnd - overlapChars);
    }
    
    return chunks;
  }

  /**
   * Sentence-based chunking - splits on sentence boundaries
   * @param {string} text - Input text to chunk
   * @returns {Array} Array of chunk objects
   */
  sentenceChunk(text) {
    if (!text) return [];
    
    // Split into sentences using common delimiters
    const sentenceDelimiters = /[.!?]+\s+/g;
    const sentences = [];
    let lastIndex = 0;
    let match;
    
    while ((match = sentenceDelimiters.exec(text)) !== null) {
      const sentence = text.slice(lastIndex, match.index + match[0].length).trim();
      if (sentence.length >= this.minCharacters) {
        sentences.push({
          text: sentence,
          startIndex: lastIndex,
          endIndex: match.index + match[0].length
        });
      }
      lastIndex = match.index + match[0].length;
    }
    
    // Add remaining text as final sentence
    if (lastIndex < text.length) {
      const remaining = text.slice(lastIndex).trim();
      if (remaining.length >= this.minCharacters) {
        sentences.push({
          text: remaining,
          startIndex: lastIndex,
          endIndex: text.length
        });
      }
    }
    
    // Group sentences into chunks
    const chunks = [];
    let currentChunk = [];
    let currentTokens = 0;
    
    for (const sentence of sentences) {
      const sentenceTokens = this.countTokens(sentence.text);
      
      if (currentTokens + sentenceTokens > this.chunkSize && currentChunk.length > 0) {
        // Create chunk from current sentences
        const chunkText = currentChunk.map(s => s.text).join(' ');
        chunks.push(this.createChunk(
          chunkText,
          currentChunk[0].startIndex,
          currentChunk[currentChunk.length - 1].endIndex,
          currentTokens
        ));
        
        // Start new chunk
        currentChunk = [sentence];
        currentTokens = sentenceTokens;
      } else {
        currentChunk.push(sentence);
        currentTokens += sentenceTokens;
      }
    }
    
    // Add final chunk
    if (currentChunk.length > 0) {
      const chunkText = currentChunk.map(s => s.text).join(' ');
      chunks.push(this.createChunk(
        chunkText,
        currentChunk[0].startIndex,
        currentChunk[currentChunk.length - 1].endIndex,
        currentTokens
      ));
    }
    
    return chunks;
  }

  /**
   * Recursive chunking - hierarchical splitting with multiple delimiters
   * @param {string} text - Input text to chunk
   * @param {Array} delimiters - Array of delimiter patterns (default: paragraph, sentence, word)
   * @returns {Array} Array of chunk objects
   */
  recursiveChunk(text, delimiters = ['\n\n', '\n', '. ', ' ']) {
    if (!text) return [];
    
    return this._recursiveChunkHelper(text, delimiters, 0, 0);
  }

  /**
   * Helper method for recursive chunking
   * @private
   */
  _recursiveChunkHelper(text, delimiters, level, startOffset) {
    const tokenCount = this.countTokens(text);
    
    // Base case: text fits in chunk size or no more delimiters
    if (tokenCount <= this.chunkSize || level >= delimiters.length) {
      return [this.createChunk(text, startOffset, startOffset + text.length, tokenCount)];
    }
    
    const delimiter = delimiters[level];
    const splits = text.split(delimiter);
    
    // If no splits occurred, try next delimiter level
    if (splits.length === 1) {
      return this._recursiveChunkHelper(text, delimiters, level + 1, startOffset);
    }
    
    // Merge splits that are too small
    const mergedSplits = this._mergeSplits(splits, delimiter);
    const chunks = [];
    let currentOffset = startOffset;
    
    for (const split of mergedSplits) {
      const splitTokenCount = this.countTokens(split);
      
      if (splitTokenCount > this.chunkSize) {
        // Recursively chunk large splits
        const subChunks = this._recursiveChunkHelper(split, delimiters, level + 1, currentOffset);
        chunks.push(...subChunks);
      } else {
        // Create chunk for appropriately sized split
        chunks.push(this.createChunk(split, currentOffset, currentOffset + split.length, splitTokenCount));
      }
      
      currentOffset += split.length + delimiter.length;
    }
    
    return chunks;
  }

  /**
   * Merge small splits to respect minimum chunk size
   * @private
   */
  _mergeSplits(splits, delimiter) {
    const merged = [];
    let current = '';
    
    for (const split of splits) {
      const testMerged = current ? current + delimiter + split : split;
      const tokenCount = this.countTokens(testMerged);
      
      if (tokenCount <= this.chunkSize) {
        current = testMerged;
      } else {
        if (current) {
          merged.push(current);
        }
        current = split;
      }
    }
    
    if (current) {
      merged.push(current);
    }
    
    return merged;
  }

  /**
   * Main chunking method - automatically selects best strategy
   * @param {string} text - Input text to chunk
   * @param {string} strategy - Chunking strategy ('token', 'sentence', 'recursive', 'auto')
   * @returns {Array} Array of chunk objects
   */
  chunk(text, strategy = 'auto') {
    if (!text || typeof text !== 'string') {
      return [];
    }
    
    // Auto-select strategy based on text characteristics
    if (strategy === 'auto') {
      const hasMultipleParagraphs = text.includes('\n\n');
      const hasSentences = /[.!?]+\s+/.test(text);
      
      if (hasMultipleParagraphs) {
        strategy = 'recursive';
      } else if (hasSentences) {
        strategy = 'sentence';
      } else {
        strategy = 'token';
      }
    }
    
    switch (strategy) {
      case 'token':
        return this.tokenChunk(text);
      case 'sentence':
        return this.sentenceChunk(text);
      case 'recursive':
        return this.recursiveChunk(text);
      default:
        return this.tokenChunk(text);
    }
  }

  /**
   * Batch process multiple texts
   * @param {Array} texts - Array of text strings
   * @param {string} strategy - Chunking strategy
   * @returns {Array} Array of arrays containing chunks for each text
   */
  chunkBatch(texts, strategy = 'auto') {
    return texts.map(text => this.chunk(text, strategy));
  }

  /**
   * Get chunk statistics
   * @param {Array} chunks - Array of chunk objects
   * @returns {Object} Statistics object
   */
  getStats(chunks) {
    if (!chunks || chunks.length === 0) {
      return { count: 0, totalTokens: 0, avgTokens: 0, minTokens: 0, maxTokens: 0 };
    }
    
    const tokenCounts = chunks.map(chunk => chunk.tokenCount);
    const totalTokens = tokenCounts.reduce((sum, count) => sum + count, 0);
    
    return {
      count: chunks.length,
      totalTokens,
      avgTokens: Math.round(totalTokens / chunks.length),
      minTokens: Math.min(...tokenCounts),
      maxTokens: Math.max(...tokenCounts)
    };
  }
 }

 // Export for different module systems
 if (typeof module !== 'undefined' && module.exports) {
  module.exports = Chonkie;
 } else if (typeof window !== 'undefined') {
  window.Chonkie = Chonkie;
 }

 // Usage examples:
 /*
 // Basic usage
 const chunker = new Chonkie({ chunkSize: 100, tokenizer: 'word' });
 const chunks = chunker.chunk("Your text here...");

 // Different strategies
 const tokenChunks = chunker.chunk(text, 'token');
 const sentenceChunks = chunker.chunk(text, 'sentence');
 const recursiveChunks = chunker.chunk(text, 'recursive');

 // Batch processing
 const multipleTexts = ["Text 1", "Text 2", "Text 3"];
 const batchResults = chunker.chunkBatch(multipleTexts);

 // Get statistics
 const stats = chunker.getStats(chunks);
 console.log(`Created ${stats.count} chunks with avg ${stats.avgTokens} tokens`);
 */
	/**
	* Chonkie.js - Lightweight text chunking library
	* Extracted core functionality from the Python Chonkie library
	*/

	class Chonkie {
	/**
	* Initialize Chonkie with configuration
	* @param {Object} config - Configuration options
	* @param {string} config.tokenizer - Tokenizer type ('word', 'character', 'gpt')
	* @param {number} config.chunkSize - Maximum tokens per chunk (default: 512)
	* @param {number} config.minChunkSize - Minimum tokens per chunk (default: 2)
	* @param {number} config.overlap - Token overlap between chunks (default: 0)
	*/
	constructor(config = {}) {
	this.tokenizer = config.tokenizer \|\| 'word';
	this.chunkSize = config.chunkSize \|\| 512;
	this.minChunkSize = config.minChunkSize \|\| 2;
	this.overlap = config.overlap \|\| 0;
	this.minCharacters = config.minCharacters \|\| 12;

	// GPT-like tokenizer approximation (rough estimate: ~4 chars per token)
	this.CHARS_PER_TOKEN = 4;
	}

	/**
	* Count tokens in text based on tokenizer type
	* @param {string} text - Input text
	* @returns {number} Token count
	*/
	countTokens(text) {
	if (!text) return 0;

	switch (this.tokenizer) {
	case 'character':
	return text.length;
	case 'word':
	return text.trim().split(/\s+/).filter(word => word.length > 0).length;
	case 'gpt':
	// Rough approximation for GPT tokenization
	return Math.ceil(text.length / this.CHARS_PER_TOKEN);
	default:
	return text.trim().split(/\s+/).filter(word => word.length > 0).length;
	}
	}

	/**
	* Create a chunk object with metadata
	* @param {string} text - Chunk text
	* @param {number} startIndex - Start position in original text
	* @param {number} endIndex - End position in original text
	* @param {number} tokenCount - Number of tokens
	* @returns {Object} Chunk object
	*/
	createChunk(text, startIndex, endIndex, tokenCount) {
	return {
	text,
	startIndex,
	endIndex,
	tokenCount,
	length: text.length
	};
	}

	/**
	* Token-based chunking - splits text into fixed-size token chunks
	* @param {string} text - Input text to chunk
	* @returns {Array} Array of chunk objects
	*/
	tokenChunk(text) {
	if (!text) return [];

	const chunks = [];
	let currentIndex = 0;

	while (currentIndex < text.length) {
	let chunkEnd = currentIndex;
	let tokenCount = 0;

	// Find chunk boundary respecting token limits
	while (chunkEnd < text.length && tokenCount < this.chunkSize) {
	const char = text[chunkEnd];
	if (this.tokenizer === 'character') {
	tokenCount++;
	chunkEnd++;
	} else if (this.tokenizer === 'word') {
	// Find next word boundary
	const nextSpace = text.indexOf(' ', chunkEnd);
	const wordEnd = nextSpace === -1 ? text.length : nextSpace;
	const word = text.slice(chunkEnd, wordEnd);

	if (tokenCount + 1 <= this.chunkSize) {
	tokenCount++;
	chunkEnd = wordEnd;
	// Skip whitespace
	while (chunkEnd < text.length && text[chunkEnd] === ' ') {
	chunkEnd++;
	}
	} else {
	break;
	}
	} else { // gpt approximation
	const estimatedTokens = Math.ceil((chunkEnd - currentIndex + 1) / this.CHARS_PER_TOKEN);
	if (estimatedTokens <= this.chunkSize) {
	chunkEnd++;
	tokenCount = estimatedTokens;
	} else {
	break;
	}
	}
	}

	// Ensure we have some content
	if (chunkEnd === currentIndex) {
	chunkEnd = Math.min(currentIndex + 1, text.length);
	}

	const chunkText = text.slice(currentIndex, chunkEnd);
	const actualTokenCount = this.countTokens(chunkText);

	chunks.push(this.createChunk(chunkText, currentIndex, chunkEnd, actualTokenCount));

	// Apply overlap
	const overlapChars = Math.floor(this.overlap * this.CHARS_PER_TOKEN);
	currentIndex = Math.max(currentIndex + 1, chunkEnd - overlapChars);
	}

	return chunks;
	}

	/**
	* Sentence-based chunking - splits on sentence boundaries
	* @param {string} text - Input text to chunk
	* @returns {Array} Array of chunk objects
	*/
	sentenceChunk(text) {
	if (!text) return [];

	// Split into sentences using common delimiters
	const sentenceDelimiters = /[.!?]+\s+/g;
	const sentences = [];
	let lastIndex = 0;
	let match;

	while ((match = sentenceDelimiters.exec(text)) !== null) {
	const sentence = text.slice(lastIndex, match.index + match[0].length).trim();
	if (sentence.length >= this.minCharacters) {
	sentences.push({
	text: sentence,
	startIndex: lastIndex,
	endIndex: match.index + match[0].length
	});
	}
	lastIndex = match.index + match[0].length;
	}

	// Add remaining text as final sentence
	if (lastIndex < text.length) {
	const remaining = text.slice(lastIndex).trim();
	if (remaining.length >= this.minCharacters) {
	sentences.push({
	text: remaining,
	startIndex: lastIndex,
	endIndex: text.length
	});
	}
	}

	// Group sentences into chunks
	const chunks = [];
	let currentChunk = [];
	let currentTokens = 0;

	for (const sentence of sentences) {
	const sentenceTokens = this.countTokens(sentence.text);

	if (currentTokens + sentenceTokens > this.chunkSize && currentChunk.length > 0) {
	// Create chunk from current sentences
	const chunkText = currentChunk.map(s => s.text).join(' ');
	chunks.push(this.createChunk(
	chunkText,
	currentChunk[0].startIndex,
	currentChunk[currentChunk.length - 1].endIndex,
	currentTokens
	));

	// Start new chunk
	currentChunk = [sentence];
	currentTokens = sentenceTokens;
	} else {
	currentChunk.push(sentence);
	currentTokens += sentenceTokens;
	}
	}

	// Add final chunk
	if (currentChunk.length > 0) {
	const chunkText = currentChunk.map(s => s.text).join(' ');
	chunks.push(this.createChunk(
	chunkText,
	currentChunk[0].startIndex,
	currentChunk[currentChunk.length - 1].endIndex,
	currentTokens
	));
	}

	return chunks;
	}

	/**
	* Recursive chunking - hierarchical splitting with multiple delimiters
	* @param {string} text - Input text to chunk
	* @param {Array} delimiters - Array of delimiter patterns (default: paragraph, sentence, word)
	* @returns {Array} Array of chunk objects
	*/
	recursiveChunk(text, delimiters = ['\n\n', '\n', '. ', ' ']) {
	if (!text) return [];

	return this._recursiveChunkHelper(text, delimiters, 0, 0);
	}

	/**
	* Helper method for recursive chunking
	* @private
	*/
	_recursiveChunkHelper(text, delimiters, level, startOffset) {
	const tokenCount = this.countTokens(text);

	// Base case: text fits in chunk size or no more delimiters
	if (tokenCount <= this.chunkSize \|\| level >= delimiters.length) {
	return [this.createChunk(text, startOffset, startOffset + text.length, tokenCount)];
	}

	const delimiter = delimiters[level];
	const splits = text.split(delimiter);

	// If no splits occurred, try next delimiter level
	if (splits.length === 1) {
	return this._recursiveChunkHelper(text, delimiters, level + 1, startOffset);
	}

	// Merge splits that are too small
	const mergedSplits = this._mergeSplits(splits, delimiter);
	const chunks = [];
	let currentOffset = startOffset;

	for (const split of mergedSplits) {
	const splitTokenCount = this.countTokens(split);

	if (splitTokenCount > this.chunkSize) {
	// Recursively chunk large splits
	const subChunks = this._recursiveChunkHelper(split, delimiters, level + 1, currentOffset);
	chunks.push(...subChunks);
	} else {
	// Create chunk for appropriately sized split
	chunks.push(this.createChunk(split, currentOffset, currentOffset + split.length, splitTokenCount));
	}

	currentOffset += split.length + delimiter.length;
	}

	return chunks;
	}

	/**
	* Merge small splits to respect minimum chunk size
	* @private
	*/
	_mergeSplits(splits, delimiter) {
	const merged = [];
	let current = '';

	for (const split of splits) {
	const testMerged = current ? current + delimiter + split : split;
	const tokenCount = this.countTokens(testMerged);

	if (tokenCount <= this.chunkSize) {
	current = testMerged;
	} else {
	if (current) {
	merged.push(current);
	}
	current = split;
	}
	}

	if (current) {
	merged.push(current);
	}

	return merged;
	}

	/**
	* Main chunking method - automatically selects best strategy
	* @param {string} text - Input text to chunk
	* @param {string} strategy - Chunking strategy ('token', 'sentence', 'recursive', 'auto')
	* @returns {Array} Array of chunk objects
	*/
	chunk(text, strategy = 'auto') {
	if (!text \|\| typeof text !== 'string') {
	return [];
	}

	// Auto-select strategy based on text characteristics
	if (strategy === 'auto') {
	const hasMultipleParagraphs = text.includes('\n\n');
	const hasSentences = /[.!?]+\s+/.test(text);

	if (hasMultipleParagraphs) {
	strategy = 'recursive';
	} else if (hasSentences) {
	strategy = 'sentence';
	} else {
	strategy = 'token';
	}
	}

	switch (strategy) {
	case 'token':
	return this.tokenChunk(text);
	case 'sentence':
	return this.sentenceChunk(text);
	case 'recursive':
	return this.recursiveChunk(text);
	default:
	return this.tokenChunk(text);
	}
	}

	/**
	* Batch process multiple texts
	* @param {Array} texts - Array of text strings
	* @param {string} strategy - Chunking strategy
	* @returns {Array} Array of arrays containing chunks for each text
	*/
	chunkBatch(texts, strategy = 'auto') {
	return texts.map(text => this.chunk(text, strategy));
	}

	/**
	* Get chunk statistics
	* @param {Array} chunks - Array of chunk objects
	* @returns {Object} Statistics object
	*/
	getStats(chunks) {
	if (!chunks \|\| chunks.length === 0) {
	return { count: 0, totalTokens: 0, avgTokens: 0, minTokens: 0, maxTokens: 0 };
	}

	const tokenCounts = chunks.map(chunk => chunk.tokenCount);
	const totalTokens = tokenCounts.reduce((sum, count) => sum + count, 0);

	return {
	count: chunks.length,
	totalTokens,
	avgTokens: Math.round(totalTokens / chunks.length),
	minTokens: Math.min(...tokenCounts),
	maxTokens: Math.max(...tokenCounts)
	};
	}
	}

	// Export for different module systems
	if (typeof module !== 'undefined' && module.exports) {
	module.exports = Chonkie;
	} else if (typeof window !== 'undefined') {
	window.Chonkie = Chonkie;
	}

	// Usage examples:
	/*
	// Basic usage
	const chunker = new Chonkie({ chunkSize: 100, tokenizer: 'word' });
	const chunks = chunker.chunk("Your text here...");

	// Different strategies
	const tokenChunks = chunker.chunk(text, 'token');
	const sentenceChunks = chunker.chunk(text, 'sentence');
	const recursiveChunks = chunker.chunk(text, 'recursive');

	// Batch processing
	const multipleTexts = ["Text 1", "Text 2", "Text 3"];
	const batchResults = chunker.chunkBatch(multipleTexts);

	// Get statistics
	const stats = chunker.getStats(chunks);
	console.log(`Created ${stats.count} chunks with avg ${stats.avgTokens} tokens`);
	*/
No results found