ipid · February 12, 2026 11:24
diff --git a/tokenizerReader.ts b/tokenizerReader.ts
 /* eslint-disable @typescript-eslint/no-unsafe-return */
 /* eslint-disable @typescript-eslint/no-unsafe-member-access */
 /* eslint-disable @typescript-eslint/no-explicit-any */

 import * as fs from 'fs'
 import { Buffer } from 'buffer'

 // ========== 命令行参数解析 ==========

 const [tokenizerFilePath] = process.argv.slice(2)
 if (!tokenizerFilePath) {
  console.error('用法: npx tsx src/tokenizerReader/index.ts <tokenizer.json 路径>')
  process.exit(1)
 }

 // ========== BPE Unicode 编码映射 ==========

 /**
 * 构建 Unicode 字符 → 字节值 的映射表。
 *
 * GPT-2 的 BPE 分词器将每个字节映射到一个 Unicode 字符，
 * 以此让所有 token 都是可打印的字符串。
 * 本函数构建其逆映射，用于将 token 解码回原始字节。
 */
 function buildUnicodeToByteMap(): Map<string, number> {
  // 可直接映射的字节范围（可打印 ASCII 及 Latin-1 补充字符）
  const directBytes: number[] = []
  for (let i = 33; i <= 126; i++) directBytes.push(i) // '!' ~ '~'
  for (let i = 161; i <= 172; i++) directBytes.push(i) // '¡' ~ '¬'
  for (let i = 174; i <= 255; i++) directBytes.push(i) // '®' ~ 'ÿ'

  // Unicode 码点数组，初始与 directBytes 一一对应
  const unicodeCodePoints: number[] = [...directBytes]

  // 不在 directBytes 中的字节，映射到 256 之后的 Unicode 码点
  let offset = 0
  for (let byte = 0; byte < 256; byte++) {
    if (!directBytes.includes(byte)) {
      directBytes.push(byte)
      unicodeCodePoints.push(256 + offset)
      offset++
    }
  }

  // 构建 Unicode 字符 → 字节值 的映射
  const map = new Map<string, number>()
  for (let i = 0; i < directBytes.length; i++) {
    map.set(String.fromCharCode(unicodeCodePoints[i]), directBytes[i])
  }
  return map
 }

 const unicodeToByteMap = buildUnicodeToByteMap()

 /**
 * 将 BPE 编码的 token 字符串解码为可读的 UTF-8 字符串。
 *
 * 解码策略：
 * 1. 尝试将每个字符通过 GPT-2 映射表转换回字节，若全部命中则按 UTF-8 解码。
 * 2. 若存在未映射字符，说明 token 本身就是原始字符串（如 SentencePiece 或直接存储
 *    Unicode 的模型），此时直接返回原始字符串，仅将 ▁ (U+2581) 替换为空格。
 */
 function decodeBpeToken(token: string): string {
  const bytes: number[] = []
  let allMapped = true
  for (const char of token) {
    const byte = unicodeToByteMap.get(char)
    if (byte !== undefined) {
      bytes.push(byte)
    } else {
      allMapped = false
      break
    }
  }

  // 所有字符都在 GPT-2 映射表中 → 按 GPT-2 规则解码
  if (allMapped) {
    return Buffer.from(bytes).toString('utf-8')
  }

  // 存在未映射字符 → token 本身就是原始字符串
  // 将 SentencePiece 的 ▁ (U+2581) 替换为空格
  return token.replace(/\u2581/g, ' ')
 }

 // ========== 十六进制 token 匹配 ==========

 /** 匹配 Llama 等模型中的 <0xHH> 格式 token */
 const HEX_TOKEN_PATTERN = /^<0x([0-9A-Fa-f]{2})>$/

 /**
 * 将 token 解码为可读字符串。
 * 支持两种格式：<0xHH> 十六进制 token 和 GPT-2 风格的 BPE token。
 */
 function decodeToken(token: string): string {
  const hexMatch = HEX_TOKEN_PATTERN.exec(token)
  if (hexMatch) {
    // 使用 latin1 编码将单字节正确映射到对应码点
    return Buffer.from([parseInt(hexMatch[1], 16)]).toString('latin1')
  }
  return decodeBpeToken(token)
 }

 // ========== 主逻辑 ==========

 /**
 * 从 tokenizer.json 中提取词汇表。
 * 兼容 HuggingFace 格式（model.vocab）和扁平格式（vocab）。
 */
 function extractVocab(tokenizerJson: any): Record<string, number> {
  if (tokenizerJson.model?.vocab) return tokenizerJson.model.vocab
  if (tokenizerJson.vocab) return tokenizerJson.vocab
  throw new Error('在 tokenizer.json 中未找到词汇表（vocab）')
 }

 interface DecodedToken {
  original: string
  decoded: string
  tokenNum: number
 }

 function printWords(decodedTokens: DecodedToken[], maxLength: number): void {
  decodedTokens
    .toSorted((a, b) => b.decoded.length - a.decoded.length)
    .slice(0, maxLength)
    .forEach((token, index) => {
      console.log(`${index + 1}. ${token.decoded} - ${token.tokenNum}`)
    })
 }

 try {
  const rawJson = fs.readFileSync(tokenizerFilePath, 'utf-8')
  const vocab = extractVocab(JSON.parse(rawJson))

  const decodedTokens = Object.entries(vocab).map(([token, tokenNum]) => {
    const decodedFriendly = decodeToken(token).replace(/\n/g, '\\n').replace(/\r/g, '\\r').replace(/\t/g, '\\t')

    return { original: token, decoded: decodedFriendly, tokenNum }
  })

  console.log('英文:')
  printWords(
    decodedTokens.filter(({ decoded }) => /[a-z]{16}/iu.test(decoded)),
    50,
  )

  console.log('中文:')
  printWords(
    decodedTokens.filter(({ decoded }) => /[\p{Script=Han}]{3}/iu.test(decoded)),
    50,
  )
 } catch (err) {
  console.error('读取 tokenizer 失败:', err instanceof Error ? err.message : String(err))
  process.exit(1)
 }
	/* eslint-disable @typescript-eslint/no-unsafe-return */
	/* eslint-disable @typescript-eslint/no-unsafe-member-access */
	/* eslint-disable @typescript-eslint/no-explicit-any */

	import * as fs from 'fs'
	import { Buffer } from 'buffer'

	// ========== 命令行参数解析 ==========

	const [tokenizerFilePath] = process.argv.slice(2)
	if (!tokenizerFilePath) {
	console.error('用法: npx tsx src/tokenizerReader/index.ts <tokenizer.json 路径>')
	process.exit(1)
	}

	// ========== BPE Unicode 编码映射 ==========

	/**
	* 构建 Unicode 字符 → 字节值的映射表。
	*
	* GPT-2 的 BPE 分词器将每个字节映射到一个 Unicode 字符，
	* 以此让所有 token 都是可打印的字符串。
	* 本函数构建其逆映射，用于将 token 解码回原始字节。
	*/
	function buildUnicodeToByteMap(): Map<string, number> {
	// 可直接映射的字节范围（可打印 ASCII 及 Latin-1 补充字符）
	const directBytes: number[] = []
	for (let i = 33; i <= 126; i++) directBytes.push(i) // '!' ~ '~'
	for (let i = 161; i <= 172; i++) directBytes.push(i) // '¡' ~ '¬'
	for (let i = 174; i <= 255; i++) directBytes.push(i) // '®' ~ 'ÿ'

	// Unicode 码点数组，初始与 directBytes 一一对应
	const unicodeCodePoints: number[] = [...directBytes]

	// 不在 directBytes 中的字节，映射到 256 之后的 Unicode 码点
	let offset = 0
	for (let byte = 0; byte < 256; byte++) {
	if (!directBytes.includes(byte)) {
	directBytes.push(byte)
	unicodeCodePoints.push(256 + offset)
	offset++
	}
	}

	// 构建 Unicode 字符 → 字节值的映射
	const map = new Map<string, number>()
	for (let i = 0; i < directBytes.length; i++) {
	map.set(String.fromCharCode(unicodeCodePoints[i]), directBytes[i])
	}
	return map
	}

	const unicodeToByteMap = buildUnicodeToByteMap()

	/**
	* 将 BPE 编码的 token 字符串解码为可读的 UTF-8 字符串。
	*
	* 解码策略：
	* 1. 尝试将每个字符通过 GPT-2 映射表转换回字节，若全部命中则按 UTF-8 解码。
	* 2. 若存在未映射字符，说明 token 本身就是原始字符串（如 SentencePiece 或直接存储
	* Unicode 的模型），此时直接返回原始字符串，仅将 ▁ (U+2581) 替换为空格。
	*/
	function decodeBpeToken(token: string): string {
	const bytes: number[] = []
	let allMapped = true
	for (const char of token) {
	const byte = unicodeToByteMap.get(char)
	if (byte !== undefined) {
	bytes.push(byte)
	} else {
	allMapped = false
	break
	}
	}

	// 所有字符都在 GPT-2 映射表中 → 按 GPT-2 规则解码
	if (allMapped) {
	return Buffer.from(bytes).toString('utf-8')
	}

	// 存在未映射字符 → token 本身就是原始字符串
	// 将 SentencePiece 的 ▁ (U+2581) 替换为空格
	return token.replace(/\u2581/g, ' ')
	}

	// ========== 十六进制 token 匹配 ==========

	/** 匹配 Llama 等模型中的 <0xHH> 格式 token */
	const HEX_TOKEN_PATTERN = /^<0x([0-9A-Fa-f]{2})>$/

	/**
	* 将 token 解码为可读字符串。
	* 支持两种格式：<0xHH> 十六进制 token 和 GPT-2 风格的 BPE token。
	*/
	function decodeToken(token: string): string {
	const hexMatch = HEX_TOKEN_PATTERN.exec(token)
	if (hexMatch) {
	// 使用 latin1 编码将单字节正确映射到对应码点
	return Buffer.from([parseInt(hexMatch[1], 16)]).toString('latin1')
	}
	return decodeBpeToken(token)
	}

	// ========== 主逻辑 ==========

	/**
	* 从 tokenizer.json 中提取词汇表。
	* 兼容 HuggingFace 格式（model.vocab）和扁平格式（vocab）。
	*/
	function extractVocab(tokenizerJson: any): Record<string, number> {
	if (tokenizerJson.model?.vocab) return tokenizerJson.model.vocab
	if (tokenizerJson.vocab) return tokenizerJson.vocab
	throw new Error('在 tokenizer.json 中未找到词汇表（vocab）')
	}

	interface DecodedToken {
	original: string
	decoded: string
	tokenNum: number
	}

	function printWords(decodedTokens: DecodedToken[], maxLength: number): void {
	decodedTokens
	.toSorted((a, b) => b.decoded.length - a.decoded.length)
	.slice(0, maxLength)
	.forEach((token, index) => {
	console.log(`${index + 1}. ${token.decoded} - ${token.tokenNum}`)
	})
	}

	try {
	const rawJson = fs.readFileSync(tokenizerFilePath, 'utf-8')
	const vocab = extractVocab(JSON.parse(rawJson))

	const decodedTokens = Object.entries(vocab).map(([token, tokenNum]) => {
	const decodedFriendly = decodeToken(token).replace(/\n/g, '\\n').replace(/\r/g, '\\r').replace(/\t/g, '\\t')

	return { original: token, decoded: decodedFriendly, tokenNum }
	})

	console.log('英文:')
	printWords(
	decodedTokens.filter(({ decoded }) => /[a-z]{16}/iu.test(decoded)),
	50,
	)

	console.log('中文:')
	printWords(
	decodedTokens.filter(({ decoded }) => /[\p{Script=Han}]{3}/iu.test(decoded)),
	50,
	)
	} catch (err) {
	console.error('读取 tokenizer 失败:', err instanceof Error ? err.message : String(err))
	process.exit(1)
	}
No results found