nsdevaraj · February 13, 2026 05:57
diff --git a/microgpt.js b/microgpt.js
 // microgpt.js — tiny dependency-free GPT in JavaScript
 // ~ port of karpathy/microgpt.py https://gist.github.com/karpathy/8627fe009c40f57531cb18360106ce95
 
 const fs = require("fs");
 const https = require("https");

 // -------------------- Helpers --------------------

 function download(url, dest) {
  return new Promise((resolve, reject) => {
    const file = fs.createWriteStream(dest);
    https
      .get(url, (response) => {
        response.pipe(file);
        file.on("finish", () => {
          file.close();
          resolve();
        });
      })
      .on("error", reject);
  });
 }

 // Simple seedable PRNG (mulberry32)
 function makePRNG(seed) {
  let state = seed | 0;
  return function () {
    state = (state + 0x6d2b79f5) | 0;
    let t = Math.imul(state ^ (state >>> 15), 1 | state);
    t = (t + Math.imul(t ^ (t >>> 7), 61 | t)) ^ t;
    return ((t ^ (t >>> 14)) >>> 0) / 4294967296;
  };
 }

 const rng = makePRNG(42);

 function randomGauss(mean = 0, std = 1) {
  // Box-Muller transform using seeded rng
  let u = 0,
    v = 0;
  while (u === 0) u = rng();
  while (v === 0) v = rng();
  return (
    mean + std * Math.sqrt(-2.0 * Math.log(u)) * Math.cos(2.0 * Math.PI * v)
  );
 }

 function randomChoice(choices, weights) {
  if (!weights) {
    return choices[Math.floor(rng() * choices.length)];
  }
  let sum = weights.reduce((a, b) => a + b, 0);
  let r = rng() * sum;
  let current = 0;
  for (let i = 0; i < choices.length; i++) {
    current += weights[i];
    if (r <= current) return choices[i];
  }
  return choices[choices.length - 1];
 }

 // Seeded shuffle (Fisher–Yates)
 function shuffle(arr) {
  for (let i = arr.length - 1; i > 0; i--) {
    const j = Math.floor(rng() * (i + 1));
    [arr[i], arr[j]] = [arr[j], arr[i]];
  }
 }

 // -------------------- Value (micrograd-like autograd) --------------------

 class Value {
  constructor(data, children = [], localGrads = []) {
    this.data = data;
    this.grad = 0;
    this._children = children;
    this._localGrads = localGrads;
  }

  // Operations
  add(other) {
    other = other instanceof Value ? other : new Value(other);
    return new Value(this.data + other.data, [this, other], [1, 1]);
  }

  mul(other) {
    other = other instanceof Value ? other : new Value(other);
    return new Value(
      this.data * other.data,
      [this, other],
      [other.data, this.data],
    );
  }

  pow(exp) {
    return new Value(
      Math.pow(this.data, exp),
      [this],
      [exp * Math.pow(this.data, exp - 1)],
    );
  }

  log() {
    return new Value(Math.log(this.data), [this], [1 / this.data]);
  }

  exp() {
    return new Value(Math.exp(this.data), [this], [Math.exp(this.data)]);
  }

  relu() {
    return new Value(Math.max(0, this.data), [this], [this.data > 0 ? 1 : 0]);
  }

  neg() {
    return this.mul(-1);
  }

  sub(other) {
    other = other instanceof Value ? other : new Value(other);
    return this.add(other.neg());
  }

  div(other) {
    other = other instanceof Value ? other : new Value(other);
    return this.mul(other.pow(-1));
  }

  // Topological sort + backward (matches Python exactly)
  backward() {
    const topo = [];
    const visited = new Set();

    const buildTopo = (v) => {
      if (!visited.has(v)) {
        visited.add(v);
        for (const child of v._children) {
          buildTopo(child);
        }
        topo.push(v);
      }
    };

    buildTopo(this);
    this.grad = 1;

    for (let i = topo.length - 1; i >= 0; i--) {
      const node = topo[i];
      for (let j = 0; j < node._children.length; j++) {
        node._children[j].grad += node._localGrads[j] * node.grad;
      }
    }
  }
 }

 // -------------------- Model --------------------

 // Hyperparameters (matching Python exactly)
 const n_embd = 16;
 const n_head = 4;
 const n_layer = 1;
 const block_size = 16; // Python: 16
 const head_dim = Math.floor(n_embd / n_head);

 const learning_rate = 0.01;
 const beta1 = 0.85; // Python: 0.85
 const beta2 = 0.99; // Python: 0.99
 const eps_adam = 1e-8;
 const num_steps = 1000; // Python: 1000
 const temperature = 0.5;

 // Load data
 const inputPath = "input.txt";
 const namesUrl =
  "https://raw.githubusercontent.com/karpathy/makemore/refs/heads/master/names.txt";

 async function loadData() {
  if (!fs.existsSync(inputPath)) {
    console.log("Downloading names.txt...");
    await download(namesUrl, inputPath);
  }
  const text = fs.readFileSync(inputPath, "utf-8");
  const docs = text
    .trim()
    .split("\n")
    .map((l) => l.trim())
    .filter((l) => l);
  shuffle(docs);
  console.log(`num docs: ${docs.length}`);
  return docs;
 }

 // Math helpers
 function linear(x, w) {
  // x: array of Value, w: matrix (array of arrays of Value)
  // matches Python: [sum(wi * xi for wi, xi in zip(wo, x)) for wo in w]
  return w.map((row) => {
    let s = row[0].mul(x[0]);
    for (let i = 1; i < row.length; i++) {
      s = s.add(row[i].mul(x[i]));
    }
    return s;
  });
 }

 function softmax(logits) {
  const maxVal = Math.max(...logits.map((v) => v.data));
  const exps = logits.map((v) => v.sub(maxVal).exp());
  let total = exps[0];
  for (let i = 1; i < exps.length; i++) {
    total = total.add(exps[i]);
  }
  return exps.map((e) => e.div(total));
 }

 function rmsnorm(x) {
  // Python: ms = sum(xi * xi for xi in x) / len(x)
  // scale = (ms + 1e-5) ** -0.5
  // return [xi * scale for xi in x]
  // This does autograd through the scaling
  let ms = x[0].mul(x[0]);
  for (let i = 1; i < x.length; i++) {
    ms = ms.add(x[i].mul(x[i]));
  }
  ms = ms.mul(1 / x.length);
  const scale = ms.add(1e-5).pow(-0.5);
  return x.map((xi) => xi.mul(scale));
 }

 // Forward function
 function gpt(tokenId, posId, keys, values, stateDict) {
  // tok_emb + pos_emb
  let x = stateDict.wte[tokenId].map((v, i) => v.add(stateDict.wpe[posId][i]));
  x = rmsnorm(x);

  for (let li = 0; li < n_layer; li++) {
    // 1) Multi-head attention block
    const xResidual = [...x];
    x = rmsnorm(x);

    const q = linear(x, stateDict[`layer${li}.attn_wq`]);
    const k = linear(x, stateDict[`layer${li}.attn_wk`]);
    const v = linear(x, stateDict[`layer${li}.attn_wv`]);

    keys[li].push(k);
    values[li].push(v);

    let xAttn = [];

    for (let h = 0; h < n_head; h++) {
      const hs = h * head_dim;
      const qh = q.slice(hs, hs + head_dim);

      const kh = keys[li].map((kk) => kk.slice(hs, hs + head_dim));
      const vh = values[li].map((vv) => vv.slice(hs, hs + head_dim));

      // Attention logits via Value ops (autograd-compatible, matching Python)
      const attnLogits = kh.map((kk) => {
        let dot = qh[0].mul(kk[0]);
        for (let j = 1; j < head_dim; j++) {
          dot = dot.add(qh[j].mul(kk[j]));
        }
        return dot.mul(1 / Math.sqrt(head_dim));
      });

      const attnWeights = softmax(attnLogits);

      const headOut = [];
      for (let j = 0; j < head_dim; j++) {
        let s = attnWeights[0].mul(vh[0][j]);
        for (let t = 1; t < vh.length; t++) {
          s = s.add(attnWeights[t].mul(vh[t][j]));
        }
        headOut.push(s);
      }

      xAttn.push(...headOut);
    }

    x = linear(xAttn, stateDict[`layer${li}.attn_wo`]);
    x = x.map((v, i) => v.add(xResidual[i]));

    // 2) MLP block
    const xResMlp = [...x];
    x = rmsnorm(x);
    x = linear(x, stateDict[`layer${li}.mlp_fc1`]);
    x = x.map((xi) => xi.relu()); // Python: relu only (no squared)
    x = linear(x, stateDict[`layer${li}.mlp_fc2`]);
    x = x.map((v, i) => v.add(xResMlp[i]));
  }

  return linear(x, stateDict.lm_head);
 }

 // Main training loop
 async function main() {
  const docs = await loadData();

  // Vocabulary (must be after loadData so docs is populated)
  const uchars = [...new Set(docs.join(""))].sort();
  const BOS = uchars.length;
  const vocab_size = uchars.length + 1;
  console.log(`vocab size: ${vocab_size}`);

  // Parameter matrices (array of arrays of Value)
  function createMatrix(nout, nin, std = 0.08) {
    return Array.from({ length: nout }, () =>
      Array.from({ length: nin }, () => new Value(randomGauss(0, std))),
    );
  }

  const stateDict = {
    wte: createMatrix(vocab_size, n_embd),
    wpe: createMatrix(block_size, n_embd),
    lm_head: createMatrix(vocab_size, n_embd),
  };

  for (let i = 0; i < n_layer; i++) {
    stateDict[`layer${i}.attn_wq`] = createMatrix(n_embd, n_embd);
    stateDict[`layer${i}.attn_wk`] = createMatrix(n_embd, n_embd);
    stateDict[`layer${i}.attn_wv`] = createMatrix(n_embd, n_embd);
    stateDict[`layer${i}.attn_wo`] = createMatrix(n_embd, n_embd);
    stateDict[`layer${i}.mlp_fc1`] = createMatrix(4 * n_embd, n_embd);
    stateDict[`layer${i}.mlp_fc2`] = createMatrix(n_embd, 4 * n_embd);
  }

  // Flatten all parameters for optimizer
  const params = [];
  Object.values(stateDict).forEach((mat) => {
    mat.forEach((row) => row.forEach((v) => params.push(v)));
  });
  console.log(`num params: ${params.length}`);

  // Adam optimizer buffers
  const m = new Array(params.length).fill(0);
  const v = new Array(params.length).fill(0);

  // Training
  for (let step = 0; step < num_steps; step++) {
    const doc = docs[step % docs.length];
    const tokens = [BOS, ...doc.split("").map((c) => uchars.indexOf(c)), BOS];
    const n = Math.min(block_size, tokens.length - 1);

    const keys = Array.from({ length: n_layer }, () => []);
    const values = Array.from({ length: n_layer }, () => []);

    const losses = [];

    for (let pos = 0; pos < n; pos++) {
      const tokenId = tokens[pos];
      const targetId = tokens[pos + 1];

      const logits = gpt(tokenId, pos, keys, values, stateDict);
      const probs = softmax(logits);
      const lossT = probs[targetId].log().neg();
      losses.push(lossT);
    }

    // loss = (1/n) * sum(losses)
    let loss = losses[0];
    for (let i = 1; i < losses.length; i++) {
      loss = loss.add(losses[i]);
    }
    loss = loss.mul(1 / n);

    // Backward
    loss.backward();

    // Adam optimizer update with linear lr decay (matching Python)
    const lr_t = learning_rate * (1 - step / num_steps);

    for (let i = 0; i < params.length; i++) {
      const p = params[i];
      m[i] = beta1 * m[i] + (1 - beta1) * p.grad;
      v[i] = beta2 * v[i] + (1 - beta2) * p.grad * p.grad;

      const mHat = m[i] / (1 - Math.pow(beta1, step + 1));
      const vHat = v[i] / (1 - Math.pow(beta2, step + 1));

      p.data -= (lr_t * mHat) / (Math.sqrt(vHat) + eps_adam);
      p.grad = 0; // zero grad after update (matching Python)
    }

    console.log(
      `step ${String(step + 1).padStart(4)} / ${String(num_steps).padStart(4)} | loss ${loss.data.toFixed(4)}`,
    );
  }

  // Inference
  console.log("\n--- inference (new, hallucinated names) ---");

  for (let sampleIdx = 0; sampleIdx < 20; sampleIdx++) {
    const keys = Array.from({ length: n_layer }, () => []);
    const values = Array.from({ length: n_layer }, () => []);

    let tokenId = BOS;
    let sample = [];

    for (let pos = 0; pos < block_size; pos++) {
      const logits = gpt(tokenId, pos, keys, values, stateDict);
      // Python: softmax([l / temperature for l in logits])
      const scaledLogits = logits.map((l) => l.mul(1 / temperature));
      const probs = softmax(scaledLogits);
      const weights = probs.map((p) => p.data);
      tokenId = randomChoice([...Array(vocab_size).keys()], weights);

      if (tokenId === BOS) break;
      sample.push(uchars[tokenId]);
    }

    console.log(
      `sample ${String(sampleIdx + 1).padStart(2)}: ${sample.join("")}`,
    );
  }
 }

 main();
	// microgpt.js — tiny dependency-free GPT in JavaScript
	// ~ port of karpathy/microgpt.py https://gist.github.com/karpathy/8627fe009c40f57531cb18360106ce95

	const fs = require("fs");
	const https = require("https");

	// -------------------- Helpers --------------------

	function download(url, dest) {
	return new Promise((resolve, reject) => {
	const file = fs.createWriteStream(dest);
	https
	.get(url, (response) => {
	response.pipe(file);
	file.on("finish", () => {
	file.close();
	resolve();
	});
	})
	.on("error", reject);
	});
	}

	// Simple seedable PRNG (mulberry32)
	function makePRNG(seed) {
	let state = seed \| 0;
	return function () {
	state = (state + 0x6d2b79f5) \| 0;
	let t = Math.imul(state ^ (state >>> 15), 1 \| state);
	t = (t + Math.imul(t ^ (t >>> 7), 61 \| t)) ^ t;
	return ((t ^ (t >>> 14)) >>> 0) / 4294967296;
	};
	}

	const rng = makePRNG(42);

	function randomGauss(mean = 0, std = 1) {
	// Box-Muller transform using seeded rng
	let u = 0,
	v = 0;
	while (u === 0) u = rng();
	while (v === 0) v = rng();
	return (
	mean + std * Math.sqrt(-2.0 * Math.log(u)) * Math.cos(2.0 * Math.PI * v)
	);
	}

	function randomChoice(choices, weights) {
	if (!weights) {
	return choices[Math.floor(rng() * choices.length)];
	}
	let sum = weights.reduce((a, b) => a + b, 0);
	let r = rng() * sum;
	let current = 0;
	for (let i = 0; i < choices.length; i++) {
	current += weights[i];
	if (r <= current) return choices[i];
	}
	return choices[choices.length - 1];
	}

	// Seeded shuffle (Fisher–Yates)
	function shuffle(arr) {
	for (let i = arr.length - 1; i > 0; i--) {
	const j = Math.floor(rng() * (i + 1));
	[arr[i], arr[j]] = [arr[j], arr[i]];
	}
	}

	// -------------------- Value (micrograd-like autograd) --------------------

	class Value {
	constructor(data, children = [], localGrads = []) {
	this.data = data;
	this.grad = 0;
	this._children = children;
	this._localGrads = localGrads;
	}

	// Operations
	add(other) {
	other = other instanceof Value ? other : new Value(other);
	return new Value(this.data + other.data, [this, other], [1, 1]);
	}

	mul(other) {
	other = other instanceof Value ? other : new Value(other);
	return new Value(
	this.data * other.data,
	[this, other],
	[other.data, this.data],
	);
	}

	pow(exp) {
	return new Value(
	Math.pow(this.data, exp),
	[this],
	[exp * Math.pow(this.data, exp - 1)],
	);
	}

	log() {
	return new Value(Math.log(this.data), [this], [1 / this.data]);
	}

	exp() {
	return new Value(Math.exp(this.data), [this], [Math.exp(this.data)]);
	}

	relu() {
	return new Value(Math.max(0, this.data), [this], [this.data > 0 ? 1 : 0]);
	}

	neg() {
	return this.mul(-1);
	}

	sub(other) {
	other = other instanceof Value ? other : new Value(other);
	return this.add(other.neg());
	}

	div(other) {
	other = other instanceof Value ? other : new Value(other);
	return this.mul(other.pow(-1));
	}

	// Topological sort + backward (matches Python exactly)
	backward() {
	const topo = [];
	const visited = new Set();

	const buildTopo = (v) => {
	if (!visited.has(v)) {
	visited.add(v);
	for (const child of v._children) {
	buildTopo(child);
	}
	topo.push(v);
	}
	};

	buildTopo(this);
	this.grad = 1;

	for (let i = topo.length - 1; i >= 0; i--) {
	const node = topo[i];
	for (let j = 0; j < node._children.length; j++) {
	node._children[j].grad += node._localGrads[j] * node.grad;
	}
	}
	}
	}

	// -------------------- Model --------------------

	// Hyperparameters (matching Python exactly)
	const n_embd = 16;
	const n_head = 4;
	const n_layer = 1;
	const block_size = 16; // Python: 16
	const head_dim = Math.floor(n_embd / n_head);

	const learning_rate = 0.01;
	const beta1 = 0.85; // Python: 0.85
	const beta2 = 0.99; // Python: 0.99
	const eps_adam = 1e-8;
	const num_steps = 1000; // Python: 1000
	const temperature = 0.5;

	// Load data
	const inputPath = "input.txt";
	const namesUrl =
	"https://raw.githubusercontent.com/karpathy/makemore/refs/heads/master/names.txt";

	async function loadData() {
	if (!fs.existsSync(inputPath)) {
	console.log("Downloading names.txt...");
	await download(namesUrl, inputPath);
	}
	const text = fs.readFileSync(inputPath, "utf-8");
	const docs = text
	.trim()
	.split("\n")
	.map((l) => l.trim())
	.filter((l) => l);
	shuffle(docs);
	console.log(`num docs: ${docs.length}`);
	return docs;
	}

	// Math helpers
	function linear(x, w) {
	// x: array of Value, w: matrix (array of arrays of Value)
	// matches Python: [sum(wi * xi for wi, xi in zip(wo, x)) for wo in w]
	return w.map((row) => {
	let s = row[0].mul(x[0]);
	for (let i = 1; i < row.length; i++) {
	s = s.add(row[i].mul(x[i]));
	}
	return s;
	});
	}

	function softmax(logits) {
	const maxVal = Math.max(...logits.map((v) => v.data));
	const exps = logits.map((v) => v.sub(maxVal).exp());
	let total = exps[0];
	for (let i = 1; i < exps.length; i++) {
	total = total.add(exps[i]);
	}
	return exps.map((e) => e.div(total));
	}

	function rmsnorm(x) {
	// Python: ms = sum(xi * xi for xi in x) / len(x)
	// scale = (ms + 1e-5) ** -0.5
	// return [xi * scale for xi in x]
	// This does autograd through the scaling
	let ms = x[0].mul(x[0]);
	for (let i = 1; i < x.length; i++) {
	ms = ms.add(x[i].mul(x[i]));
	}
	ms = ms.mul(1 / x.length);
	const scale = ms.add(1e-5).pow(-0.5);
	return x.map((xi) => xi.mul(scale));
	}

	// Forward function
	function gpt(tokenId, posId, keys, values, stateDict) {
	// tok_emb + pos_emb
	let x = stateDict.wte[tokenId].map((v, i) => v.add(stateDict.wpe[posId][i]));
	x = rmsnorm(x);

	for (let li = 0; li < n_layer; li++) {
	// 1) Multi-head attention block
	const xResidual = [...x];
	x = rmsnorm(x);

	const q = linear(x, stateDict[`layer${li}.attn_wq`]);
	const k = linear(x, stateDict[`layer${li}.attn_wk`]);
	const v = linear(x, stateDict[`layer${li}.attn_wv`]);

	keys[li].push(k);
	values[li].push(v);

	let xAttn = [];

	for (let h = 0; h < n_head; h++) {
	const hs = h * head_dim;
	const qh = q.slice(hs, hs + head_dim);

	const kh = keys[li].map((kk) => kk.slice(hs, hs + head_dim));
	const vh = values[li].map((vv) => vv.slice(hs, hs + head_dim));

	// Attention logits via Value ops (autograd-compatible, matching Python)
	const attnLogits = kh.map((kk) => {
	let dot = qh[0].mul(kk[0]);
	for (let j = 1; j < head_dim; j++) {
	dot = dot.add(qh[j].mul(kk[j]));
	}
	return dot.mul(1 / Math.sqrt(head_dim));
	});

	const attnWeights = softmax(attnLogits);

	const headOut = [];
	for (let j = 0; j < head_dim; j++) {
	let s = attnWeights[0].mul(vh[0][j]);
	for (let t = 1; t < vh.length; t++) {
	s = s.add(attnWeights[t].mul(vh[t][j]));
	}
	headOut.push(s);
	}

	xAttn.push(...headOut);
	}

	x = linear(xAttn, stateDict[`layer${li}.attn_wo`]);
	x = x.map((v, i) => v.add(xResidual[i]));

	// 2) MLP block
	const xResMlp = [...x];
	x = rmsnorm(x);
	x = linear(x, stateDict[`layer${li}.mlp_fc1`]);
	x = x.map((xi) => xi.relu()); // Python: relu only (no squared)
	x = linear(x, stateDict[`layer${li}.mlp_fc2`]);
	x = x.map((v, i) => v.add(xResMlp[i]));
	}

	return linear(x, stateDict.lm_head);
	}

	// Main training loop
	async function main() {
	const docs = await loadData();

	// Vocabulary (must be after loadData so docs is populated)
	const uchars = [...new Set(docs.join(""))].sort();
	const BOS = uchars.length;
	const vocab_size = uchars.length + 1;
	console.log(`vocab size: ${vocab_size}`);

	// Parameter matrices (array of arrays of Value)
	function createMatrix(nout, nin, std = 0.08) {
	return Array.from({ length: nout }, () =>
	Array.from({ length: nin }, () => new Value(randomGauss(0, std))),
	);
	}

	const stateDict = {
	wte: createMatrix(vocab_size, n_embd),
	wpe: createMatrix(block_size, n_embd),
	lm_head: createMatrix(vocab_size, n_embd),
	};

	for (let i = 0; i < n_layer; i++) {
	stateDict[`layer${i}.attn_wq`] = createMatrix(n_embd, n_embd);
	stateDict[`layer${i}.attn_wk`] = createMatrix(n_embd, n_embd);
	stateDict[`layer${i}.attn_wv`] = createMatrix(n_embd, n_embd);
	stateDict[`layer${i}.attn_wo`] = createMatrix(n_embd, n_embd);
	stateDict[`layer${i}.mlp_fc1`] = createMatrix(4 * n_embd, n_embd);
	stateDict[`layer${i}.mlp_fc2`] = createMatrix(n_embd, 4 * n_embd);
	}

	// Flatten all parameters for optimizer
	const params = [];
	Object.values(stateDict).forEach((mat) => {
	mat.forEach((row) => row.forEach((v) => params.push(v)));
	});
	console.log(`num params: ${params.length}`);

	// Adam optimizer buffers
	const m = new Array(params.length).fill(0);
	const v = new Array(params.length).fill(0);

	// Training
	for (let step = 0; step < num_steps; step++) {
	const doc = docs[step % docs.length];
	const tokens = [BOS, ...doc.split("").map((c) => uchars.indexOf(c)), BOS];
	const n = Math.min(block_size, tokens.length - 1);

	const keys = Array.from({ length: n_layer }, () => []);
	const values = Array.from({ length: n_layer }, () => []);

	const losses = [];

	for (let pos = 0; pos < n; pos++) {
	const tokenId = tokens[pos];
	const targetId = tokens[pos + 1];

	const logits = gpt(tokenId, pos, keys, values, stateDict);
	const probs = softmax(logits);
	const lossT = probs[targetId].log().neg();
	losses.push(lossT);
	}

	// loss = (1/n) * sum(losses)
	let loss = losses[0];
	for (let i = 1; i < losses.length; i++) {
	loss = loss.add(losses[i]);
	}
	loss = loss.mul(1 / n);

	// Backward
	loss.backward();

	// Adam optimizer update with linear lr decay (matching Python)
	const lr_t = learning_rate * (1 - step / num_steps);

	for (let i = 0; i < params.length; i++) {
	const p = params[i];
	m[i] = beta1 * m[i] + (1 - beta1) * p.grad;
	v[i] = beta2 * v[i] + (1 - beta2) * p.grad * p.grad;

	const mHat = m[i] / (1 - Math.pow(beta1, step + 1));
	const vHat = v[i] / (1 - Math.pow(beta2, step + 1));

	p.data -= (lr_t * mHat) / (Math.sqrt(vHat) + eps_adam);
	p.grad = 0; // zero grad after update (matching Python)
	}

	console.log(
	`step ${String(step + 1).padStart(4)} / ${String(num_steps).padStart(4)} \| loss ${loss.data.toFixed(4)}`,
	);
	}

	// Inference
	console.log("\n--- inference (new, hallucinated names) ---");

	for (let sampleIdx = 0; sampleIdx < 20; sampleIdx++) {
	const keys = Array.from({ length: n_layer }, () => []);
	const values = Array.from({ length: n_layer }, () => []);

	let tokenId = BOS;
	let sample = [];

	for (let pos = 0; pos < block_size; pos++) {
	const logits = gpt(tokenId, pos, keys, values, stateDict);
	// Python: softmax([l / temperature for l in logits])
	const scaledLogits = logits.map((l) => l.mul(1 / temperature));
	const probs = softmax(scaledLogits);
	const weights = probs.map((p) => p.data);
	tokenId = randomChoice([...Array(vocab_size).keys()], weights);

	if (tokenId === BOS) break;
	sample.push(uchars[tokenId]);
	}

	console.log(
	`sample ${String(sampleIdx + 1).padStart(2)}: ${sample.join("")}`,
	);
	}
	}

	main();
No results found