Created
February 13, 2026 05:57
-
-
Save nsdevaraj/e37b5a1f2de2709535af7cc38e8812d6 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // microgpt.js — tiny dependency-free GPT in JavaScript | |
| // ~ port of karpathy/microgpt.py https://gist.github.com/karpathy/8627fe009c40f57531cb18360106ce95 | |
| const fs = require("fs"); | |
| const https = require("https"); | |
| // -------------------- Helpers -------------------- | |
| function download(url, dest) { | |
| return new Promise((resolve, reject) => { | |
| const file = fs.createWriteStream(dest); | |
| https | |
| .get(url, (response) => { | |
| response.pipe(file); | |
| file.on("finish", () => { | |
| file.close(); | |
| resolve(); | |
| }); | |
| }) | |
| .on("error", reject); | |
| }); | |
| } | |
| // Simple seedable PRNG (mulberry32) | |
| function makePRNG(seed) { | |
| let state = seed | 0; | |
| return function () { | |
| state = (state + 0x6d2b79f5) | 0; | |
| let t = Math.imul(state ^ (state >>> 15), 1 | state); | |
| t = (t + Math.imul(t ^ (t >>> 7), 61 | t)) ^ t; | |
| return ((t ^ (t >>> 14)) >>> 0) / 4294967296; | |
| }; | |
| } | |
| const rng = makePRNG(42); | |
| function randomGauss(mean = 0, std = 1) { | |
| // Box-Muller transform using seeded rng | |
| let u = 0, | |
| v = 0; | |
| while (u === 0) u = rng(); | |
| while (v === 0) v = rng(); | |
| return ( | |
| mean + std * Math.sqrt(-2.0 * Math.log(u)) * Math.cos(2.0 * Math.PI * v) | |
| ); | |
| } | |
| function randomChoice(choices, weights) { | |
| if (!weights) { | |
| return choices[Math.floor(rng() * choices.length)]; | |
| } | |
| let sum = weights.reduce((a, b) => a + b, 0); | |
| let r = rng() * sum; | |
| let current = 0; | |
| for (let i = 0; i < choices.length; i++) { | |
| current += weights[i]; | |
| if (r <= current) return choices[i]; | |
| } | |
| return choices[choices.length - 1]; | |
| } | |
| // Seeded shuffle (Fisher–Yates) | |
| function shuffle(arr) { | |
| for (let i = arr.length - 1; i > 0; i--) { | |
| const j = Math.floor(rng() * (i + 1)); | |
| [arr[i], arr[j]] = [arr[j], arr[i]]; | |
| } | |
| } | |
| // -------------------- Value (micrograd-like autograd) -------------------- | |
| class Value { | |
| constructor(data, children = [], localGrads = []) { | |
| this.data = data; | |
| this.grad = 0; | |
| this._children = children; | |
| this._localGrads = localGrads; | |
| } | |
| // Operations | |
| add(other) { | |
| other = other instanceof Value ? other : new Value(other); | |
| return new Value(this.data + other.data, [this, other], [1, 1]); | |
| } | |
| mul(other) { | |
| other = other instanceof Value ? other : new Value(other); | |
| return new Value( | |
| this.data * other.data, | |
| [this, other], | |
| [other.data, this.data], | |
| ); | |
| } | |
| pow(exp) { | |
| return new Value( | |
| Math.pow(this.data, exp), | |
| [this], | |
| [exp * Math.pow(this.data, exp - 1)], | |
| ); | |
| } | |
| log() { | |
| return new Value(Math.log(this.data), [this], [1 / this.data]); | |
| } | |
| exp() { | |
| return new Value(Math.exp(this.data), [this], [Math.exp(this.data)]); | |
| } | |
| relu() { | |
| return new Value(Math.max(0, this.data), [this], [this.data > 0 ? 1 : 0]); | |
| } | |
| neg() { | |
| return this.mul(-1); | |
| } | |
| sub(other) { | |
| other = other instanceof Value ? other : new Value(other); | |
| return this.add(other.neg()); | |
| } | |
| div(other) { | |
| other = other instanceof Value ? other : new Value(other); | |
| return this.mul(other.pow(-1)); | |
| } | |
| // Topological sort + backward (matches Python exactly) | |
| backward() { | |
| const topo = []; | |
| const visited = new Set(); | |
| const buildTopo = (v) => { | |
| if (!visited.has(v)) { | |
| visited.add(v); | |
| for (const child of v._children) { | |
| buildTopo(child); | |
| } | |
| topo.push(v); | |
| } | |
| }; | |
| buildTopo(this); | |
| this.grad = 1; | |
| for (let i = topo.length - 1; i >= 0; i--) { | |
| const node = topo[i]; | |
| for (let j = 0; j < node._children.length; j++) { | |
| node._children[j].grad += node._localGrads[j] * node.grad; | |
| } | |
| } | |
| } | |
| } | |
| // -------------------- Model -------------------- | |
| // Hyperparameters (matching Python exactly) | |
| const n_embd = 16; | |
| const n_head = 4; | |
| const n_layer = 1; | |
| const block_size = 16; // Python: 16 | |
| const head_dim = Math.floor(n_embd / n_head); | |
| const learning_rate = 0.01; | |
| const beta1 = 0.85; // Python: 0.85 | |
| const beta2 = 0.99; // Python: 0.99 | |
| const eps_adam = 1e-8; | |
| const num_steps = 1000; // Python: 1000 | |
| const temperature = 0.5; | |
| // Load data | |
| const inputPath = "input.txt"; | |
| const namesUrl = | |
| "https://raw.githubusercontent.com/karpathy/makemore/refs/heads/master/names.txt"; | |
| async function loadData() { | |
| if (!fs.existsSync(inputPath)) { | |
| console.log("Downloading names.txt..."); | |
| await download(namesUrl, inputPath); | |
| } | |
| const text = fs.readFileSync(inputPath, "utf-8"); | |
| const docs = text | |
| .trim() | |
| .split("\n") | |
| .map((l) => l.trim()) | |
| .filter((l) => l); | |
| shuffle(docs); | |
| console.log(`num docs: ${docs.length}`); | |
| return docs; | |
| } | |
| // Math helpers | |
| function linear(x, w) { | |
| // x: array of Value, w: matrix (array of arrays of Value) | |
| // matches Python: [sum(wi * xi for wi, xi in zip(wo, x)) for wo in w] | |
| return w.map((row) => { | |
| let s = row[0].mul(x[0]); | |
| for (let i = 1; i < row.length; i++) { | |
| s = s.add(row[i].mul(x[i])); | |
| } | |
| return s; | |
| }); | |
| } | |
| function softmax(logits) { | |
| const maxVal = Math.max(...logits.map((v) => v.data)); | |
| const exps = logits.map((v) => v.sub(maxVal).exp()); | |
| let total = exps[0]; | |
| for (let i = 1; i < exps.length; i++) { | |
| total = total.add(exps[i]); | |
| } | |
| return exps.map((e) => e.div(total)); | |
| } | |
| function rmsnorm(x) { | |
| // Python: ms = sum(xi * xi for xi in x) / len(x) | |
| // scale = (ms + 1e-5) ** -0.5 | |
| // return [xi * scale for xi in x] | |
| // This does autograd through the scaling | |
| let ms = x[0].mul(x[0]); | |
| for (let i = 1; i < x.length; i++) { | |
| ms = ms.add(x[i].mul(x[i])); | |
| } | |
| ms = ms.mul(1 / x.length); | |
| const scale = ms.add(1e-5).pow(-0.5); | |
| return x.map((xi) => xi.mul(scale)); | |
| } | |
| // Forward function | |
| function gpt(tokenId, posId, keys, values, stateDict) { | |
| // tok_emb + pos_emb | |
| let x = stateDict.wte[tokenId].map((v, i) => v.add(stateDict.wpe[posId][i])); | |
| x = rmsnorm(x); | |
| for (let li = 0; li < n_layer; li++) { | |
| // 1) Multi-head attention block | |
| const xResidual = [...x]; | |
| x = rmsnorm(x); | |
| const q = linear(x, stateDict[`layer${li}.attn_wq`]); | |
| const k = linear(x, stateDict[`layer${li}.attn_wk`]); | |
| const v = linear(x, stateDict[`layer${li}.attn_wv`]); | |
| keys[li].push(k); | |
| values[li].push(v); | |
| let xAttn = []; | |
| for (let h = 0; h < n_head; h++) { | |
| const hs = h * head_dim; | |
| const qh = q.slice(hs, hs + head_dim); | |
| const kh = keys[li].map((kk) => kk.slice(hs, hs + head_dim)); | |
| const vh = values[li].map((vv) => vv.slice(hs, hs + head_dim)); | |
| // Attention logits via Value ops (autograd-compatible, matching Python) | |
| const attnLogits = kh.map((kk) => { | |
| let dot = qh[0].mul(kk[0]); | |
| for (let j = 1; j < head_dim; j++) { | |
| dot = dot.add(qh[j].mul(kk[j])); | |
| } | |
| return dot.mul(1 / Math.sqrt(head_dim)); | |
| }); | |
| const attnWeights = softmax(attnLogits); | |
| const headOut = []; | |
| for (let j = 0; j < head_dim; j++) { | |
| let s = attnWeights[0].mul(vh[0][j]); | |
| for (let t = 1; t < vh.length; t++) { | |
| s = s.add(attnWeights[t].mul(vh[t][j])); | |
| } | |
| headOut.push(s); | |
| } | |
| xAttn.push(...headOut); | |
| } | |
| x = linear(xAttn, stateDict[`layer${li}.attn_wo`]); | |
| x = x.map((v, i) => v.add(xResidual[i])); | |
| // 2) MLP block | |
| const xResMlp = [...x]; | |
| x = rmsnorm(x); | |
| x = linear(x, stateDict[`layer${li}.mlp_fc1`]); | |
| x = x.map((xi) => xi.relu()); // Python: relu only (no squared) | |
| x = linear(x, stateDict[`layer${li}.mlp_fc2`]); | |
| x = x.map((v, i) => v.add(xResMlp[i])); | |
| } | |
| return linear(x, stateDict.lm_head); | |
| } | |
| // Main training loop | |
| async function main() { | |
| const docs = await loadData(); | |
| // Vocabulary (must be after loadData so docs is populated) | |
| const uchars = [...new Set(docs.join(""))].sort(); | |
| const BOS = uchars.length; | |
| const vocab_size = uchars.length + 1; | |
| console.log(`vocab size: ${vocab_size}`); | |
| // Parameter matrices (array of arrays of Value) | |
| function createMatrix(nout, nin, std = 0.08) { | |
| return Array.from({ length: nout }, () => | |
| Array.from({ length: nin }, () => new Value(randomGauss(0, std))), | |
| ); | |
| } | |
| const stateDict = { | |
| wte: createMatrix(vocab_size, n_embd), | |
| wpe: createMatrix(block_size, n_embd), | |
| lm_head: createMatrix(vocab_size, n_embd), | |
| }; | |
| for (let i = 0; i < n_layer; i++) { | |
| stateDict[`layer${i}.attn_wq`] = createMatrix(n_embd, n_embd); | |
| stateDict[`layer${i}.attn_wk`] = createMatrix(n_embd, n_embd); | |
| stateDict[`layer${i}.attn_wv`] = createMatrix(n_embd, n_embd); | |
| stateDict[`layer${i}.attn_wo`] = createMatrix(n_embd, n_embd); | |
| stateDict[`layer${i}.mlp_fc1`] = createMatrix(4 * n_embd, n_embd); | |
| stateDict[`layer${i}.mlp_fc2`] = createMatrix(n_embd, 4 * n_embd); | |
| } | |
| // Flatten all parameters for optimizer | |
| const params = []; | |
| Object.values(stateDict).forEach((mat) => { | |
| mat.forEach((row) => row.forEach((v) => params.push(v))); | |
| }); | |
| console.log(`num params: ${params.length}`); | |
| // Adam optimizer buffers | |
| const m = new Array(params.length).fill(0); | |
| const v = new Array(params.length).fill(0); | |
| // Training | |
| for (let step = 0; step < num_steps; step++) { | |
| const doc = docs[step % docs.length]; | |
| const tokens = [BOS, ...doc.split("").map((c) => uchars.indexOf(c)), BOS]; | |
| const n = Math.min(block_size, tokens.length - 1); | |
| const keys = Array.from({ length: n_layer }, () => []); | |
| const values = Array.from({ length: n_layer }, () => []); | |
| const losses = []; | |
| for (let pos = 0; pos < n; pos++) { | |
| const tokenId = tokens[pos]; | |
| const targetId = tokens[pos + 1]; | |
| const logits = gpt(tokenId, pos, keys, values, stateDict); | |
| const probs = softmax(logits); | |
| const lossT = probs[targetId].log().neg(); | |
| losses.push(lossT); | |
| } | |
| // loss = (1/n) * sum(losses) | |
| let loss = losses[0]; | |
| for (let i = 1; i < losses.length; i++) { | |
| loss = loss.add(losses[i]); | |
| } | |
| loss = loss.mul(1 / n); | |
| // Backward | |
| loss.backward(); | |
| // Adam optimizer update with linear lr decay (matching Python) | |
| const lr_t = learning_rate * (1 - step / num_steps); | |
| for (let i = 0; i < params.length; i++) { | |
| const p = params[i]; | |
| m[i] = beta1 * m[i] + (1 - beta1) * p.grad; | |
| v[i] = beta2 * v[i] + (1 - beta2) * p.grad * p.grad; | |
| const mHat = m[i] / (1 - Math.pow(beta1, step + 1)); | |
| const vHat = v[i] / (1 - Math.pow(beta2, step + 1)); | |
| p.data -= (lr_t * mHat) / (Math.sqrt(vHat) + eps_adam); | |
| p.grad = 0; // zero grad after update (matching Python) | |
| } | |
| console.log( | |
| `step ${String(step + 1).padStart(4)} / ${String(num_steps).padStart(4)} | loss ${loss.data.toFixed(4)}`, | |
| ); | |
| } | |
| // Inference | |
| console.log("\n--- inference (new, hallucinated names) ---"); | |
| for (let sampleIdx = 0; sampleIdx < 20; sampleIdx++) { | |
| const keys = Array.from({ length: n_layer }, () => []); | |
| const values = Array.from({ length: n_layer }, () => []); | |
| let tokenId = BOS; | |
| let sample = []; | |
| for (let pos = 0; pos < block_size; pos++) { | |
| const logits = gpt(tokenId, pos, keys, values, stateDict); | |
| // Python: softmax([l / temperature for l in logits]) | |
| const scaledLogits = logits.map((l) => l.mul(1 / temperature)); | |
| const probs = softmax(scaledLogits); | |
| const weights = probs.map((p) => p.data); | |
| tokenId = randomChoice([...Array(vocab_size).keys()], weights); | |
| if (tokenId === BOS) break; | |
| sample.push(uchars[tokenId]); | |
| } | |
| console.log( | |
| `sample ${String(sampleIdx + 1).padStart(2)}: ${sample.join("")}`, | |
| ); | |
| } | |
| } | |
| main(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment