Skip to content

Instantly share code, notes, and snippets.

@nsdevaraj
Created February 13, 2026 05:57
Show Gist options
  • Select an option

  • Save nsdevaraj/e37b5a1f2de2709535af7cc38e8812d6 to your computer and use it in GitHub Desktop.

Select an option

Save nsdevaraj/e37b5a1f2de2709535af7cc38e8812d6 to your computer and use it in GitHub Desktop.
// microgpt.js — tiny dependency-free GPT in JavaScript
// ~ port of karpathy/microgpt.py https://gist.github.com/karpathy/8627fe009c40f57531cb18360106ce95
const fs = require("fs");
const https = require("https");
// -------------------- Helpers --------------------
function download(url, dest) {
return new Promise((resolve, reject) => {
const file = fs.createWriteStream(dest);
https
.get(url, (response) => {
response.pipe(file);
file.on("finish", () => {
file.close();
resolve();
});
})
.on("error", reject);
});
}
// Simple seedable PRNG (mulberry32)
function makePRNG(seed) {
let state = seed | 0;
return function () {
state = (state + 0x6d2b79f5) | 0;
let t = Math.imul(state ^ (state >>> 15), 1 | state);
t = (t + Math.imul(t ^ (t >>> 7), 61 | t)) ^ t;
return ((t ^ (t >>> 14)) >>> 0) / 4294967296;
};
}
const rng = makePRNG(42);
function randomGauss(mean = 0, std = 1) {
// Box-Muller transform using seeded rng
let u = 0,
v = 0;
while (u === 0) u = rng();
while (v === 0) v = rng();
return (
mean + std * Math.sqrt(-2.0 * Math.log(u)) * Math.cos(2.0 * Math.PI * v)
);
}
function randomChoice(choices, weights) {
if (!weights) {
return choices[Math.floor(rng() * choices.length)];
}
let sum = weights.reduce((a, b) => a + b, 0);
let r = rng() * sum;
let current = 0;
for (let i = 0; i < choices.length; i++) {
current += weights[i];
if (r <= current) return choices[i];
}
return choices[choices.length - 1];
}
// Seeded shuffle (Fisher–Yates)
function shuffle(arr) {
for (let i = arr.length - 1; i > 0; i--) {
const j = Math.floor(rng() * (i + 1));
[arr[i], arr[j]] = [arr[j], arr[i]];
}
}
// -------------------- Value (micrograd-like autograd) --------------------
class Value {
constructor(data, children = [], localGrads = []) {
this.data = data;
this.grad = 0;
this._children = children;
this._localGrads = localGrads;
}
// Operations
add(other) {
other = other instanceof Value ? other : new Value(other);
return new Value(this.data + other.data, [this, other], [1, 1]);
}
mul(other) {
other = other instanceof Value ? other : new Value(other);
return new Value(
this.data * other.data,
[this, other],
[other.data, this.data],
);
}
pow(exp) {
return new Value(
Math.pow(this.data, exp),
[this],
[exp * Math.pow(this.data, exp - 1)],
);
}
log() {
return new Value(Math.log(this.data), [this], [1 / this.data]);
}
exp() {
return new Value(Math.exp(this.data), [this], [Math.exp(this.data)]);
}
relu() {
return new Value(Math.max(0, this.data), [this], [this.data > 0 ? 1 : 0]);
}
neg() {
return this.mul(-1);
}
sub(other) {
other = other instanceof Value ? other : new Value(other);
return this.add(other.neg());
}
div(other) {
other = other instanceof Value ? other : new Value(other);
return this.mul(other.pow(-1));
}
// Topological sort + backward (matches Python exactly)
backward() {
const topo = [];
const visited = new Set();
const buildTopo = (v) => {
if (!visited.has(v)) {
visited.add(v);
for (const child of v._children) {
buildTopo(child);
}
topo.push(v);
}
};
buildTopo(this);
this.grad = 1;
for (let i = topo.length - 1; i >= 0; i--) {
const node = topo[i];
for (let j = 0; j < node._children.length; j++) {
node._children[j].grad += node._localGrads[j] * node.grad;
}
}
}
}
// -------------------- Model --------------------
// Hyperparameters (matching Python exactly)
const n_embd = 16;
const n_head = 4;
const n_layer = 1;
const block_size = 16; // Python: 16
const head_dim = Math.floor(n_embd / n_head);
const learning_rate = 0.01;
const beta1 = 0.85; // Python: 0.85
const beta2 = 0.99; // Python: 0.99
const eps_adam = 1e-8;
const num_steps = 1000; // Python: 1000
const temperature = 0.5;
// Load data
const inputPath = "input.txt";
const namesUrl =
"https://raw.githubusercontent.com/karpathy/makemore/refs/heads/master/names.txt";
async function loadData() {
if (!fs.existsSync(inputPath)) {
console.log("Downloading names.txt...");
await download(namesUrl, inputPath);
}
const text = fs.readFileSync(inputPath, "utf-8");
const docs = text
.trim()
.split("\n")
.map((l) => l.trim())
.filter((l) => l);
shuffle(docs);
console.log(`num docs: ${docs.length}`);
return docs;
}
// Math helpers
function linear(x, w) {
// x: array of Value, w: matrix (array of arrays of Value)
// matches Python: [sum(wi * xi for wi, xi in zip(wo, x)) for wo in w]
return w.map((row) => {
let s = row[0].mul(x[0]);
for (let i = 1; i < row.length; i++) {
s = s.add(row[i].mul(x[i]));
}
return s;
});
}
function softmax(logits) {
const maxVal = Math.max(...logits.map((v) => v.data));
const exps = logits.map((v) => v.sub(maxVal).exp());
let total = exps[0];
for (let i = 1; i < exps.length; i++) {
total = total.add(exps[i]);
}
return exps.map((e) => e.div(total));
}
function rmsnorm(x) {
// Python: ms = sum(xi * xi for xi in x) / len(x)
// scale = (ms + 1e-5) ** -0.5
// return [xi * scale for xi in x]
// This does autograd through the scaling
let ms = x[0].mul(x[0]);
for (let i = 1; i < x.length; i++) {
ms = ms.add(x[i].mul(x[i]));
}
ms = ms.mul(1 / x.length);
const scale = ms.add(1e-5).pow(-0.5);
return x.map((xi) => xi.mul(scale));
}
// Forward function
function gpt(tokenId, posId, keys, values, stateDict) {
// tok_emb + pos_emb
let x = stateDict.wte[tokenId].map((v, i) => v.add(stateDict.wpe[posId][i]));
x = rmsnorm(x);
for (let li = 0; li < n_layer; li++) {
// 1) Multi-head attention block
const xResidual = [...x];
x = rmsnorm(x);
const q = linear(x, stateDict[`layer${li}.attn_wq`]);
const k = linear(x, stateDict[`layer${li}.attn_wk`]);
const v = linear(x, stateDict[`layer${li}.attn_wv`]);
keys[li].push(k);
values[li].push(v);
let xAttn = [];
for (let h = 0; h < n_head; h++) {
const hs = h * head_dim;
const qh = q.slice(hs, hs + head_dim);
const kh = keys[li].map((kk) => kk.slice(hs, hs + head_dim));
const vh = values[li].map((vv) => vv.slice(hs, hs + head_dim));
// Attention logits via Value ops (autograd-compatible, matching Python)
const attnLogits = kh.map((kk) => {
let dot = qh[0].mul(kk[0]);
for (let j = 1; j < head_dim; j++) {
dot = dot.add(qh[j].mul(kk[j]));
}
return dot.mul(1 / Math.sqrt(head_dim));
});
const attnWeights = softmax(attnLogits);
const headOut = [];
for (let j = 0; j < head_dim; j++) {
let s = attnWeights[0].mul(vh[0][j]);
for (let t = 1; t < vh.length; t++) {
s = s.add(attnWeights[t].mul(vh[t][j]));
}
headOut.push(s);
}
xAttn.push(...headOut);
}
x = linear(xAttn, stateDict[`layer${li}.attn_wo`]);
x = x.map((v, i) => v.add(xResidual[i]));
// 2) MLP block
const xResMlp = [...x];
x = rmsnorm(x);
x = linear(x, stateDict[`layer${li}.mlp_fc1`]);
x = x.map((xi) => xi.relu()); // Python: relu only (no squared)
x = linear(x, stateDict[`layer${li}.mlp_fc2`]);
x = x.map((v, i) => v.add(xResMlp[i]));
}
return linear(x, stateDict.lm_head);
}
// Main training loop
async function main() {
const docs = await loadData();
// Vocabulary (must be after loadData so docs is populated)
const uchars = [...new Set(docs.join(""))].sort();
const BOS = uchars.length;
const vocab_size = uchars.length + 1;
console.log(`vocab size: ${vocab_size}`);
// Parameter matrices (array of arrays of Value)
function createMatrix(nout, nin, std = 0.08) {
return Array.from({ length: nout }, () =>
Array.from({ length: nin }, () => new Value(randomGauss(0, std))),
);
}
const stateDict = {
wte: createMatrix(vocab_size, n_embd),
wpe: createMatrix(block_size, n_embd),
lm_head: createMatrix(vocab_size, n_embd),
};
for (let i = 0; i < n_layer; i++) {
stateDict[`layer${i}.attn_wq`] = createMatrix(n_embd, n_embd);
stateDict[`layer${i}.attn_wk`] = createMatrix(n_embd, n_embd);
stateDict[`layer${i}.attn_wv`] = createMatrix(n_embd, n_embd);
stateDict[`layer${i}.attn_wo`] = createMatrix(n_embd, n_embd);
stateDict[`layer${i}.mlp_fc1`] = createMatrix(4 * n_embd, n_embd);
stateDict[`layer${i}.mlp_fc2`] = createMatrix(n_embd, 4 * n_embd);
}
// Flatten all parameters for optimizer
const params = [];
Object.values(stateDict).forEach((mat) => {
mat.forEach((row) => row.forEach((v) => params.push(v)));
});
console.log(`num params: ${params.length}`);
// Adam optimizer buffers
const m = new Array(params.length).fill(0);
const v = new Array(params.length).fill(0);
// Training
for (let step = 0; step < num_steps; step++) {
const doc = docs[step % docs.length];
const tokens = [BOS, ...doc.split("").map((c) => uchars.indexOf(c)), BOS];
const n = Math.min(block_size, tokens.length - 1);
const keys = Array.from({ length: n_layer }, () => []);
const values = Array.from({ length: n_layer }, () => []);
const losses = [];
for (let pos = 0; pos < n; pos++) {
const tokenId = tokens[pos];
const targetId = tokens[pos + 1];
const logits = gpt(tokenId, pos, keys, values, stateDict);
const probs = softmax(logits);
const lossT = probs[targetId].log().neg();
losses.push(lossT);
}
// loss = (1/n) * sum(losses)
let loss = losses[0];
for (let i = 1; i < losses.length; i++) {
loss = loss.add(losses[i]);
}
loss = loss.mul(1 / n);
// Backward
loss.backward();
// Adam optimizer update with linear lr decay (matching Python)
const lr_t = learning_rate * (1 - step / num_steps);
for (let i = 0; i < params.length; i++) {
const p = params[i];
m[i] = beta1 * m[i] + (1 - beta1) * p.grad;
v[i] = beta2 * v[i] + (1 - beta2) * p.grad * p.grad;
const mHat = m[i] / (1 - Math.pow(beta1, step + 1));
const vHat = v[i] / (1 - Math.pow(beta2, step + 1));
p.data -= (lr_t * mHat) / (Math.sqrt(vHat) + eps_adam);
p.grad = 0; // zero grad after update (matching Python)
}
console.log(
`step ${String(step + 1).padStart(4)} / ${String(num_steps).padStart(4)} | loss ${loss.data.toFixed(4)}`,
);
}
// Inference
console.log("\n--- inference (new, hallucinated names) ---");
for (let sampleIdx = 0; sampleIdx < 20; sampleIdx++) {
const keys = Array.from({ length: n_layer }, () => []);
const values = Array.from({ length: n_layer }, () => []);
let tokenId = BOS;
let sample = [];
for (let pos = 0; pos < block_size; pos++) {
const logits = gpt(tokenId, pos, keys, values, stateDict);
// Python: softmax([l / temperature for l in logits])
const scaledLogits = logits.map((l) => l.mul(1 / temperature));
const probs = softmax(scaledLogits);
const weights = probs.map((p) => p.data);
tokenId = randomChoice([...Array(vocab_size).keys()], weights);
if (tokenId === BOS) break;
sample.push(uchars[tokenId]);
}
console.log(
`sample ${String(sampleIdx + 1).padStart(2)}: ${sample.join("")}`,
);
}
}
main();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment