Created
February 12, 2026 01:42
-
-
Save asears/78dffd0bccc71264c10ec98d16a5cc86 to your computer and use it in GitHub Desktop.
microgpt (karpathy)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| The most atomic way to train and inference a GPT in pure, dependency-free Python. | |
| This file is the complete algorithm. Everything else is just efficiency. | |
| @karpathy and @claude with the refactoring barkie dog :ruff: | |
| """ | |
| import logging | |
| import math # math.log, math.exp | |
| import operator | |
| import random # random.seed, random.choices, random.gauss, random.shuffle | |
| from itertools import chain, repeat | |
| from pathlib import Path | |
| logging.basicConfig(level=logging.INFO, format="%(message)s") | |
| logger = logging.getLogger(__name__) | |
| # Let there be order among chaos | |
| random.seed(42) | |
| # Let there be an input dataset `docs`: list[str] of documents (e.g. a dataset of names) | |
| if not Path("input.txt").exists(): | |
| import urllib.request | |
| urllib.request.urlretrieve( | |
| "https://raw.githubusercontent.com/karpathy/makemore/refs/heads/master/names.txt", | |
| "input.txt", | |
| ) | |
| docs = [ | |
| line.strip() | |
| for line in Path("input.txt") | |
| .read_text(encoding="utf-8") | |
| .strip() | |
| .split("\n") | |
| if line.strip() | |
| ] # list[str] of documents | |
| random.shuffle(docs) | |
| logger.info("num docs: %s", len(docs)) | |
| # Let there be a Tokenizer to translate strings to discrete symbols and back | |
| chars = ["<BOS>", *sorted(set("".join(docs)))] # character-level tokenizer | |
| vocab_size = len(chars) | |
| stoi = {char: i for i, char in enumerate(chars)} # encoding: map string to integer | |
| itos = dict(enumerate(chars)) # decoding: map integer to string | |
| BOS = stoi["<BOS>"] | |
| logger.info("vocab size: %s", vocab_size) | |
| # Let there be an Autograd to apply the chain rule recursively across a | |
| # computation graph and so calculate gradients of the loss wrt model parameters. | |
| class Value: | |
| """Stores a single scalar value and its gradient for automatic differentiation. | |
| Implements a minimal autograd engine supporting basic arithmetic operations, | |
| mathematical functions (log, exp), and ReLU activation with automatic | |
| backpropagation through reverse topological sort. | |
| References: | |
| - micrograd: https://github.com/karpathy/micrograd | |
| - Automatic Differentiation: https://en.wikipedia.org/wiki/Automatic_differentiation | |
| """ | |
| def __init__(self, data, _children=(), _op=""): | |
| """Initialize autograd node with scalar value and gradient tracking. | |
| Args: | |
| data: Scalar numerical value. | |
| _children: Tuple of child nodes in computation graph. | |
| _op: String name of operation that produced this node. | |
| """ | |
| self.data = data | |
| self.grad = 0 | |
| self._backward = lambda: None | |
| self._prev = set(_children) | |
| self._op = _op # the op that produced this node, for graphviz / debugging / etc | |
| def __add__(self, other): | |
| """Element-wise addition with automatic differentiation. | |
| Args: | |
| other: Scalar, int, or Value to add. | |
| Returns: | |
| New Value with sum, tracking gradient for backprop. | |
| References: | |
| - micrograd: https://github.com/karpathy/micrograd | |
| """ | |
| other = other if isinstance(other, Value) else Value(other) | |
| out = Value(self.data + other.data, (self, other), "+") | |
| def _backward(): | |
| self.grad += out.grad | |
| other.grad += out.grad | |
| out._backward = _backward | |
| return out | |
| def __mul__(self, other): | |
| """Element-wise multiplication with automatic differentiation. | |
| Args: | |
| other: Scalar, int, or Value to multiply. | |
| Returns: | |
| New Value with product, tracking gradient for backprop. | |
| References: | |
| - micrograd: https://github.com/karpathy/micrograd | |
| """ | |
| other = other if isinstance(other, Value) else Value(other) | |
| out = Value(self.data * other.data, (self, other), "*") | |
| def _backward(): | |
| self.grad += other.data * out.grad | |
| other.grad += self.data * out.grad | |
| out._backward = _backward | |
| return out | |
| def __pow__(self, other): | |
| """Power operation with automatic differentiation. | |
| Args: | |
| other: Int or float exponent (must be numeric type). | |
| Returns: | |
| New Value with power result, tracking gradient for backprop. | |
| Raises: | |
| TypeError: If exponent is not int or float. | |
| References: | |
| - micrograd: https://github.com/karpathy/micrograd | |
| """ | |
| if not isinstance(other, (int, float)): | |
| msg = "only supporting int/float powers for now" | |
| raise TypeError(msg) | |
| out = Value(self.data**other, (self,), f"**{other}") | |
| def _backward(): | |
| self.grad += (other * self.data ** (other - 1)) * out.grad | |
| out._backward = _backward | |
| return out | |
| def log(self): | |
| """Natural logarithm with automatic differentiation. | |
| Returns: | |
| New Value with ln(self), tracking gradient for backprop. | |
| References: | |
| - micrograd: https://github.com/karpathy/micrograd | |
| """ | |
| out = Value(math.log(self.data), (self,), "log") | |
| def _backward(): | |
| self.grad += (1 / self.data) * out.grad | |
| out._backward = _backward | |
| return out | |
| def exp(self): | |
| """Exponential function (e^x) with automatic differentiation. | |
| Returns: | |
| New Value with exp(self), tracking gradient for backprop. | |
| References: | |
| - micrograd: https://github.com/karpathy/micrograd | |
| """ | |
| out = Value(math.exp(self.data), (self,), "exp") | |
| def _backward(): | |
| self.grad += out.data * out.grad | |
| out._backward = _backward | |
| return out | |
| def relu(self): | |
| """Rectified Linear Unit activation with automatic differentiation. | |
| Returns max(0, x) which provides non-linearity for neural networks. | |
| Returns: | |
| New Value with ReLU(self), tracking gradient for backprop. | |
| References: | |
| - Krizhevsky et al. (2012): https://papers.nips.cc/paper/2012/hash/c399862d3b9d6b76c8436e924a68c45b-Abstract.html | |
| - micrograd: https://github.com/karpathy/micrograd | |
| """ | |
| out = Value(max(self.data, 0), (self,), "ReLU") | |
| def _backward(): | |
| self.grad += (out.data > 0) * out.grad | |
| out._backward = _backward | |
| return out | |
| def backward(self): | |
| """Backpropagation: compute gradients via reverse topological sort. | |
| Performs automatic differentiation by traversing computation graph in | |
| reverse topological order, applying chain rule at each node. | |
| References: | |
| - Rumelhart et al. (1986): https://www.nature.com/articles/323533a0 | |
| - Automatic Differentiation: https://en.wikipedia.org/wiki/Automatic_differentiation | |
| - Griewank & Walther (2008): https://arxiv.org/abs/0712.0210 | |
| - micrograd: https://github.com/karpathy/micrograd | |
| """ | |
| # topological order all of the children in the graph | |
| topo = [] | |
| visited = set() | |
| def build_topo(v): | |
| if v not in visited: | |
| visited.add(v) | |
| for child in v._prev: | |
| build_topo(child) | |
| topo.append(v) | |
| build_topo(self) | |
| # go one variable at a time and apply the chain rule to get its gradient | |
| self.grad = 1 | |
| for v in reversed(topo): | |
| v._backward() | |
| def __neg__(self): | |
| """Unary negation operator (-x). | |
| Returns: | |
| New Value with negated value. | |
| References: | |
| - micrograd: https://github.com/karpathy/micrograd | |
| """ | |
| return self * -1 | |
| def __radd__(self, other): | |
| """Right addition operator (other + self) for reverse operations. | |
| Args: | |
| other: Scalar, int, or Value. | |
| Returns: | |
| New Value with sum (delegates to __add__). | |
| References: | |
| - micrograd: https://github.com/karpathy/micrograd | |
| """ | |
| return self + other | |
| def __sub__(self, other): | |
| """Subtraction operator (self - other). | |
| Args: | |
| other: Scalar, int, or Value to subtract. | |
| Returns: | |
| New Value with difference. | |
| References: | |
| - micrograd: https://github.com/karpathy/micrograd | |
| """ | |
| return self + (-other) | |
| def __rsub__(self, other): | |
| """Right subtraction operator (other - self) for reverse operations. | |
| Args: | |
| other: Scalar, int, or Value. | |
| Returns: | |
| New Value with difference. | |
| References: | |
| - micrograd: https://github.com/karpathy/micrograd | |
| """ | |
| return other + (-self) | |
| def __rmul__(self, other): | |
| """Right multiplication operator (other * self) for reverse operations. | |
| Args: | |
| other: Scalar, int, or Value. | |
| Returns: | |
| New Value with product (delegates to __mul__). | |
| References: | |
| - micrograd: https://github.com/karpathy/micrograd | |
| """ | |
| return self * other | |
| def __truediv__(self, other): | |
| """Division operator (self / other). | |
| Args: | |
| other: Scalar, int, or Value denominator. | |
| Returns: | |
| New Value with quotient. | |
| References: | |
| - micrograd: https://github.com/karpathy/micrograd | |
| """ | |
| return self * other**-1 | |
| def __rtruediv__(self, other): | |
| """Right division operator (other / self) for reverse operations. | |
| Args: | |
| other: Scalar, int, or Value numerator. | |
| Returns: | |
| New Value with quotient. | |
| References: | |
| - micrograd: https://github.com/karpathy/micrograd | |
| """ | |
| return other * self**-1 | |
| def __repr__(self): | |
| """String representation showing data and gradient. | |
| Returns: | |
| String in format "Value(data=..., grad=...)". | |
| References: | |
| - micrograd: https://github.com/karpathy/micrograd | |
| """ | |
| return f"Value(data={self.data}, grad={self.grad})" | |
| # Initialize the parameters, to store the knowledge of the model. | |
| n_embd = 16 # embedding dimension | |
| n_head = 4 # number of attention heads | |
| n_layer = 1 # number of layers | |
| block_size = 8 # maximum sequence length | |
| head_dim = n_embd // n_head # dimension of each head | |
| def matrix(nout, nin, std=0.02): | |
| """Create random parameter matrix with Gaussian initialization. | |
| Args: | |
| nout: Output dimension (number of neurons). | |
| nin: Input dimension (fan-in). | |
| std: Standard deviation for Gaussian initialization (default 0.02). | |
| Returns: | |
| 2D list of Value objects representing weight matrix shape (nout, nin). | |
| References: | |
| - He et al. (2015): https://arxiv.org/abs/1502.01852 | |
| - nanoGPT: https://github.com/karpathy/nanoGPT | |
| """ | |
| return [ | |
| [Value(random.gauss(0, std)) for _ in range(nin)] for _ in range(nout) | |
| ] | |
| state_dict = { | |
| "wte": matrix(vocab_size, n_embd), | |
| "wpe": matrix(block_size, n_embd), | |
| "lm_head": matrix(vocab_size, n_embd), | |
| } | |
| for layer_idx in range(n_layer): | |
| state_dict[f"layer{layer_idx}.attn_wq"] = matrix(n_embd, n_embd) | |
| state_dict[f"layer{layer_idx}.attn_wk"] = matrix(n_embd, n_embd) | |
| state_dict[f"layer{layer_idx}.attn_wv"] = matrix(n_embd, n_embd) | |
| state_dict[f"layer{layer_idx}.attn_wo"] = matrix(n_embd, n_embd, std=0) | |
| state_dict[f"layer{layer_idx}.mlp_fc1"] = matrix(4 * n_embd, n_embd) | |
| state_dict[f"layer{layer_idx}.mlp_fc2"] = matrix(n_embd, 4 * n_embd, std=0) | |
| params = list( | |
| chain.from_iterable( | |
| chain.from_iterable(mat) for mat in state_dict.values() | |
| ), | |
| ) # flatten params into a single list[Value] | |
| logger.info("num params: %s", len(params)) | |
| # Define the model architecture, a stateless function mapping token streams | |
| # and model parameters to logits over what comes next. Follow GPT-2 (blessed | |
| # among the GPTs) with minor differences: layernorm -> rmsnorm, no biases, | |
| # GeLU -> ReLU^2, no weight tying | |
| def linear(x, w): | |
| """Matrix-vector multiplication for forward pass. | |
| Args: | |
| x: Input vector (list of Values). | |
| w: Weight matrix (list of lists of Values) shape (out_dim, in_dim). | |
| Returns: | |
| Output vector after linear transformation shape (out_dim,). | |
| References: | |
| - Transformer architecture: https://arxiv.org/abs/1706.03762 | |
| - nanoGPT: https://github.com/karpathy/nanoGPT | |
| """ | |
| return [sum(wi * xi for wi, xi in zip(wo, x)) for wo in w] | |
| def softmax(logits): | |
| """Convert logits to probability distribution with numerical stability. | |
| Uses max subtraction trick to prevent numerical overflow in exponentials. | |
| Args: | |
| logits: List of Value objects representing unnormalized scores. | |
| Returns: | |
| List of Value objects normalized to sum to 1. | |
| References: | |
| - Goodfellow et al. (2016): Deep Learning, Section 4.1 | |
| - Numerical stability: https://en.wikipedia.org/wiki/Softmax_function | |
| """ | |
| max_val = max(val.data for val in logits) | |
| exps = [(val - max_val).exp() for val in logits] | |
| total = sum(exps) | |
| return [e / total for e in exps] | |
| def rmsnorm(x): | |
| """Root Mean Square layer normalization. | |
| Alternative to LayerNorm that normalizes using RMS instead of mean+std. | |
| More efficient and used in modern architectures like T5. | |
| Args: | |
| x: Input vector (list of Values). | |
| Returns: | |
| Normalized vector with RMS stabilization. | |
| References: | |
| - Chen et al. (2020): https://arxiv.org/abs/2011.10566 | |
| - T5 paper: https://arxiv.org/abs/1910.10683 | |
| - nanoGPT: https://github.com/karpathy/nanoGPT | |
| """ | |
| ms = sum(xi * xi for xi in x) / len(x) | |
| scale = 1 / math.sqrt(ms + 1e-5) | |
| return [xi * scale for xi in x] | |
| def attention_head(q_h, k_h, v_h, head_dim): | |
| """Compute single attention head output. | |
| Performs scaled dot-product attention for one attention head. | |
| Args: | |
| q_h: Query vector for this head (list of Values). | |
| k_h: Key vectors for all positions in this head (list of lists). | |
| v_h: Value vectors for all positions in this head (list of lists). | |
| head_dim: Dimension of each attention head. | |
| Returns: | |
| Output vector for this attention head (list of Values). | |
| References: | |
| - Attention Is All You Need (Vaswani et al. 2017): | |
| https://arxiv.org/abs/1706.03762 | |
| """ | |
| attn_logits = [ | |
| sum( | |
| q_h[dim_idx] * k_h[seq_idx][dim_idx] | |
| for dim_idx in range(head_dim) | |
| ) | |
| / math.sqrt(head_dim) | |
| for seq_idx in range(len(k_h)) | |
| ] | |
| attn_weights = softmax(attn_logits) | |
| head_out = [ | |
| sum( | |
| attn_weights[seq_idx] * v_h[seq_idx][dim_idx] | |
| for seq_idx in range(len(v_h)) | |
| ) | |
| for dim_idx in range(head_dim) | |
| ] | |
| return head_out | |
| def mlp_block(x, layer_idx): | |
| """Compute MLP (feed-forward) layer with ReLU^2 activation. | |
| Applies two linear transformations with ReLU squared non-linearity. | |
| Args: | |
| x: Input vector (list of Values). | |
| layer_idx: Index of the transformer layer. | |
| Returns: | |
| Output vector after MLP transformation (list of Values). | |
| References: | |
| - Attention Is All You Need (Vaswani et al. 2017): | |
| https://arxiv.org/abs/1706.03762 | |
| - nanoGPT: https://github.com/karpathy/nanoGPT | |
| """ | |
| x = linear(x, state_dict[f"layer{layer_idx}.mlp_fc1"]) | |
| x = [xi.relu() ** 2 for xi in x] | |
| x = linear(x, state_dict[f"layer{layer_idx}.mlp_fc2"]) | |
| return x | |
| def gpt(token_id, pos_id, keys, values): | |
| """Forward pass through GPT model: embedding → multi-head attention → MLP. | |
| Implements a simplified GPT-2 style transformer layer with: | |
| - Token and position embeddings | |
| - Multi-head scaled dot-product attention | |
| - Feed-forward MLP with ReLU^2 activation | |
| - Residual connections and RMSNorm | |
| Args: | |
| token_id: Token index in vocabulary [0, vocab_size). | |
| pos_id: Position in sequence [0, block_size). | |
| keys: Cached key vectors for each layer (mutated in-place). | |
| values: Cached value vectors for each layer (mutated in-place). | |
| Returns: | |
| Logits over vocabulary for next token prediction. | |
| References: | |
| - Attention Is All You Need (Vaswani et al. 2017): | |
| https://arxiv.org/abs/1706.03762 | |
| - Language Models are Unsupervised Multitask Learners (Radford et al. 2019): | |
| https://github.com/tpn/pdfs/blob/master/Language%20Models%20are%20Unsupervised%20Multitask%20Learners.pdf | |
| - nanoGPT: https://github.com/karpathy/nanoGPT | |
| - makemore: https://github.com/karpathy/makemore | |
| """ | |
| x = list(map(operator.add, state_dict["wte"][token_id], state_dict["wpe"][pos_id])) | |
| x = rmsnorm(x) | |
| for layer_idx in range(n_layer): | |
| # 1) Multi-head attention block | |
| x_residual = x | |
| x = rmsnorm(x) | |
| q = linear(x, state_dict[f"layer{layer_idx}.attn_wq"]) | |
| k = linear(x, state_dict[f"layer{layer_idx}.attn_wk"]) | |
| v = linear(x, state_dict[f"layer{layer_idx}.attn_wv"]) | |
| keys[layer_idx].append(k) | |
| values[layer_idx].append(v) | |
| x_attn = [] | |
| for head_idx in range(n_head): | |
| head_start = head_idx * head_dim | |
| q_h = q[head_start : head_start + head_dim] | |
| k_h = [ | |
| ki[head_start : head_start + head_dim] for ki in keys[layer_idx] | |
| ] | |
| v_h = [ | |
| vi[head_start : head_start + head_dim] for vi in values[layer_idx] | |
| ] | |
| head_out = attention_head(q_h, k_h, v_h, head_dim) | |
| x_attn.extend(head_out) | |
| x = linear(x_attn, state_dict[f"layer{layer_idx}.attn_wo"]) | |
| x = list(map(operator.add, x, x_residual)) | |
| # 2) MLP block | |
| x_residual = x | |
| x = rmsnorm(x) | |
| x = mlp_block(x, layer_idx) | |
| x = list(map(operator.add, x, x_residual)) | |
| return linear(x, state_dict["lm_head"]) | |
| # Let there be Adam, the blessed optimizer and its buffers | |
| learning_rate, beta1, beta2, eps_adam = 1e-2, 0.9, 0.95, 1e-8 | |
| m = [0.0] * len(params) # first moment buffer | |
| v = [0.0] * len(params) # second moment buffer | |
| # Repeat in sequence | |
| num_steps = 500 # number of training steps | |
| for step in range(num_steps): | |
| # Take single document, tokenize it, surround with BOS tokens on both sides | |
| doc = docs[step % len(docs)] | |
| tokens = list(chain([BOS], (stoi[char] for char in doc), [BOS])) | |
| num_positions = min(block_size, len(tokens) - 1) | |
| # Forward the token sequence through the model, building up the computation | |
| # graph all the way to the loss. | |
| keys, values = ( | |
| [[] for _ in repeat(None, n_layer)], | |
| [[] for _ in repeat(None, n_layer)], | |
| ) | |
| losses = [] | |
| for pos_id in range(num_positions): | |
| token_id, target_id = tokens[pos_id], tokens[pos_id + 1] | |
| logits = gpt(token_id, pos_id, keys, values) | |
| probs = softmax(logits) | |
| loss_t = -probs[target_id].log() | |
| losses.append(loss_t) | |
| loss = (1 / num_positions) * sum( | |
| losses, | |
| ) # final average loss over the document sequence. May yours be low. | |
| # Backward the loss, calculating the gradients with respect to all model parameters. | |
| loss.backward() | |
| # Adam optimizer update: update the model parameters based on gradients. | |
| lr_t = learning_rate * (1 - step / num_steps) | |
| for param_idx, p in enumerate(params): | |
| m[param_idx] = beta1 * m[param_idx] + (1 - beta1) * p.grad | |
| v[param_idx] = beta2 * v[param_idx] + (1 - beta2) * p.grad**2 | |
| m_hat = m[param_idx] / (1 - beta1 ** (step + 1)) | |
| v_hat = v[param_idx] / (1 - beta2 ** (step + 1)) | |
| p.data -= lr_t * m_hat / (math.sqrt(v_hat) + eps_adam) | |
| p.grad = 0 | |
| logger.info("step %s / %s | loss %.4f", step + 1, num_steps, loss.data) | |
| # Inference: may the model babble back to us | |
| temperature = 0.6 # in (0, 1], control the "creativity" of generated text, low to high | |
| logger.info("\n--- inference ---") | |
| for sample_idx in range(20): | |
| keys, values = ( | |
| [[] for _ in repeat(None, n_layer)], | |
| [[] for _ in repeat(None, n_layer)], | |
| ) | |
| token_id = BOS | |
| sample_text = "" | |
| for pos_id in range(block_size): | |
| logits = gpt(token_id, pos_id, keys, values) | |
| probs = softmax([logit / temperature for logit in logits]) | |
| weights = [p.data for p in probs] | |
| token_id = random.choices(range(vocab_size), weights=weights)[0] | |
| if token_id == BOS: | |
| break | |
| sample_text += itos[token_id] | |
| logger.info("sample %s: %s", sample_idx + 1, sample_text) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment