Skip to content

Instantly share code, notes, and snippets.

@asears
Created February 12, 2026 01:42
Show Gist options
  • Select an option

  • Save asears/78dffd0bccc71264c10ec98d16a5cc86 to your computer and use it in GitHub Desktop.

Select an option

Save asears/78dffd0bccc71264c10ec98d16a5cc86 to your computer and use it in GitHub Desktop.
microgpt (karpathy)
"""
The most atomic way to train and inference a GPT in pure, dependency-free Python.
This file is the complete algorithm. Everything else is just efficiency.
@karpathy and @claude with the refactoring barkie dog :ruff:
"""
import logging
import math # math.log, math.exp
import operator
import random # random.seed, random.choices, random.gauss, random.shuffle
from itertools import chain, repeat
from pathlib import Path
logging.basicConfig(level=logging.INFO, format="%(message)s")
logger = logging.getLogger(__name__)
# Let there be order among chaos
random.seed(42)
# Let there be an input dataset `docs`: list[str] of documents (e.g. a dataset of names)
if not Path("input.txt").exists():
import urllib.request
urllib.request.urlretrieve(
"https://raw.githubusercontent.com/karpathy/makemore/refs/heads/master/names.txt",
"input.txt",
)
docs = [
line.strip()
for line in Path("input.txt")
.read_text(encoding="utf-8")
.strip()
.split("\n")
if line.strip()
] # list[str] of documents
random.shuffle(docs)
logger.info("num docs: %s", len(docs))
# Let there be a Tokenizer to translate strings to discrete symbols and back
chars = ["<BOS>", *sorted(set("".join(docs)))] # character-level tokenizer
vocab_size = len(chars)
stoi = {char: i for i, char in enumerate(chars)} # encoding: map string to integer
itos = dict(enumerate(chars)) # decoding: map integer to string
BOS = stoi["<BOS>"]
logger.info("vocab size: %s", vocab_size)
# Let there be an Autograd to apply the chain rule recursively across a
# computation graph and so calculate gradients of the loss wrt model parameters.
class Value:
"""Stores a single scalar value and its gradient for automatic differentiation.
Implements a minimal autograd engine supporting basic arithmetic operations,
mathematical functions (log, exp), and ReLU activation with automatic
backpropagation through reverse topological sort.
References:
- micrograd: https://github.com/karpathy/micrograd
- Automatic Differentiation: https://en.wikipedia.org/wiki/Automatic_differentiation
"""
def __init__(self, data, _children=(), _op=""):
"""Initialize autograd node with scalar value and gradient tracking.
Args:
data: Scalar numerical value.
_children: Tuple of child nodes in computation graph.
_op: String name of operation that produced this node.
"""
self.data = data
self.grad = 0
self._backward = lambda: None
self._prev = set(_children)
self._op = _op # the op that produced this node, for graphviz / debugging / etc
def __add__(self, other):
"""Element-wise addition with automatic differentiation.
Args:
other: Scalar, int, or Value to add.
Returns:
New Value with sum, tracking gradient for backprop.
References:
- micrograd: https://github.com/karpathy/micrograd
"""
other = other if isinstance(other, Value) else Value(other)
out = Value(self.data + other.data, (self, other), "+")
def _backward():
self.grad += out.grad
other.grad += out.grad
out._backward = _backward
return out
def __mul__(self, other):
"""Element-wise multiplication with automatic differentiation.
Args:
other: Scalar, int, or Value to multiply.
Returns:
New Value with product, tracking gradient for backprop.
References:
- micrograd: https://github.com/karpathy/micrograd
"""
other = other if isinstance(other, Value) else Value(other)
out = Value(self.data * other.data, (self, other), "*")
def _backward():
self.grad += other.data * out.grad
other.grad += self.data * out.grad
out._backward = _backward
return out
def __pow__(self, other):
"""Power operation with automatic differentiation.
Args:
other: Int or float exponent (must be numeric type).
Returns:
New Value with power result, tracking gradient for backprop.
Raises:
TypeError: If exponent is not int or float.
References:
- micrograd: https://github.com/karpathy/micrograd
"""
if not isinstance(other, (int, float)):
msg = "only supporting int/float powers for now"
raise TypeError(msg)
out = Value(self.data**other, (self,), f"**{other}")
def _backward():
self.grad += (other * self.data ** (other - 1)) * out.grad
out._backward = _backward
return out
def log(self):
"""Natural logarithm with automatic differentiation.
Returns:
New Value with ln(self), tracking gradient for backprop.
References:
- micrograd: https://github.com/karpathy/micrograd
"""
out = Value(math.log(self.data), (self,), "log")
def _backward():
self.grad += (1 / self.data) * out.grad
out._backward = _backward
return out
def exp(self):
"""Exponential function (e^x) with automatic differentiation.
Returns:
New Value with exp(self), tracking gradient for backprop.
References:
- micrograd: https://github.com/karpathy/micrograd
"""
out = Value(math.exp(self.data), (self,), "exp")
def _backward():
self.grad += out.data * out.grad
out._backward = _backward
return out
def relu(self):
"""Rectified Linear Unit activation with automatic differentiation.
Returns max(0, x) which provides non-linearity for neural networks.
Returns:
New Value with ReLU(self), tracking gradient for backprop.
References:
- Krizhevsky et al. (2012): https://papers.nips.cc/paper/2012/hash/c399862d3b9d6b76c8436e924a68c45b-Abstract.html
- micrograd: https://github.com/karpathy/micrograd
"""
out = Value(max(self.data, 0), (self,), "ReLU")
def _backward():
self.grad += (out.data > 0) * out.grad
out._backward = _backward
return out
def backward(self):
"""Backpropagation: compute gradients via reverse topological sort.
Performs automatic differentiation by traversing computation graph in
reverse topological order, applying chain rule at each node.
References:
- Rumelhart et al. (1986): https://www.nature.com/articles/323533a0
- Automatic Differentiation: https://en.wikipedia.org/wiki/Automatic_differentiation
- Griewank & Walther (2008): https://arxiv.org/abs/0712.0210
- micrograd: https://github.com/karpathy/micrograd
"""
# topological order all of the children in the graph
topo = []
visited = set()
def build_topo(v):
if v not in visited:
visited.add(v)
for child in v._prev:
build_topo(child)
topo.append(v)
build_topo(self)
# go one variable at a time and apply the chain rule to get its gradient
self.grad = 1
for v in reversed(topo):
v._backward()
def __neg__(self):
"""Unary negation operator (-x).
Returns:
New Value with negated value.
References:
- micrograd: https://github.com/karpathy/micrograd
"""
return self * -1
def __radd__(self, other):
"""Right addition operator (other + self) for reverse operations.
Args:
other: Scalar, int, or Value.
Returns:
New Value with sum (delegates to __add__).
References:
- micrograd: https://github.com/karpathy/micrograd
"""
return self + other
def __sub__(self, other):
"""Subtraction operator (self - other).
Args:
other: Scalar, int, or Value to subtract.
Returns:
New Value with difference.
References:
- micrograd: https://github.com/karpathy/micrograd
"""
return self + (-other)
def __rsub__(self, other):
"""Right subtraction operator (other - self) for reverse operations.
Args:
other: Scalar, int, or Value.
Returns:
New Value with difference.
References:
- micrograd: https://github.com/karpathy/micrograd
"""
return other + (-self)
def __rmul__(self, other):
"""Right multiplication operator (other * self) for reverse operations.
Args:
other: Scalar, int, or Value.
Returns:
New Value with product (delegates to __mul__).
References:
- micrograd: https://github.com/karpathy/micrograd
"""
return self * other
def __truediv__(self, other):
"""Division operator (self / other).
Args:
other: Scalar, int, or Value denominator.
Returns:
New Value with quotient.
References:
- micrograd: https://github.com/karpathy/micrograd
"""
return self * other**-1
def __rtruediv__(self, other):
"""Right division operator (other / self) for reverse operations.
Args:
other: Scalar, int, or Value numerator.
Returns:
New Value with quotient.
References:
- micrograd: https://github.com/karpathy/micrograd
"""
return other * self**-1
def __repr__(self):
"""String representation showing data and gradient.
Returns:
String in format "Value(data=..., grad=...)".
References:
- micrograd: https://github.com/karpathy/micrograd
"""
return f"Value(data={self.data}, grad={self.grad})"
# Initialize the parameters, to store the knowledge of the model.
n_embd = 16 # embedding dimension
n_head = 4 # number of attention heads
n_layer = 1 # number of layers
block_size = 8 # maximum sequence length
head_dim = n_embd // n_head # dimension of each head
def matrix(nout, nin, std=0.02):
"""Create random parameter matrix with Gaussian initialization.
Args:
nout: Output dimension (number of neurons).
nin: Input dimension (fan-in).
std: Standard deviation for Gaussian initialization (default 0.02).
Returns:
2D list of Value objects representing weight matrix shape (nout, nin).
References:
- He et al. (2015): https://arxiv.org/abs/1502.01852
- nanoGPT: https://github.com/karpathy/nanoGPT
"""
return [
[Value(random.gauss(0, std)) for _ in range(nin)] for _ in range(nout)
]
state_dict = {
"wte": matrix(vocab_size, n_embd),
"wpe": matrix(block_size, n_embd),
"lm_head": matrix(vocab_size, n_embd),
}
for layer_idx in range(n_layer):
state_dict[f"layer{layer_idx}.attn_wq"] = matrix(n_embd, n_embd)
state_dict[f"layer{layer_idx}.attn_wk"] = matrix(n_embd, n_embd)
state_dict[f"layer{layer_idx}.attn_wv"] = matrix(n_embd, n_embd)
state_dict[f"layer{layer_idx}.attn_wo"] = matrix(n_embd, n_embd, std=0)
state_dict[f"layer{layer_idx}.mlp_fc1"] = matrix(4 * n_embd, n_embd)
state_dict[f"layer{layer_idx}.mlp_fc2"] = matrix(n_embd, 4 * n_embd, std=0)
params = list(
chain.from_iterable(
chain.from_iterable(mat) for mat in state_dict.values()
),
) # flatten params into a single list[Value]
logger.info("num params: %s", len(params))
# Define the model architecture, a stateless function mapping token streams
# and model parameters to logits over what comes next. Follow GPT-2 (blessed
# among the GPTs) with minor differences: layernorm -> rmsnorm, no biases,
# GeLU -> ReLU^2, no weight tying
def linear(x, w):
"""Matrix-vector multiplication for forward pass.
Args:
x: Input vector (list of Values).
w: Weight matrix (list of lists of Values) shape (out_dim, in_dim).
Returns:
Output vector after linear transformation shape (out_dim,).
References:
- Transformer architecture: https://arxiv.org/abs/1706.03762
- nanoGPT: https://github.com/karpathy/nanoGPT
"""
return [sum(wi * xi for wi, xi in zip(wo, x)) for wo in w]
def softmax(logits):
"""Convert logits to probability distribution with numerical stability.
Uses max subtraction trick to prevent numerical overflow in exponentials.
Args:
logits: List of Value objects representing unnormalized scores.
Returns:
List of Value objects normalized to sum to 1.
References:
- Goodfellow et al. (2016): Deep Learning, Section 4.1
- Numerical stability: https://en.wikipedia.org/wiki/Softmax_function
"""
max_val = max(val.data for val in logits)
exps = [(val - max_val).exp() for val in logits]
total = sum(exps)
return [e / total for e in exps]
def rmsnorm(x):
"""Root Mean Square layer normalization.
Alternative to LayerNorm that normalizes using RMS instead of mean+std.
More efficient and used in modern architectures like T5.
Args:
x: Input vector (list of Values).
Returns:
Normalized vector with RMS stabilization.
References:
- Chen et al. (2020): https://arxiv.org/abs/2011.10566
- T5 paper: https://arxiv.org/abs/1910.10683
- nanoGPT: https://github.com/karpathy/nanoGPT
"""
ms = sum(xi * xi for xi in x) / len(x)
scale = 1 / math.sqrt(ms + 1e-5)
return [xi * scale for xi in x]
def attention_head(q_h, k_h, v_h, head_dim):
"""Compute single attention head output.
Performs scaled dot-product attention for one attention head.
Args:
q_h: Query vector for this head (list of Values).
k_h: Key vectors for all positions in this head (list of lists).
v_h: Value vectors for all positions in this head (list of lists).
head_dim: Dimension of each attention head.
Returns:
Output vector for this attention head (list of Values).
References:
- Attention Is All You Need (Vaswani et al. 2017):
https://arxiv.org/abs/1706.03762
"""
attn_logits = [
sum(
q_h[dim_idx] * k_h[seq_idx][dim_idx]
for dim_idx in range(head_dim)
)
/ math.sqrt(head_dim)
for seq_idx in range(len(k_h))
]
attn_weights = softmax(attn_logits)
head_out = [
sum(
attn_weights[seq_idx] * v_h[seq_idx][dim_idx]
for seq_idx in range(len(v_h))
)
for dim_idx in range(head_dim)
]
return head_out
def mlp_block(x, layer_idx):
"""Compute MLP (feed-forward) layer with ReLU^2 activation.
Applies two linear transformations with ReLU squared non-linearity.
Args:
x: Input vector (list of Values).
layer_idx: Index of the transformer layer.
Returns:
Output vector after MLP transformation (list of Values).
References:
- Attention Is All You Need (Vaswani et al. 2017):
https://arxiv.org/abs/1706.03762
- nanoGPT: https://github.com/karpathy/nanoGPT
"""
x = linear(x, state_dict[f"layer{layer_idx}.mlp_fc1"])
x = [xi.relu() ** 2 for xi in x]
x = linear(x, state_dict[f"layer{layer_idx}.mlp_fc2"])
return x
def gpt(token_id, pos_id, keys, values):
"""Forward pass through GPT model: embedding → multi-head attention → MLP.
Implements a simplified GPT-2 style transformer layer with:
- Token and position embeddings
- Multi-head scaled dot-product attention
- Feed-forward MLP with ReLU^2 activation
- Residual connections and RMSNorm
Args:
token_id: Token index in vocabulary [0, vocab_size).
pos_id: Position in sequence [0, block_size).
keys: Cached key vectors for each layer (mutated in-place).
values: Cached value vectors for each layer (mutated in-place).
Returns:
Logits over vocabulary for next token prediction.
References:
- Attention Is All You Need (Vaswani et al. 2017):
https://arxiv.org/abs/1706.03762
- Language Models are Unsupervised Multitask Learners (Radford et al. 2019):
https://github.com/tpn/pdfs/blob/master/Language%20Models%20are%20Unsupervised%20Multitask%20Learners.pdf
- nanoGPT: https://github.com/karpathy/nanoGPT
- makemore: https://github.com/karpathy/makemore
"""
x = list(map(operator.add, state_dict["wte"][token_id], state_dict["wpe"][pos_id]))
x = rmsnorm(x)
for layer_idx in range(n_layer):
# 1) Multi-head attention block
x_residual = x
x = rmsnorm(x)
q = linear(x, state_dict[f"layer{layer_idx}.attn_wq"])
k = linear(x, state_dict[f"layer{layer_idx}.attn_wk"])
v = linear(x, state_dict[f"layer{layer_idx}.attn_wv"])
keys[layer_idx].append(k)
values[layer_idx].append(v)
x_attn = []
for head_idx in range(n_head):
head_start = head_idx * head_dim
q_h = q[head_start : head_start + head_dim]
k_h = [
ki[head_start : head_start + head_dim] for ki in keys[layer_idx]
]
v_h = [
vi[head_start : head_start + head_dim] for vi in values[layer_idx]
]
head_out = attention_head(q_h, k_h, v_h, head_dim)
x_attn.extend(head_out)
x = linear(x_attn, state_dict[f"layer{layer_idx}.attn_wo"])
x = list(map(operator.add, x, x_residual))
# 2) MLP block
x_residual = x
x = rmsnorm(x)
x = mlp_block(x, layer_idx)
x = list(map(operator.add, x, x_residual))
return linear(x, state_dict["lm_head"])
# Let there be Adam, the blessed optimizer and its buffers
learning_rate, beta1, beta2, eps_adam = 1e-2, 0.9, 0.95, 1e-8
m = [0.0] * len(params) # first moment buffer
v = [0.0] * len(params) # second moment buffer
# Repeat in sequence
num_steps = 500 # number of training steps
for step in range(num_steps):
# Take single document, tokenize it, surround with BOS tokens on both sides
doc = docs[step % len(docs)]
tokens = list(chain([BOS], (stoi[char] for char in doc), [BOS]))
num_positions = min(block_size, len(tokens) - 1)
# Forward the token sequence through the model, building up the computation
# graph all the way to the loss.
keys, values = (
[[] for _ in repeat(None, n_layer)],
[[] for _ in repeat(None, n_layer)],
)
losses = []
for pos_id in range(num_positions):
token_id, target_id = tokens[pos_id], tokens[pos_id + 1]
logits = gpt(token_id, pos_id, keys, values)
probs = softmax(logits)
loss_t = -probs[target_id].log()
losses.append(loss_t)
loss = (1 / num_positions) * sum(
losses,
) # final average loss over the document sequence. May yours be low.
# Backward the loss, calculating the gradients with respect to all model parameters.
loss.backward()
# Adam optimizer update: update the model parameters based on gradients.
lr_t = learning_rate * (1 - step / num_steps)
for param_idx, p in enumerate(params):
m[param_idx] = beta1 * m[param_idx] + (1 - beta1) * p.grad
v[param_idx] = beta2 * v[param_idx] + (1 - beta2) * p.grad**2
m_hat = m[param_idx] / (1 - beta1 ** (step + 1))
v_hat = v[param_idx] / (1 - beta2 ** (step + 1))
p.data -= lr_t * m_hat / (math.sqrt(v_hat) + eps_adam)
p.grad = 0
logger.info("step %s / %s | loss %.4f", step + 1, num_steps, loss.data)
# Inference: may the model babble back to us
temperature = 0.6 # in (0, 1], control the "creativity" of generated text, low to high
logger.info("\n--- inference ---")
for sample_idx in range(20):
keys, values = (
[[] for _ in repeat(None, n_layer)],
[[] for _ in repeat(None, n_layer)],
)
token_id = BOS
sample_text = ""
for pos_id in range(block_size):
logits = gpt(token_id, pos_id, keys, values)
probs = softmax([logit / temperature for logit in logits])
weights = [p.data for p in probs]
token_id = random.choices(range(vocab_size), weights=weights)[0]
if token_id == BOS:
break
sample_text += itos[token_id]
logger.info("sample %s: %s", sample_idx + 1, sample_text)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment