Created
February 12, 2026 20:05
-
-
Save hikalkan/f18b9cf4788ced9f2bcc26ce4adc23ea to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // Program.cs | |
| // Port of @karpathy microgpt.py gist to a single, dependency-free C# file. | |
| // Warning: extremely slow (scalar autograd + tons of allocations). Educational only. | |
| using System; | |
| using System.Collections.Generic; | |
| using System.IO; | |
| using System.Linq; | |
| using System.Net.Http; | |
| public static class Program | |
| { | |
| // ----------------------------- | |
| // Minimal scalar autograd engine | |
| // ----------------------------- | |
| public sealed class Value | |
| { | |
| public double Data; | |
| public double Grad; | |
| private readonly Value[] _children; | |
| private readonly double[] _localGrads; // d(this)/d(child_i) | |
| public Value(double data) | |
| { | |
| Data = data; | |
| Grad = 0.0; | |
| _children = Array.Empty<Value>(); | |
| _localGrads = Array.Empty<double>(); | |
| } | |
| private Value(double data, Value[] children, double[] localGrads) | |
| { | |
| Data = data; | |
| Grad = 0.0; | |
| _children = children; | |
| _localGrads = localGrads; | |
| } | |
| public static Value operator +(Value a, Value b) | |
| => new Value(a.Data + b.Data, new[] { a, b }, new[] { 1.0, 1.0 }); | |
| public static Value operator +(Value a, double b) => a + new Value(b); | |
| public static Value operator +(double a, Value b) => new Value(a) + b; | |
| public static Value operator *(Value a, Value b) | |
| => new Value(a.Data * b.Data, new[] { a, b }, new[] { b.Data, a.Data }); | |
| public static Value operator *(Value a, double b) => a * new Value(b); | |
| public static Value operator *(double a, Value b) => new Value(a) * b; | |
| public static Value operator -(Value a) => a * -1.0; | |
| public static Value operator -(Value a, Value b) => a + (-b); | |
| public static Value operator -(Value a, double b) => a + (-new Value(b)); | |
| public static Value operator -(double a, Value b) => new Value(a) + (-b); | |
| public static Value operator /(Value a, Value b) => a * b.Pow(-1.0); | |
| public static Value operator /(Value a, double b) => a * new Value(b).Pow(-1.0); | |
| public static Value operator /(double a, Value b) => new Value(a) * b.Pow(-1.0); | |
| public Value Pow(double p) | |
| { | |
| // y = x^p ; dy/dx = p*x^(p-1) | |
| var y = Math.Pow(Data, p); | |
| var local = p * Math.Pow(Data, p - 1.0); | |
| return new Value(y, new[] { this }, new[] { local }); | |
| } | |
| public Value Log() | |
| { | |
| // y = log(x) ; dy/dx = 1/x | |
| return new Value(Math.Log(Data), new[] { this }, new[] { 1.0 / Data }); | |
| } | |
| public Value Exp() | |
| { | |
| // y = exp(x) ; dy/dx = exp(x) | |
| var e = Math.Exp(Data); | |
| return new Value(e, new[] { this }, new[] { e }); | |
| } | |
| public Value Relu() | |
| { | |
| // y = max(0,x) ; dy/dx = 1 if x>0 else 0 | |
| var y = Math.Max(0.0, Data); | |
| var local = Data > 0.0 ? 1.0 : 0.0; | |
| return new Value(y, new[] { this }, new[] { local }); | |
| } | |
| public void Backward() | |
| { | |
| // topo sort | |
| var topo = new List<Value>(capacity: 1024); | |
| var visited = new HashSet<Value>(ReferenceEqualityComparer<Value>.Instance); | |
| void Build(Value v) | |
| { | |
| if (visited.Add(v)) | |
| { | |
| foreach (var ch in v._children) Build(ch); | |
| topo.Add(v); | |
| } | |
| } | |
| Build(this); | |
| Grad = 1.0; | |
| for (int i = topo.Count - 1; i >= 0; i--) | |
| { | |
| var v = topo[i]; | |
| for (int ci = 0; ci < v._children.Length; ci++) | |
| { | |
| var child = v._children[ci]; | |
| var local = v._localGrads[ci]; | |
| child.Grad += local * v.Grad; | |
| } | |
| } | |
| } | |
| } | |
| // Reference equality comparer for topo traversal | |
| private sealed class ReferenceEqualityComparer<T> : IEqualityComparer<T> where T : class | |
| { | |
| public static readonly ReferenceEqualityComparer<T> Instance = new(); | |
| public bool Equals(T? x, T? y) => ReferenceEquals(x, y); | |
| public int GetHashCode(T obj) => System.Runtime.CompilerServices.RuntimeHelpers.GetHashCode(obj); | |
| } | |
| // ----------------------------- | |
| // Random helpers (Gaussian + weighted choice) | |
| // ----------------------------- | |
| private static readonly Random Rng = new Random(42); | |
| private static double NextGaussian(double mean = 0.0, double std = 1.0) | |
| { | |
| // Box-Muller transform | |
| double u1 = 1.0 - Rng.NextDouble(); | |
| double u2 = 1.0 - Rng.NextDouble(); | |
| double z0 = Math.Sqrt(-2.0 * Math.Log(u1)) * Math.Cos(2.0 * Math.PI * u2); | |
| return mean + std * z0; | |
| } | |
| private static int WeightedChoice(IReadOnlyList<double> weights) | |
| { | |
| double total = 0.0; | |
| for (int i = 0; i < weights.Count; i++) total += weights[i]; | |
| double r = Rng.NextDouble() * total; | |
| double c = 0.0; | |
| for (int i = 0; i < weights.Count; i++) | |
| { | |
| c += weights[i]; | |
| if (r <= c) return i; | |
| } | |
| return weights.Count - 1; // fallback | |
| } | |
| // ----------------------------- | |
| // Model hyperparams (same as python) | |
| // ----------------------------- | |
| private const int n_embd = 16; | |
| private const int n_head = 4; | |
| private const int n_layer = 1; | |
| private const int block_size = 8; | |
| private const int head_dim = n_embd / n_head; | |
| // state dict of parameter matrices | |
| private static Dictionary<string, List<List<Value>>> state_dict = new(); | |
| private static List<Value> parameters = new(); | |
| private static List<List<Value>> Matrix(int nout, int nin, double std = 0.02) | |
| { | |
| var mat = new List<List<Value>>(nout); | |
| for (int o = 0; o < nout; o++) | |
| { | |
| var row = new List<Value>(nin); | |
| for (int i = 0; i < nin; i++) row.Add(new Value(NextGaussian(0.0, std))); | |
| mat.Add(row); | |
| } | |
| return mat; | |
| } | |
| private static void InitModel(int vocabSize) | |
| { | |
| state_dict = new Dictionary<string, List<List<Value>>>(); | |
| state_dict["wte"] = Matrix(vocabSize, n_embd); | |
| state_dict["wpe"] = Matrix(block_size, n_embd); | |
| state_dict["lm_head"] = Matrix(vocabSize, n_embd); | |
| for (int i = 0; i < n_layer; i++) | |
| { | |
| state_dict[$"layer{i}.attn_wq"] = Matrix(n_embd, n_embd); | |
| state_dict[$"layer{i}.attn_wk"] = Matrix(n_embd, n_embd); | |
| state_dict[$"layer{i}.attn_wv"] = Matrix(n_embd, n_embd); | |
| state_dict[$"layer{i}.attn_wo"] = Matrix(n_embd, n_embd, std: 0.0); | |
| state_dict[$"layer{i}.mlp_fc1"] = Matrix(4 * n_embd, n_embd); | |
| state_dict[$"layer{i}.mlp_fc2"] = Matrix(n_embd, 4 * n_embd, std: 0.0); | |
| } | |
| parameters = new List<Value>(); | |
| foreach (var mat in state_dict.Values) | |
| foreach (var row in mat) | |
| parameters.AddRange(row); | |
| } | |
| // ----------------------------- | |
| // Model ops (linear, softmax, rmsnorm, gpt) | |
| // ----------------------------- | |
| private static List<Value> Linear(List<Value> x, List<List<Value>> w) | |
| { | |
| // w: [nout][nin], x: [nin] -> out: [nout] | |
| var output = new List<Value>(w.Count); | |
| for (int o = 0; o < w.Count; o++) | |
| { | |
| Value sum = new Value(0.0); | |
| var row = w[o]; | |
| for (int i = 0; i < row.Count; i++) | |
| sum = sum + (row[i] * x[i]); | |
| output.Add(sum); | |
| } | |
| return output; | |
| } | |
| private static List<Value> Softmax(List<Value> logits) | |
| { | |
| double maxVal = logits.Max(v => v.Data); | |
| var exps = new List<Value>(logits.Count); | |
| for (int i = 0; i < logits.Count; i++) | |
| exps.Add((logits[i] - maxVal).Exp()); | |
| Value total = new Value(0.0); | |
| for (int i = 0; i < exps.Count; i++) total = total + exps[i]; | |
| var probs = new List<Value>(logits.Count); | |
| for (int i = 0; i < exps.Count; i++) probs.Add(exps[i] / total); | |
| return probs; | |
| } | |
| private static List<Value> RmsNorm(List<Value> x) | |
| { | |
| // ms = mean(x^2), scale = (ms + 1e-5)^-0.5, return x*scale | |
| Value ms = new Value(0.0); | |
| for (int i = 0; i < x.Count; i++) ms = ms + (x[i] * x[i]); | |
| ms = ms / (double)x.Count; | |
| Value scale = (ms + 1e-5).Pow(-0.5); | |
| var y = new List<Value>(x.Count); | |
| for (int i = 0; i < x.Count; i++) y.Add(x[i] * scale); | |
| return y; | |
| } | |
| private static List<Value> Slice(List<Value> v, int start, int length) | |
| { | |
| var r = new List<Value>(length); | |
| for (int i = 0; i < length; i++) r.Add(v[start + i]); | |
| return r; | |
| } | |
| private static List<Value> GPT( | |
| int tokenId, | |
| int posId, | |
| List<List<List<Value>>> keys, // [layer][time][n_embd] | |
| List<List<List<Value>>> values) // [layer][time][n_embd] | |
| { | |
| var tok_emb = state_dict["wte"][tokenId]; // length n_embd | |
| var pos_emb = state_dict["wpe"][posId]; // length n_embd | |
| var x = new List<Value>(n_embd); | |
| for (int i = 0; i < n_embd; i++) x.Add(tok_emb[i] + pos_emb[i]); | |
| x = RmsNorm(x); | |
| for (int li = 0; li < n_layer; li++) | |
| { | |
| // 1) Multi-head attention block | |
| var x_residual = x; | |
| x = RmsNorm(x); | |
| var q = Linear(x, state_dict[$"layer{li}.attn_wq"]); | |
| var k = Linear(x, state_dict[$"layer{li}.attn_wk"]); | |
| var v = Linear(x, state_dict[$"layer{li}.attn_wv"]); | |
| keys[li].Add(k); | |
| values[li].Add(v); | |
| var x_attn = new List<Value>(n_embd); | |
| for (int h = 0; h < n_head; h++) | |
| { | |
| int hs = h * head_dim; | |
| var q_h = Slice(q, hs, head_dim); | |
| var k_h = keys[li].Select(kk => Slice(kk, hs, head_dim)).ToList(); | |
| var v_h = values[li].Select(vv => Slice(vv, hs, head_dim)).ToList(); | |
| // attn_logits[t] = dot(q_h, k_h[t]) / sqrt(head_dim) | |
| var attn_logits = new List<Value>(k_h.Count); | |
| for (int t = 0; t < k_h.Count; t++) | |
| { | |
| Value dot = new Value(0.0); | |
| for (int j = 0; j < head_dim; j++) | |
| dot = dot + (q_h[j] * k_h[t][j]); | |
| attn_logits.Add(dot / Math.Sqrt(head_dim)); | |
| } | |
| var attn_weights = Softmax(attn_logits); // length time | |
| // head_out[j] = sum_t attn_weights[t] * v_h[t][j] | |
| for (int j = 0; j < head_dim; j++) | |
| { | |
| Value sum = new Value(0.0); | |
| for (int t = 0; t < v_h.Count; t++) | |
| sum = sum + (attn_weights[t] * v_h[t][j]); | |
| x_attn.Add(sum); | |
| } | |
| } | |
| x = Linear(x_attn, state_dict[$"layer{li}.attn_wo"]); | |
| var x_added = new List<Value>(n_embd); | |
| for (int i = 0; i < n_embd; i++) x_added.Add(x[i] + x_residual[i]); | |
| x = x_added; | |
| // 2) MLP block | |
| x_residual = x; | |
| x = RmsNorm(x); | |
| x = Linear(x, state_dict[$"layer{li}.mlp_fc1"]); | |
| for (int i = 0; i < x.Count; i++) x[i] = x[i].Relu().Pow(2.0); | |
| x = Linear(x, state_dict[$"layer{li}.mlp_fc2"]); | |
| x_added = new List<Value>(n_embd); | |
| for (int i = 0; i < n_embd; i++) x_added.Add(x[i] + x_residual[i]); | |
| x = x_added; | |
| } | |
| var logits = Linear(x, state_dict["lm_head"]); | |
| return logits; | |
| } | |
| // ----------------------------- | |
| // Main: dataset, tokenizer, train, inference | |
| // ----------------------------- | |
| public static void Main() | |
| { | |
| // Dataset | |
| const string inputPath = "input.txt"; | |
| if (!File.Exists(inputPath)) | |
| { | |
| Console.WriteLine("input.txt not found, downloading names dataset..."); | |
| var url = "https://raw.githubusercontent.com/karpathy/makemore/refs/heads/master/names.txt"; | |
| using var http = new HttpClient(); | |
| var bytes = http.GetByteArrayAsync(url).GetAwaiter().GetResult(); | |
| File.WriteAllBytes(inputPath, bytes); | |
| } | |
| var docs = File.ReadAllLines(inputPath) | |
| .Select(l => l.Trim()) | |
| .Where(l => !string.IsNullOrWhiteSpace(l)) | |
| .ToList(); | |
| // Shuffle | |
| docs = docs.OrderBy(_ => Rng.Next()).ToList(); | |
| Console.WriteLine($"num docs: {docs.Count}"); | |
| // Tokenizer: unique chars become 0..n-1, BOS is last id | |
| var uchars = docs.SelectMany(s => s).Distinct().OrderBy(c => c).ToList(); | |
| int BOS = uchars.Count; | |
| int vocabSize = uchars.Count + 1; | |
| Console.WriteLine($"vocab size: {vocabSize}"); | |
| // Model init | |
| InitModel(vocabSize); | |
| Console.WriteLine($"num params: {parameters.Count}"); | |
| // Adam buffers | |
| double learningRate = 1e-2, beta1 = 0.9, beta2 = 0.95, epsAdam = 1e-8; | |
| var m = new double[parameters.Count]; | |
| var v = new double[parameters.Count]; | |
| // Train | |
| int numSteps = 500; | |
| for (int step = 0; step < numSteps; step++) | |
| { | |
| string doc = docs[step % docs.Count]; | |
| // tokens = [BOS] + chars + [BOS] | |
| var tokens = new List<int>(doc.Length + 2) { BOS }; | |
| for (int i = 0; i < doc.Length; i++) | |
| tokens.Add(uchars.IndexOf(doc[i])); | |
| tokens.Add(BOS); | |
| int n = Math.Min(block_size, tokens.Count - 1); | |
| var keys = Enumerable.Range(0, n_layer).Select(_ => new List<List<Value>>()).ToList(); | |
| var values = Enumerable.Range(0, n_layer).Select(_ => new List<List<Value>>()).ToList(); | |
| var losses = new List<Value>(n); | |
| for (int posId = 0; posId < n; posId++) | |
| { | |
| int tokenId = tokens[posId]; | |
| int targetId = tokens[posId + 1]; | |
| var logits = GPT(tokenId, posId, keys, values); | |
| var probs = Softmax(logits); | |
| var loss_t = -probs[targetId].Log(); | |
| losses.Add(loss_t); | |
| } | |
| // loss = mean(losses) | |
| Value loss = new Value(0.0); | |
| for (int i = 0; i < losses.Count; i++) loss = loss + losses[i]; | |
| loss = (1.0 / n) * loss; | |
| // backward | |
| loss.Backward(); | |
| // cosine lr decay | |
| double lr_t = learningRate * 0.5 * (1.0 + Math.Cos(Math.PI * step / numSteps)); | |
| // adam update | |
| for (int i = 0; i < parameters.Count; i++) | |
| { | |
| var p = parameters[i]; | |
| m[i] = beta1 * m[i] + (1.0 - beta1) * p.Grad; | |
| v[i] = beta2 * v[i] + (1.0 - beta2) * (p.Grad * p.Grad); | |
| double mHat = m[i] / (1.0 - Math.Pow(beta1, step + 1)); | |
| double vHat = v[i] / (1.0 - Math.Pow(beta2, step + 1)); | |
| p.Data -= lr_t * mHat / (Math.Sqrt(vHat) + epsAdam); | |
| p.Grad = 0.0; | |
| } | |
| Console.WriteLine($"step {step + 1,4} / {numSteps,4} | loss {loss.Data:F4}"); | |
| } | |
| // Inference | |
| double temperature = 0.5; | |
| Console.WriteLine("\n--- inference ---"); | |
| for (int sampleIdx = 0; sampleIdx < 20; sampleIdx++) | |
| { | |
| var keys = Enumerable.Range(0, n_layer).Select(_ => new List<List<Value>>()).ToList(); | |
| var values = Enumerable.Range(0, n_layer).Select(_ => new List<List<Value>>()).ToList(); | |
| int tokenId = BOS; | |
| var sample = new List<char>(); | |
| for (int posId = 0; posId < block_size; posId++) | |
| { | |
| var logits = GPT(tokenId, posId, keys, values); | |
| // probs = softmax(logits / temperature) | |
| var scaled = new List<Value>(logits.Count); | |
| for (int i = 0; i < logits.Count; i++) scaled.Add(logits[i] / temperature); | |
| var probs = Softmax(scaled); | |
| var w = probs.Select(p => p.Data).ToList(); | |
| tokenId = WeightedChoice(w); | |
| if (tokenId == BOS) break; | |
| sample.Add(uchars[tokenId]); | |
| } | |
| Console.WriteLine($"sample {sampleIdx + 1,2}: {new string(sample.ToArray())}"); | |
| } | |
| } | |
| } |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
original: https://gist.github.com/karpathy/8627fe009c40f57531cb18360106ce95