Skip to content

Instantly share code, notes, and snippets.

@hikalkan
Created February 12, 2026 20:05
Show Gist options
  • Select an option

  • Save hikalkan/f18b9cf4788ced9f2bcc26ce4adc23ea to your computer and use it in GitHub Desktop.

Select an option

Save hikalkan/f18b9cf4788ced9f2bcc26ce4adc23ea to your computer and use it in GitHub Desktop.
// Program.cs
// Port of @karpathy microgpt.py gist to a single, dependency-free C# file.
// Warning: extremely slow (scalar autograd + tons of allocations). Educational only.
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net.Http;
public static class Program
{
// -----------------------------
// Minimal scalar autograd engine
// -----------------------------
public sealed class Value
{
public double Data;
public double Grad;
private readonly Value[] _children;
private readonly double[] _localGrads; // d(this)/d(child_i)
public Value(double data)
{
Data = data;
Grad = 0.0;
_children = Array.Empty<Value>();
_localGrads = Array.Empty<double>();
}
private Value(double data, Value[] children, double[] localGrads)
{
Data = data;
Grad = 0.0;
_children = children;
_localGrads = localGrads;
}
public static Value operator +(Value a, Value b)
=> new Value(a.Data + b.Data, new[] { a, b }, new[] { 1.0, 1.0 });
public static Value operator +(Value a, double b) => a + new Value(b);
public static Value operator +(double a, Value b) => new Value(a) + b;
public static Value operator *(Value a, Value b)
=> new Value(a.Data * b.Data, new[] { a, b }, new[] { b.Data, a.Data });
public static Value operator *(Value a, double b) => a * new Value(b);
public static Value operator *(double a, Value b) => new Value(a) * b;
public static Value operator -(Value a) => a * -1.0;
public static Value operator -(Value a, Value b) => a + (-b);
public static Value operator -(Value a, double b) => a + (-new Value(b));
public static Value operator -(double a, Value b) => new Value(a) + (-b);
public static Value operator /(Value a, Value b) => a * b.Pow(-1.0);
public static Value operator /(Value a, double b) => a * new Value(b).Pow(-1.0);
public static Value operator /(double a, Value b) => new Value(a) * b.Pow(-1.0);
public Value Pow(double p)
{
// y = x^p ; dy/dx = p*x^(p-1)
var y = Math.Pow(Data, p);
var local = p * Math.Pow(Data, p - 1.0);
return new Value(y, new[] { this }, new[] { local });
}
public Value Log()
{
// y = log(x) ; dy/dx = 1/x
return new Value(Math.Log(Data), new[] { this }, new[] { 1.0 / Data });
}
public Value Exp()
{
// y = exp(x) ; dy/dx = exp(x)
var e = Math.Exp(Data);
return new Value(e, new[] { this }, new[] { e });
}
public Value Relu()
{
// y = max(0,x) ; dy/dx = 1 if x>0 else 0
var y = Math.Max(0.0, Data);
var local = Data > 0.0 ? 1.0 : 0.0;
return new Value(y, new[] { this }, new[] { local });
}
public void Backward()
{
// topo sort
var topo = new List<Value>(capacity: 1024);
var visited = new HashSet<Value>(ReferenceEqualityComparer<Value>.Instance);
void Build(Value v)
{
if (visited.Add(v))
{
foreach (var ch in v._children) Build(ch);
topo.Add(v);
}
}
Build(this);
Grad = 1.0;
for (int i = topo.Count - 1; i >= 0; i--)
{
var v = topo[i];
for (int ci = 0; ci < v._children.Length; ci++)
{
var child = v._children[ci];
var local = v._localGrads[ci];
child.Grad += local * v.Grad;
}
}
}
}
// Reference equality comparer for topo traversal
private sealed class ReferenceEqualityComparer<T> : IEqualityComparer<T> where T : class
{
public static readonly ReferenceEqualityComparer<T> Instance = new();
public bool Equals(T? x, T? y) => ReferenceEquals(x, y);
public int GetHashCode(T obj) => System.Runtime.CompilerServices.RuntimeHelpers.GetHashCode(obj);
}
// -----------------------------
// Random helpers (Gaussian + weighted choice)
// -----------------------------
private static readonly Random Rng = new Random(42);
private static double NextGaussian(double mean = 0.0, double std = 1.0)
{
// Box-Muller transform
double u1 = 1.0 - Rng.NextDouble();
double u2 = 1.0 - Rng.NextDouble();
double z0 = Math.Sqrt(-2.0 * Math.Log(u1)) * Math.Cos(2.0 * Math.PI * u2);
return mean + std * z0;
}
private static int WeightedChoice(IReadOnlyList<double> weights)
{
double total = 0.0;
for (int i = 0; i < weights.Count; i++) total += weights[i];
double r = Rng.NextDouble() * total;
double c = 0.0;
for (int i = 0; i < weights.Count; i++)
{
c += weights[i];
if (r <= c) return i;
}
return weights.Count - 1; // fallback
}
// -----------------------------
// Model hyperparams (same as python)
// -----------------------------
private const int n_embd = 16;
private const int n_head = 4;
private const int n_layer = 1;
private const int block_size = 8;
private const int head_dim = n_embd / n_head;
// state dict of parameter matrices
private static Dictionary<string, List<List<Value>>> state_dict = new();
private static List<Value> parameters = new();
private static List<List<Value>> Matrix(int nout, int nin, double std = 0.02)
{
var mat = new List<List<Value>>(nout);
for (int o = 0; o < nout; o++)
{
var row = new List<Value>(nin);
for (int i = 0; i < nin; i++) row.Add(new Value(NextGaussian(0.0, std)));
mat.Add(row);
}
return mat;
}
private static void InitModel(int vocabSize)
{
state_dict = new Dictionary<string, List<List<Value>>>();
state_dict["wte"] = Matrix(vocabSize, n_embd);
state_dict["wpe"] = Matrix(block_size, n_embd);
state_dict["lm_head"] = Matrix(vocabSize, n_embd);
for (int i = 0; i < n_layer; i++)
{
state_dict[$"layer{i}.attn_wq"] = Matrix(n_embd, n_embd);
state_dict[$"layer{i}.attn_wk"] = Matrix(n_embd, n_embd);
state_dict[$"layer{i}.attn_wv"] = Matrix(n_embd, n_embd);
state_dict[$"layer{i}.attn_wo"] = Matrix(n_embd, n_embd, std: 0.0);
state_dict[$"layer{i}.mlp_fc1"] = Matrix(4 * n_embd, n_embd);
state_dict[$"layer{i}.mlp_fc2"] = Matrix(n_embd, 4 * n_embd, std: 0.0);
}
parameters = new List<Value>();
foreach (var mat in state_dict.Values)
foreach (var row in mat)
parameters.AddRange(row);
}
// -----------------------------
// Model ops (linear, softmax, rmsnorm, gpt)
// -----------------------------
private static List<Value> Linear(List<Value> x, List<List<Value>> w)
{
// w: [nout][nin], x: [nin] -> out: [nout]
var output = new List<Value>(w.Count);
for (int o = 0; o < w.Count; o++)
{
Value sum = new Value(0.0);
var row = w[o];
for (int i = 0; i < row.Count; i++)
sum = sum + (row[i] * x[i]);
output.Add(sum);
}
return output;
}
private static List<Value> Softmax(List<Value> logits)
{
double maxVal = logits.Max(v => v.Data);
var exps = new List<Value>(logits.Count);
for (int i = 0; i < logits.Count; i++)
exps.Add((logits[i] - maxVal).Exp());
Value total = new Value(0.0);
for (int i = 0; i < exps.Count; i++) total = total + exps[i];
var probs = new List<Value>(logits.Count);
for (int i = 0; i < exps.Count; i++) probs.Add(exps[i] / total);
return probs;
}
private static List<Value> RmsNorm(List<Value> x)
{
// ms = mean(x^2), scale = (ms + 1e-5)^-0.5, return x*scale
Value ms = new Value(0.0);
for (int i = 0; i < x.Count; i++) ms = ms + (x[i] * x[i]);
ms = ms / (double)x.Count;
Value scale = (ms + 1e-5).Pow(-0.5);
var y = new List<Value>(x.Count);
for (int i = 0; i < x.Count; i++) y.Add(x[i] * scale);
return y;
}
private static List<Value> Slice(List<Value> v, int start, int length)
{
var r = new List<Value>(length);
for (int i = 0; i < length; i++) r.Add(v[start + i]);
return r;
}
private static List<Value> GPT(
int tokenId,
int posId,
List<List<List<Value>>> keys, // [layer][time][n_embd]
List<List<List<Value>>> values) // [layer][time][n_embd]
{
var tok_emb = state_dict["wte"][tokenId]; // length n_embd
var pos_emb = state_dict["wpe"][posId]; // length n_embd
var x = new List<Value>(n_embd);
for (int i = 0; i < n_embd; i++) x.Add(tok_emb[i] + pos_emb[i]);
x = RmsNorm(x);
for (int li = 0; li < n_layer; li++)
{
// 1) Multi-head attention block
var x_residual = x;
x = RmsNorm(x);
var q = Linear(x, state_dict[$"layer{li}.attn_wq"]);
var k = Linear(x, state_dict[$"layer{li}.attn_wk"]);
var v = Linear(x, state_dict[$"layer{li}.attn_wv"]);
keys[li].Add(k);
values[li].Add(v);
var x_attn = new List<Value>(n_embd);
for (int h = 0; h < n_head; h++)
{
int hs = h * head_dim;
var q_h = Slice(q, hs, head_dim);
var k_h = keys[li].Select(kk => Slice(kk, hs, head_dim)).ToList();
var v_h = values[li].Select(vv => Slice(vv, hs, head_dim)).ToList();
// attn_logits[t] = dot(q_h, k_h[t]) / sqrt(head_dim)
var attn_logits = new List<Value>(k_h.Count);
for (int t = 0; t < k_h.Count; t++)
{
Value dot = new Value(0.0);
for (int j = 0; j < head_dim; j++)
dot = dot + (q_h[j] * k_h[t][j]);
attn_logits.Add(dot / Math.Sqrt(head_dim));
}
var attn_weights = Softmax(attn_logits); // length time
// head_out[j] = sum_t attn_weights[t] * v_h[t][j]
for (int j = 0; j < head_dim; j++)
{
Value sum = new Value(0.0);
for (int t = 0; t < v_h.Count; t++)
sum = sum + (attn_weights[t] * v_h[t][j]);
x_attn.Add(sum);
}
}
x = Linear(x_attn, state_dict[$"layer{li}.attn_wo"]);
var x_added = new List<Value>(n_embd);
for (int i = 0; i < n_embd; i++) x_added.Add(x[i] + x_residual[i]);
x = x_added;
// 2) MLP block
x_residual = x;
x = RmsNorm(x);
x = Linear(x, state_dict[$"layer{li}.mlp_fc1"]);
for (int i = 0; i < x.Count; i++) x[i] = x[i].Relu().Pow(2.0);
x = Linear(x, state_dict[$"layer{li}.mlp_fc2"]);
x_added = new List<Value>(n_embd);
for (int i = 0; i < n_embd; i++) x_added.Add(x[i] + x_residual[i]);
x = x_added;
}
var logits = Linear(x, state_dict["lm_head"]);
return logits;
}
// -----------------------------
// Main: dataset, tokenizer, train, inference
// -----------------------------
public static void Main()
{
// Dataset
const string inputPath = "input.txt";
if (!File.Exists(inputPath))
{
Console.WriteLine("input.txt not found, downloading names dataset...");
var url = "https://raw.githubusercontent.com/karpathy/makemore/refs/heads/master/names.txt";
using var http = new HttpClient();
var bytes = http.GetByteArrayAsync(url).GetAwaiter().GetResult();
File.WriteAllBytes(inputPath, bytes);
}
var docs = File.ReadAllLines(inputPath)
.Select(l => l.Trim())
.Where(l => !string.IsNullOrWhiteSpace(l))
.ToList();
// Shuffle
docs = docs.OrderBy(_ => Rng.Next()).ToList();
Console.WriteLine($"num docs: {docs.Count}");
// Tokenizer: unique chars become 0..n-1, BOS is last id
var uchars = docs.SelectMany(s => s).Distinct().OrderBy(c => c).ToList();
int BOS = uchars.Count;
int vocabSize = uchars.Count + 1;
Console.WriteLine($"vocab size: {vocabSize}");
// Model init
InitModel(vocabSize);
Console.WriteLine($"num params: {parameters.Count}");
// Adam buffers
double learningRate = 1e-2, beta1 = 0.9, beta2 = 0.95, epsAdam = 1e-8;
var m = new double[parameters.Count];
var v = new double[parameters.Count];
// Train
int numSteps = 500;
for (int step = 0; step < numSteps; step++)
{
string doc = docs[step % docs.Count];
// tokens = [BOS] + chars + [BOS]
var tokens = new List<int>(doc.Length + 2) { BOS };
for (int i = 0; i < doc.Length; i++)
tokens.Add(uchars.IndexOf(doc[i]));
tokens.Add(BOS);
int n = Math.Min(block_size, tokens.Count - 1);
var keys = Enumerable.Range(0, n_layer).Select(_ => new List<List<Value>>()).ToList();
var values = Enumerable.Range(0, n_layer).Select(_ => new List<List<Value>>()).ToList();
var losses = new List<Value>(n);
for (int posId = 0; posId < n; posId++)
{
int tokenId = tokens[posId];
int targetId = tokens[posId + 1];
var logits = GPT(tokenId, posId, keys, values);
var probs = Softmax(logits);
var loss_t = -probs[targetId].Log();
losses.Add(loss_t);
}
// loss = mean(losses)
Value loss = new Value(0.0);
for (int i = 0; i < losses.Count; i++) loss = loss + losses[i];
loss = (1.0 / n) * loss;
// backward
loss.Backward();
// cosine lr decay
double lr_t = learningRate * 0.5 * (1.0 + Math.Cos(Math.PI * step / numSteps));
// adam update
for (int i = 0; i < parameters.Count; i++)
{
var p = parameters[i];
m[i] = beta1 * m[i] + (1.0 - beta1) * p.Grad;
v[i] = beta2 * v[i] + (1.0 - beta2) * (p.Grad * p.Grad);
double mHat = m[i] / (1.0 - Math.Pow(beta1, step + 1));
double vHat = v[i] / (1.0 - Math.Pow(beta2, step + 1));
p.Data -= lr_t * mHat / (Math.Sqrt(vHat) + epsAdam);
p.Grad = 0.0;
}
Console.WriteLine($"step {step + 1,4} / {numSteps,4} | loss {loss.Data:F4}");
}
// Inference
double temperature = 0.5;
Console.WriteLine("\n--- inference ---");
for (int sampleIdx = 0; sampleIdx < 20; sampleIdx++)
{
var keys = Enumerable.Range(0, n_layer).Select(_ => new List<List<Value>>()).ToList();
var values = Enumerable.Range(0, n_layer).Select(_ => new List<List<Value>>()).ToList();
int tokenId = BOS;
var sample = new List<char>();
for (int posId = 0; posId < block_size; posId++)
{
var logits = GPT(tokenId, posId, keys, values);
// probs = softmax(logits / temperature)
var scaled = new List<Value>(logits.Count);
for (int i = 0; i < logits.Count; i++) scaled.Add(logits[i] / temperature);
var probs = Softmax(scaled);
var w = probs.Select(p => p.Data).ToList();
tokenId = WeightedChoice(w);
if (tokenId == BOS) break;
sample.Add(uchars[tokenId]);
}
Console.WriteLine($"sample {sampleIdx + 1,2}: {new string(sample.ToArray())}");
}
}
}
@hikalkan
Copy link
Author

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment