Skip to content

Instantly share code, notes, and snippets.

@BexTuychiev
Created February 4, 2026 10:34
Show Gist options
  • Select an option

  • Save BexTuychiev/ab05b64b20a15998f86720e684e21085 to your computer and use it in GitHub Desktop.

Select an option

Save BexTuychiev/ab05b64b20a15998f86720e684e21085 to your computer and use it in GitHub Desktop.
LSTM Sentiment Classifier for IMDB Reviews - PyTorch Implementation
"""
PyTorch Built-in LSTM for IMDB Sentiment Classification
Demonstrates nn.LSTM with a real NLP task.
"""
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from datasets import load_dataset
from collections import Counter
import numpy as np
torch.manual_seed(42)
np.random.seed(42)
class LSTMClassifier(nn.Module):
"""LSTM sentiment classifier using PyTorch's nn.LSTM."""
def __init__(
self,
vocab_size,
embedding_dim,
hidden_size,
output_size,
num_layers=1,
dropout=0.0,
bidirectional=False,
padding_idx=0,
):
super().__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.bidirectional = bidirectional
self.embedding = nn.Embedding(
vocab_size, embedding_dim, padding_idx=padding_idx
)
self.lstm = nn.LSTM(
input_size=embedding_dim,
hidden_size=hidden_size,
num_layers=num_layers,
batch_first=True,
dropout=dropout if num_layers > 1 else 0,
bidirectional=bidirectional,
)
fc_input_size = hidden_size * 2 if bidirectional else hidden_size
self.fc = nn.Linear(fc_input_size, output_size)
def forward(self, x):
embedded = self.embedding(x)
lstm_out, (h_n, c_n) = self.lstm(embedded)
if self.bidirectional:
h_final = torch.cat([h_n[-2], h_n[-1]], dim=1)
else:
h_final = h_n[-1]
return self.fc(h_final)
def build_vocab(texts, max_vocab_size=10000):
counter = Counter()
for text in texts:
counter.update(text.lower().split())
vocab = {"<pad>": 0, "<unk>": 1}
for word, _ in counter.most_common(max_vocab_size - 2):
vocab[word] = len(vocab)
return vocab
def tokenize_and_pad(texts, vocab, max_length=200):
sequences = []
for text in texts:
tokens = text.lower().split()
ids = [vocab.get(t, vocab["<unk>"]) for t in tokens[:max_length]]
ids = ids + [vocab["<pad>"]] * (max_length - len(ids))
sequences.append(ids)
return torch.tensor(sequences, dtype=torch.long)
def load_imdb_data(max_samples=20000, max_length=200, max_vocab=10000):
print("Loading IMDB dataset...")
dataset = load_dataset("imdb")
train_data = dataset["train"].shuffle(seed=42)
test_data = dataset["test"].shuffle(seed=42)
train_texts = train_data["text"][:max_samples]
train_labels = train_data["label"][:max_samples]
test_texts = test_data["text"][: max_samples // 4]
test_labels = test_data["label"][: max_samples // 4]
print(f" Train: {len(train_texts)}, Test: {len(test_texts)}")
vocab = build_vocab(train_texts, max_vocab)
print(f" Vocab: {len(vocab)}, Seq length: {max_length}")
X_train = tokenize_and_pad(train_texts, vocab, max_length)
X_test = tokenize_and_pad(test_texts, vocab, max_length)
y_train = torch.tensor(train_labels, dtype=torch.long)
y_test = torch.tensor(test_labels, dtype=torch.long)
return X_train, y_train, X_test, y_test, vocab
def train_model(model, train_loader, test_loader, n_epochs=5, lr=0.001):
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
for epoch in range(n_epochs):
model.train()
total_loss, correct, total = 0, 0, 0
for batch_texts, batch_labels in train_loader:
optimizer.zero_grad()
outputs = model(batch_texts)
loss = criterion(outputs, batch_labels)
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)
optimizer.step()
total_loss += loss.item()
_, predicted = outputs.max(1)
correct += (predicted == batch_labels).sum().item()
total += batch_labels.size(0)
train_acc = correct / total
model.eval()
test_correct, test_total = 0, 0
with torch.no_grad():
for batch_texts, batch_labels in test_loader:
outputs = model(batch_texts)
_, predicted = outputs.max(1)
test_correct += (predicted == batch_labels).sum().item()
test_total += batch_labels.size(0)
test_acc = test_correct / test_total
print(
f" Epoch {epoch+1}/{n_epochs}: Loss={total_loss/len(train_loader):.4f}, "
f"Train={train_acc:.3f}, Test={test_acc:.3f}"
)
return test_acc
if __name__ == "__main__":
print("=" * 60)
print("LSTM Sentiment Classification - IMDB Dataset")
print("=" * 60)
# Load data
X_train, y_train, X_test, y_test, vocab = load_imdb_data(
max_samples=10000, max_length=100, max_vocab=5000
)
# Create data loaders (TensorDataset bundles tensors together)
train_loader = DataLoader(
TensorDataset(X_train, y_train), batch_size=64, shuffle=True
)
test_loader = DataLoader(TensorDataset(X_test, y_test), batch_size=64)
# Create model
model = LSTMClassifier(
vocab_size=len(vocab),
embedding_dim=32,
hidden_size=32,
output_size=2,
padding_idx=vocab["<pad>"],
)
n_params = sum(p.numel() for p in model.parameters())
print(f"\nModel: embedding=32, hidden=32, params={n_params:,}")
# Train
print(f"\nTraining...")
final_acc = train_model(model, train_loader, test_loader, n_epochs=3, lr=0.001)
print(f"\nFinal test accuracy: {final_acc:.1%}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment