Created
February 4, 2026 10:34
-
-
Save BexTuychiev/ab05b64b20a15998f86720e684e21085 to your computer and use it in GitHub Desktop.
LSTM Sentiment Classifier for IMDB Reviews - PyTorch Implementation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| PyTorch Built-in LSTM for IMDB Sentiment Classification | |
| Demonstrates nn.LSTM with a real NLP task. | |
| """ | |
| import torch | |
| import torch.nn as nn | |
| from torch.utils.data import DataLoader, TensorDataset | |
| from datasets import load_dataset | |
| from collections import Counter | |
| import numpy as np | |
| torch.manual_seed(42) | |
| np.random.seed(42) | |
| class LSTMClassifier(nn.Module): | |
| """LSTM sentiment classifier using PyTorch's nn.LSTM.""" | |
| def __init__( | |
| self, | |
| vocab_size, | |
| embedding_dim, | |
| hidden_size, | |
| output_size, | |
| num_layers=1, | |
| dropout=0.0, | |
| bidirectional=False, | |
| padding_idx=0, | |
| ): | |
| super().__init__() | |
| self.hidden_size = hidden_size | |
| self.num_layers = num_layers | |
| self.bidirectional = bidirectional | |
| self.embedding = nn.Embedding( | |
| vocab_size, embedding_dim, padding_idx=padding_idx | |
| ) | |
| self.lstm = nn.LSTM( | |
| input_size=embedding_dim, | |
| hidden_size=hidden_size, | |
| num_layers=num_layers, | |
| batch_first=True, | |
| dropout=dropout if num_layers > 1 else 0, | |
| bidirectional=bidirectional, | |
| ) | |
| fc_input_size = hidden_size * 2 if bidirectional else hidden_size | |
| self.fc = nn.Linear(fc_input_size, output_size) | |
| def forward(self, x): | |
| embedded = self.embedding(x) | |
| lstm_out, (h_n, c_n) = self.lstm(embedded) | |
| if self.bidirectional: | |
| h_final = torch.cat([h_n[-2], h_n[-1]], dim=1) | |
| else: | |
| h_final = h_n[-1] | |
| return self.fc(h_final) | |
| def build_vocab(texts, max_vocab_size=10000): | |
| counter = Counter() | |
| for text in texts: | |
| counter.update(text.lower().split()) | |
| vocab = {"<pad>": 0, "<unk>": 1} | |
| for word, _ in counter.most_common(max_vocab_size - 2): | |
| vocab[word] = len(vocab) | |
| return vocab | |
| def tokenize_and_pad(texts, vocab, max_length=200): | |
| sequences = [] | |
| for text in texts: | |
| tokens = text.lower().split() | |
| ids = [vocab.get(t, vocab["<unk>"]) for t in tokens[:max_length]] | |
| ids = ids + [vocab["<pad>"]] * (max_length - len(ids)) | |
| sequences.append(ids) | |
| return torch.tensor(sequences, dtype=torch.long) | |
| def load_imdb_data(max_samples=20000, max_length=200, max_vocab=10000): | |
| print("Loading IMDB dataset...") | |
| dataset = load_dataset("imdb") | |
| train_data = dataset["train"].shuffle(seed=42) | |
| test_data = dataset["test"].shuffle(seed=42) | |
| train_texts = train_data["text"][:max_samples] | |
| train_labels = train_data["label"][:max_samples] | |
| test_texts = test_data["text"][: max_samples // 4] | |
| test_labels = test_data["label"][: max_samples // 4] | |
| print(f" Train: {len(train_texts)}, Test: {len(test_texts)}") | |
| vocab = build_vocab(train_texts, max_vocab) | |
| print(f" Vocab: {len(vocab)}, Seq length: {max_length}") | |
| X_train = tokenize_and_pad(train_texts, vocab, max_length) | |
| X_test = tokenize_and_pad(test_texts, vocab, max_length) | |
| y_train = torch.tensor(train_labels, dtype=torch.long) | |
| y_test = torch.tensor(test_labels, dtype=torch.long) | |
| return X_train, y_train, X_test, y_test, vocab | |
| def train_model(model, train_loader, test_loader, n_epochs=5, lr=0.001): | |
| criterion = nn.CrossEntropyLoss() | |
| optimizer = torch.optim.Adam(model.parameters(), lr=lr) | |
| for epoch in range(n_epochs): | |
| model.train() | |
| total_loss, correct, total = 0, 0, 0 | |
| for batch_texts, batch_labels in train_loader: | |
| optimizer.zero_grad() | |
| outputs = model(batch_texts) | |
| loss = criterion(outputs, batch_labels) | |
| loss.backward() | |
| torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0) | |
| optimizer.step() | |
| total_loss += loss.item() | |
| _, predicted = outputs.max(1) | |
| correct += (predicted == batch_labels).sum().item() | |
| total += batch_labels.size(0) | |
| train_acc = correct / total | |
| model.eval() | |
| test_correct, test_total = 0, 0 | |
| with torch.no_grad(): | |
| for batch_texts, batch_labels in test_loader: | |
| outputs = model(batch_texts) | |
| _, predicted = outputs.max(1) | |
| test_correct += (predicted == batch_labels).sum().item() | |
| test_total += batch_labels.size(0) | |
| test_acc = test_correct / test_total | |
| print( | |
| f" Epoch {epoch+1}/{n_epochs}: Loss={total_loss/len(train_loader):.4f}, " | |
| f"Train={train_acc:.3f}, Test={test_acc:.3f}" | |
| ) | |
| return test_acc | |
| if __name__ == "__main__": | |
| print("=" * 60) | |
| print("LSTM Sentiment Classification - IMDB Dataset") | |
| print("=" * 60) | |
| # Load data | |
| X_train, y_train, X_test, y_test, vocab = load_imdb_data( | |
| max_samples=10000, max_length=100, max_vocab=5000 | |
| ) | |
| # Create data loaders (TensorDataset bundles tensors together) | |
| train_loader = DataLoader( | |
| TensorDataset(X_train, y_train), batch_size=64, shuffle=True | |
| ) | |
| test_loader = DataLoader(TensorDataset(X_test, y_test), batch_size=64) | |
| # Create model | |
| model = LSTMClassifier( | |
| vocab_size=len(vocab), | |
| embedding_dim=32, | |
| hidden_size=32, | |
| output_size=2, | |
| padding_idx=vocab["<pad>"], | |
| ) | |
| n_params = sum(p.numel() for p in model.parameters()) | |
| print(f"\nModel: embedding=32, hidden=32, params={n_params:,}") | |
| # Train | |
| print(f"\nTraining...") | |
| final_acc = train_model(model, train_loader, test_loader, n_epochs=3, lr=0.001) | |
| print(f"\nFinal test accuracy: {final_acc:.1%}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment