BexTuychiev · February 4, 2026 10:34
diff --git a/lstm_builtin.py b/lstm_builtin.py
 """
 PyTorch Built-in LSTM for IMDB Sentiment Classification
 Demonstrates nn.LSTM with a real NLP task.
 """

 import torch
 import torch.nn as nn
 from torch.utils.data import DataLoader, TensorDataset
 from datasets import load_dataset
 from collections import Counter
 import numpy as np

 torch.manual_seed(42)
 np.random.seed(42)


 class LSTMClassifier(nn.Module):
    """LSTM sentiment classifier using PyTorch's nn.LSTM."""

    def __init__(
        self,
        vocab_size,
        embedding_dim,
        hidden_size,
        output_size,
        num_layers=1,
        dropout=0.0,
        bidirectional=False,
        padding_idx=0,
    ):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bidirectional = bidirectional

        self.embedding = nn.Embedding(
            vocab_size, embedding_dim, padding_idx=padding_idx
        )
        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0,
            bidirectional=bidirectional,
        )

        fc_input_size = hidden_size * 2 if bidirectional else hidden_size
        self.fc = nn.Linear(fc_input_size, output_size)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, (h_n, c_n) = self.lstm(embedded)

        if self.bidirectional:
            h_final = torch.cat([h_n[-2], h_n[-1]], dim=1)
        else:
            h_final = h_n[-1]

        return self.fc(h_final)


 def build_vocab(texts, max_vocab_size=10000):
    counter = Counter()
    for text in texts:
        counter.update(text.lower().split())

    vocab = {"<pad>": 0, "<unk>": 1}
    for word, _ in counter.most_common(max_vocab_size - 2):
        vocab[word] = len(vocab)
    return vocab


 def tokenize_and_pad(texts, vocab, max_length=200):
    sequences = []
    for text in texts:
        tokens = text.lower().split()
        ids = [vocab.get(t, vocab["<unk>"]) for t in tokens[:max_length]]
        ids = ids + [vocab["<pad>"]] * (max_length - len(ids))
        sequences.append(ids)
    return torch.tensor(sequences, dtype=torch.long)


 def load_imdb_data(max_samples=20000, max_length=200, max_vocab=10000):
    print("Loading IMDB dataset...")
    dataset = load_dataset("imdb")

    train_data = dataset["train"].shuffle(seed=42)
    test_data = dataset["test"].shuffle(seed=42)

    train_texts = train_data["text"][:max_samples]
    train_labels = train_data["label"][:max_samples]
    test_texts = test_data["text"][: max_samples // 4]
    test_labels = test_data["label"][: max_samples // 4]

    print(f"  Train: {len(train_texts)}, Test: {len(test_texts)}")

    vocab = build_vocab(train_texts, max_vocab)
    print(f"  Vocab: {len(vocab)}, Seq length: {max_length}")

    X_train = tokenize_and_pad(train_texts, vocab, max_length)
    X_test = tokenize_and_pad(test_texts, vocab, max_length)
    y_train = torch.tensor(train_labels, dtype=torch.long)
    y_test = torch.tensor(test_labels, dtype=torch.long)

    return X_train, y_train, X_test, y_test, vocab


 def train_model(model, train_loader, test_loader, n_epochs=5, lr=0.001):
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    for epoch in range(n_epochs):
        model.train()
        total_loss, correct, total = 0, 0, 0

        for batch_texts, batch_labels in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_texts)
            loss = criterion(outputs, batch_labels)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)
            optimizer.step()

            total_loss += loss.item()
            _, predicted = outputs.max(1)
            correct += (predicted == batch_labels).sum().item()
            total += batch_labels.size(0)

        train_acc = correct / total

        model.eval()
        test_correct, test_total = 0, 0
        with torch.no_grad():
            for batch_texts, batch_labels in test_loader:
                outputs = model(batch_texts)
                _, predicted = outputs.max(1)
                test_correct += (predicted == batch_labels).sum().item()
                test_total += batch_labels.size(0)

        test_acc = test_correct / test_total
        print(
            f"  Epoch {epoch+1}/{n_epochs}: Loss={total_loss/len(train_loader):.4f}, "
            f"Train={train_acc:.3f}, Test={test_acc:.3f}"
        )

    return test_acc


 if __name__ == "__main__":
    print("=" * 60)
    print("LSTM Sentiment Classification - IMDB Dataset")
    print("=" * 60)

    # Load data
    X_train, y_train, X_test, y_test, vocab = load_imdb_data(
        max_samples=10000, max_length=100, max_vocab=5000
    )

    # Create data loaders (TensorDataset bundles tensors together)
    train_loader = DataLoader(
        TensorDataset(X_train, y_train), batch_size=64, shuffle=True
    )
    test_loader = DataLoader(TensorDataset(X_test, y_test), batch_size=64)

    # Create model
    model = LSTMClassifier(
        vocab_size=len(vocab),
        embedding_dim=32,
        hidden_size=32,
        output_size=2,
        padding_idx=vocab["<pad>"],
    )

    n_params = sum(p.numel() for p in model.parameters())
    print(f"\nModel: embedding=32, hidden=32, params={n_params:,}")

    # Train
    print(f"\nTraining...")
    final_acc = train_model(model, train_loader, test_loader, n_epochs=3, lr=0.001)
    print(f"\nFinal test accuracy: {final_acc:.1%}")
	"""
	PyTorch Built-in LSTM for IMDB Sentiment Classification
	Demonstrates nn.LSTM with a real NLP task.
	"""

	import torch
	import torch.nn as nn
	from torch.utils.data import DataLoader, TensorDataset
	from datasets import load_dataset
	from collections import Counter
	import numpy as np

	torch.manual_seed(42)
	np.random.seed(42)


	class LSTMClassifier(nn.Module):
	"""LSTM sentiment classifier using PyTorch's nn.LSTM."""

	def __init__(
	self,
	vocab_size,
	embedding_dim,
	hidden_size,
	output_size,
	num_layers=1,
	dropout=0.0,
	bidirectional=False,
	padding_idx=0,
	):
	super().__init__()
	self.hidden_size = hidden_size
	self.num_layers = num_layers
	self.bidirectional = bidirectional

	self.embedding = nn.Embedding(
	vocab_size, embedding_dim, padding_idx=padding_idx
	)
	self.lstm = nn.LSTM(
	input_size=embedding_dim,
	hidden_size=hidden_size,
	num_layers=num_layers,
	batch_first=True,
	dropout=dropout if num_layers > 1 else 0,
	bidirectional=bidirectional,
	)

	fc_input_size = hidden_size * 2 if bidirectional else hidden_size
	self.fc = nn.Linear(fc_input_size, output_size)

	def forward(self, x):
	embedded = self.embedding(x)
	lstm_out, (h_n, c_n) = self.lstm(embedded)

	if self.bidirectional:
	h_final = torch.cat([h_n[-2], h_n[-1]], dim=1)
	else:
	h_final = h_n[-1]

	return self.fc(h_final)


	def build_vocab(texts, max_vocab_size=10000):
	counter = Counter()
	for text in texts:
	counter.update(text.lower().split())

	vocab = {"<pad>": 0, "<unk>": 1}
	for word, _ in counter.most_common(max_vocab_size - 2):
	vocab[word] = len(vocab)
	return vocab


	def tokenize_and_pad(texts, vocab, max_length=200):
	sequences = []
	for text in texts:
	tokens = text.lower().split()
	ids = [vocab.get(t, vocab["<unk>"]) for t in tokens[:max_length]]
	ids = ids + [vocab["<pad>"]] * (max_length - len(ids))
	sequences.append(ids)
	return torch.tensor(sequences, dtype=torch.long)


	def load_imdb_data(max_samples=20000, max_length=200, max_vocab=10000):
	print("Loading IMDB dataset...")
	dataset = load_dataset("imdb")

	train_data = dataset["train"].shuffle(seed=42)
	test_data = dataset["test"].shuffle(seed=42)

	train_texts = train_data["text"][:max_samples]
	train_labels = train_data["label"][:max_samples]
	test_texts = test_data["text"][: max_samples // 4]
	test_labels = test_data["label"][: max_samples // 4]

	print(f" Train: {len(train_texts)}, Test: {len(test_texts)}")

	vocab = build_vocab(train_texts, max_vocab)
	print(f" Vocab: {len(vocab)}, Seq length: {max_length}")

	X_train = tokenize_and_pad(train_texts, vocab, max_length)
	X_test = tokenize_and_pad(test_texts, vocab, max_length)
	y_train = torch.tensor(train_labels, dtype=torch.long)
	y_test = torch.tensor(test_labels, dtype=torch.long)

	return X_train, y_train, X_test, y_test, vocab


	def train_model(model, train_loader, test_loader, n_epochs=5, lr=0.001):
	criterion = nn.CrossEntropyLoss()
	optimizer = torch.optim.Adam(model.parameters(), lr=lr)

	for epoch in range(n_epochs):
	model.train()
	total_loss, correct, total = 0, 0, 0

	for batch_texts, batch_labels in train_loader:
	optimizer.zero_grad()
	outputs = model(batch_texts)
	loss = criterion(outputs, batch_labels)
	loss.backward()
	torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)
	optimizer.step()

	total_loss += loss.item()
	_, predicted = outputs.max(1)
	correct += (predicted == batch_labels).sum().item()
	total += batch_labels.size(0)

	train_acc = correct / total

	model.eval()
	test_correct, test_total = 0, 0
	with torch.no_grad():
	for batch_texts, batch_labels in test_loader:
	outputs = model(batch_texts)
	_, predicted = outputs.max(1)
	test_correct += (predicted == batch_labels).sum().item()
	test_total += batch_labels.size(0)

	test_acc = test_correct / test_total
	print(
	f" Epoch {epoch+1}/{n_epochs}: Loss={total_loss/len(train_loader):.4f}, "
	f"Train={train_acc:.3f}, Test={test_acc:.3f}"
	)

	return test_acc


	if __name__ == "__main__":
	print("=" * 60)
	print("LSTM Sentiment Classification - IMDB Dataset")
	print("=" * 60)

	# Load data
	X_train, y_train, X_test, y_test, vocab = load_imdb_data(
	max_samples=10000, max_length=100, max_vocab=5000
	)

	# Create data loaders (TensorDataset bundles tensors together)
	train_loader = DataLoader(
	TensorDataset(X_train, y_train), batch_size=64, shuffle=True
	)
	test_loader = DataLoader(TensorDataset(X_test, y_test), batch_size=64)

	# Create model
	model = LSTMClassifier(
	vocab_size=len(vocab),
	embedding_dim=32,
	hidden_size=32,
	output_size=2,
	padding_idx=vocab["<pad>"],
	)

	n_params = sum(p.numel() for p in model.parameters())
	print(f"\nModel: embedding=32, hidden=32, params={n_params:,}")

	# Train
	print(f"\nTraining...")
	final_acc = train_model(model, train_loader, test_loader, n_epochs=3, lr=0.001)
	print(f"\nFinal test accuracy: {final_acc:.1%}")
No results found