PyTorch Inefficiency [closed]

6 hours ago 1
ARTICLE AD BOX

I’m working on a small language model project in Python using PyTorch, and I could really use some guidance on improving both performance and overall design.

Right now I’m trying to train a word-level LSTM on a collection of Wikipedia articles stored locally in a folder. The program loads the files, tokenizes them into words, builds a vocabulary, encodes everything into tensors, and then trains using random batches. It technically works, but it’s extremely slow—much slower than I expected—even when using a relatively small subset of the dataset.

Some specific issues I’m running into:

Data loading and preprocessing take a long time, especially when dealing with larger datasets (millions of tokens).

Training steps themselves are slow; even a few iterations can take minutes.

Memory usage seems high, especially when building the token list and vocabulary.

I’m not sure if my batching approach with slicing inside a Python loop) is inefficient.

I suspect I might be CPU-bound rather than actually using the GPU effectively.

For context, I’m still pretty new to Python and machine learning, so I wouldn’t be surprised if there are fundamental mistakes in how I structured this. I’m mainly trying to understand:

What are the biggest inefficiencies in this kind of setup?

How should large text datasets be handled properly (streaming, datasets, etc.)?

What’s the “correct” way to batch data for sequence models in PyTorch?

Are there major architectural issues with using a simple word-level LSTM like this?

How can I better utilize hardware (CPU vs GPU) for this kind of task?

import os import torch import torch.nn as nn import torch.optim as optim MODEL_PATH = "general_word_model.pt" DATA_FOLDER = "dataset" DEVICE = "cuda" if torch.cuda.is_available() else "cpu" BATCH_SIZE = 32 SEQ_LEN = 32 LR = 0.001 STEPS = 500 def iter_files(folder): for root, _, files in os.walk(folder): for file in files: yield os.path.join(root, file) def tokenize_streaming(folder): tokens = [] for path in iter_files(folder): try: with open(path, "r", encoding="utf-8", errors="ignore") as f: for line in f: line = line.replace("\n", " \n ") tokens.extend(line.split()) except Exception: continue return tokens def tokenize(text): text = text.replace("\n", " \n ") return text.split() def detokenize(tokens): return " ".join(tokens).replace(" \n ", "\n") class WordLSTM(nn.Module): def __init__(self, vocab_size, embed_size=128, hidden_size=256): super().__init__() self.embedding = nn.Embedding(vocab_size, embed_size) self.lstm = nn.LSTM(embed_size, hidden_size, batch_first=True) self.fc = nn.Linear(hidden_size, vocab_size) def forward(self, x, hidden=None): x = self.embedding(x) out, hidden = self.lstm(x, hidden) logits = self.fc(out) return logits, hidden def get_batch(data, seq_len=32, batch_size=32): ix = torch.randint(0, len(data) - seq_len - 1, (batch_size,)) x = torch.stack([data[i:i+seq_len] for i in ix]) y = torch.stack([data[i+1:i+seq_len+1] for i in ix]) return x, y tokens = tokenize_streaming(DATA_FOLDER) print(f"Dataset size: {len(tokens)} tokens") vocab = sorted(set(tokens)) stoi = {w: i for i, w in enumerate(vocab)} itos = {i: w for w, i in stoi.items()} data_encoded = torch.tensor([stoi[w] for w in tokens], dtype=torch.long) vocab_size = len(vocab) model = WordLSTM(vocab_size).to(DEVICE) if os.path.exists(MODEL_PATH): print("\n📦 Loading model...\n") checkpoint = torch.load(MODEL_PATH, map_location=DEVICE) model.load_state_dict(checkpoint["model_state"]) stoi = checkpoint["stoi"] itos = checkpoint["itos"] else: print("\n🧠 Training...\n") optimizer = optim.Adam(model.parameters(), lr=LR) loss_fn = nn.CrossEntropyLoss() scheduler = optim.lr_scheduler.StepLR( optimizer, step_size=5000, gamma=0.5 ) model.train() seed_texts = ["What is Albuquerque?"] for step in range(STEPS): x, y = get_batch(data_encoded, SEQ_LEN, BATCH_SIZE) x, y = x.to(DEVICE), y.to(DEVICE) logits, _ = model(x, None) loss = loss_fn( logits.view(-1, vocab_size), y.view(-1) ) optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() scheduler.step() if step % 5 == 0: print(f"\nStep {step}, Loss: {loss.item():.4f}") model.eval() for seed in seed_texts: tokens_in = tokenize(seed) idx = torch.tensor([[stoi.get(t, 0) for t in tokens_in]], dtype=torch.long).to(DEVICE) hidden = None out = [] for _ in range(40): logits, hidden = model(idx, hidden) probs = torch.softmax(logits[:, -1, :], dim=-1) next_id = torch.multinomial(probs, 1).item() out.append(next_id) idx = torch.cat([ idx, torch.tensor([[next_id]], device=DEVICE) ], dim=1) if idx.size(1) > SEQ_LEN: idx = idx[:, -SEQ_LEN:] print("Sample:", seed, detokenize([itos[i] for i in out])) model.train() torch.save({ "model_state": model.state_dict(), "stoi": stoi, "itos": itos, "vocab_size": vocab_size }, MODEL_PATH) print("\n💾 Model saved.\n") def chat(): print("\n💬 General Word LSTM Chat\n") print("Type 'exit' to quit\n") while True: user = input("You: ") if user.lower() == "exit": break tokens_in = tokenize(user) idx = torch.tensor([[stoi.get(t, 0) for t in tokens_in]], dtype=torch.long).to(DEVICE) model.eval() hidden = None out = [] for _ in range(60): logits, hidden = model(idx, hidden) probs = torch.softmax(logits[:, -1, :], dim=-1) next_id = torch.multinomial(probs, 1).item() out.append(next_id) idx = torch.cat([ idx, torch.tensor([[next_id]], device=DEVICE) ], dim=1) if idx.size(1) > SEQ_LEN: idx = idx[:, -SEQ_LEN:] print("\nAI:", detokenize([itos[i] for i in out]), "\n") chat()
Read Entire Article