PyTorch: From Tensors to Training Loops for Developers (2026)
Why PyTorch?
PyTorch is the dominant framework for AI research and is rapidly taking over production ML too. HuggingFace, most LLM research, and a growing share of production systems run on PyTorch.
Bash
pip install torch torchvision transformers datasetsTensors: NumPy with Superpowers
Tensors are like NumPy arrays but can run on GPU and track gradients.
Python
import torch
import numpy as np
# Create tensors
x = torch.tensor([1.0, 2.0, 3.0])
y = torch.zeros(3, 4) # 3x4 tensor of zeros
z = torch.randn(2, 3, 4) # random normal, shape (2, 3, 4)
# Shape, dtype, device
print(x.shape) # torch.Size([3])
print(z.shape) # torch.Size([2, 3, 4])
print(x.dtype) # torch.float32
# Move to GPU (if available)
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
x_gpu = x.to(device)
# Operations (same as NumPy)
a = torch.tensor([[1., 2.], [3., 4.]])
b = torch.tensor([[5., 6.], [7., 8.]])
print(a + b) # element-wise
print(a @ b) # matrix multiply
print(a.T) # transpose
print(a.sum()) # 10.0
print(a.mean(dim=0)) # mean along first dimension
# Convert to/from NumPy
arr = x.numpy() # tensor → numpy (CPU only)
t = torch.from_numpy(arr) # numpy → tensorAutograd: Automatic Differentiation
PyTorch tracks computations and computes gradients automatically:
Python
# requires_grad=True tells PyTorch to track this tensor
w = torch.tensor(3.0, requires_grad=True)
b = torch.tensor(1.0, requires_grad=True)
x = torch.tensor(2.0)
# Forward pass: compute output
y_pred = w * x + b # y = 3*2 + 1 = 7
loss = (y_pred - 5.0) ** 2 # (7 - 5)^2 = 4
# Backward pass: compute gradients
loss.backward()
print(w.grad) # d(loss)/dw = 2*(y_pred - 5) * x = 2*2*2 = 8
print(b.grad) # d(loss)/db = 2*(y_pred - 5) * 1 = 4
# Update weights manually
with torch.no_grad(): # don't track gradient for this operation
w -= 0.1 * w.grad # gradient descent step
b -= 0.1 * b.grad
w.grad.zero_() # clear gradients (they accumulate otherwise!)
b.grad.zero_()Building Models with nn.Module
Every PyTorch model is an nn.Module:
Python
import torch.nn as nn
class MLP(nn.Module):
"""Multi-layer perceptron."""
def __init__(self, input_size, hidden_sizes, output_size, dropout=0.3):
super().__init__()
layers = []
prev_size = input_size
for h in hidden_sizes:
layers.extend([
nn.Linear(prev_size, h),
nn.LayerNorm(h),
nn.GELU(),
nn.Dropout(dropout),
])
prev_size = h
layers.append(nn.Linear(prev_size, output_size))
self.net = nn.Sequential(*layers)
def forward(self, x):
return self.net(x)
model = MLP(input_size=128, hidden_sizes=[256, 128], output_size=10)
print(model)
# Count parameters
n_params = sum(p.numel() for p in model.parameters())
print(f"Parameters: {n_params:,}") # 66,826
# Sample forward pass
x = torch.randn(32, 128) # batch of 32
out = model(x)
print(out.shape) # (32, 10)The Training Loop
Python
import torch
from torch.utils.data import DataLoader, TensorDataset
def train(model, train_loader, val_loader, epochs=20, lr=1e-3):
device = next(model.parameters()).device
optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=0.01)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
criterion = nn.CrossEntropyLoss()
history = {"train_loss": [], "val_acc": []}
for epoch in range(epochs):
# ── Training ──────────────────────────────────────────────
model.train() # enable dropout + batch norm training mode
total_loss = 0
for x_batch, y_batch in train_loader:
x_batch, y_batch = x_batch.to(device), y_batch.to(device)
optimizer.zero_grad() # clear previous gradients
output = model(x_batch) # forward pass
loss = criterion(output, y_batch) # compute loss
loss.backward() # backpropagation
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # gradient clipping
optimizer.step() # update weights
total_loss += loss.item()
# ── Validation ────────────────────────────────────────────
model.eval() # disable dropout
correct = total = 0
with torch.no_grad(): # no gradient tracking needed
for x_val, y_val in val_loader:
x_val, y_val = x_val.to(device), y_val.to(device)
preds = model(x_val).argmax(dim=1)
correct += (preds == y_val).sum().item()
total += len(y_val)
val_acc = correct / total
avg_loss = total_loss / len(train_loader)
scheduler.step()
history["train_loss"].append(avg_loss)
history["val_acc"].append(val_acc)
if (epoch + 1) % 5 == 0:
print(f"Epoch {epoch+1}/{epochs} | Loss: {avg_loss:.4f} | Val Acc: {val_acc:.1%}")
return history
# Example usage
X = torch.randn(1000, 128)
y = torch.randint(0, 10, (1000,))
dataset = TensorDataset(X, y)
train_set, val_set = torch.utils.data.random_split(dataset, [800, 200])
train_loader = DataLoader(train_set, batch_size=32, shuffle=True)
val_loader = DataLoader(val_set, batch_size=64)
model = MLP(128, [256, 128], 10).to(device)
history = train(model, train_loader, val_loader, epochs=20)Saving and Loading Models
Python
# Save: model weights only (recommended)
torch.save(model.state_dict(), "model.pt")
# Load
loaded_model = MLP(128, [256, 128], 10)
loaded_model.load_state_dict(torch.load("model.pt", map_location=device))
loaded_model.eval()
# Save: full model (architecture + weights, less portable)
torch.save(model, "full_model.pt")
loaded = torch.load("full_model.pt")GPU Acceleration
Python
# Check available hardware
print(torch.cuda.is_available()) # NVIDIA GPU
print(torch.backends.mps.is_available()) # Apple Silicon
# Recommended device selection
device = (
"cuda" if torch.cuda.is_available()
else "mps" if torch.backends.mps.is_available()
else "cpu"
)
# Move model and data to device
model = model.to(device)
x = x.to(device)
# With NVIDIA: multi-GPU training
if torch.cuda.device_count() > 1:
model = nn.DataParallel(model)Working with HuggingFace
HuggingFace Transformers is built on top of PyTorch:
Python
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
# Load pre-trained model
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.eval()
# Inference
texts = ["This movie was fantastic!", "I hated every minute of it."]
inputs = tokenizer(texts, return_tensors="pt", truncation=True, padding=True, max_length=128)
with torch.no_grad():
outputs = model(**inputs)
logits = outputs.logits
predictions = logits.softmax(dim=-1)
labels = ["NEGATIVE", "POSITIVE"]
for text, pred in zip(texts, predictions):
label = labels[pred.argmax()]
confidence = pred.max().item()
print(f"{label} ({confidence:.1%}): {text}")Debugging Tips
Python
# 1. Check for NaN in outputs
assert not torch.isnan(output).any(), "NaN in model output!"
# 2. Print shapes at each layer
class DebugNet(nn.Module):
def forward(self, x):
print(f"Input: {x.shape}")
x = self.layer1(x)
print(f"After layer1: {x.shape}")
return x
# 3. Check gradient flow
for name, param in model.named_parameters():
if param.grad is not None:
print(f"{name}: grad norm = {param.grad.norm():.4f}")
# 4. Memory usage on GPU
print(f"GPU memory: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
torch.cuda.empty_cache() # free cached memoryWhat to Learn Next
- How transformers work → How LLMs Work
- Fine-tuning LLMs with HuggingFace → Fine-Tuning LLMs Guide
- Build a project → RAG Document Assistant