AI Learning Hub Roadmap Resources Blog Projects Paths

PyTorch for AI Developers: Practical Guide from Tensors to Training

· 4 min read · AI Learning Hub

Why PyTorch?

PyTorch is the dominant framework for AI research and is rapidly taking over production ML too. HuggingFace, most LLM research, and a growing share of production systems run on PyTorch.

pip install torch torchvision transformers datasets

Tensors: NumPy with Superpowers

Tensors are like NumPy arrays but can run on GPU and track gradients.

import torch
import numpy as np

# Create tensors
x = torch.tensor([1.0, 2.0, 3.0])
y = torch.zeros(3, 4)               # 3x4 tensor of zeros
z = torch.randn(2, 3, 4)            # random normal, shape (2, 3, 4)

# Shape, dtype, device
print(x.shape)    # torch.Size([3])
print(z.shape)    # torch.Size([2, 3, 4])
print(x.dtype)    # torch.float32

# Move to GPU (if available)
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
x_gpu = x.to(device)

# Operations (same as NumPy)
a = torch.tensor([[1., 2.], [3., 4.]])
b = torch.tensor([[5., 6.], [7., 8.]])
print(a + b)          # element-wise
print(a @ b)          # matrix multiply
print(a.T)            # transpose
print(a.sum())        # 10.0
print(a.mean(dim=0))  # mean along first dimension

# Convert to/from NumPy
arr = x.numpy()         # tensor → numpy (CPU only)
t = torch.from_numpy(arr)  # numpy → tensor

Autograd: Automatic Differentiation

PyTorch tracks computations and computes gradients automatically:

# requires_grad=True tells PyTorch to track this tensor
w = torch.tensor(3.0, requires_grad=True)
b = torch.tensor(1.0, requires_grad=True)

x = torch.tensor(2.0)

# Forward pass: compute output
y_pred = w * x + b   # y = 3*2 + 1 = 7
loss = (y_pred - 5.0) ** 2  # (7 - 5)^2 = 4

# Backward pass: compute gradients
loss.backward()

print(w.grad)  # d(loss)/dw = 2*(y_pred - 5) * x = 2*2*2 = 8
print(b.grad)  # d(loss)/db = 2*(y_pred - 5) * 1 = 4

# Update weights manually
with torch.no_grad():  # don't track gradient for this operation
    w -= 0.1 * w.grad   # gradient descent step
    b -= 0.1 * b.grad
    w.grad.zero_()       # clear gradients (they accumulate otherwise!)
    b.grad.zero_()

Building Models with nn.Module

Every PyTorch model is an nn.Module:

import torch.nn as nn

class MLP(nn.Module):
    """Multi-layer perceptron."""
    def __init__(self, input_size, hidden_sizes, output_size, dropout=0.3):
        super().__init__()
        layers = []
        prev_size = input_size
        for h in hidden_sizes:
            layers.extend([
                nn.Linear(prev_size, h),
                nn.LayerNorm(h),
                nn.GELU(),
                nn.Dropout(dropout),
            ])
            prev_size = h
        layers.append(nn.Linear(prev_size, output_size))
        self.net = nn.Sequential(*layers)

    def forward(self, x):
        return self.net(x)


model = MLP(input_size=128, hidden_sizes=[256, 128], output_size=10)
print(model)

# Count parameters
n_params = sum(p.numel() for p in model.parameters())
print(f"Parameters: {n_params:,}")   # 66,826

# Sample forward pass
x = torch.randn(32, 128)   # batch of 32
out = model(x)
print(out.shape)   # (32, 10)

The Training Loop

import torch
from torch.utils.data import DataLoader, TensorDataset

def train(model, train_loader, val_loader, epochs=20, lr=1e-3):
    device = next(model.parameters()).device
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=0.01)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
    criterion = nn.CrossEntropyLoss()
    history = {"train_loss": [], "val_acc": []}

    for epoch in range(epochs):
        # ── Training ──────────────────────────────────────────────
        model.train()   # enable dropout + batch norm training mode
        total_loss = 0
        for x_batch, y_batch in train_loader:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)

            optimizer.zero_grad()           # clear previous gradients
            output = model(x_batch)         # forward pass
            loss = criterion(output, y_batch)  # compute loss
            loss.backward()                 # backpropagation
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # gradient clipping
            optimizer.step()                # update weights
            total_loss += loss.item()

        # ── Validation ────────────────────────────────────────────
        model.eval()    # disable dropout
        correct = total = 0
        with torch.no_grad():   # no gradient tracking needed
            for x_val, y_val in val_loader:
                x_val, y_val = x_val.to(device), y_val.to(device)
                preds = model(x_val).argmax(dim=1)
                correct += (preds == y_val).sum().item()
                total += len(y_val)

        val_acc = correct / total
        avg_loss = total_loss / len(train_loader)
        scheduler.step()

        history["train_loss"].append(avg_loss)
        history["val_acc"].append(val_acc)

        if (epoch + 1) % 5 == 0:
            print(f"Epoch {epoch+1}/{epochs} | Loss: {avg_loss:.4f} | Val Acc: {val_acc:.1%}")

    return history


# Example usage
X = torch.randn(1000, 128)
y = torch.randint(0, 10, (1000,))
dataset = TensorDataset(X, y)
train_set, val_set = torch.utils.data.random_split(dataset, [800, 200])
train_loader = DataLoader(train_set, batch_size=32, shuffle=True)
val_loader = DataLoader(val_set, batch_size=64)

model = MLP(128, [256, 128], 10).to(device)
history = train(model, train_loader, val_loader, epochs=20)

Saving and Loading Models

# Save: model weights only (recommended)
torch.save(model.state_dict(), "model.pt")

# Load
loaded_model = MLP(128, [256, 128], 10)
loaded_model.load_state_dict(torch.load("model.pt", map_location=device))
loaded_model.eval()

# Save: full model (architecture + weights, less portable)
torch.save(model, "full_model.pt")
loaded = torch.load("full_model.pt")

GPU Acceleration

# Check available hardware
print(torch.cuda.is_available())          # NVIDIA GPU
print(torch.backends.mps.is_available())  # Apple Silicon

# Recommended device selection
device = (
    "cuda" if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available()
    else "cpu"
)

# Move model and data to device
model = model.to(device)
x = x.to(device)

# With NVIDIA: multi-GPU training
if torch.cuda.device_count() > 1:
    model = nn.DataParallel(model)

Working with HuggingFace

HuggingFace Transformers is built on top of PyTorch:

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load pre-trained model
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.eval()

# Inference
texts = ["This movie was fantastic!", "I hated every minute of it."]
inputs = tokenizer(texts, return_tensors="pt", truncation=True, padding=True, max_length=128)

with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    predictions = logits.softmax(dim=-1)

labels = ["NEGATIVE", "POSITIVE"]
for text, pred in zip(texts, predictions):
    label = labels[pred.argmax()]
    confidence = pred.max().item()
    print(f"{label} ({confidence:.1%}): {text}")

Debugging Tips

# 1. Check for NaN in outputs
assert not torch.isnan(output).any(), "NaN in model output!"

# 2. Print shapes at each layer
class DebugNet(nn.Module):
    def forward(self, x):
        print(f"Input: {x.shape}")
        x = self.layer1(x)
        print(f"After layer1: {x.shape}")
        return x

# 3. Check gradient flow
for name, param in model.named_parameters():
    if param.grad is not None:
        print(f"{name}: grad norm = {param.grad.norm():.4f}")

# 4. Memory usage on GPU
print(f"GPU memory: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
torch.cuda.empty_cache()  # free cached memory

What to Learn Next

← Back to all articles