Introduction
Weight decay helps prevent a model from memorizing training data by keeping its weights small and simple.
Jump into concepts and practice - no test required
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay=decay_rate)
optimizer = torch.optim.SGD(model.parameters(), lr=0.1, weight_decay=0.01)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=0.0001)
import torch import torch.nn as nn import torch.optim as optim # Simple linear model class SimpleModel(nn.Module): def __init__(self): super().__init__() self.linear = nn.Linear(1, 1) def forward(self, x): return self.linear(x) # Create model model = SimpleModel() # Dummy data: y = 2x + 1 x_train = torch.tensor([[1.0], [2.0], [3.0], [4.0]]) y_train = torch.tensor([[3.0], [5.0], [7.0], [9.0]]) # Loss function criterion = nn.MSELoss() # Optimizer with weight decay (L2 regularization) optimizer = optim.SGD(model.parameters(), lr=0.1, weight_decay=0.01) # Training loop for epoch in range(10): optimizer.zero_grad() outputs = model(x_train) loss = criterion(outputs, y_train) loss.backward() optimizer.step() print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}") # Print final weights for name, param in model.named_parameters(): print(f"{name}: {param.data.flatten().tolist()}")
weight_decay to apply L2 regularization.weight_decay correctly.import torch model = torch.nn.Linear(2, 1) optimizer = torch.optim.SGD(model.parameters(), lr=0.1, weight_decay=0.01) initial_weight = model.weight.data.clone() optimizer.zero_grad() output = model(torch.tensor([[1.0, 2.0]])) loss = output.sum() loss.backward() optimizer.step() updated_weight = model.weight.data print((initial_weight - updated_weight).abs().sum().item())
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=0.1)