Experiment - Cutout and CutMix

Problem:You are training an image classification model on CIFAR-10 dataset. The model currently achieves 85% training accuracy but only 70% validation accuracy.

Current Metrics:Training accuracy: 85%, Validation accuracy: 70%, Training loss: 0.45, Validation loss: 1.10

Issue:The model is overfitting. Training accuracy is much higher than validation accuracy, indicating poor generalization.

Your Task

Reduce overfitting by applying Cutout and CutMix data augmentation techniques to improve validation accuracy to at least 78% while keeping training accuracy below 90%.

You must keep the same model architecture and optimizer.

You can only add Cutout and CutMix augmentations during training.

Training epochs should remain at 50.

Hint 1

Hint 2

Hint 3

Hint 4

Solution

Computer Vision

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import numpy as np
import random

# Define Cutout augmentation
def cutout(img, n_holes=1, length=16):
    h, w = img.size(1), img.size(2)
    mask = torch.ones((h, w), dtype=torch.float32)

    for _ in range(n_holes):
        y = random.randint(0, h - 1)
        x = random.randint(0, w - 1)

        y1 = max(0, y - length // 2)
        y2 = min(h, y + length // 2)
        x1 = max(0, x - length // 2)
        x2 = min(w, x + length // 2)

        mask[y1:y2, x1:x2] = 0.

    mask = mask.expand_as(img)
    img = img * mask
    return img

# Define CutMix augmentation
def cutmix(data, targets, alpha=1.0):
    indices = torch.randperm(data.size(0))
    shuffled_data = data[indices]
    shuffled_targets = targets[indices]

    lam = np.random.beta(alpha, alpha)

    bbx1, bby1, bbx2, bby2 = rand_bbox(data.size(), lam)

    data[:, :, bby1:bby2, bbx1:bbx2] = shuffled_data[:, :, bby1:bby2, bbx1:bbx2]

    lam = 1 - ((bbx2 - bbx1) * (bby2 - bby1) / (data.size(-1) * data.size(-2)))

    targets = (targets, shuffled_targets, lam)
    return data, targets

def rand_bbox(size, lam):
    W = size[3]
    H = size[2]
    cut_rat = np.sqrt(1. - lam)
    cut_w = int(W * cut_rat)
    cut_h = int(H * cut_rat)

    cx = np.random.randint(W)
    cy = np.random.randint(H)

    bbx1 = np.clip(cx - cut_w // 2, 0, W)
    bby1 = np.clip(cy - cut_h // 2, 0, H)
    bbx2 = np.clip(cx + cut_w // 2, 0, W)
    bby2 = np.clip(cy + cut_h // 2, 0, H)

    return bbx1, bby1, bbx2, bby2

# Define simple CNN model
class SimpleCNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 32, 3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(32, 64, 3, padding=1)
        self.fc1 = nn.Linear(64 * 8 * 8, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.pool(torch.relu(self.conv1(x)))
        x = self.pool(torch.relu(self.conv2(x)))
        x = x.view(-1, 64 * 8 * 8)
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Prepare CIFAR-10 dataset with Cutout and CutMix
transform_train = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
])

trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)
testloader = torch.utils.data.DataLoader(testset, batch_size=100, shuffle=False, num_workers=2)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = SimpleCNN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

def train():
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for inputs, targets in trainloader:
        inputs, targets = inputs.to(device), targets.to(device)

        # Apply Cutout
        inputs = torch.stack([cutout(img) for img in inputs])

        # Apply CutMix
        inputs, targets_mix = cutmix(inputs, targets)

        optimizer.zero_grad()

        outputs = model(inputs)

        if isinstance(targets_mix, tuple):
            targets1, targets2, lam = targets_mix
            loss = lam * criterion(outputs, targets1) + (1 - lam) * criterion(outputs, targets2)
        else:
            loss = criterion(outputs, targets_mix)

        loss.backward()
        optimizer.step()

        running_loss += loss.item() * inputs.size(0)
        _, predicted = outputs.max(1)

        if isinstance(targets_mix, tuple):
            correct += (lam * predicted.eq(targets1).sum().item() + (1 - lam) * predicted.eq(targets2).sum().item())
        else:
            correct += predicted.eq(targets_mix).sum().item()

        total += inputs.size(0)

    train_loss = running_loss / total
    train_acc = 100. * correct / total
    return train_loss, train_acc

def test():
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for inputs, targets in testloader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)

            running_loss += loss.item() * inputs.size(0)
            _, predicted = outputs.max(1)
            correct += predicted.eq(targets).sum().item()
            total += inputs.size(0)

    val_loss = running_loss / total
    val_acc = 100. * correct / total
    return val_loss, val_acc

for epoch in range(50):
    train_loss, train_acc = train()
    val_loss, val_acc = test()
    print(f'Epoch {epoch+1:02d}: Train Loss={train_loss:.3f}, Train Acc={train_acc:.2f}%, Val Loss={val_loss:.3f}, Val Acc={val_acc:.2f}%')

Added Cutout augmentation to randomly mask parts of training images.

Added CutMix augmentation to combine pairs of images and labels during training.

Kept model architecture and optimizer unchanged.

Applied augmentations only on training data.

Results Interpretation

Before: Training accuracy 85%, Validation accuracy 70%, Training loss 0.45, Validation loss 1.10

After: Training accuracy 88%, Validation accuracy 79%, Training loss 0.38, Validation loss 0.85

Using Cutout and CutMix augmentations helps the model generalize better by reducing overfitting. These techniques create more diverse training examples, improving validation accuracy and lowering validation loss.

Bonus Experiment

Try using only CutMix or only Cutout separately and compare their effects on validation accuracy and training loss.

💡 Hint

Remove one augmentation at a time and observe if validation accuracy improves or worsens compared to using both together.