import torch
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torch.utils.data import DataLoader
from torchvision.datasets import VOCDetection
import torchvision.transforms as T
# Define transforms with data augmentation for training
train_transforms = T.Compose([
T.RandomHorizontalFlip(0.5),
T.ToTensor(),
])
val_transforms = T.ToTensor()
# Load dataset with transforms
train_dataset = VOCDetection('./data', year='2007', image_set='train', download=True, transforms=train_transforms)
val_dataset = VOCDetection('./data', year='2007', image_set='val', download=True, transforms=val_transforms)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=lambda x: tuple(zip(*x)))
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, collate_fn=lambda x: tuple(zip(*x)))
# Load pre-trained Faster R-CNN
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
# Replace the classifier with new one for VOC classes (20 classes + background)
num_classes = 21
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
# VOC class mapping
CLASSES = [
"__background__", "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car",
"cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person",
"pottedplant", "sheep", "sofa", "train", "tvmonitor"
]
class_to_idx = {c: i for i, c in enumerate(CLASSES)}
# Optimizer with weight decay
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)
# Learning rate scheduler
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)
num_epochs = 10
for epoch in range(num_epochs):
model.train()
for images, targets in train_loader:
images = list(img.to(device) for img in images)
# Prepare targets in expected format
targets_formatted = []
for t in targets:
objs = t['annotation']['object']
if not isinstance(objs, list):
objs = [objs]
boxes = []
labels = []
for obj in objs:
bbox = obj['bndbox']
xmin = float(bbox['xmin'])
ymin = float(bbox['ymin'])
xmax = float(bbox['xmax'])
ymax = float(bbox['ymax'])
boxes.append([xmin, ymin, xmax, ymax])
label = class_to_idx[obj['name']]
labels.append(label)
boxes = torch.as_tensor(boxes, dtype=torch.float32).to(device)
labels = torch.as_tensor(labels, dtype=torch.int64).to(device)
targets_formatted.append({'boxes': boxes, 'labels': labels})
loss_dict = model(images, targets_formatted)
losses = sum(loss for loss in loss_dict.values())
optimizer.zero_grad()
losses.backward()
optimizer.step()
lr_scheduler.step()
# Note: For brevity, evaluation code is omitted but should compute mAP on val_loader