import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
class SimpleTransformerEncoder(nn.Module):
def __init__(self, vocab_size, embed_dim, num_heads, hidden_dim, num_layers, dropout=0.1):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim)
encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads, dim_feedforward=hidden_dim, dropout=dropout)
self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
self.fc = nn.Linear(embed_dim, 2) # binary classification
self.dropout = nn.Dropout(dropout)
def forward(self, x):
# x shape: (batch_size, seq_len)
embedded = self.embedding(x)
embedded = embedded * (embedded.size(-1) ** 0.5) # scale embedding
embedded = embedded.permute(1, 0, 2) # (seq_len, batch_size, embed_dim) for transformer
encoded = self.transformer_encoder(embedded) # same shape
encoded = encoded.mean(dim=0) # average over sequence length
out = self.dropout(encoded)
out = self.fc(out)
return out
# Dummy dataset creation for demonstration
vocab_size = 1000
embed_dim = 64
num_heads = 4
hidden_dim = 128
num_layers = 2
batch_size = 32
seq_len = 10
# Random data simulating tokenized sentences
X_train = torch.randint(0, vocab_size, (500, seq_len))
y_train = torch.randint(0, 2, (500,))
X_val = torch.randint(0, vocab_size, (100, seq_len))
y_val = torch.randint(0, 2, (100,))
train_dataset = TensorDataset(X_train, y_train)
val_dataset = TensorDataset(X_val, y_val)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
model = SimpleTransformerEncoder(vocab_size, embed_dim, num_heads, hidden_dim, num_layers, dropout=0.3)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.0005, weight_decay=1e-4)
# Training loop with early stopping
best_val_acc = 0
patience = 3
trigger_times = 0
for epoch in range(30):
model.train()
total_loss = 0
correct = 0
total = 0
for inputs, labels in train_loader:
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
total_loss += loss.item() * inputs.size(0)
preds = outputs.argmax(dim=1)
correct += (preds == labels).sum().item()
total += labels.size(0)
train_loss = total_loss / total
train_acc = correct / total * 100
model.eval()
val_loss = 0
val_correct = 0
val_total = 0
with torch.no_grad():
for inputs, labels in val_loader:
outputs = model(inputs)
loss = criterion(outputs, labels)
val_loss += loss.item() * inputs.size(0)
preds = outputs.argmax(dim=1)
val_correct += (preds == labels).sum().item()
val_total += labels.size(0)
val_loss /= val_total
val_acc = val_correct / val_total * 100
if val_acc > best_val_acc:
best_val_acc = val_acc
trigger_times = 0
else:
trigger_times += 1
if trigger_times >= patience:
break
print(f"Final Training Accuracy: {train_acc:.2f}%")
print(f"Final Validation Accuracy: {val_acc:.2f}%")
print(f"Final Training Loss: {train_loss:.4f}")
print(f"Final Validation Loss: {val_loss:.4f}")