import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split
import matplotlib.pyplot as plt
# Dummy dataset: simple text classification with tokenized inputs
X = torch.randint(0, 1000, (1000, 50)) # 1000 samples, 50 tokens each
Y = (X.sum(dim=1) % 2).long() # Learnable binary labels based on input
dataset = TensorDataset(X, Y)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_ds, val_ds = random_split(dataset, [train_size, val_size])
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=32, shuffle=False)
class SimpleLLM(nn.Module):
def __init__(self, vocab_size, embed_dim, num_layers, hidden_dim):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim)
self.layers = nn.ModuleList([
nn.TransformerEncoderLayer(d_model=embed_dim, nhead=4, dim_feedforward=hidden_dim)
for _ in range(num_layers)
])
self.classifier = nn.Linear(embed_dim, 2)
def forward(self, x):
x = self.embedding(x) # (batch, seq_len, embed_dim)
x = x.permute(1, 0, 2) # Transformer expects (seq_len, batch, embed_dim)
for layer in self.layers:
x = layer(x)
x = x.mean(dim=0) # average over sequence length
out = self.classifier(x)
return out
# Training function
def train_model(model, dataloader, epochs=5):
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
model.train()
for epoch in range(epochs):
for xb, yb in dataloader:
optimizer.zero_grad()
preds = model(xb)
loss = criterion(preds, yb)
loss.backward()
optimizer.step()
return model
# Evaluation function
def evaluate_model(model, dataloader):
model.eval()
correct = 0
total = 0
with torch.no_grad():
for xb, yb in dataloader:
preds = model(xb)
predicted = preds.argmax(dim=1)
correct += (predicted == yb).sum().item()
total += yb.size(0)
return correct / total * 100
# Parameter counting
def count_params(model):
return sum(p.numel() for p in model.parameters()) / 1e6
# Model sizes to test
model_configs = [
{"num_layers": 2, "hidden_dim": 128, "embed_dim": 64}, # ~0.1M params
{"num_layers": 4, "hidden_dim": 256, "embed_dim": 128}, # ~0.7M params
{"num_layers": 6, "hidden_dim": 512, "embed_dim": 256} # ~3M params
]
results = []
for config in model_configs:
model = SimpleLLM(vocab_size=1000, embed_dim=config["embed_dim"], num_layers=config["num_layers"], hidden_dim=config["hidden_dim"])
num_params = count_params(model)
model = train_model(model, train_loader, epochs=5)
acc = evaluate_model(model, val_loader)
results.append({"params_approx": f"{num_params:.1f}M", "accuracy": acc})
# Print results
for r in results:
print(f"Model size approx: {r["params_approx"]} params, Validation accuracy: {r["accuracy"]:.2f}%")
# Plot
sizes = [float(r["params_approx"][:-1]) for r in results]
accs = [r["accuracy"] for r in results]
plt.plot(sizes, accs, marker='o')
plt.xlabel('Model size (millions of params)')
plt.ylabel('Validation Accuracy (%)')
plt.title('LLM Scaling Law: Accuracy vs Model Size')
plt.grid(True)
plt.show()