import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
# Dummy dataset for semantic search embeddings
class SemanticSearchDataset(Dataset):
def __init__(self, queries, positives, negatives):
self.queries = queries
self.positives = positives
self.negatives = negatives
def __len__(self):
return len(self.queries)
def __getitem__(self, idx):
return self.queries[idx], self.positives[idx], self.negatives[idx]
# Simple embedding model
class EmbeddingModel(nn.Module):
def __init__(self, input_dim, embed_dim):
super().__init__()
self.fc = nn.Sequential(
nn.Linear(input_dim, 128),
nn.ReLU(),
nn.Linear(128, embed_dim)
)
def forward(self, x):
x = self.fc(x)
x = nn.functional.normalize(x, p=2, dim=1) # Normalize embeddings
return x
# Triplet loss function
triplet_loss = nn.TripletMarginLoss(margin=1.0, p=2)
# Example training loop
def train(model, dataloader, optimizer, epochs=10):
model.train()
for epoch in range(epochs):
total_loss = 0
for q, pos, neg in dataloader:
optimizer.zero_grad()
q_embed = model(q)
pos_embed = model(pos)
neg_embed = model(neg)
loss = triplet_loss(q_embed, pos_embed, neg_embed)
loss.backward()
optimizer.step()
total_loss += loss.item()
print(f"Epoch {epoch+1}, Loss: {total_loss/len(dataloader):.4f}")
# Simulated data (random vectors for example)
np.random.seed(0)
queries = torch.tensor(np.random.rand(100, 50), dtype=torch.float32)
positives = queries + 0.05 * torch.randn(100, 50) # similar vectors
negatives = torch.tensor(np.random.rand(100, 50), dtype=torch.float32) # random vectors
# Dataset and loader
dataset = SemanticSearchDataset(queries, positives, negatives)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)
# Model, optimizer
model = EmbeddingModel(input_dim=50, embed_dim=32)
optimizer = optim.Adam(model.parameters(), lr=0.001)
# Train
train(model, dataloader, optimizer, epochs=20)
# After training, embeddings are better separated for semantic search