import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
# Simulated user-item rating data
np.random.seed(42)
num_users, num_items = 100, 50
ratings = np.random.randint(1, 6, size=(num_users, num_items)).astype(float)
# Mask some ratings to simulate missing data
mask = np.random.rand(num_users, num_items) < 0.7
ratings_masked = np.where(mask, ratings, 0)
# Train-test split on observed ratings
observed_indices = np.argwhere(ratings_masked > 0)
train_idx, val_idx = train_test_split(observed_indices, test_size=0.2, random_state=42)
# Matrix factorization with regularization
class MatrixFactorization:
def __init__(self, R, K, alpha, beta, iterations):
self.R = R
self.num_users, self.num_items = R.shape
self.K = K # latent factors
self.alpha = alpha # learning rate
self.beta = beta # regularization parameter
self.iterations = iterations
def train(self, train_idx, val_idx):
self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K))
self.train_errors = []
self.val_errors = []
for i in range(self.iterations):
for u, i_ in train_idx:
prediction = self.predict_single(u, i_)
e_ui = self.R[u, i_] - prediction
# Update latent factors with regularization
self.P[u, :] += self.alpha * (e_ui * self.Q[i_, :] - self.beta * self.P[u, :])
self.Q[i_, :] += self.alpha * (e_ui * self.P[u, :] - self.beta * self.Q[i_, :])
train_rmse = self.compute_rmse(train_idx)
val_rmse = self.compute_rmse(val_idx)
self.train_errors.append(train_rmse)
self.val_errors.append(val_rmse)
# Early stopping if validation error increases
if i > 5 and self.val_errors[-1] > self.val_errors[-2]:
break
return self.train_errors[-1], self.val_errors[-1]
def predict_single(self, u, i):
return np.dot(self.P[u, :], self.Q[i, :].T)
def compute_rmse(self, idx):
errors = []
for u, i_ in idx:
pred = self.predict_single(u, i_)
errors.append((self.R[u, i_] - pred) ** 2)
return np.sqrt(np.mean(errors))
# Hyperparameters tuned to reduce overfitting
K = 10 # fewer latent factors
alpha = 0.005 # learning rate
beta = 0.1 # stronger regularization
iterations = 50
mf = MatrixFactorization(ratings_masked, K, alpha, beta, iterations)
train_rmse, val_rmse = mf.train(train_idx, val_idx)
print(f"Training RMSE: {train_rmse:.3f}")
print(f"Validation RMSE: {val_rmse:.3f}")