import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss
# Sample data creation
np.random.seed(42)
data = pd.DataFrame({
'category': np.random.choice(['A', 'B', 'C', 'D'], size=1000),
'feature_num': np.random.randn(1000)
})
cat_probs = {'A': 0.1, 'B': 0.3, 'C': 0.7, 'D': 0.9}
data['target'] = (np.random.rand(1000) < data['category'].map(cat_probs).values).astype(int)
# Split data
train_df, val_df = train_test_split(data, test_size=0.2, random_state=42)
# Function for target encoding with smoothing
class TargetEncoder:
def __init__(self, smoothing=1):
self.smoothing = smoothing
self.target_means = None
self.global_mean = None
def fit(self, X, y):
self.global_mean = y.mean()
agg = pd.DataFrame({'count': X.groupby(X).size(), 'mean': y.groupby(X).mean()})
smoothing = 1 / (1 + np.exp(-(agg['count'] - self.smoothing)))
self.target_means = self.global_mean * (1 - smoothing) + agg['mean'] * smoothing
def transform(self, X):
return X.map(self.target_means).fillna(self.global_mean)
def fit_transform(self, X, y):
self.fit(X, y)
return self.transform(X)
# Apply target encoding
encoder = TargetEncoder(smoothing=10)
train_cat_encoded = encoder.fit_transform(train_df['category'], train_df['target'])
val_cat_encoded = encoder.transform(val_df['category'])
# Prepare features
X_train = pd.DataFrame({
'category_encoded': train_cat_encoded,
'feature_num': train_df['feature_num']
})
X_val = pd.DataFrame({
'category_encoded': val_cat_encoded,
'feature_num': val_df['feature_num']
})
y_train = train_df['target']
y_val = val_df['target']
# Train logistic regression
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)
# Predict and evaluate
train_preds = model.predict(X_train)
val_preds = model.predict(X_val)
train_probs = model.predict_proba(X_train)[:, 1]
val_probs = model.predict_proba(X_val)[:, 1]
train_acc = accuracy_score(y_train, train_preds) * 100
val_acc = accuracy_score(y_val, val_preds) * 100
val_loss = log_loss(y_val, val_probs)
print(f"Training accuracy: {train_acc:.2f}%")
print(f"Validation accuracy: {val_acc:.2f}%")
print(f"Validation loss: {val_loss:.3f}")