import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
# Sample data creation
# For demonstration, create a simple dataset
data = pd.DataFrame({
'Age': [22, 25, 47, 52, 46, 56, 55, 60, 18, 30, 40, 70, 80, 85],
'Feature1': [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0],
'Target': [0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1]
})
# Original model with continuous Age
X_orig = data[['Age', 'Feature1']]
y = data['Target']
X_train_orig, X_test_orig, y_train, y_test = train_test_split(X_orig, y, test_size=0.3, random_state=42)
model_orig = LogisticRegression()
model_orig.fit(X_train_orig, y_train)
y_pred_orig = model_orig.predict(X_test_orig)
orig_accuracy = accuracy_score(y_test, y_pred_orig) * 100
# Binning Age into 4 categories
bins = [0, 25, 50, 70, 100]
labels = ['Young', 'Adult', 'Senior', 'Elder']
data['Age_binned'] = pd.cut(data['Age'], bins=bins, labels=labels, right=False)
# Convert categorical bins to numeric codes
# Option 1: Use codes
# data['Age_binned_code'] = data['Age_binned'].cat.codes
# Option 2: Use one-hot encoding
age_dummies = pd.get_dummies(data['Age_binned'], prefix='Age')
# Prepare new features
X_binned = pd.concat([age_dummies, data['Feature1']], axis=1)
X_train_bin, X_test_bin, y_train, y_test = train_test_split(X_binned, y, test_size=0.3, random_state=42)
model_bin = LogisticRegression()
model_bin.fit(X_train_bin, y_train)
y_pred_bin = model_bin.predict(X_test_bin)
bin_accuracy = accuracy_score(y_test, y_pred_bin) * 100
print(f"Original model accuracy with continuous Age: {orig_accuracy:.2f}%")
print(f"Model accuracy with binned Age: {bin_accuracy:.2f}%")