This example shows how adding a new feature 'income_per_person' can help the model predict house prices better by lowering the error.
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
# Sample data
data = {'age': [25, 32, 47, 51, 62],
'income': [50000, 60000, 80000, 90000, 120000],
'family_size': [3, 4, 2, 5, 3],
'house_price': [200000, 250000, 320000, 360000, 400000]}
# Create DataFrame
df = pd.DataFrame(data)
# Feature engineering: create income per person
df['income_per_person'] = df['income'] / df['family_size']
# Prepare features and target
X = df[['age', 'income', 'family_size']]
X_eng = df[['age', 'income', 'family_size', 'income_per_person']]
y = df['house_price']
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
X_eng_train, X_eng_test, y_train_eng, y_test_eng = train_test_split(X_eng, y, random_state=42)
# Train model without engineered feature
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse_without = mean_squared_error(y_test, y_pred)
# Train model with engineered feature
model_eng = LinearRegression()
model_eng.fit(X_eng_train, y_train_eng)
y_pred_eng = model_eng.predict(X_eng_test)
mse_with = mean_squared_error(y_test_eng, y_pred_eng)
print(f"MSE without engineered feature: {mse_without:.2e}")
print(f"MSE with engineered feature: {mse_with:.2e}")