from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_openml
from sklearn.metrics import accuracy_score
import pandas as pd
# Load dataset
adult = fetch_openml(name='adult', version=2, as_frame=True)
df = adult.frame
# Prepare features and target
X = df.drop(columns=['class'])
y = (df['class'] == '>50K').astype(int)
# Identify categorical features by name
cat_features = X.select_dtypes(include=['category', 'object']).columns.tolist()
# Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# Create Pool objects for CatBoost
train_pool = Pool(X_train, y_train, cat_features=cat_features)
val_pool = Pool(X_val, y_val, cat_features=cat_features)
# Initialize CatBoost with adjusted hyperparameters
model = CatBoostClassifier(
iterations=500,
learning_rate=0.05,
depth=6,
l2_leaf_reg=10,
early_stopping_rounds=50,
verbose=0,
random_seed=42
)
# Train model
model.fit(train_pool, eval_set=val_pool, use_best_model=True)
# Predict and evaluate
train_pred = model.predict(X_train)
val_pred = model.predict(X_val)
train_acc = accuracy_score(y_train, train_pred) * 100
val_acc = accuracy_score(y_val, val_pred) * 100
print(f'Training accuracy: {train_acc:.2f}%')
print(f'Validation accuracy: {val_acc:.2f}%')