import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.base import BaseEstimator, TransformerMixin
import re
class TextCleaner(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X):
cleaned = []
for doc in X:
doc = doc.lower() # lowercase
doc = re.sub(r'[^a-z ]', ' ', doc) # remove non-letters
doc = re.sub(r'\s+', ' ', doc) # remove extra spaces
cleaned.append(doc.strip())
return cleaned
# Load dataset
categories = ['alt.atheism', 'sci.space', 'comp.graphics']
data = fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes'))
X_train, X_val, y_train, y_val = train_test_split(data.data, data.target, test_size=0.2, random_state=42)
# Build pipeline
pipeline = Pipeline([
('cleaner', TextCleaner()),
('tfidf', TfidfVectorizer(stop_words='english', max_features=1000)),
('clf', MLPClassifier(hidden_layer_sizes=(50,), alpha=0.01, max_iter=100, early_stopping=True, random_state=42))
])
# Train model
pipeline.fit(X_train, y_train)
# Evaluate
train_preds = pipeline.predict(X_train)
val_preds = pipeline.predict(X_val)
train_acc = accuracy_score(y_train, train_preds) * 100
val_acc = accuracy_score(y_val, val_preds) * 100
print(f'Training accuracy: {train_acc:.2f}%')
print(f'Validation accuracy: {val_acc:.2f}%')