This program trains a model to classify text into three categories from a real dataset. It shows accuracy and one example prediction.
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
# Load a small subset of data for speed
categories = ['alt.atheism', 'comp.graphics', 'sci.space']
data_train = fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes'))
data_test = fetch_20newsgroups(subset='test', categories=categories, remove=('headers', 'footers', 'quotes'))
# Convert text to numbers
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X_train = vectorizer.fit_transform(data_train.data)
X_test = vectorizer.transform(data_test.data)
# Train model
model = LogisticRegression(max_iter=1000, multi_class='ovr')
model.fit(X_train, data_train.target)
# Predict on test data
predictions = model.predict(X_test)
# Calculate accuracy
acc = accuracy_score(data_test.target, predictions)
print(f"Accuracy: {acc:.3f}")
print(f"Sample prediction for first test text: {data_test.target_names[predictions[0]]}")