from sklearn.datasets import load_files
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
# Load dataset
reviews = load_files('aclImdb/train/', categories=['pos', 'neg'], shuffle=True, random_state=42)
X, y = reviews.data, reviews.target
# Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
# Create pipeline with TF-IDF and LinearSVC
pipeline = Pipeline([
('tfidf', TfidfVectorizer(max_features=5000, stop_words='english')),
('svm', LinearSVC())
])
# Hyperparameter tuning with GridSearchCV
param_grid = {'svm__C': [0.01, 0.1, 1, 10]}
grid = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1)
grid.fit(X_train, y_train)
# Best model
best_model = grid.best_estimator_
# Predictions
train_preds = best_model.predict(X_train)
val_preds = best_model.predict(X_val)
# Metrics
train_acc = accuracy_score(y_train, train_preds) * 100
val_acc = accuracy_score(y_val, val_preds) * 100
print(f'Training accuracy: {train_acc:.2f}%')
print(f'Validation accuracy: {val_acc:.2f}%')