import nltk
from nltk.corpus import stopwords
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss
# Download stopwords if not already downloaded
nltk.download('stopwords')
# Sample data
texts = ["I love this movie", "This movie is terrible", "Best film ever", "Worst film ever", "I enjoyed the movie", "I hated the movie"]
labels = [1, 0, 1, 0, 1, 0]
# Split data
X_train, X_val, y_train, y_val = train_test_split(texts, labels, test_size=0.33, random_state=42)
# Custom transformer to remove stopwords
class StopwordRemover(BaseEstimator, TransformerMixin):
def __init__(self):
self.stopwords = set(stopwords.words('english'))
def fit(self, X, y=None):
return self
def transform(self, X):
cleaned = []
for doc in X:
words = doc.split()
filtered = [word for word in words if word.lower() not in self.stopwords]
cleaned.append(' '.join(filtered))
return cleaned
# Build pipeline with custom component
pipeline = Pipeline([
('stopword_removal', StopwordRemover()),
('tfidf', TfidfVectorizer()),
('clf', LogisticRegression(max_iter=1000))
])
# Train model
pipeline.fit(X_train, y_train)
# Predict and evaluate
train_preds = pipeline.predict(X_train)
val_preds = pipeline.predict(X_val)
train_probs = pipeline.predict_proba(X_train)
val_probs = pipeline.predict_proba(X_val)
train_acc = accuracy_score(y_train, train_preds) * 100
val_acc = accuracy_score(y_val, val_preds) * 100
train_loss = log_loss(y_train, train_probs)
val_loss = log_loss(y_val, val_probs)
print(f"Training accuracy: {train_acc:.2f}%")
print(f"Validation accuracy: {val_acc:.2f}%")
print(f"Training loss: {train_loss:.4f}")
print(f"Validation loss: {val_loss:.4f}")