import string
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
class TextPreprocessor(BaseEstimator, TransformerMixin):
def __init__(self):
self.stop_words = set(stopwords.words('english'))
self.lemmatizer = WordNetLemmatizer()
self.punct_table = str.maketrans('', '', string.punctuation)
def preprocess(self, text):
# Lowercase
text = text.lower()
# Remove punctuation
text = text.translate(self.punct_table)
# Tokenize
tokens = word_tokenize(text)
# Remove stopwords
tokens = [t for t in tokens if t not in self.stop_words]
# Lemmatize
tokens = [self.lemmatizer.lemmatize(t) for t in tokens]
return ' '.join(tokens)
def fit(self, X, y=None):
return self
def transform(self, X):
return [self.preprocess(text) for text in X]
# Example dataset
texts = [
'I love programming in Python!',
'Python programming is fun.',
'I dislike bugs in code.',
'Debugging code is frustrating.',
'I enjoy learning new things.'
]
labels = [1, 1, 0, 0, 1]
X_train, X_val, y_train, y_val = train_test_split(texts, labels, test_size=0.4, random_state=42)
pipeline = Pipeline([
('preprocessor', TextPreprocessor()),
('vectorizer', TfidfVectorizer()),
('classifier', LogisticRegression(max_iter=200))
])
pipeline.fit(X_train, y_train)
train_acc = pipeline.score(X_train, y_train) * 100
val_acc = pipeline.score(X_val, y_val) * 100
print(f'Training accuracy: {train_acc:.2f}%')
print(f'Validation accuracy: {val_acc:.2f}%')