import nltk
import spacy
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn as nn
import torch.optim as optim
# Load spaCy model for preprocessing
nlp = spacy.load('en_core_web_sm')
# Sample data (replace with real dataset)
texts = ["I love this movie", "This film is terrible", "Amazing story and great acting", "Worst movie ever"]
labels = [1, 0, 1, 0]
# Preprocess texts with spaCy
processed_texts = [" ".join([token.lemma_ for token in nlp(text.lower()) if not token.is_stop and token.is_alpha]) for text in texts]
# Use Hugging Face tokenizer and model for feature extraction
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
model = AutoModel.from_pretrained('distilbert-base-uncased')
class TextClassifier(nn.Module):
def __init__(self):
super().__init__()
self.bert = model
self.dropout = nn.Dropout(0.3)
self.linear = nn.Linear(768, 2)
def forward(self, input_ids, attention_mask):
outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
pooled_output = outputs.last_hidden_state[:, 0, :]
dropped = self.dropout(pooled_output)
return self.linear(dropped)
# Prepare data for PyTorch
inputs = tokenizer(processed_texts, padding=True, truncation=True, return_tensors='pt')
labels_tensor = torch.tensor(labels)
# Split data
train_indices, val_indices = train_test_split(range(len(labels)), test_size=0.5, random_state=42)
train_inputs = {k: v[train_indices] for k, v in inputs.items()}
val_inputs = {k: v[val_indices] for k, v in inputs.items()}
train_labels = labels_tensor[train_indices]
val_labels = labels_tensor[val_indices]
# Initialize model, loss, optimizer
classifier = TextClassifier()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(classifier.parameters(), lr=1e-4)
# Training loop with early stopping
best_val_acc = 0
patience = 3
trigger_times = 0
for epoch in range(20):
classifier.train()
optimizer.zero_grad()
outputs = classifier(train_inputs['input_ids'], train_inputs['attention_mask'])
loss = criterion(outputs, train_labels)
loss.backward()
optimizer.step()
classifier.eval()
with torch.no_grad():
val_outputs = classifier(val_inputs['input_ids'], val_inputs['attention_mask'])
val_loss = criterion(val_outputs, val_labels)
val_preds = val_outputs.argmax(dim=1)
val_acc = accuracy_score(val_labels, val_preds)
if val_acc > best_val_acc:
best_val_acc = val_acc
trigger_times = 0
else:
trigger_times += 1
if trigger_times >= patience:
break
train_preds = classifier(train_inputs['input_ids'], train_inputs['attention_mask']).argmax(dim=1)
train_acc = accuracy_score(train_labels, train_preds)
print(f'Training accuracy: {train_acc*100:.2f}%')
print(f'Validation accuracy: {best_val_acc*100:.2f}%')