from transformers import RobertaForSequenceClassification, DistilBertForSequenceClassification, RobertaTokenizer, DistilBertTokenizer
from transformers import Trainer, TrainingArguments
from datasets import load_dataset
import numpy as np
from sklearn.metrics import accuracy_score
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
return {"accuracy": accuracy_score(labels, predictions)}
# Load dataset
raw_datasets = load_dataset("imdb")
# Load tokenizers
roberta_tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
distilbert_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
# Tokenize function
def tokenize_function(examples):
return roberta_tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)
# Tokenize datasets
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
# Prepare datasets for Trainer
train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(2000))
val_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(500))
# Load models
roberta_model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2, hidden_dropout_prob=0.3)
distilbert_model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2, dropout=0.3)
# Training arguments with dropout and early stopping
training_args = TrainingArguments(
output_dir="./results",
evaluation_strategy="epoch",
save_strategy="epoch",
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=5,
weight_decay=0.01,
load_best_model_at_end=True,
metric_for_best_model="accuracy",
greater_is_better=True,
save_total_limit=1
)
from transformers import EarlyStoppingCallback
# Trainer for RoBERTa
roberta_trainer = Trainer(
model=roberta_model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
compute_metrics=compute_metrics,
callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)
# Train RoBERTa
roberta_trainer.train()
# Evaluate RoBERTa
roberta_eval = roberta_trainer.evaluate()
# Tokenize function for DistilBERT
def tokenize_function_distilbert(examples):
return distilbert_tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)
# Tokenize datasets for DistilBERT
tokenized_datasets_distilbert = raw_datasets.map(tokenize_function_distilbert, batched=True)
train_dataset_distilbert = tokenized_datasets_distilbert["train"].shuffle(seed=42).select(range(2000))
val_dataset_distilbert = tokenized_datasets_distilbert["test"].shuffle(seed=42).select(range(500))
# Trainer for DistilBERT
training_args_distilbert = TrainingArguments(
output_dir="./results_distilbert",
evaluation_strategy="epoch",
save_strategy="epoch",
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=5,
weight_decay=0.01,
load_best_model_at_end=True,
metric_for_best_model="accuracy",
greater_is_better=True,
save_total_limit=1
)
distilbert_trainer = Trainer(
model=distilbert_model,
args=training_args_distilbert,
train_dataset=train_dataset_distilbert,
eval_dataset=val_dataset_distilbert,
compute_metrics=compute_metrics,
callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)
# Train DistilBERT
distilbert_trainer.train()
# Evaluate DistilBERT
distilbert_eval = distilbert_trainer.evaluate()
print(f"RoBERTa validation accuracy: {roberta_eval['eval_accuracy']*100:.2f}%")
print(f"DistilBERT validation accuracy: {distilbert_eval['eval_accuracy']*100:.2f}%")