from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
import torch
import numpy as np
from sklearn.metrics import accuracy_score
from datasets import load_dataset
# Load general pretrained model and tokenizer
model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
# Load domain-specific dataset (example: medical texts)
dataset = load_dataset('csv', data_files={'train': 'domain_train.csv', 'validation': 'domain_val.csv'})
# Tokenize function
def tokenize_function(examples):
tokenized = tokenizer(examples['text'], padding='max_length', truncation=True)
tokenized['labels'] = examples['label']
return tokenized
dataset = dataset.map(tokenize_function, batched=True)
# Compute metrics
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
return {"accuracy": accuracy_score(labels, predictions)}
# Set training arguments
training_args = TrainingArguments(
output_dir='./results',
num_train_epochs=5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
evaluation_strategy='epoch',
save_strategy='epoch',
learning_rate=2e-5,
weight_decay=0.01,
logging_dir='./logs',
load_best_model_at_end=True,
metric_for_best_model='accuracy',
greater_is_better=True
)
# Define Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=dataset['train'],
eval_dataset=dataset['validation'],
compute_metrics=compute_metrics
)
# Fine-tune the model
trainer.train()
# Evaluate
eval_result = trainer.evaluate()
print(f"Validation Accuracy: {eval_result['eval_accuracy']*100:.2f}%")
print(f"Validation Loss: {eval_result['eval_loss']:.4f}")