from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
import torch
# Load base model and tokenizer
model_name = 'gpt2'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
# Example prompt engineering: adding clear instructions
prompt = "Answer the question precisely and concisely:\nWhat is the capital of France?"
inputs = tokenizer(prompt, return_tensors='pt')
outputs = model.generate(**inputs, max_length=50)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
# If prompt engineering is not enough, fine-tune with small dataset
train_texts = ["What is the capital of France?\tParis", "Who wrote Hamlet?\tShakespeare"]
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
class SimpleDataset(torch.utils.data.Dataset):
def __init__(self, encodings):
self.encodings = encodings
def __len__(self):
return len(self.encodings['input_ids'])
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
item['labels'] = item['input_ids'].clone()
return item
dataset = SimpleDataset(train_encodings)
training_args = TrainingArguments(
output_dir='./results',
num_train_epochs=3,
per_device_train_batch_size=2,
logging_steps=10,
save_steps=10,
save_total_limit=1,
learning_rate=5e-5,
weight_decay=0.01,
logging_dir='./logs',
no_cuda=True
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=dataset
)
trainer.train()
# After fine-tuning, test again
prompt_ft = "What is the capital of France?"
inputs_ft = tokenizer(prompt_ft, return_tensors='pt')
outputs_ft = model.generate(**inputs_ft, max_length=50)
print(tokenizer.decode(outputs_ft[0], skip_special_tokens=True))