import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
# Load tokenizer and model
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)
# Prepare dataset (example with dummy data for illustration)
texts = ["Hello, how are you?", "The weather is nice today.", "I love reading books."]
inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=50)
# Define training arguments
training_args = TrainingArguments(
output_dir='./results',
num_train_epochs=3,
per_device_train_batch_size=4,
learning_rate=5e-5,
weight_decay=0.01,
logging_dir='./logs',
logging_steps=10,
save_steps=10,
evaluation_strategy='steps',
eval_steps=10
)
# Dummy dataset class
class TextDataset(torch.utils.data.Dataset):
def __init__(self, encodings):
self.encodings = encodings
def __len__(self):
return len(self.encodings['input_ids'])
def __getitem__(self, idx):
return {key: val[idx] for key, val in self.encodings.items()}
train_dataset = TextDataset(inputs)
# Initialize Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset
)
# Train model
trainer.train()
# After training, generate text
input_text = "Today is a beautiful"
input_ids = tokenizer(input_text, return_tensors='pt').input_ids
outputs = model.generate(input_ids, max_length=20, num_beams=5, no_repeat_ngram_size=2)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))