from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
import numpy as np
# Small sample dataset
data = {
'en': [
'Hello, how are you?',
'I love machine learning.',
'This is a test sentence.',
'The weather is nice today.',
'Can you help me?',
'What is your name?',
'I am learning to translate.',
'This is fun!',
'Have a great day.',
'See you tomorrow.'
] * 10, # 100 sentences
'fr': [
'Bonjour, comment ça va?',
"J'aime l'apprentissage automatique.",
"Ceci est une phrase de test.",
"Il fait beau aujourd'hui.",
"Pouvez-vous m'aider?",
"Comment vous appelez-vous?",
"J'apprends à traduire.",
"C'est amusant!",
"Bonne journée.",
"À demain."
] * 10
}
# Create Hugging Face Dataset
dataset = Dataset.from_dict(data)
# Load tokenizer and model
model_name = 'Helsinki-NLP/opus-mt-en-fr'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
# Tokenize function
def preprocess_function(examples):
inputs = examples['en']
targets = examples['fr']
model_inputs = tokenizer(inputs, max_length=40, truncation=True)
with tokenizer.as_target_tokenizer():
labels = tokenizer(targets, max_length=40, truncation=True)
model_inputs['labels'] = labels['input_ids']
return model_inputs
# Prepare dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)
# Training arguments
training_args = TrainingArguments(
output_dir='./results',
evaluation_strategy='no',
learning_rate=5e-5,
per_device_train_batch_size=8,
num_train_epochs=3,
weight_decay=0.01,
save_total_limit=1,
logging_steps=10
)
# Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset
)
# Train model
trainer.train()
# Test translation
test_sentences = [
'Hello, how are you?',
'I love machine learning.',
'Can you help me?'
]
inputs = tokenizer(test_sentences, return_tensors='pt', padding=True, truncation=True)
outputs = model.generate(**inputs, max_length=40)
translations = tokenizer.batch_decode(outputs, skip_special_tokens=True)
print('Translations:')
for en, fr in zip(test_sentences, translations):
print(f'EN: {en}')
print(f'FR: {fr}')
print('---')