import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
# Sample data (toy example)
english_sentences = ['hello', 'how are you', 'good morning', 'thank you', 'see you']
french_sentences = ['bonjour', 'comment ça va', 'bon matin', 'merci', 'à bientôt']
# Tokenize English
eng_tokenizer = Tokenizer()
eng_tokenizer.fit_on_texts(english_sentences)
eng_seq = eng_tokenizer.texts_to_sequences(english_sentences)
eng_seq = pad_sequences(eng_seq, padding='post')
# Tokenize French
fr_tokenizer = Tokenizer()
fr_tokenizer.fit_on_texts(french_sentences)
fr_seq = fr_tokenizer.texts_to_sequences(french_sentences)
fr_seq = pad_sequences(fr_seq, padding='post')
vocab_eng = len(eng_tokenizer.word_index) + 1
vocab_fr = len(fr_tokenizer.word_index) + 1
# Build model with dropout and lower learning rate
model = Sequential([
Embedding(vocab_eng, 8, input_length=eng_seq.shape[1]),
LSTM(16, return_sequences=False),
Dropout(0.3),
Dense(vocab_fr, activation='softmax')
])
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
# Prepare target data (next word prediction simplified)
y_train = fr_seq[:, 0] # Simplified target for demo
# Early stopping callback
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
# Train model
history = model.fit(eng_seq, y_train, epochs=50, batch_size=2, validation_split=0.2, callbacks=[early_stop])