This code shows how to turn sentences into numbers and pad them so RNNs can process batches.
import torch
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence
def tokenize(text):
return text.lower().split()
# Sample dataset
dataset = ["I love machine learning", "Deep learning is fun", "RNNs handle sequences"]
# Build vocabulary
vocab = build_vocab_from_iterator(tokenize(text) for text in dataset, specials=["<pad>"])
# Convert text to token ids
def text_to_ids(text):
return [vocab[token] for token in tokenize(text)]
# Prepare batch
batch = ["I love machine learning", "RNNs handle sequences"]
token_ids = [torch.tensor(text_to_ids(text)) for text in batch]
# Pad sequences
padded_batch = pad_sequence(token_ids, batch_first=True, padding_value=vocab['<pad>'])
print("Vocabulary tokens:", vocab.get_itos())
print("Token IDs for batch:", token_ids)
print("Padded batch tensor:", padded_batch)