Introduction
Text preprocessing helps turn words into numbers so RNNs can understand and learn from text.
Jump into concepts and practice - no test required
import torch from torchtext.vocab import build_vocab_from_iterator from torch.nn.utils.rnn import pad_sequence def tokenize(text): return text.lower().split() # Build vocabulary from tokenized texts vocab = build_vocab_from_iterator(tokenize(text) for text in dataset, specials=["<pad>"]) # Convert text to list of token ids def text_to_ids(text): return [vocab[token] for token in tokenize(text)] # Pad sequences to same length padded_batch = pad_sequence([torch.tensor(text_to_ids(t)) for t in batch], batch_first=True)
text = "Hello world" tokens = tokenize(text) print(tokens)
ids = text_to_ids("Hello world") print(ids)
batch = ["Hello world", "Hi"] padded = pad_sequence([torch.tensor(text_to_ids(t)) for t in batch], batch_first=True) print(padded)
import torch from torchtext.vocab import build_vocab_from_iterator from torch.nn.utils.rnn import pad_sequence def tokenize(text): return text.lower().split() # Sample dataset dataset = ["I love machine learning", "Deep learning is fun", "RNNs handle sequences"] # Build vocabulary vocab = build_vocab_from_iterator(tokenize(text) for text in dataset, specials=["<pad>"]) # Convert text to token ids def text_to_ids(text): return [vocab[token] for token in tokenize(text)] # Prepare batch batch = ["I love machine learning", "RNNs handle sequences"] token_ids = [torch.tensor(text_to_ids(text)) for text in batch] # Pad sequences padded_batch = pad_sequence(token_ids, batch_first=True, padding_value=vocab['<pad>']) print("Vocabulary tokens:", vocab.get_itos()) print("Token IDs for batch:", token_ids) print("Padded batch tensor:", padded_batch)
import torch from torch.nn.utils.rnn import pad_sequence seq1 = torch.tensor([1, 2, 3]) seq2 = torch.tensor([4, 5]) seq3 = torch.tensor([6]) batch = pad_sequence([seq1, seq2, seq3], batch_first=True, padding_value=0) print(batch.shape)
import torch from torch.nn.utils.rnn import pad_sequence sentences = [[1, 2, 3, 4], [5, 6], [7]] tensors = [torch.tensor(s) for s in sentences] padded = pad_sequence(tensors) print(padded.shape)