import re
class SimpleTokenizer:
def __init__(self):
self.vocab = set()
def tokenize(self, text):
# Lowercase and strip
text = text.lower().strip()
# Split on spaces and punctuation using regex
tokens = re.findall(r"\b\w+\b", text)
return tokens
def build_vocab(self, texts):
vocab = set()
for text in texts:
tokens = self.tokenize(text)
vocab.update(tokens)
self.vocab = vocab
return vocab
# Example usage
texts = [
"Hello, world! This is a test.",
"Tokenization is important for NLP.",
"Let's improve the tokenizer."
]
# Initialize tokenizer
tokenizer = SimpleTokenizer()
# Build vocabulary
vocab = tokenizer.build_vocab(texts)
# Test tokenization accuracy and vocab coverage
reference_tokens = [
['hello', 'world', 'this', 'is', 'a', 'test'],
['tokenization', 'is', 'important', 'for', 'nlp'],
['let', 's', 'improve', 'the', 'tokenizer']
]
def tokenization_accuracy(tokenizer, texts, reference_tokens):
correct = 0
total = 0
for text, ref in zip(texts, reference_tokens):
pred = tokenizer.tokenize(text)
total += len(ref)
for t1, t2 in zip(pred, ref):
if t1 == t2:
correct += 1
return correct / total * 100
def vocab_coverage(tokenizer, texts):
total_words = 0
known_words = 0
for text in texts:
tokens = tokenizer.tokenize(text)
total_words += len(tokens)
known_words += sum(1 for t in tokens if t in tokenizer.vocab)
return known_words / total_words * 100
accuracy = tokenization_accuracy(tokenizer, texts, reference_tokens)
coverage = vocab_coverage(tokenizer, texts)
print(f"Tokenization accuracy: {accuracy:.1f}%")
print(f"Vocabulary coverage: {coverage:.1f}%")