Introduction
Controlling vocabulary size helps models focus on important words and run faster by ignoring rare or unimportant words.
Jump into concepts and practice - no test required
from sklearn.feature_extraction.text import CountVectorizer vectorizer = CountVectorizer(max_features=VOCAB_SIZE) X = vectorizer.fit_transform(texts)
vectorizer = CountVectorizer(max_features=1000)vectorizer = CountVectorizer(max_features=500, min_df=5)
vectorizer = CountVectorizer(max_features=2000, stop_words='english')
from sklearn.feature_extraction.text import CountVectorizer texts = [ 'I love machine learning', 'Machine learning is fun', 'I love coding in Python', 'Python coding is great for machine learning' ] VOCAB_SIZE = 5 vectorizer = CountVectorizer(max_features=VOCAB_SIZE) X = vectorizer.fit_transform(texts) print('Vocabulary:', vectorizer.get_feature_names_out()) print('Transformed shape:', X.shape) print('Feature matrix (dense):\n', X.toarray())
from sklearn.feature_extraction.text import CountVectorizer texts = ['apple banana apple', 'banana orange', 'apple orange orange'] vectorizer = CountVectorizer(max_features=2) vectorizer.fit(texts) vocab = vectorizer.get_feature_names_out() print(len(vocab))
from sklearn.feature_extraction.text import CountVectorizer texts = ['cat dog', 'dog mouse', 'cat mouse'] vectorizer = CountVectorizer(max_features='3') vectorizer.fit(texts) vocab = vectorizer.get_feature_names_out() print(vocab)