Introduction
LDA helps find hidden topics in a collection of texts. It groups words that often appear together to understand the main themes.
Jump into concepts and practice - no test required
from sklearn.decomposition import LatentDirichletAllocation lda = LatentDirichletAllocation(n_components=number_of_topics, random_state=seed) lda.fit(document_term_matrix)
from sklearn.decomposition import LatentDirichletAllocation lda = LatentDirichletAllocation(n_components=3, random_state=42) lda.fit(X)
lda = LatentDirichletAllocation(n_components=5, max_iter=10, random_state=0) lda.fit(X)
from sklearn.feature_extraction.text import CountVectorizer from sklearn.decomposition import LatentDirichletAllocation # Sample documents texts = [ 'I love reading books about science and technology', 'The new movie was exciting and full of action', 'Technology advances help science progress', 'Action movies are thrilling and fun to watch', 'Books on science explain complex ideas clearly' ] # Convert texts to a matrix of token counts vectorizer = CountVectorizer(stop_words='english') X = vectorizer.fit_transform(texts) # Create LDA model to find 2 topics lda = LatentDirichletAllocation(n_components=2, random_state=0) lda.fit(X) # Show top words for each topic n_top_words = 3 feature_names = vectorizer.get_feature_names_out() for topic_idx, topic in enumerate(lda.components_): top_words = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]] print(f"Topic {topic_idx + 1}: {', '.join(top_words)}")
topic_distribution?
from sklearn.decomposition import LatentDirichletAllocation from sklearn.feature_extraction.text import CountVectorizer docs = ["apple banana apple", "banana orange banana", "apple orange orange"] vectorizer = CountVectorizer() dtm = vectorizer.fit_transform(docs) lda = LatentDirichletAllocation(n_components=2, random_state=0) lda.fit(dtm) topic_distribution = lda.transform(dtm)
from sklearn.decomposition import LatentDirichletAllocation from sklearn.feature_extraction.text import CountVectorizer docs = ["cat dog", "dog mouse", "cat mouse"] vectorizer = CountVectorizer() dtm = vectorizer.fit_transform(docs) lda = LatentDirichletAllocation(n_components=2) lda.fit_transform(dtm) print(lda.components_)