This program finds 2 topics in 5 short texts. It prints the top 3 words for each topic to show what the topic is about.
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
# Sample documents
texts = [
'I love reading books about science and technology',
'The new movie was exciting and full of action',
'Technology advances help science progress',
'Action movies are thrilling and fun to watch',
'Books on science explain complex ideas clearly'
]
# Convert texts to a matrix of token counts
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(texts)
# Create LDA model to find 2 topics
lda = LatentDirichletAllocation(n_components=2, random_state=0)
lda.fit(X)
# Show top words for each topic
n_top_words = 3
feature_names = vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(lda.components_):
top_words = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
print(f"Topic {topic_idx + 1}: {', '.join(top_words)}")