import gensim
from gensim import corpora
from gensim.models import LdaModel, CoherenceModel
# Sample preprocessed documents (list of token lists)
documents = [
['human', 'interface', 'computer'],
['survey', 'user', 'computer', 'system', 'response', 'time'],
['eps', 'user', 'interface', 'system'],
['system', 'human', 'system', 'eps'],
['user', 'response', 'time'],
['trees'],
['graph', 'trees'],
['graph', 'minors', 'trees'],
['graph', 'minors', 'survey']
]
# Create dictionary and corpus
id2word = corpora.Dictionary(documents)
corpus = [id2word.doc2bow(text) for text in documents]
best_num_topics = None
best_coherence = -1
coherence_scores = {}
for num_topics in range(5, 16):
lda_model = LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics, random_state=42, passes=10)
coherence_model = CoherenceModel(model=lda_model, texts=documents, dictionary=id2word, coherence='c_v')
coherence_score = coherence_model.get_coherence()
coherence_scores[num_topics] = coherence_score
if coherence_score > best_coherence:
best_coherence = coherence_score
best_num_topics = num_topics
print(f"Best number of topics: {best_num_topics}")
print(f"Best coherence score: {best_coherence:.4f}")
print("Coherence scores for all tested topic numbers:")
for k, v in coherence_scores.items():
print(f"{k}: {v:.4f}")