import gensim
from gensim import corpora
from gensim.models import CoherenceModel
from gensim.utils import simple_preprocess
from gensim.models.phrases import Phrases, Phraser
# Sample documents
texts = [
'Cats are small animals that like to climb trees.',
'Dogs are loyal and friendly pets.',
'Birds can fly and sing beautiful songs.',
'Fish swim in water and have scales.',
'Cats and dogs can live together peacefully.',
'Birds build nests to lay eggs.',
'Fish live in oceans, rivers, and lakes.',
'Dogs need walks and exercise daily.',
'Cats like to chase mice and birds.',
'Birds migrate during winter to warmer places.'
]
# Preprocessing function
def preprocess(texts):
return [simple_preprocess(doc, deacc=True) for doc in texts]
# Preprocess texts
processed_texts = preprocess(texts)
# Build bigrams
bigram = Phrases(processed_texts, min_count=1, threshold=2)
bigram_mod = Phraser(bigram)
texts_bigrams = [bigram_mod[doc] for doc in processed_texts]
# Create dictionary and corpus
id2word = corpora.Dictionary(texts_bigrams)
# Filter extremes to remove very rare and very common words
id2word.filter_extremes(no_below=1, no_above=0.8)
corpus = [id2word.doc2bow(text) for text in texts_bigrams]
# Train LDA model with tuned parameters
lda_model = gensim.models.LdaModel(
corpus=corpus,
id2word=id2word,
num_topics=5,
random_state=100,
update_every=1,
chunksize=10,
passes=20,
alpha='auto',
per_word_topics=True
)
# Compute coherence score
coherence_model_lda = CoherenceModel(model=lda_model, texts=texts_bigrams, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print(f'Coherence Score: {coherence_lda:.2f}')