from gensim.models import FastText
# Sample larger training data
sentences = [
['machine', 'learning', 'is', 'fun'],
['fasttext', 'embeddings', 'capture', 'subword', 'information'],
['word', 'vectors', 'help', 'in', 'nlp'],
['deep', 'learning', 'models', 'require', 'lots', 'of', 'data'],
['natural', 'language', 'processing', 'is', 'exciting'],
['fasttext', 'can', 'handle', 'rare', 'words'],
['subword', 'information', 'improves', 'embedding', 'quality'],
['more', 'data', 'helps', 'reduce', 'overfitting'],
['training', 'for', 'more', 'epochs', 'improves', 'results'],
['gensim', 'makes', 'training', 'fasttext', 'easy']
]
# Train FastText model with improved hyperparameters
model = FastText(
sentences,
vector_size=50, # increased vector size
window=3, # context window size
min_count=1, # include all words
sg=1, # skip-gram model
epochs=20, # more training epochs
min_n=3, # min length of char ngrams
max_n=6 # max length of char ngrams
)
# Example: get vector for a word
vector = model.wv['fasttext']
# Evaluate on a small word similarity set
# (word pairs and human scores)
word_pairs = [('machine', 'learning'), ('fasttext', 'embedding'), ('deep', 'model'), ('rare', 'words')]
human_scores = [0.9, 0.8, 0.7, 0.6]
from scipy.stats import spearmanr
model_scores = []
for w1, w2 in word_pairs:
v1 = model.wv[w1]
v2 = model.wv[w2]
sim = v1.dot(v2) / ((v1**2).sum()**0.5 * (v2**2).sum()**0.5)
model_scores.append(sim)
correlation, _ = spearmanr(human_scores, model_scores)
print(f"Spearman correlation: {correlation:.2f}")