import numpy as np
from gensim.models import Word2Vec
# Sample small corpus
sentences = [
['king', 'queen', 'man', 'woman'],
['apple', 'orange', 'fruit', 'banana'],
['car', 'bus', 'train', 'vehicle'],
['dog', 'cat', 'animal', 'pet'],
['king', 'man', 'royal', 'crown'],
['queen', 'woman', 'royal', 'crown'],
['apple', 'fruit', 'sweet'],
['dog', 'pet', 'loyal'],
['car', 'vehicle', 'fast'],
['bus', 'vehicle', 'public']
]
# Train Word2Vec skip-gram model
model = Word2Vec(sentences, vector_size=30, window=3, min_count=1, sg=1, epochs=100)
# Function to compute cosine similarity
from numpy.linalg import norm
def cosine_similarity(vec1, vec2):
return np.dot(vec1, vec2) / (norm(vec1) * norm(vec2))
# Normalize embeddings
def get_normalized_vector(word):
vec = model.wv[word]
return vec / norm(vec)
# Similar word pairs
similar_pairs = [
('king', 'queen'),
('apple', 'banana'),
('car', 'bus'),
('dog', 'cat')
]
# Unrelated word pairs
unrelated_pairs = [
('king', 'apple'),
('car', 'dog'),
('queen', 'banana'),
('bus', 'cat')
]
# Compute similarities
similar_sim = [cosine_similarity(get_normalized_vector(w1), get_normalized_vector(w2)) for w1, w2 in similar_pairs]
unrelated_sim = [cosine_similarity(get_normalized_vector(w1), get_normalized_vector(w2)) for w1, w2 in unrelated_pairs]
print('Similar word pairs cosine similarities:', similar_sim)
print('Unrelated word pairs cosine similarities:', unrelated_sim)