import numpy as np
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# Sample sentences
similar_pairs = [
("The cat sits on the mat.", "A cat is sitting on a mat."),
("Dogs are great pets.", "I love my dog as a pet."),
("He is reading a book.", "He reads books every day.")
]
dissimilar_pairs = [
("The cat sits on the mat.", "The weather is sunny today."),
("Dogs are great pets.", "I am cooking dinner."),
("He is reading a book.", "She is playing football.")
]
# Pre-trained embeddings dictionary (mock example with random vectors for demo)
# In real case, load embeddings like GloVe or Word2Vec
embedding_dim = 50
np.random.seed(0)
mock_vocab = ["the", "cat", "sits", "on", "mat", "a", "is", "sitting", "dogs", "are", "great", "pets", "i", "love", "my", "dog", "as", "he", "reading", "book", "reads", "books", "every", "day", "weather", "sunny", "today", "am", "cooking", "dinner", "she", "playing", "football"]
embeddings_index = {word: np.random.rand(embedding_dim) for word in mock_vocab}
# Function to preprocess and get sentence embedding
def sentence_embedding(sentence, embeddings, tfidf_weights=None):
words = sentence.lower().translate(str.maketrans('', '', string.punctuation)).split()
valid_words = [w for w in words if w in embeddings]
if not valid_words:
return np.zeros(embedding_dim)
if tfidf_weights is not None:
weights = np.array([tfidf_weights.get(w, 0) for w in valid_words])
weighted_embeds = np.array([embeddings[w] for w in valid_words]) * weights[:, None]
return weighted_embeds.sum(axis=0) / weights.sum() if weights.sum() != 0 else weighted_embeds.mean(axis=0)
else:
return np.mean([embeddings[w] for w in valid_words], axis=0)
# Prepare corpus for TF-IDF
corpus = [s for pair in similar_pairs + dissimilar_pairs for s in pair]
vectorizer = TfidfVectorizer(stop_words='english')
vectorizer.fit(corpus)
idf = dict(zip(vectorizer.get_feature_names_out(), vectorizer.idf_))
# Compute similarities
def compute_avg_similarities(pairs, embeddings, tfidf_weights):
sims = []
for s1, s2 in pairs:
emb1 = sentence_embedding(s1, embeddings, tfidf_weights)
emb2 = sentence_embedding(s2, embeddings, tfidf_weights)
sim = cosine_similarity([emb1], [emb2])[0][0]
sims.append(sim)
return np.mean(sims)
# Current approach: simple average embeddings without TF-IDF
avg_sim_similar = compute_avg_similarities(similar_pairs, embeddings_index, None)
avg_sim_dissimilar = compute_avg_similarities(dissimilar_pairs, embeddings_index, None)
# Improved approach: weighted average with TF-IDF
avg_sim_similar_tfidf = compute_avg_similarities(similar_pairs, embeddings_index, idf)
avg_sim_dissimilar_tfidf = compute_avg_similarities(dissimilar_pairs, embeddings_index, idf)
print(f"Before improvement - Similar pairs avg similarity: {avg_sim_similar:.2f}")
print(f"Before improvement - Dissimilar pairs avg similarity: {avg_sim_dissimilar:.2f}")
print(f"After improvement - Similar pairs avg similarity: {avg_sim_similar_tfidf:.2f}")
print(f"After improvement - Dissimilar pairs avg similarity: {avg_sim_dissimilar_tfidf:.2f}")