import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Lambda
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K
# Sample data: pairs of sentences and labels (1=related, 0=unrelated)
sentences = ["I love cats", "Cats are great", "I hate rain", "Rain is annoying"]
labels = [1, 1, 0, 0]
# Simple tokenizer and vocabulary
vocab = {"I":1, "love":2, "cats":3, "Cats":3, "are":4, "great":5, "hate":6, "rain":7, "Rain":7, "is":8, "annoying":9}
max_len = 4
def tokenize(sentence):
tokens = sentence.split()
return [vocab.get(t, 0) for t in tokens] + [0]*(max_len - len(tokens))
X = np.array([tokenize(s) for s in sentences])
y = np.array(labels)
# Embedding size under 100
dim_embedding = 50
# Define embedding model
input_text = Input(shape=(max_len,))
embedding_layer = Embedding(input_dim=len(vocab)+1, output_dim=dim_embedding, input_length=max_len)(input_text)
lstm_layer = LSTM(32)(embedding_layer)
dense_layer = Dense(32, activation='relu')(lstm_layer)
normalized = Lambda(lambda x: K.l2_normalize(x, axis=1))(dense_layer)
model = Model(inputs=input_text, outputs=normalized)
# Contrastive loss function
def contrastive_loss(y_true, y_pred):
margin = 1.0
square_pred = K.square(y_pred)
margin_square = K.square(K.maximum(margin - y_pred, 0))
return K.mean(y_true * square_pred + (1 - y_true) * margin_square)
# Prepare pairs for training (simplified example)
# Here we create pairs and labels for similarity
pairs = []
pair_labels = []
for i in range(len(X)):
for j in range(i+1, len(X)):
pairs.append([X[i], X[j]])
pair_labels.append(1 if y[i] == y[j] else 0)
pairs = np.array(pairs)
pair_labels = np.array(pair_labels)
# Model to compute distance between embeddings
input_a = Input(shape=(max_len,))
input_b = Input(shape=(max_len,))
embedding_a = model(input_a)
embedding_b = model(input_b)
def euclidean_distance(vects):
x, y = vects
return K.sqrt(K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True), K.epsilon()))
distance = Lambda(euclidean_distance)([embedding_a, embedding_b])
siamese_net = Model([input_a, input_b], distance)
siamese_net.compile(loss=contrastive_loss, optimizer='adam')
# Train model
siamese_net.fit([pairs[:,0], pairs[:,1]], pair_labels, epochs=20, batch_size=2, verbose=0)
# Test similarity
embeddings = model.predict(X)
from sklearn.metrics.pairwise import cosine_similarity
sim_matrix = cosine_similarity(embeddings)
# Calculate average similarity for related and unrelated pairs
related_sims = []
unrelated_sims = []
for i in range(len(y)):
for j in range(i+1, len(y)):
if y[i] == y[j]:
related_sims.append(sim_matrix[i,j])
else:
unrelated_sims.append(sim_matrix[i,j])
avg_related = np.mean(related_sims)
avg_unrelated = np.mean(unrelated_sims)
print(f"Average related similarity: {avg_related:.2f}")
print(f"Average unrelated similarity: {avg_unrelated:.2f}")