import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, LayerNormalization, Dropout
from tensorflow.keras.models import Model
import numpy as np
# Positional Encoding function
def positional_encoding(max_len, d_model):
pos = np.arange(max_len)[:, np.newaxis]
i = np.arange(d_model)[np.newaxis, :]
angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
angle_rads = pos * angle_rates
sines = np.sin(angle_rads[:, 0::2])
cosines = np.cos(angle_rads[:, 1::2])
pos_encoding = np.concatenate([sines, cosines], axis=-1)
pos_encoding = pos_encoding[np.newaxis, ...]
return tf.cast(pos_encoding, dtype=tf.float32)
# Simple Multi-Head Self-Attention Layer
class MultiHeadSelfAttention(tf.keras.layers.Layer):
def __init__(self, d_model, num_heads):
super().__init__()
self.num_heads = num_heads
self.d_model = d_model
assert d_model % num_heads == 0
self.depth = d_model // num_heads
self.wq = Dense(d_model)
self.wk = Dense(d_model)
self.wv = Dense(d_model)
self.dense = Dense(d_model)
def split_heads(self, x, batch_size):
x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
return tf.transpose(x, perm=[0, 2, 1, 3])
def call(self, v, k, q):
batch_size = tf.shape(q)[0]
q = self.wq(q)
k = self.wk(k)
v = self.wv(v)
q = self.split_heads(q, batch_size)
k = self.split_heads(k, batch_size)
v = self.split_heads(v, batch_size)
matmul_qk = tf.matmul(q, k, transpose_b=True)
dk = tf.cast(tf.shape(k)[-1], tf.float32)
scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
output = tf.matmul(attention_weights, v)
output = tf.transpose(output, perm=[0, 2, 1, 3])
concat_attention = tf.reshape(output, (batch_size, -1, self.d_model))
output = self.dense(concat_attention)
return output
# Transformer Encoder Layer
class TransformerEncoderLayer(tf.keras.layers.Layer):
def __init__(self, d_model, num_heads, dff, rate=0.1):
super().__init__()
self.mha = MultiHeadSelfAttention(d_model, num_heads)
self.ffn = tf.keras.Sequential([
Dense(dff, activation='relu'),
Dense(d_model)
])
self.layernorm1 = LayerNormalization(epsilon=1e-6)
self.layernorm2 = LayerNormalization(epsilon=1e-6)
self.dropout1 = Dropout(rate)
self.dropout2 = Dropout(rate)
def call(self, x, training=None):
attn_output = self.mha(x, x, x)
attn_output = self.dropout1(attn_output, training=training)
out1 = self.layernorm1(x + attn_output)
ffn_output = self.ffn(out1)
ffn_output = self.dropout2(ffn_output, training=training)
out2 = self.layernorm2(out1 + ffn_output)
return out2
# Build Transformer Model for text classification
vocab_size = 10000
max_len = 100
embedding_dim = 64
num_heads = 4
dff = 128
num_classes = 2
inputs = Input(shape=(max_len,))
embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)(inputs)
pos_encoding = positional_encoding(max_len, embedding_dim)
embedding += pos_encoding[:, :max_len, :]
encoder_layer = TransformerEncoderLayer(embedding_dim, num_heads, dff)
x = encoder_layer(embedding)
x = tf.keras.layers.GlobalAveragePooling1D()(x)
x = Dropout(0.1)(x)
x = Dense(64, activation='relu')(x)
x = Dropout(0.1)(x)
outputs = Dense(num_classes, activation='softmax')(x)
model = Model(inputs=inputs, outputs=outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
# Dummy data for demonstration (replace with real dataset)
X_train = np.random.randint(0, vocab_size, size=(1000, max_len))
y_train = np.random.randint(0, num_classes, size=(1000,))
X_val = np.random.randint(0, vocab_size, size=(200, max_len))
y_val = np.random.randint(0, num_classes, size=(200,))
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))