import tensorflow as tf
from tensorflow.keras.layers import Layer, Dense, Dropout, LayerNormalization, Embedding, Input, GlobalAveragePooling1D
from tensorflow.keras.models import Model
class ScaledDotProductAttention(Layer):
def __init__(self, dropout_rate=0.1):
super().__init__()
self.dropout = Dropout(dropout_rate)
self.layernorm = LayerNormalization(epsilon=1e-6)
def call(self, query, key, value, training=None):
matmul_qk = tf.matmul(query, key, transpose_b=True) # [batch, seq_len_q, seq_len_k]
dk = tf.cast(tf.shape(key)[-1], tf.float32)
scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
attention_weights = self.dropout(attention_weights, training=training)
output = tf.matmul(attention_weights, value) # [batch, seq_len_q, depth_v]
output = self.layernorm(output + query) # Residual connection + normalization
return output
# Simple text classification model with attention
vocab_size = 5000
embedding_dim = 64
max_len = 100
num_classes = 2
inputs = Input(shape=(max_len,))
embedding = Embedding(vocab_size, embedding_dim)(inputs)
# Query, Key, Value are the same embedding here for simplicity
attention_layer = ScaledDotProductAttention(dropout_rate=0.2)
attention_output = attention_layer(embedding, embedding, embedding)
pooled = GlobalAveragePooling1D()(attention_output)
outputs = Dense(num_classes, activation='softmax')(pooled)
model = Model(inputs=inputs, outputs=outputs)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
# Dummy data for demonstration
import numpy as np
X_train = np.random.randint(0, vocab_size, size=(1000, max_len))
y_train = np.random.randint(0, num_classes, size=(1000,))
X_val = np.random.randint(0, vocab_size, size=(200, max_len))
y_val = np.random.randint(0, num_classes, size=(200,))
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))