import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Dropout, Flatten, Conv2D, MaxPooling2D, Embedding, LSTM, concatenate
from tensorflow.keras.models import Model
# Example input shapes
text_input_shape = (100,) # e.g., 100 words encoded as integers
image_input_shape = (64, 64, 3) # 64x64 RGB images
audio_input_shape = (40, 100) # e.g., 40 MFCC features over 100 time steps
# Text model
text_input = Input(shape=text_input_shape, name='text_input')
x_text = Embedding(input_dim=5000, output_dim=64, input_length=100)(text_input)
x_text = LSTM(32)(x_text)
x_text = Dropout(0.3)(x_text)
# Image model
image_input = Input(shape=image_input_shape, name='image_input')
x_image = Conv2D(32, (3,3), activation='relu')(image_input)
x_image = MaxPooling2D((2,2))(x_image)
x_image = Conv2D(64, (3,3), activation='relu')(x_image)
x_image = MaxPooling2D((2,2))(x_image)
x_image = Flatten()(x_image)
x_image = Dropout(0.3)(x_image)
# Audio model
audio_input = Input(shape=audio_input_shape, name='audio_input')
x_audio = Conv2D(32, (3,3), activation='relu')(tf.expand_dims(audio_input, -1))
x_audio = MaxPooling2D((2,2))(x_audio)
x_audio = Flatten()(x_audio)
x_audio = Dropout(0.3)(x_audio)
# Combine all
combined = concatenate([x_text, x_image, x_audio])
z = Dense(64, activation='relu')(combined)
z = Dropout(0.4)(z)
z = Dense(1, activation='sigmoid')(z)
model = Model(inputs=[text_input, image_input, audio_input], outputs=z)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# Dummy data for demonstration
X_text = np.random.randint(0, 5000, (500, 100))
X_image = np.random.rand(500, 64, 64, 3)
X_audio = np.random.rand(500, 40, 100)
y = np.random.randint(0, 2, 500)
# Train model
history = model.fit(
{'text_input': X_text, 'image_input': X_image, 'audio_input': X_audio},
y,
epochs=20,
batch_size=32,
validation_split=0.2
)