A Variational Autoencoder (VAE) helps us learn how to create new data similar to what we have, like making new pictures that look like real ones.
Variational Autoencoder in Computer Vision
class VAE(tf.keras.Model): def __init__(self, latent_dim): super(VAE, self).__init__() self.encoder = ... # define encoder model self.decoder = ... # define decoder model self.latent_dim = latent_dim def sample(self, eps=None): if eps is None: eps = tf.random.normal(shape=(100, self.latent_dim)) return self.decode(eps, apply_sigmoid=True) def encode(self, x): mean, logvar = self.encoder(x) return mean, logvar def reparameterize(self, mean, logvar): eps = tf.random.normal(shape=tf.shape(mean)) return eps * tf.exp(logvar * 0.5) + mean def decode(self, z, apply_sigmoid=False): logits = self.decoder(z) if apply_sigmoid: probs = tf.sigmoid(logits) return probs return logits
The encoder learns to represent input data as two values: mean and log variance.
The reparameterization trick allows backpropagation through random sampling.
mean, logvar = vae.encode(input_image)
z = vae.reparameterize(mean, logvar)
reconstructed = vae.decode(z, apply_sigmoid=True)loss = reconstruction_loss + kl_divergence_loss optimizer.minimize(loss, vae.trainable_variables)
This program trains a simple VAE on MNIST digits for 3 epochs, then shows the sum of pixel values of an original and its reconstruction to see how close they are.
import tensorflow as tf from tensorflow.keras import layers class Sampling(layers.Layer): def call(self, inputs): mean, logvar = inputs eps = tf.random.normal(shape=tf.shape(mean)) return eps * tf.exp(logvar * 0.5) + mean latent_dim = 2 # Encoder encoder_inputs = tf.keras.Input(shape=(28, 28, 1)) x = layers.Flatten()(encoder_inputs) x = layers.Dense(128, activation='relu')(x) mean = layers.Dense(latent_dim)(x) logvar = layers.Dense(latent_dim)(x) z = Sampling()([mean, logvar]) encoder = tf.keras.Model(encoder_inputs, [mean, logvar, z], name='encoder') # Decoder latent_inputs = tf.keras.Input(shape=(latent_dim,)) x = layers.Dense(128, activation='relu')(latent_inputs) x = layers.Dense(28 * 28, activation='sigmoid')(x) decoder_outputs = layers.Reshape((28, 28, 1))(x) decoder = tf.keras.Model(latent_inputs, decoder_outputs, name='decoder') # VAE Model class VAE(tf.keras.Model): def __init__(self, encoder, decoder): super(VAE, self).__init__() self.encoder = encoder self.decoder = decoder def call(self, x): mean, logvar, z = self.encoder(x) reconstructed = self.decoder(z) kl_loss = -0.5 * tf.reduce_mean(1 + logvar - tf.square(mean) - tf.exp(logvar)) self.add_loss(kl_loss) return reconstructed # Load data (x_train, _), (x_test, _) = tf.keras.datasets.mnist.load_data() x_train = x_train.astype('float32') / 255. x_train = x_train[..., tf.newaxis] x_test = x_test.astype('float32') / 255. x_test = x_test[..., tf.newaxis] vae = VAE(encoder, decoder) vae.compile(optimizer='adam', loss=tf.keras.losses.MeanSquaredError()) # Train vae.fit(x_train, x_train, epochs=3, batch_size=64) # Test reconstruction import numpy as np sample = x_test[:1] reconstructed = vae(sample) print('Original pixel sum:', np.sum(sample.numpy())) print('Reconstructed pixel sum:', np.sum(reconstructed.numpy()))
The KL divergence loss helps the model learn a smooth latent space.
Using a small latent dimension (like 2) helps visualize the latent space.
Training longer usually improves reconstruction quality.
Variational Autoencoders learn to compress and generate data by encoding inputs into a latent space with randomness.
The reparameterization trick allows training with random sampling.
They are useful for generating new data and understanding data features.