File size: 6,656 Bytes

# Deep learning
import tensorflow as tf 

# Methods for loading the weights into the model
import os 
import inspect


_CAP = 3501 # Cap for the number of notes

class Encoder_Z(tf.keras.layers.Layer):
  # Encoder part of the VAE

  def __init__(self, dim_z, name="encoder", **kwargs):
    super(Encoder_Z, self).__init__(name=name, **kwargs)
    self.dim_x = (3, _CAP, 1)
    self.dim_z = dim_z

  def build(self):
    layers = [tf.keras.layers.InputLayer(input_shape=self.dim_x)]

    layers.append(tf.keras.layers.Conv2D(filters=64, kernel_size=3, strides=(2, 2)))
    layers.append(tf.keras.layers.ReLU())
    layers.append(tf.keras.layers.Flatten())

    layers.append(tf.keras.layers.Dense(2000))
    layers.append(tf.keras.layers.ReLU())

    layers.append(tf.keras.layers.Dense(500))
    layers.append(tf.keras.layers.ReLU())

    layers.append(tf.keras.layers.Dense(self.dim_z * 2, activation=None, name="dist_params"))

    return tf.keras.Sequential(layers)


class Decoder_X(tf.keras.layers.Layer):
  # Decoder part of the VAE.

  def __init__(self, dim_z, name="decoder", **kwargs):
    super(Decoder_X, self).__init__(name=name, **kwargs)
    self.dim_z = dim_z

  def build(self):
    # Build architecture
      
    layers = [tf.keras.layers.InputLayer(input_shape=(self.dim_z,))]

    layers.append(tf.keras.layers.Dense(500))
    layers.append(tf.keras.layers.ReLU())

    layers.append(tf.keras.layers.Dense(2000))
    layers.append(tf.keras.layers.ReLU())

    layers.append(tf.keras.layers.Dense((_CAP - 1) / 2 * 32, activation=None))
    layers.append(tf.keras.layers.Reshape((1, int((_CAP - 1) / 2), 32)))

    layers.append(tf.keras.layers.Conv2DTranspose(
        filters=64, kernel_size=3, strides=2, padding='valid'))
    layers.append(tf.keras.layers.ReLU())

    layers.append(tf.keras.layers.Conv2DTranspose(
        filters=1, kernel_size=3, strides=1, padding='same'))

    return tf.keras.Sequential(layers)

kl_weight = tf.keras.backend.variable(0.125)



class VAECost:
  """
  VAE cost with a schedule based on the Microsoft Research Blog's article
  "Less pain, more gain: A simple method for VAE training with less of that KL-vanishing agony"
    
  The KL weight increases linearly, until it meets a certain threshold and keeps constant
  for the same number of epochs. After that, it decreases abruptly to zero again, and the
  cycle repeats.
  """

  def __init__(self, model):
    self.model = model
    self.kl_weight_increasing = True
    self.epoch = 1


  # The loss should have the form loss(y_true, y_pred), but in this
  # case y_pred is computed in the cost function

  @tf.function()
  def __call__(self, x_true):
    x_true = tf.cast(x_true, tf.float32)
      
    # Encode "song map" to get its latent representation and the parameters
    # of the distribution
    z_sample, mu, sd = self.model.encode(x_true)

    # Decode the latent representation. Due to the VAE architecture, we should
    # ideally get a reconstructed song map similar to the input.
    x_recons = self.model.decoder(z_sample)

    # Compute mean squared error, where our ground truth is the song map
    # we pass as input, so we "compare" the reconstruction to it.
      
    recons_error = tf.cast(
        tf.reduce_mean((x_true - x_recons) ** 2, axis=[1, 2, 3]),
        tf.float32)

    # Compute reverse KL divergence
    kl_divergence = -0.5 * tf.math.reduce_sum(
          1 + tf.math.log(tf.math.square(sd)) - tf.math.square(mu) - tf.math.square(sd),
          axis=1) # shape=(batch_size,)

    # Return metrics
    elbo = tf.reduce_mean(-kl_weight * kl_divergence - recons_error)
    mean_kl_divergence = tf.reduce_mean(kl_divergence)
    mean_recons_error = tf.reduce_mean(recons_error)

    return -elbo, mean_kl_divergence, mean_recons_error


class VAE(tf.keras.Model):
  # Main architecture, which connects the encoder with the decoder.

  def __init__(self, name="variational autoencoder", **kwargs):
    super(VAE, self).__init__(name=name, **kwargs)
    self.dim_x = (3, _CAP, 1)
    self.encoder = Encoder_Z(dim_z=120).build()
    self.decoder = Decoder_X(dim_z=120).build()
    self.cost_func = VAECost(self)

    # Get the path of the script that defines this method
    script_path = inspect.getfile(inspect.currentframe())

    # Get the directory containing the script
    script_dir = os.path.dirname(os.path.abspath(script_path))

    # Construct the path to the weights folder
    weights_dir = os.path.join(script_dir, 'weights') + os.sep
    
    # Load pretrained weights
    self.load_weights(weights_dir)

  @tf.function()
  def train_step(self, data):
    # Gradient descent
      
    with tf.GradientTape() as tape:
      neg_elbo, mean_kl_divergence, mean_recons_error = self.cost_func(data)

    gradients = tape.gradient(neg_elbo, self.trainable_variables)
    self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))

    return {"abs ELBO": neg_elbo, "mean KL": mean_kl_divergence,
            "mean recons": mean_recons_error,
            "kl weight": kl_weight}

  def encode(self, x_input: tf.Tensor) -> tuple[tf.Tensor]:
    """
    Make a forward pass through the encoder for a given song map, in order
    to return the latent representation and the distribution's parameters.

    Parameters
    ----------
    x_input : tf.Tensor
        Song map to be encoded by the VAE.

    Returns
    -------
    z_sample: tf.Tensor
        A sampled latent representation from the distribution which encodes the song.
    mu: tf.Tensor
        The mean parameter of the distribution.
    sd: tf.Tensor
        The standard deviation parameter of the distribution.
    """
    x_input = tf.expand_dims(x_input, axis=-1) # Add channel dimension

    if tf.rank(x_input) == 3: # If there's no batch dimension, add it
        x_input = tf.expand_dims(x_input, axis=0)

    mu, rho = tf.split(self.encoder(x_input), num_or_size_splits=2, axis=1)
    sd = tf.math.log(1 + tf.math.exp(rho))
    z_sample = mu + sd * tf.random.normal(shape=(120,))
    return z_sample, mu, sd

  def decode(self, z_sample: tf.Tensor=None) -> tf.Tensor:
    """
    Decode a latent representation of a song.

    Parameters
    ----------
    z_sample : tf.Tensor
    
    Song encoding outputed by the encoder.
    Default ``None``, for which the sampling is done over an unit Gaussian distribution.

    Returns
    -------
    song_map: tf.Tensor
        Song map corresponding to the encoding.
    """
      
    if z_sample == None:
      z_sample = tf.expand_dims(tf.random.normal(shape=(120,)), axis=0)
                               
    song_map = self.decoder(z_sample)
    return song_map