{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "dgxD_ljQ23dE" }, "source": [ "# Music generation with Variational AutoEncoder" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "jRRn77kD4EiT" }, "outputs": [], "source": [ "!pip install -q git+https://github.com/tensorflow/docs" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "jqomLmiK4RY4", "outputId": "f26beba1-c06d-48db-a3cf-df5bc83efcb0" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", "Requirement already satisfied: tensorflow-addons in /usr/local/lib/python3.7/dist-packages (0.17.0)\n", "Requirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from tensorflow-addons) (21.3)\n", "Requirement already satisfied: typeguard>=2.7 in /usr/local/lib/python3.7/dist-packages (from tensorflow-addons) (2.7.1)\n", "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->tensorflow-addons) (3.0.9)\n" ] } ], "source": [ "!pip install tensorflow-addons" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "7DZkI3Fg2yUQ" }, "outputs": [], "source": [ "import librosa\n", "\n", "import numpy as np\n", "import pandas as pd\n", "import os\n", "\n", "import tensorflow as tf\n", "import tensorflow_addons as tfa\n", "from tensorflow.keras import layers \n", "\n", "import matplotlib.pyplot as plt\n", "from IPython import display\n", "from IPython.display import clear_output\n", "\n", "import glob\n", "import imageio\n", "import time\n", "import IPython.display as ipd\n", "\n", "AUTOTUNE = tf.data.experimental.AUTOTUNE" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "AWcjKTUD37z-" }, "outputs": [], "source": [ "seed=123\n", "tf.compat.v1.set_random_seed(seed)\n", "session_conf = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)\n", "sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=session_conf)\n", "tf.compat.v1.keras.backend.set_session(sess)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "hsaYjzBi4BPv" }, "outputs": [], "source": [ "train_size = 60000\n", "BATCH_SIZE = 10\n", "test_size = 10000\n", "epochs = 20\n", "# set the dimensionality of the latent space to a plane for visualization later\n", "latent_dim = 2\n", "num_examples_to_generate = 10\n", "\n", "BASE_PATH = 'drive/MyDrive/music_generation/auto_encoder/genres_original'" ] }, { "cell_type": "markdown", "metadata": { "id": "PM0_HbJd45tx" }, "source": [ "Data preprocessing" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "ChJ3XdWt40ap" }, "outputs": [], "source": [ "def DatasetLoader(class_):\n", " music_list = np.array(sorted(os.listdir(BASE_PATH+'/'+class_)))\n", " train_music_1 = list(music_list[[0,52,19,39,71,12,75,85,3,45,24,46,88]]) #99,10,66,76,41\n", " train_music_2 = list(music_list[[4,43,56,55,45,31,11,13,70,37,21,78]]) #65,32,53,22,19,80,89,\n", " TrackSet_1 = [(BASE_PATH)+'/'+class_+'/%s'%(x) for x in train_music_1]\n", " TrackSet_2 = [(BASE_PATH)+'/'+class_+'/%s'%(x) for x in train_music_2]\n", "\n", " return TrackSet_1, TrackSet_2" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Z1sM3p7h42Af" }, "outputs": [], "source": [ "def load(file_):\n", " data_, sampling_rate = librosa.load(file_,sr=3000, offset=0.0, duration=30)\n", " data_ = data_.reshape(1,90001)\n", " return data_\n", "map_data = lambda filename: tf.compat.v1.py_func(load, [filename], [tf.float32])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "h6ipB6js43ME" }, "outputs": [], "source": [ "TrackSet_1, TrackSet_2 = DatasetLoader('jazz')" ] }, { "cell_type": "markdown", "metadata": { "id": "6Cglv0Yc472G" }, "source": [ "sample original music" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 75 }, "id": "yMbqcybe49YH", "outputId": "7cd4c8e8-0b46-4d86-c1fc-961673e9a44b" }, "outputs": [ { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sample = TrackSet_1[1]\n", "sample_, sampling_rate = librosa.load(sample,sr=3000, offset=0.0, duration=30)\n", "ipd.Audio(sample_,rate=3000)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 232 }, "id": "Z-Nnpz3o4-p_", "outputId": "22194601-ef5d-499a-f4f2-40d3ea8adc82" }, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "import librosa.display\n", "plt.figure(figsize=(18,15))\n", "for i in range(4):\n", " plt.subplot(4, 4, i + 1)\n", " j = load(TrackSet_1[i])\n", " librosa.display.waveplot(j[0], sr=3000)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "jjiK-fSw5ArS" }, "outputs": [], "source": [ "train_dataset = (\n", " tf.data.Dataset\n", " .from_tensor_slices((TrackSet_1))\n", " .map(map_data, num_parallel_calls=AUTOTUNE)\n", " .shuffle(3)\n", " .batch(BATCH_SIZE)\n", ")\n", "test_dataset = (\n", " tf.data.Dataset\n", " .from_tensor_slices((TrackSet_2))\n", " .map(map_data, num_parallel_calls=AUTOTUNE)\n", " .shuffle(3)\n", " .batch(BATCH_SIZE)\n", ")" ] }, { "cell_type": "markdown", "metadata": { "id": "siBpQeyR5CwS" }, "source": [ "Network architecture" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "saK3Cvmc5E-x" }, "outputs": [], "source": [ "class Resnet1DBlock(tf.keras.Model):\n", " def __init__(self, kernel_size, filters,type='encode'):\n", " super(Resnet1DBlock, self).__init__()\n", " \n", " if type=='encode':\n", " self.conv1a = layers.Conv1D(filters, kernel_size, 2,padding=\"same\")\n", " self.conv1b = layers.Conv1D(filters, kernel_size, 1,padding=\"same\")\n", " self.norm1a = tfa.layers.InstanceNormalization()\n", " self.norm1b = tfa.layers.InstanceNormalization()\n", " if type=='decode':\n", " self.conv1a = layers.Conv1DTranspose(filters, kernel_size, 1,padding=\"same\")\n", " self.conv1b = layers.Conv1DTranspose(filters, kernel_size, 1,padding=\"same\")\n", " self.norm1a = tf.keras.layers.BatchNormalization()\n", " self.norm1b = tf.keras.layers.BatchNormalization()\n", " else:\n", " return None\n", "\n", " def call(self, input_tensor):\n", " x = tf.nn.relu(input_tensor)\n", " x = self.conv1a(x)\n", " x = self.norm1a(x)\n", " x = layers.LeakyReLU(0.4)(x)\n", "\n", " x = self.conv1b(x)\n", " x = self.norm1b(x)\n", " x = layers.LeakyReLU(0.4)(x)\n", "\n", " x += input_tensor\n", " return tf.nn.relu(x)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "BslaIFfO5HXS" }, "outputs": [], "source": [ "class CVAE(tf.keras.Model):\n", " \"\"\"Convolutional variational autoencoder.\"\"\"\n", "\n", " def __init__(self, latent_dim):\n", " super(CVAE, self).__init__()\n", " self.latent_dim = latent_dim\n", " self.encoder = tf.keras.Sequential(\n", " [\n", " tf.keras.layers.InputLayer(input_shape=(1,90001)),\n", " layers.Conv1D(64,1,2),\n", " Resnet1DBlock(64,1),\n", " layers.Conv1D(128,1,2),\n", " Resnet1DBlock(128,1),\n", " layers.Conv1D(128,1,2),\n", " Resnet1DBlock(128,1),\n", " layers.Conv1D(256,1,2),\n", " Resnet1DBlock(256,1),\n", " # No activation\n", " layers.Flatten(),\n", " layers.Dense(latent_dim+latent_dim)\n", "\n", " ]\n", " )\n", " self.decoder = tf.keras.Sequential(\n", " [\n", " tf.keras.layers.InputLayer(input_shape=(latent_dim,)),\n", " layers.Reshape(target_shape=(1,latent_dim)),\n", " Resnet1DBlock(512,1,'decode'),\n", " layers.Conv1DTranspose(512,1,1),\n", " Resnet1DBlock(256,1,'decode'),\n", " layers.Conv1DTranspose(256,1,1),\n", " Resnet1DBlock(128,1,'decode'),\n", " layers.Conv1DTranspose(128,1,1),\n", " Resnet1DBlock(64,1,'decode'),\n", " layers.Conv1DTranspose(64,1,1),\n", " # No activation\n", " layers.Conv1DTranspose(90001,1,1),\n", " ]\n", " )\n", " @tf.function\n", " def sample(self, eps=None):\n", " if eps is None:\n", " eps = tf.random.normal(shape=(200, self.latent_dim))\n", " return self.decode(eps, apply_sigmoid=True)\n", " @tf.function\n", " def encode(self, x):\n", " mean, logvar = tf.split(self.encoder(x), num_or_size_splits=2, axis=1)\n", " return mean, logvar\n", " @tf.function\n", " def reparameterize(self, mean, logvar):\n", " eps = tf.random.normal(shape=mean.shape)\n", " return eps * tf.exp(logvar * .5) + mean\n", " @tf.function\n", " def decode(self, z, apply_sigmoid=False):\n", " logits = self.decoder(z)\n", " if apply_sigmoid:\n", " probs = tf.sigmoid(logits)\n", " return probs\n", " return logits" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Lr-dUkGA5JC4" }, "outputs": [], "source": [ "optimizer = tf.keras.optimizers.Adam(0.0003,beta_1=0.9, beta_2=0.999,epsilon=1e-08)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "tA6t8mbC5KDb" }, "outputs": [], "source": [ "@tf.function\n", "def log_normal_pdf(sample, mean, logvar, raxis=1):\n", " log2pi = tf.math.log(2. * np.pi)\n", " return tf.reduce_sum(\n", " -.5 * ((sample - mean) ** 2. * tf.exp(-logvar) + logvar + log2pi),\n", " axis=raxis)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "eJ752zGX5LZ4" }, "outputs": [], "source": [ "@tf.function\n", "def compute_loss(model, x):\n", " mean, logvar = model.encode(x)\n", " z = model.reparameterize(mean, logvar)\n", " x_logit = model.decode(z)\n", " cross_ent = tf.nn.sigmoid_cross_entropy_with_logits(logits=x_logit, labels=x)\n", " logpx_z = -tf.reduce_sum(cross_ent, axis=[1,2])\n", " logpz = log_normal_pdf(z, 0., 0.)\n", " logqz_x = log_normal_pdf(z, mean, logvar)\n", " return -tf.reduce_mean(logpx_z + logpz - logqz_x)" ] }, { "cell_type": "markdown", "metadata": { "id": "PcSo6Y1U5OeZ" }, "source": [ "Reconstruction loss" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "EabyjIiU5OJZ" }, "outputs": [], "source": [ "@tf.function\n", "def train_step(model, x, optimizer):\n", " \n", " \"\"\"Executes one training step and returns the loss.\n", "\n", " This function computes the loss and gradients, and uses the latter to\n", " update the model's parameters.\n", " \"\"\"\n", " with tf.GradientTape() as tape:\n", " mean, logvar = model.encode(x)\n", " z = model.reparameterize(mean, logvar)\n", " x_logit = model.decode(z)\n", " cross_ent = tf.nn.sigmoid_cross_entropy_with_logits(logits=x_logit, labels=x)\n", " logpx_z = -tf.reduce_sum(cross_ent, axis=[1,2])\n", " logpz = log_normal_pdf(z, 0., 0.)\n", " logqz_x = log_normal_pdf(z, mean, logvar)\n", " loss_KL = -tf.reduce_mean(logpx_z + logpz - logqz_x)\n", " reconstruction_loss = tf.reduce_mean(\n", " tf.keras.losses.binary_crossentropy(x, x_logit)\n", " )\n", " total_loss = reconstruction_loss+ loss_KL\n", " gradients = tape.gradient(total_loss, model.trainable_variables)\n", " optimizer.apply_gradients(zip(gradients, model.trainable_variables))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "u4l4OtOM5R78" }, "outputs": [], "source": [ "# keeping the random vector constant for generation (prediction) so\n", "# it will be easier to see the improvement.\n", "random_vector_for_generation = tf.random.normal(\n", " shape=[num_examples_to_generate, latent_dim])\n", "model = CVAE(latent_dim)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "58NP4C-w5TWJ" }, "outputs": [], "source": [ "import librosa.display\n", "\n", "def generate_and_save_images(model, epoch, test_sample, save):\n", " mean, logvar = model.encode(test_sample)\n", " z = model.reparameterize(mean, logvar)\n", " predictions = model.sample(z)\n", " fig = plt.figure(figsize=(18, 15))\n", "\n", " for i in range(predictions.shape[0]):\n", " plt.subplot(4, 4, i + 1)\n", " wave = np.asarray(predictions[i])\n", " librosa.display.waveplot(wave[0], sr=3000)\n", "\n", " # tight_layout minimizes the overlap between 2 sub-plots\n", " plt.savefig('{}_{:04d}.png'.format(save, epoch))\n", " plt.savefig('{}_{:04d}.png'.format(save, epoch))\n", " plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "NYi2IaCt5UrX" }, "outputs": [], "source": [ "# Pick a sample of the test set for generating output images\n", "assert BATCH_SIZE >= num_examples_to_generate\n", "for test_batch in test_dataset.take(1):\n", " test_sample = test_batch[0]" ] }, { "cell_type": "markdown", "metadata": { "id": "IfrPv1He5W1Y" }, "source": [ "Training the model" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 662 }, "id": "CZ1TEYXC5ZEF", "outputId": "9d494d7b-4477-48bb-b056-f333c22bf3fe" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch: 20, Test set ELBO: -18109.1796875, time elapse for current epoch: 6.8476269245147705\n" ] }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "generate_and_save_images(model, 0, test_sample, 'jazz')\n", "def train(train_dataset, test_dataset, model, save):\n", " for epoch in range(1, epochs + 1):\n", " start_time = time.time()\n", " for train_x in train_dataset:\n", " train_x = np.asarray(train_x)[0]\n", " train_step(model, train_x, optimizer)\n", " end_time = time.time()\n", "\n", " loss = tf.keras.metrics.Mean()\n", " for test_x in test_dataset:\n", " test_x = np.asarray(test_x)[0]\n", " loss(compute_loss(model, test_x))\n", " display.clear_output(wait=False)\n", " elbo = -loss.result()\n", " print('Epoch: {}, Test set ELBO: {}, time elapse for current epoch: {}'.format(epoch, \n", " elbo, \n", " end_time - start_time\n", " ))\n", " generate_and_save_images(model,\n", " epoch, \n", " test_sample,\n", " save)\n", "train(train_dataset, test_dataset, model, 'jazz')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "JHK2LPk55bQh" }, "outputs": [], "source": [ "anim_file_1 = 'jazz_cvae.gif'\n", "\n", "with imageio.get_writer(anim_file_1, mode='I') as writer:\n", " filenames = glob.glob('jazz*.png')\n", " filenames = sorted(filenames)\n", " for filename in filenames:\n", " image = imageio.imread(filename)\n", " writer.append_data(image)\n", " image = imageio.imread(filename)\n", " writer.append_data(image)" ] }, { "cell_type": "markdown", "metadata": { "id": "pTEd1Aqs5c7_" }, "source": [ "Visualization" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 1000 }, "id": "JmxhsMzB5eYN", "outputId": "8d995986-a245-467c-db16-759e41b50762" }, "outputs": [ { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import tensorflow_docs.vis.embed as embed\n", "embed.embed_file(anim_file_1)" ] }, { "cell_type": "markdown", "metadata": { "id": "kca5NC685gg8" }, "source": [ "Generated Music - Jazz" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "9UJPYwRD5iIT" }, "outputs": [], "source": [ "def inference(test_dataset, model): \n", " save_music = []\n", " for test in test_dataset:\n", " mean, logvar = model.encode(test)\n", " z = model.reparameterize(mean, logvar)\n", " predictions = model.sample(z)\n", " for pred in predictions:\n", " wave = np.asarray(pred)\n", " save_music.append(wave)\n", " return save_music\n", "\n", "saved_musics = inference(test_dataset, model)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 75 }, "id": "Hkx8tzHxJskG", "outputId": "575fe3f9-5736-4596-9026-2748717c26aa" }, "outputs": [ { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "music1=saved_musics[0][0]\n", "ipd.Audio(music1,rate=3000)" ] } ], "metadata": { "colab": { "collapsed_sections": [], "name": "music_generation_with_Variational_AE.ipynb", "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.12" } }, "nbformat": 4, "nbformat_minor": 1 }