diff --git "a/notebooks/test_model.ipynb" "b/notebooks/test_model.ipynb" --- "a/notebooks/test_model.ipynb" +++ "b/notebooks/test_model.ipynb" @@ -10,7 +10,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "id": "6c7800a6", "metadata": {}, "outputs": [], @@ -27,20 +27,19 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "id": "b447e2c4", "metadata": {}, "outputs": [], "source": [ "import os\n", - "os.environ['CUDA_VISIBLE_DEVICES'] = ''\n", "import sys\n", "sys.path.insert(0, os.path.dirname(os.path.abspath(\"\")))" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "id": "c2fc0e7a", "metadata": {}, "outputs": [], @@ -64,7 +63,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "id": "97f24046", "metadata": {}, "outputs": [], @@ -75,12 +74,12 @@ "\n", "#@markdown teticio/audio-diffusion-instrumental-hiphop-256 - trained on instrumental hiphop\n", "\n", - "model_id = \"../models/ddim-ema-audio-256/\" #@param [\"teticio/audio-diffusion-256\", \"teticio/audio-diffusion-breaks-256\", \"audio-diffusion-instrumenal-hiphop-256\"]" + "model_id = \"teticio/audio-diffusion-256\" #@param [\"teticio/audio-diffusion-256\", \"teticio/audio-diffusion-breaks-256\", \"audio-diffusion-instrumenal-hiphop-256\"]" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "id": "a3d45c36", "metadata": {}, "outputs": [], @@ -100,185 +99,10 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "id": "b809fed5", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Seed = 13546217519375750400\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "6d9d0ebd24d74ff49fe214bca3602ee1", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/500 [00:00" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Seed = 18230467057955732031\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "f69cd2bf68fe41aa93aabc7f4fa2dab8", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/500 [00:00" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Seed = 17187483613161389671\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "d1164cceb7c74550bbfd0c21ec5b6d1c", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/500 [00:00\u001b[0;34m()\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mSeed = \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mseed\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 4\u001b[0m generator\u001b[38;5;241m.\u001b[39mmanual_seed(seed)\n\u001b[1;32m 5\u001b[0m image, (sample_rate,\n\u001b[0;32m----> 6\u001b[0m audio) \u001b[38;5;241m=\u001b[39m \u001b[43maudio_diffusion\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgenerate_spectrogram_and_audio\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 7\u001b[0m \u001b[43m \u001b[49m\u001b[43msteps\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m500\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 8\u001b[0m \u001b[43m \u001b[49m\u001b[43mgenerator\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgenerator\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 9\u001b[0m display(image)\n\u001b[1;32m 10\u001b[0m display(Audio(audio, rate\u001b[38;5;241m=\u001b[39msample_rate))\n", - "File \u001b[0;32m~/ML/huggingface/audio-diffusion/audiodiffusion/__init__.py:72\u001b[0m, in \u001b[0;36mAudioDiffusion.generate_spectrogram_and_audio\u001b[0;34m(self, steps, generator)\u001b[0m\n\u001b[1;32m 57\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mgenerate_spectrogram_and_audio\u001b[39m(\n\u001b[1;32m 58\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 59\u001b[0m steps: \u001b[38;5;28mint\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 60\u001b[0m generator: torch\u001b[38;5;241m.\u001b[39mGenerator \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 61\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tuple[Image\u001b[38;5;241m.\u001b[39mImage, Tuple[\u001b[38;5;28mint\u001b[39m, np\u001b[38;5;241m.\u001b[39mndarray]]:\n\u001b[1;32m 62\u001b[0m \u001b[38;5;124;03m\"\"\"Generate random mel spectrogram and convert to audio.\u001b[39;00m\n\u001b[1;32m 63\u001b[0m \n\u001b[1;32m 64\u001b[0m \u001b[38;5;124;03m Args:\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 70\u001b[0m \u001b[38;5;124;03m (float, np.ndarray): sample rate and raw audio\u001b[39;00m\n\u001b[1;32m 71\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m---> 72\u001b[0m images, (sample_rate, audios) \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpipe\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmel\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 73\u001b[0m \u001b[43m \u001b[49m\u001b[43mbatch_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 74\u001b[0m \u001b[43m \u001b[49m\u001b[43msteps\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msteps\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 75\u001b[0m \u001b[43m \u001b[49m\u001b[43mgenerator\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgenerator\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 76\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m images[\u001b[38;5;241m0\u001b[39m], (sample_rate, audios[\u001b[38;5;241m0\u001b[39m])\n", - "File \u001b[0;32m~/.local/share/virtualenvs/huggingface-OfWfm_Zx/lib/python3.10/site-packages/torch/autograd/grad_mode.py:27\u001b[0m, in \u001b[0;36m_DecoratorContextManager.__call__..decorate_context\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(func)\n\u001b[1;32m 25\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdecorate_context\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m 26\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mclone():\n\u001b[0;32m---> 27\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/ML/huggingface/audio-diffusion/audiodiffusion/__init__.py:220\u001b[0m, in \u001b[0;36mAudioDiffusionPipeline.__call__\u001b[0;34m(self, mel, batch_size, audio_file, raw_audio, slice, start_step, steps, generator, mask_start_secs, mask_end_secs)\u001b[0m\n\u001b[1;32m 217\u001b[0m images \u001b[38;5;241m=\u001b[39m images\u001b[38;5;241m.\u001b[39mto(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdevice)\n\u001b[1;32m 218\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m step, t \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(\n\u001b[1;32m 219\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mprogress_bar(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mscheduler\u001b[38;5;241m.\u001b[39mtimesteps[start_step:])):\n\u001b[0;32m--> 220\u001b[0m model_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43munet\u001b[49m\u001b[43m(\u001b[49m\u001b[43mimages\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mt\u001b[49m\u001b[43m)\u001b[49m[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124msample\u001b[39m\u001b[38;5;124m'\u001b[39m]\n\u001b[1;32m 221\u001b[0m images \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mscheduler\u001b[38;5;241m.\u001b[39mstep(model_output,\n\u001b[1;32m 222\u001b[0m t,\n\u001b[1;32m 223\u001b[0m images,\n\u001b[1;32m 224\u001b[0m generator\u001b[38;5;241m=\u001b[39mgenerator)[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mprev_sample\u001b[39m\u001b[38;5;124m'\u001b[39m]\n\u001b[1;32m 226\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m mask \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n", - "File \u001b[0;32m~/.local/share/virtualenvs/huggingface-OfWfm_Zx/lib/python3.10/site-packages/torch/nn/modules/module.py:1130\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m 1126\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1127\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1128\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1129\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1130\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1131\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m 1132\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n", - "File \u001b[0;32m~/.local/share/virtualenvs/huggingface-OfWfm_Zx/lib/python3.10/site-packages/diffusers/models/unet_2d.py:229\u001b[0m, in \u001b[0;36mUNet2DModel.forward\u001b[0;34m(self, sample, timestep, return_dict)\u001b[0m\n\u001b[1;32m 227\u001b[0m sample, skip_sample \u001b[38;5;241m=\u001b[39m upsample_block(sample, res_samples, emb, skip_sample)\n\u001b[1;32m 228\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 229\u001b[0m sample \u001b[38;5;241m=\u001b[39m \u001b[43mupsample_block\u001b[49m\u001b[43m(\u001b[49m\u001b[43msample\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mres_samples\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43memb\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 231\u001b[0m \u001b[38;5;66;03m# 6. post-process\u001b[39;00m\n\u001b[1;32m 232\u001b[0m \u001b[38;5;66;03m# make sure hidden states is in float32\u001b[39;00m\n\u001b[1;32m 233\u001b[0m \u001b[38;5;66;03m# when running in half-precision\u001b[39;00m\n\u001b[1;32m 234\u001b[0m sample \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconv_norm_out(sample\u001b[38;5;241m.\u001b[39mfloat())\u001b[38;5;241m.\u001b[39mtype(sample\u001b[38;5;241m.\u001b[39mdtype)\n", - "File \u001b[0;32m~/.local/share/virtualenvs/huggingface-OfWfm_Zx/lib/python3.10/site-packages/torch/nn/modules/module.py:1130\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m 1126\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1127\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1128\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1129\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1130\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1131\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m 1132\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n", - "File \u001b[0;32m~/.local/share/virtualenvs/huggingface-OfWfm_Zx/lib/python3.10/site-packages/diffusers/models/unet_blocks.py:1225\u001b[0m, in \u001b[0;36mUpBlock2D.forward\u001b[0;34m(self, hidden_states, res_hidden_states_tuple, temb, upsample_size)\u001b[0m\n\u001b[1;32m 1223\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mutils\u001b[38;5;241m.\u001b[39mcheckpoint\u001b[38;5;241m.\u001b[39mcheckpoint(create_custom_forward(resnet), hidden_states, temb)\n\u001b[1;32m 1224\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1225\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m \u001b[43mresnet\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtemb\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1227\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mupsamplers \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 1228\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m upsampler \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mupsamplers:\n", - "File \u001b[0;32m~/.local/share/virtualenvs/huggingface-OfWfm_Zx/lib/python3.10/site-packages/torch/nn/modules/module.py:1130\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m 1126\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1127\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1128\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1129\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1130\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1131\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m 1132\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n", - "File \u001b[0;32m~/.local/share/virtualenvs/huggingface-OfWfm_Zx/lib/python3.10/site-packages/diffusers/models/resnet.py:375\u001b[0m, in \u001b[0;36mResnetBlock2D.forward\u001b[0;34m(self, input_tensor, temb)\u001b[0m\n\u001b[1;32m 372\u001b[0m temb \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtime_emb_proj(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnonlinearity(temb))[:, :, \u001b[38;5;28;01mNone\u001b[39;00m, \u001b[38;5;28;01mNone\u001b[39;00m]\n\u001b[1;32m 373\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m hidden_states \u001b[38;5;241m+\u001b[39m temb\n\u001b[0;32m--> 375\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mnorm2\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 376\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnonlinearity(hidden_states)\n\u001b[1;32m 378\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdropout(hidden_states)\n", - "File \u001b[0;32m~/.local/share/virtualenvs/huggingface-OfWfm_Zx/lib/python3.10/site-packages/torch/nn/modules/module.py:1130\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m 1126\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1127\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1128\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1129\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1130\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1131\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m 1132\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n", - "File \u001b[0;32m~/.local/share/virtualenvs/huggingface-OfWfm_Zx/lib/python3.10/site-packages/torch/nn/modules/normalization.py:272\u001b[0m, in \u001b[0;36mGroupNorm.forward\u001b[0;34m(self, input)\u001b[0m\n\u001b[1;32m 271\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;28minput\u001b[39m: Tensor) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tensor:\n\u001b[0;32m--> 272\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mF\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgroup_norm\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 273\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mnum_groups\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mweight\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbias\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43meps\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/.local/share/virtualenvs/huggingface-OfWfm_Zx/lib/python3.10/site-packages/torch/nn/functional.py:2516\u001b[0m, in \u001b[0;36mgroup_norm\u001b[0;34m(input, num_groups, weight, bias, eps)\u001b[0m\n\u001b[1;32m 2514\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m handle_torch_function(group_norm, (\u001b[38;5;28minput\u001b[39m, weight, bias,), \u001b[38;5;28minput\u001b[39m, num_groups, weight\u001b[38;5;241m=\u001b[39mweight, bias\u001b[38;5;241m=\u001b[39mbias, eps\u001b[38;5;241m=\u001b[39meps)\n\u001b[1;32m 2515\u001b[0m _verify_batch_size([\u001b[38;5;28minput\u001b[39m\u001b[38;5;241m.\u001b[39msize(\u001b[38;5;241m0\u001b[39m) \u001b[38;5;241m*\u001b[39m \u001b[38;5;28minput\u001b[39m\u001b[38;5;241m.\u001b[39msize(\u001b[38;5;241m1\u001b[39m) \u001b[38;5;241m/\u001b[39m\u001b[38;5;241m/\u001b[39m num_groups, num_groups] \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mlist\u001b[39m(\u001b[38;5;28minput\u001b[39m\u001b[38;5;241m.\u001b[39msize()[\u001b[38;5;241m2\u001b[39m:]))\n\u001b[0;32m-> 2516\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgroup_norm\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnum_groups\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mweight\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbias\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43meps\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackends\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcudnn\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43menabled\u001b[49m\u001b[43m)\u001b[49m\n", - "\u001b[0;31mKeyboardInterrupt\u001b[0m: " - ] - } - ], + "outputs": [], "source": [ "for _ in range(10):\n", " seed = generator.seed()\n", @@ -286,7 +110,6 @@ " generator.manual_seed(seed)\n", " image, (sample_rate,\n", " audio) = audio_diffusion.generate_spectrogram_and_audio(\n", - " steps=500,\n", " generator=generator)\n", " display(image)\n", " display(Audio(audio, rate=sample_rate))\n", @@ -315,117 +138,30 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "id": "a7e637e5", "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "3e3bee82d86544739ea589eb3e92abc3", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/50 [00:00" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "seed = 16183389798189209330 #@param {type:\"integer\"}\n", "generator.manual_seed(seed)\n", - "image, (sample_rate,\n", - " audio) = audio_diffusion.generate_spectrogram_and_audio(\n", - " generator=generator)\n", + "image, (sample_rate, audio) = audio_diffusion.generate_spectrogram_and_audio(\n", + " generator=generator)\n", "display(image)\n", "display(Audio(audio, rate=sample_rate))" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "id": "5074ec11", "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "52a82603bd3449598f7b515dbf8b9f29", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/100 [00:00" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "seed = 16183389798189209330 #@param {type:\"integer\"}\n", "generator.manual_seed(seed)\n", - "image, (sample_rate,\n", - " audio) = audio_diffusion.generate_spectrogram_and_audio(\n", - " steps=100,\n", - " generator=generator)\n", + "image, (sample_rate, audio) = audio_diffusion.generate_spectrogram_and_audio(\n", + " generator=generator)\n", "display(image)\n", "display(Audio(audio, rate=sample_rate))" ] @@ -443,13 +179,13 @@ "track = AudioDiffusion.loop_it(audio, sample_rate, loops=1)\n", "for variation in range(12):\n", " image2, (\n", - " sample_rate, audio2\n", - " ) = audio_diffusion.generate_spectrogram_and_audio_from_audio(\n", - " raw_audio=audio,\n", - " start_step=start_steps)\n", + " sample_rate,\n", + " audio2) = audio_diffusion.generate_spectrogram_and_audio_from_audio(\n", + " raw_audio=audio, start_step=start_steps)\n", " display(image2)\n", " display(Audio(audio2, rate=sample_rate))\n", - " track = np.concatenate([track, AudioDiffusion.loop_it(audio2, sample_rate, loops=1)])\n", + " track = np.concatenate(\n", + " [track, AudioDiffusion.loop_it(audio2, sample_rate, loops=1)])\n", "display(Audio(track, rate=sample_rate))" ] }, @@ -474,11 +210,11 @@ "track = audio\n", "for variation in range(12):\n", " image2, (\n", - " sample_rate, audio2\n", - " ) = audio_diffusion.generate_spectrogram_and_audio_from_audio(\n", - " raw_audio=audio[-overlap_samples:],\n", - " start_step=start_step,\n", - " mask_start_secs=overlap_secs)\n", + " sample_rate,\n", + " audio2) = audio_diffusion.generate_spectrogram_and_audio_from_audio(\n", + " raw_audio=audio[-overlap_samples:],\n", + " start_step=start_step,\n", + " mask_start_secs=overlap_secs)\n", " display(image2)\n", " display(Audio(audio2, rate=sample_rate))\n", " track = np.concatenate([track, audio2[overlap_samples:]])\n",