{ "cells": [ { "cell_type": "markdown", "id": "0fd939b0", "metadata": {}, "source": [ "\"Open" ] }, { "cell_type": "code", "execution_count": 5, "id": "6c7800a6", "metadata": {}, "outputs": [], "source": [ "try:\n", " # are we running on Google Colab?\n", " import google.colab\n", " !git clone -q https://github.com/teticio/audio-diffusion.git\n", " %cd audio-diffusion\n", " !pip install -q -r requirements.txt\n", "except:\n", " pass" ] }, { "cell_type": "code", "execution_count": 1, "id": "b447e2c4", "metadata": {}, "outputs": [], "source": [ "import os\n", "import sys\n", "sys.path.insert(0, os.path.dirname(os.path.abspath(\"\")))" ] }, { "cell_type": "code", "execution_count": 2, "id": "c2fc0e7a", "metadata": {}, "outputs": [], "source": [ "import random\n", "from PIL import Image\n", "from src.mel import Mel\n", "from IPython.display import Audio\n", "from datasets import load_dataset\n", "from diffusers import DDPMPipeline" ] }, { "cell_type": "code", "execution_count": 3, "id": "a3d45c36", "metadata": {}, "outputs": [], "source": [ "mel = Mel(x_res=256, y_res=256)" ] }, { "cell_type": "markdown", "id": "011fb5a1", "metadata": {}, "source": [ "### Run model inference to generate mel spectrogram" ] }, { "cell_type": "code", "execution_count": 4, "id": "b809fed5", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "136b125789ff43548d369bb9062c328e", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/1000 [00:00" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "image = Image.fromarray(images[0][0])\n", "image" ] }, { "cell_type": "markdown", "id": "7230c280", "metadata": {}, "source": [ "### Transform mel spectrogram to audio" ] }, { "cell_type": "code", "execution_count": 30, "id": "5f8a149d", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "audio = mel.image_to_audio(image)\n", "Audio(data=audio, rate=mel.get_sample_rate())" ] }, { "cell_type": "markdown", "id": "ef54cef3", "metadata": {}, "source": [ "### Compare results with random sample from training set" ] }, { "cell_type": "code", "execution_count": 10, "id": "269ee816", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Using custom data configuration teticio--audio-difussion-data-256-67e42008226ba2a6\n", "Reusing dataset parquet (/home/teticio/.cache/huggingface/datasets/teticio___parquet/teticio--audio-difussion-data-256-67e42008226ba2a6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "bf938940f955409ba6d3d6bbf685c3a7", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/1 [00:00" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "image = random.choice(ds['train'])['image']\n", "image" ] }, { "cell_type": "code", "execution_count": 14, "id": "492e2334", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "audio = mel.image_to_audio(image)\n", "Audio(data=audio, rate=mel.get_sample_rate())" ] }, { "cell_type": "markdown", "id": "946fdb4d", "metadata": {}, "source": [ "### Push model to hub" ] }, { "cell_type": "code", "execution_count": null, "id": "37c0564e", "metadata": {}, "outputs": [], "source": [ "from diffusers.hub_utils import init_git_repo, push_to_hub\n", "\n", "\n", "class AttributeDict(dict):\n", "\n", " def __getattr__(self, attr):\n", " return self[attr]\n", "\n", " def __setattr__(self, attr, value):\n", " self[attr] = value\n", "\n", "\n", "args = AttributeDict({\n", " \"hub_model_id\":\n", " \"teticio/audio-diffusion-256\",\n", " \"output_dir\":\n", " \"../ddpm-ema-audio-256-repo\",\n", " \"local_rank\":\n", " -1,\n", " \"hub_token\":\n", " open(os.path.join(os.environ['HOME'], '.huggingface/token'), 'rt').read(),\n", " \"hub_private_repo\":\n", " False,\n", " \"overwrite_output_dir\":\n", " False\n", "})\n", "\n", "repo = init_git_repo(args, at_init=True)\n", "ddpm = DDPMPipeline.from_pretrained('../ddpm-ema-audio-256')\n", "push_to_hub(args, ddpm, repo)" ] }, { "cell_type": "code", "execution_count": null, "id": "8c8261a0", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "huggingface", "language": "python", "name": "huggingface" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.13" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": false } }, "nbformat": 4, "nbformat_minor": 5 }