{ "cells": [ { "cell_type": "markdown", "id": "3de63898", "metadata": {}, "source": [ "\"Open" ] }, { "cell_type": "code", "execution_count": null, "id": "81fbd495", "metadata": {}, "outputs": [], "source": [ "try:\n", " # are we running on Google Colab?\n", " import google.colab\n", " !git clone -q https://github.com/teticio/audio-diffusion.git\n", " %cd audio-diffusion\n", " %pip install -q -r requirements.txt\n", "except:\n", " pass" ] }, { "cell_type": "code", "execution_count": null, "id": "218fcdf1", "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset\n", "from IPython.display import Audio\n", "from diffusers import Mel" ] }, { "cell_type": "code", "execution_count": null, "id": "5e4f8ee5", "metadata": {}, "outputs": [], "source": [ "# These are the default parameters. If you change any of them, you may have to adjust the others.\n", "mel = Mel(x_res=256,\n", " y_res=256,\n", " hop_length=512,\n", " sample_rate=22050,\n", " n_fft=2048,\n", " n_iter=32)" ] }, { "cell_type": "markdown", "id": "b2178c3f", "metadata": {}, "source": [ "### Transform slice of audio to mel spectrogram" ] }, { "cell_type": "code", "execution_count": null, "id": "f32bb35e", "metadata": {}, "outputs": [], "source": [ "try:\n", " # are we running on Google Colab?\n", " from google.colab import files\n", " audio_file = list(files.upload().keys())[0]\n", "except:\n", " audio_file = \"/home/teticio/Music/Music/A Tribe Called Quest/The Anthology/08 Can I Kick It_.mp3\"" ] }, { "cell_type": "code", "execution_count": null, "id": "61dbcd2e", "metadata": {}, "outputs": [], "source": [ "mel.load_audio(audio_file)" ] }, { "cell_type": "code", "execution_count": null, "id": "ccadcc0f", "metadata": {}, "outputs": [], "source": [ "image = mel.audio_slice_to_image(15)\n", "image" ] }, { "cell_type": "code", "execution_count": null, "id": "8cec79c6", "metadata": {}, "outputs": [], "source": [ "image.width, image.height" ] }, { "cell_type": "markdown", "id": "fe112fef", "metadata": {}, "source": [ "### Transform mel spectrogram back to audio" ] }, { "cell_type": "code", "execution_count": null, "id": "0b268a54", "metadata": {}, "outputs": [], "source": [ "audio = mel.image_to_audio(image)\n", "Audio(data=audio, rate=mel.get_sample_rate())" ] }, { "cell_type": "markdown", "id": "0f1f2006", "metadata": {}, "source": [ "### Select a random image from the training set" ] }, { "cell_type": "code", "execution_count": null, "id": "1f29f025", "metadata": {}, "outputs": [], "source": [ "ds = load_dataset('teticio/audio-diffusion-256')" ] }, { "cell_type": "code", "execution_count": null, "id": "e002482d", "metadata": {}, "outputs": [], "source": [ "image = ds['train'].shuffle().select(range(1))['image'][0]\n", "image" ] }, { "cell_type": "markdown", "id": "6a801fc5", "metadata": {}, "source": [ "### Convert to audio" ] }, { "cell_type": "code", "execution_count": null, "id": "a2421827", "metadata": {}, "outputs": [], "source": [ "audio = mel.image_to_audio(image)\n", "Audio(data=audio, rate=mel.get_sample_rate())" ] }, { "cell_type": "code", "execution_count": null, "id": "2281fb55", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "huggingface", "language": "python", "name": "huggingface" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.6" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": false } }, "nbformat": 4, "nbformat_minor": 5 }