"Open

In [None]:
try:
 # are we running on Google Colab?
 import google.colab
 !git clone -q https://github.com/teticio/audio-diffusion.git
 %cd audio-diffusion
 !pip install -q -r requirements.txt
except:
 pass

In [None]:
import os
import sys
sys.path.insert(0, os.path.dirname(os.path.abspath("")))

In [None]:
import torch
import random
import numpy as np
from datasets import load_dataset
from IPython.display import Audio
from audiodiffusion.mel import Mel
from audiodiffusion import AudioDiffusion

In [None]:
mel = Mel(x_res=256, y_res=256)
generator = torch.Generator()

## DDPM (De-noising Diffusion Probabilistic Models)

### Select model

In [None]:
#@markdown teticio/audio-diffusion-256 - trained on my Spotify "liked" playlist

#@markdown teticio/audio-diffusion-breaks-256 - trained on samples used in music

#@markdown teticio/audio-diffusion-instrumental-hiphop-256 - trained on instrumental hiphop

model_id = "teticio/audio-diffusion-256" #@param ["teticio/audio-diffusion-256", "teticio/audio-diffusion-breaks-256", "audio-diffusion-instrumenal-hiphop-256", "teticio/audio-diffusion-ddim-256"]

In [None]:
audio_diffusion = AudioDiffusion(model_id=model_id)

### Run model inference to generate mel spectrogram, audios and loops

In [None]:
for _ in range(10):
 seed = generator.seed()
 print(f'Seed = {seed}')
 generator.manual_seed(seed)
 image, (sample_rate,
 audio) = audio_diffusion.generate_spectrogram_and_audio(
 generator=generator)
 display(image)
 display(Audio(audio, rate=sample_rate))
 loop = AudioDiffusion.loop_it(audio, sample_rate)
 if loop is not None:
 display(Audio(loop, rate=sample_rate))
 else:
 print("Unable to determine loop points")

### Generate variations of audios

Try playing around with `start_steps`. Values closer to zero will produce new samples, while values closer to 1,000 will produce samples more faithful to the original.

In [None]:
seed = 16183389798189209330 #@param {type:"integer"}
generator.manual_seed(seed)
image, (sample_rate, audio) = audio_diffusion.generate_spectrogram_and_audio(
 generator=generator)
display(image)
display(Audio(audio, rate=sample_rate))

In [None]:
start_steps = 500 #@param {type:"slider", min:0, max:1000, step:10}
track = AudioDiffusion.loop_it(audio, sample_rate, loops=1)
for variation in range(12):
 image2, (
 sample_rate,
 audio2) = audio_diffusion.generate_spectrogram_and_audio_from_audio(
 raw_audio=audio, start_step=start_steps)
 display(image2)
 display(Audio(audio2, rate=sample_rate))
 track = np.concatenate(
 [track, AudioDiffusion.loop_it(audio2, sample_rate, loops=1)])
display(Audio(track, rate=sample_rate))

### Generate continuations ("out-painting")

In [None]:
overlap_secs = 2 #@param {type:"integer"}
start_step = 0 #@param {type:"slider", min:0, max:1000, step:10}
overlap_samples = overlap_secs * sample_rate
track = audio
for variation in range(12):
 image2, (
 sample_rate,
 audio2) = audio_diffusion.generate_spectrogram_and_audio_from_audio(
 raw_audio=audio[-overlap_samples:],
 start_step=start_step,
 mask_start_secs=overlap_secs)
 display(image2)
 display(Audio(audio2, rate=sample_rate))
 track = np.concatenate([track, audio2[overlap_samples:]])
 audio = audio2
display(Audio(track, rate=sample_rate))

### Remix (style transfer)

Alternatively, you can start from another audio altogether, resulting in a kind of style transfer. Maintaining the same seed during generation fixes the style, while masking helps stitch consecutive segments together more smoothly.

In [None]:
try:
 # are we running on Google Colab?
 from google.colab import files
 audio_file = list(files.upload().keys())[0]
except:
 audio_file = "/home/teticio/Music/liked/El Michels Affair - Glaciers Of Ice.mp3"

In [None]:
start_step = 500 #@param {type:"slider", min:0, max:1000, step:10}
overlap_secs = 2 #@param {type:"integer"}
mel.load_audio(audio_file)
overlap_samples = overlap_secs * mel.get_sample_rate()
slice_size = mel.x_res * mel.hop_length
stride = slice_size - overlap_samples
generator = torch.Generator()
seed = generator.seed()
print(f'Seed = {seed}')
track = np.array([])
not_first = 0
for sample in range(len(mel.audio) // stride):
 generator.manual_seed(seed)
 audio = np.array(mel.audio[sample * stride:sample * stride + slice_size])
 if not_first:
 # Normalize and re-insert generated audio
 audio[:overlap_samples] = audio2[-overlap_samples:] * np.max(
 audio[:overlap_samples]) / np.max(audio2[-overlap_samples:])
 _, (sample_rate,
 audio2) = audio_diffusion.generate_spectrogram_and_audio_from_audio(
 raw_audio=audio,
 start_step=start_step,
 generator=generator,
 mask_start_secs=overlap_secs * not_first)
 track = np.concatenate([track, audio2[overlap_samples * not_first:]])
 not_first = 1
 display(Audio(track, rate=sample_rate))

### Fill the gap ("in-painting")

In [None]:
slice = 3 #@param {type:"integer"}
audio = mel.get_audio_slice(slice)
_, (sample_rate,
 audio2) = audio_diffusion.generate_spectrogram_and_audio_from_audio(
 raw_audio=mel.get_audio_slice(slice),
 mask_start_secs=1,
 mask_end_secs=1,
 step_generator=torch.Generator())
display(Audio(audio, rate=sample_rate))
display(Audio(audio2, rate=sample_rate))

## DDIM (De-noising Diffusion Implicit Models)

In [None]:
audio_diffusion = AudioDiffusion(model_id='teticio/audio-diffusion-ddim-256')

### Generation can be done in many fewer steps with DDIMs

In [None]:
for _ in range(10):
 seed = generator.seed()
 print(f'Seed = {seed}')
 generator.manual_seed(seed)
 image, (sample_rate,
 audio) = audio_diffusion.generate_spectrogram_and_audio(
 generator=generator)
 display(image)
 display(Audio(audio, rate=sample_rate))
 loop = AudioDiffusion.loop_it(audio, sample_rate)
 if loop is not None:
 display(Audio(loop, rate=sample_rate))
 else:
 print("Unable to determine loop points")

The parameter eta controls the variance:
* 0 - DDIM (deterministic)
* 1 - DDPM (De-noising Diffusion Probabilistic Model)

In [None]:
image, (sample_rate, audio) = audio_diffusion.generate_spectrogram_and_audio(
 steps=1000, generator=generator, eta=1)
display(image)
display(Audio(audio, rate=sample_rate))

### DDIMs can be used as encoders...

In [None]:
# Doesn't have to be an audio from the train dataset, this is just for convenience
ds = load_dataset('teticio/audio-diffusion-256')

In [None]:
image = ds['train'][264]['image']
display(Audio(mel.image_to_audio(image), rate=mel.get_sample_rate()))

In [None]:
noise = audio_diffusion.pipe.encode([image])

In [None]:
# Reconstruct original audio from noise
_, (sample_rate, audio) = audio_diffusion.generate_spectrogram_and_audio(
 noise=noise, generator=generator)
display(Audio(audio, rate=sample_rate))

### ...or to interpolate between audios

In [None]:
image2 = ds['train'][15978]['image']
display(Audio(mel.image_to_audio(image2), rate=mel.get_sample_rate()))

In [None]:
noise2 = audio_diffusion.pipe.encode([image2], steps=1000)

In [None]:
alpha = 0.5 #@param {type:"slider", min:0, max:1, step:0.1}
_, (sample_rate, audio) = audio_diffusion.generate_spectrogram_and_audio(
 noise=audio_diffusion.pipe.slerp(noise, noise2, alpha),
 generator=generator)
display(Audio(mel.image_to_audio(image), rate=mel.get_sample_rate()))
display(Audio(mel.image_to_audio(image2), rate=mel.get_sample_rate()))
display(Audio(audio, rate=sample_rate))