In [27]:
from audio_diffusion_pytorch import AudioDiffusionModel
import torch
from IPython.display import Audio
import matplotlib.pyplot as plt
from tqdm import tqdm
import numpy as np

In [28]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [29]:
model = AudioDiffusionModel(in_channels=1, 
 patch_size=1,
 multipliers=[1, 2, 4, 4, 4, 4, 4],
 factors=[2, 2, 2, 2, 2, 2],
 num_blocks=[2, 2, 2, 2, 2, 2],
 attentions=[0, 0, 0, 0, 0, 0]
 )
model = model.to(device)

In [30]:
fs = 22050
t = 32768
fc_min = 220
fc_max = 440
batch_size = 8
samples = torch.arange(t) / fs
n_iters = 1000

samples = samples.view(1, -1)
print(samples.shape)

lr = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr, betas=(0.95, 0.999), eps=1e-6, weight_decay=1e-3)

torch.Size([1, 32768])


In [31]:
losses = []
pbar = tqdm(range(n_iters))
for i in pbar:
 
 optimizer.zero_grad()
 
 # create a batch of random sine waves
 f = torch.randint(fc_min, fc_max, [batch_size,1])
 signals = torch.sin(2 * torch.pi * f * samples)
 signals = signals.view(batch_size, 1, -1)
 signals = signals.to(device)

 loss = model(signals)
 loss.backward() 
 optimizer.step()
 
 losses.append(loss.item())
 pbar.set_description(f"{i} - loss step: {loss.item():0.4f} loss mean: {np.mean(losses):0.4f}")

999 - loss step: 0.0457 loss mean: 0.1161: 100%|██████████████████████████████████████████████████████████████████████| 1000/1000 [09:38<00:00, 1.73it/s]


In [38]:
# Sample 2 sources given start noise
noise = torch.randn(1, 1, t)
noise = noise.to(device)
sampled = model.sample(
 noise=noise,
 num_steps=50 # Suggested range: 2-50
) # [2, 1, 2 ** 18]

In [39]:
z = sampled[0]
Audio(z.cpu(), rate=22050)