| from math import pi |
| from random import randint |
| from typing import Any, Optional, Sequence, Tuple, Union |
|
|
| import torch |
| from einops import rearrange |
| from torch import Tensor, nn |
| from tqdm import tqdm |
|
|
| from .utils import * |
| from .sampler import * |
|
|
| """ |
| Diffusion Classes (generic for 1d data) |
| """ |
|
|
|
|
| class Model1d(nn.Module): |
| def __init__(self, unet_type: str = "base", **kwargs): |
| super().__init__() |
| diffusion_kwargs, kwargs = groupby("diffusion_", kwargs) |
| self.unet = None |
| self.diffusion = None |
|
|
| def forward(self, x: Tensor, **kwargs) -> Tensor: |
| return self.diffusion(x, **kwargs) |
|
|
| def sample(self, *args, **kwargs) -> Tensor: |
| return self.diffusion.sample(*args, **kwargs) |
|
|
|
|
| """ |
| Audio Diffusion Classes (specific for 1d audio data) |
| """ |
|
|
|
|
| def get_default_model_kwargs(): |
| return dict( |
| channels=128, |
| patch_size=16, |
| multipliers=[1, 2, 4, 4, 4, 4, 4], |
| factors=[4, 4, 4, 2, 2, 2], |
| num_blocks=[2, 2, 2, 2, 2, 2], |
| attentions=[0, 0, 0, 1, 1, 1, 1], |
| attention_heads=8, |
| attention_features=64, |
| attention_multiplier=2, |
| attention_use_rel_pos=False, |
| diffusion_type="v", |
| diffusion_sigma_distribution=UniformDistribution(), |
| ) |
|
|
|
|
| def get_default_sampling_kwargs(): |
| return dict(sigma_schedule=LinearSchedule(), sampler=VSampler(), clamp=True) |
|
|
|
|
| class AudioDiffusionModel(Model1d): |
| def __init__(self, **kwargs): |
| super().__init__(**{**get_default_model_kwargs(), **kwargs}) |
|
|
| def sample(self, *args, **kwargs): |
| return super().sample(*args, **{**get_default_sampling_kwargs(), **kwargs}) |
|
|
|
|
| class AudioDiffusionConditional(Model1d): |
| def __init__( |
| self, |
| embedding_features: int, |
| embedding_max_length: int, |
| embedding_mask_proba: float = 0.1, |
| **kwargs, |
| ): |
| self.embedding_mask_proba = embedding_mask_proba |
| default_kwargs = dict( |
| **get_default_model_kwargs(), |
| unet_type="cfg", |
| context_embedding_features=embedding_features, |
| context_embedding_max_length=embedding_max_length, |
| ) |
| super().__init__(**{**default_kwargs, **kwargs}) |
|
|
| def forward(self, *args, **kwargs): |
| default_kwargs = dict(embedding_mask_proba=self.embedding_mask_proba) |
| return super().forward(*args, **{**default_kwargs, **kwargs}) |
|
|
| def sample(self, *args, **kwargs): |
| default_kwargs = dict( |
| **get_default_sampling_kwargs(), |
| embedding_scale=5.0, |
| ) |
| return super().sample(*args, **{**default_kwargs, **kwargs}) |
|
|
|
|
|
|