| import math |
| import torch |
| import torch.nn as nn |
| import torch.nn.functional as F |
| from typing import Optional |
| from conformer import ConformerBlock |
| from diffusers.models.activations import get_activation |
|
|
|
|
| class SinusoidalPosEmb(torch.nn.Module): |
| """ |
| input: tensor.Size([a]) |
| output: tensor.size([a, dim]) |
| """ |
|
|
| def __init__(self, dim): |
| super().__init__() |
| self.dim = dim |
| assert self.dim % 2 == 0, "SinusoidalPosEmb requires dim to be even" |
|
|
| def forward(self, x, scale=1000): |
| if x.ndim < 1: |
| x = x.unsqueeze(0) |
| device = x.device |
| half_dim = self.dim // 2 |
| emb = math.log(10000) / (half_dim - 1) |
| emb = torch.exp(torch.arange(half_dim, device=device).float() * -emb) |
| |
| |
| emb = scale * x.unsqueeze(1) * emb.unsqueeze(0) |
| emb = torch.cat((emb.sin(), emb.cos()), dim=-1) |
|
|
| return emb |
|
|
|
|
| class Block1D(torch.nn.Module): |
| def __init__(self, dim, dim_out, groups=8): |
| super().__init__() |
| self.block = torch.nn.Sequential( |
| torch.nn.Conv1d(dim, dim_out, 3, padding=1), |
| torch.nn.GroupNorm(groups, dim_out), |
| nn.Mish(), |
| ) |
|
|
| def forward(self, x): |
| return self.block(x) |
|
|
|
|
| class ResnetBlock1D(torch.nn.Module): |
| def __init__(self, dim, dim_out, time_emb_dim, groups=8, film_dim=None): |
| super().__init__() |
| self.mlp = torch.nn.Sequential( |
| nn.Mish(), torch.nn.Linear(time_emb_dim, dim_out) |
| ) |
|
|
| self.block1 = Block1D(dim, dim_out, groups=groups) |
| self.block2 = Block1D(dim_out, dim_out, groups=groups) |
|
|
| self.res_conv = torch.nn.Conv1d(dim, dim_out, 1) |
|
|
| |
| self.film = None |
| if film_dim is not None: |
| self.film = nn.Sequential( |
| nn.Mish(), |
| nn.Linear(film_dim, 2 * dim_out), |
| ) |
|
|
| def forward(self, x, time_emb, film_cond=None): |
| h = self.block1(x) |
| |
| if self.film is not None and film_cond is not None: |
| film_params = self.film(film_cond).unsqueeze(-1) |
| gamma, beta = film_params.chunk(2, dim=1) |
| h = (1 + gamma) * h + beta |
| h += self.mlp(time_emb).unsqueeze(-1) |
| h = self.block2(h) |
| output = h + self.res_conv(x) |
| return output |
|
|
|
|
| class Downsample1D(nn.Module): |
| def __init__(self, dim): |
| super().__init__() |
| self.conv = torch.nn.Conv1d(dim, dim, 3, 2, 1) |
|
|
| def forward(self, x): |
| return self.conv(x) |
|
|
|
|
| class TimestepEmbedding(nn.Module): |
| def __init__( |
| self, |
| in_channels: int, |
| time_embed_dim: int, |
| act_fn: str = "silu", |
| out_dim: int = None, |
| post_act_fn: Optional[str] = None, |
| cond_proj_dim=None, |
| ): |
| super().__init__() |
|
|
| self.linear_1 = nn.Linear(in_channels, time_embed_dim) |
|
|
| if cond_proj_dim is not None: |
| self.cond_proj = nn.Linear(cond_proj_dim, in_channels, bias=False) |
| else: |
| self.cond_proj = None |
|
|
| self.act = get_activation(act_fn) |
|
|
| if out_dim is not None: |
| time_embed_dim_out = out_dim |
| else: |
| time_embed_dim_out = time_embed_dim |
| self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim_out) |
|
|
| if post_act_fn is None: |
| self.post_act = None |
| else: |
| self.post_act = get_activation(post_act_fn) |
|
|
| def forward(self, sample, condition=None): |
| if condition is not None: |
| sample = sample + self.cond_proj(condition) |
| sample = self.linear_1(sample) |
|
|
| if self.act is not None: |
| sample = self.act(sample) |
|
|
| sample = self.linear_2(sample) |
|
|
| if self.post_act is not None: |
| sample = self.post_act(sample) |
| return sample |
|
|
|
|
| class Upsample1D(nn.Module): |
| """A 1D upsampling layer with an optional convolution. |
| |
| Parameters: |
| channels (`int`): |
| number of channels in the inputs and outputs. |
| use_conv (`bool`, default `False`): |
| option to use a convolution. |
| use_conv_transpose (`bool`, default `False`): |
| option to use a convolution transpose. |
| out_channels (`int`, optional): |
| number of output channels. Defaults to `channels`. |
| """ |
|
|
| def __init__( |
| self, |
| channels, |
| use_conv=False, |
| use_conv_transpose=True, |
| out_channels=None, |
| name="conv", |
| ): |
| super().__init__() |
| self.channels = channels |
| self.out_channels = out_channels or channels |
| self.use_conv = use_conv |
| self.use_conv_transpose = use_conv_transpose |
| self.name = name |
|
|
| self.conv = None |
| if use_conv_transpose: |
| self.conv = nn.ConvTranspose1d(channels, self.out_channels, 4, 2, 1) |
| elif use_conv: |
| self.conv = nn.Conv1d(self.channels, self.out_channels, 3, padding=1) |
|
|
| def forward(self, inputs): |
| assert inputs.shape[1] == self.channels |
| if self.use_conv_transpose: |
| return self.conv(inputs) |
|
|
| outputs = F.interpolate(inputs, scale_factor=2.0, mode="nearest") |
|
|
| if self.use_conv: |
| outputs = self.conv(outputs) |
|
|
| return outputs |
|
|
|
|
| class ConformerWrapper(ConformerBlock): |
| def __init__( |
| self, |
| *, |
| dim, |
| dim_head=64, |
| heads=8, |
| ff_mult=4, |
| conv_expansion_factor=2, |
| conv_kernel_size=31, |
| attn_dropout=0, |
| ff_dropout=0, |
| conv_dropout=0, |
| conv_causal=False, |
| ): |
| super().__init__( |
| dim=dim, |
| dim_head=dim_head, |
| heads=heads, |
| ff_mult=ff_mult, |
| conv_expansion_factor=conv_expansion_factor, |
| conv_kernel_size=conv_kernel_size, |
| attn_dropout=attn_dropout, |
| ff_dropout=ff_dropout, |
| conv_dropout=conv_dropout, |
| conv_causal=conv_causal, |
| ) |
|
|
| def forward( |
| self, |
| hidden_states, |
| attention_mask, |
| encoder_hidden_states=None, |
| encoder_attention_mask=None, |
| timestep=None, |
| ): |
| return super().forward(x=hidden_states, mask=attention_mask.bool()) |
|
|