Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	File size: 3,479 Bytes
			
			| 99bbd30 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 | import torch
import torch.nn as nn
import torch.nn.functional as F
from einops import rearrange
from mmaudio.ext.autoencoder.edm2_utils import (MPConv1D, mp_silu, mp_sum, normalize)
def nonlinearity(x):
    # swish
    return mp_silu(x)
class ResnetBlock1D(nn.Module):
    def __init__(self, *, in_dim, out_dim=None, conv_shortcut=False, kernel_size=3, use_norm=True):
        super().__init__()
        self.in_dim = in_dim
        out_dim = in_dim if out_dim is None else out_dim
        self.out_dim = out_dim
        self.use_conv_shortcut = conv_shortcut
        self.use_norm = use_norm
        self.conv1 = MPConv1D(in_dim, out_dim, kernel_size=kernel_size)
        self.conv2 = MPConv1D(out_dim, out_dim, kernel_size=kernel_size)
        if self.in_dim != self.out_dim:
            if self.use_conv_shortcut:
                self.conv_shortcut = MPConv1D(in_dim, out_dim, kernel_size=kernel_size)
            else:
                self.nin_shortcut = MPConv1D(in_dim, out_dim, kernel_size=1)
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # pixel norm
        if self.use_norm:
            x = normalize(x, dim=1)
        h = x
        h = nonlinearity(h)
        h = self.conv1(h)
        h = nonlinearity(h)
        h = self.conv2(h)
        if self.in_dim != self.out_dim:
            if self.use_conv_shortcut:
                x = self.conv_shortcut(x)
            else:
                x = self.nin_shortcut(x)
        return mp_sum(x, h, t=0.3)
class AttnBlock1D(nn.Module):
    def __init__(self, in_channels, num_heads=1):
        super().__init__()
        self.in_channels = in_channels
        self.num_heads = num_heads
        self.qkv = MPConv1D(in_channels, in_channels * 3, kernel_size=1)
        self.proj_out = MPConv1D(in_channels, in_channels, kernel_size=1)
    def forward(self, x):
        h = x
        y = self.qkv(h)
        y = y.reshape(y.shape[0], self.num_heads, -1, 3, y.shape[-1])
        q, k, v = normalize(y, dim=2).unbind(3)
        q = rearrange(q, 'b h c l -> b h l c')
        k = rearrange(k, 'b h c l -> b h l c')
        v = rearrange(v, 'b h c l -> b h l c')
        h = F.scaled_dot_product_attention(q, k, v)
        h = rearrange(h, 'b h l c -> b (h c) l')
        h = self.proj_out(h)
        return mp_sum(x, h, t=0.3)
class Upsample1D(nn.Module):
    def __init__(self, in_channels, with_conv):
        super().__init__()
        self.with_conv = with_conv
        if self.with_conv:
            self.conv = MPConv1D(in_channels, in_channels, kernel_size=3)
    def forward(self, x):
        x = F.interpolate(x, scale_factor=2.0, mode='nearest-exact')  # support 3D tensor(B,C,T)
        if self.with_conv:
            x = self.conv(x)
        return x
class Downsample1D(nn.Module):
    def __init__(self, in_channels, with_conv):
        super().__init__()
        self.with_conv = with_conv
        if self.with_conv:
            # no asymmetric padding in torch conv, must do it ourselves
            self.conv1 = MPConv1D(in_channels, in_channels, kernel_size=1)
            self.conv2 = MPConv1D(in_channels, in_channels, kernel_size=1)
    def forward(self, x):
        if self.with_conv:
            x = self.conv1(x)
        x = F.avg_pool1d(x, kernel_size=2, stride=2)
        if self.with_conv:
            x = self.conv2(x)
        return x
 |