audio-diffusion / audiodiffusion /audio_encoder.py
yoinked's picture
repo
9f76d9a
import numpy as np
import torch
from diffusers import ConfigMixin, Mel, ModelMixin
from torch import nn
class SeparableConv2d(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size):
super(SeparableConv2d, self).__init__()
self.depthwise = nn.Conv2d(
in_channels,
in_channels,
kernel_size=kernel_size,
groups=in_channels,
bias=False,
padding=1,
)
self.pointwise = nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=True)
def forward(self, x):
out = self.depthwise(x)
out = self.pointwise(out)
return out
class ConvBlock(nn.Module):
def __init__(self, in_channels, out_channels, dropout_rate):
super(ConvBlock, self).__init__()
self.sep_conv = SeparableConv2d(in_channels, out_channels, (3, 3))
self.leaky_relu = nn.LeakyReLU(0.2)
self.batch_norm = nn.BatchNorm2d(out_channels, eps=0.001, momentum=0.01)
self.max_pool = nn.MaxPool2d((2, 2))
self.dropout = nn.Dropout(dropout_rate)
def forward(self, x):
x = self.sep_conv(x)
x = self.leaky_relu(x)
x = self.batch_norm(x)
x = self.max_pool(x)
x = self.dropout(x)
return x
class DenseBlock(nn.Module):
def __init__(self, in_features, out_features, dropout_rate):
super(DenseBlock, self).__init__()
self.flatten = nn.Flatten()
self.dense = nn.Linear(in_features, out_features)
self.leaky_relu = nn.LeakyReLU(0.2)
self.batch_norm = nn.BatchNorm1d(out_features, eps=0.001, momentum=0.01)
self.dropout = nn.Dropout(dropout_rate)
def forward(self, x):
x = self.flatten(x.permute(0, 2, 3, 1))
x = self.dense(x)
x = self.leaky_relu(x)
x = self.batch_norm(x)
x = self.dropout(x)
return x
class AudioEncoder(ModelMixin, ConfigMixin):
def __init__(self):
super().__init__()
self.mel = Mel(
x_res=216,
y_res=96,
sample_rate=22050,
n_fft=2048,
hop_length=512,
top_db=80,
)
self.conv_blocks = nn.ModuleList([ConvBlock(1, 32, 0.2), ConvBlock(32, 64, 0.3), ConvBlock(64, 128, 0.4)])
self.dense_block = DenseBlock(41472, 1024, 0.5)
self.embedding = nn.Linear(1024, 100)
def forward(self, x):
for conv_block in self.conv_blocks:
x = conv_block(x)
x = self.dense_block(x)
x = self.embedding(x)
return x
@torch.no_grad()
def encode(self, audio_files):
self.eval()
y = []
for audio_file in audio_files:
self.mel.load_audio(audio_file)
x = [
np.expand_dims(
np.frombuffer(self.mel.audio_slice_to_image(slice).tobytes(), dtype="uint8").reshape(
(self.mel.y_res, self.mel.x_res)
)
/ 255,
axis=0,
)
for slice in range(self.mel.get_number_of_slices())
]
y += [torch.mean(self(torch.Tensor(x)), dim=0)]
return torch.stack(y)
# from diffusers import Mel
# from audiodiffusion.audio_encoder import AudioEncoder
# audio_encoder = AudioEncoder.from_pretrained("teticio/audio-encoder")
# audio_encoder.encode(['/home/teticio/Music/liked/Agua Re - Holy Dance - Large Sound Mix.mp3'])