Spaces:
Paused
Paused
File size: 1,792 Bytes
9d3cb0a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 |
import torch
import torch.nn as nn
from .chroma import ChromaExtractor
from .energy import EnergyExtractor
from .voice import VoiceConversionExtractor
from .mbenergy import MultibandEnergyExtractor
class Conditioner(nn.Module):
def __init__(self,
condition_type,
**kwargs
):
super().__init__()
if condition_type == 'energy':
self.conditioner = EnergyExtractor(**kwargs)
elif condition_type == 'chroma':
self.conditioner = ChromaExtractor(**kwargs)
elif condition_type == 'vc':
self.conditioner = VoiceConversionExtractor(**kwargs)
elif condition_type == 'mb_energy':
self.conditioner = MultibandEnergyExtractor(**kwargs)
else:
raise NotImplementedError
def forward(self, waveform, latent_shape):
# B T C
condition = self.conditioner(waveform)
# B C T
condition = condition.permute(0, 2, 1).contiguous()
if len(latent_shape) == 4:
# 2d spectrogram B C T F
assert (condition.shape[-1] % latent_shape[-2]) == 0
X = latent_shape[-1] * condition.shape[-1] // latent_shape[-2]
# copy on F direction
condition = condition.unsqueeze(-1).expand(-1, -1, -1, X)
elif len(latent_shape) == 3:
condition = condition
else:
raise NotImplementedError
return condition
if __name__ == '__main__':
conditioner = Conditioner(condition_type='energy',
hop_size=160, window_size=1024, padding='reflect',
min_db=-80, norm=True)
audio = torch.rand(4, 16000) # Example audio signal
energy = conditioner(audio, (4, 8, 100, 64)) |