File size: 2,067 Bytes
36a67ca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import torch
from transformers import PreTrainedModel

from config import DinoHuVitsConfig
from src import CAMPPlus, Flow, HiFiGAN, PosteriorHubert


class DinoHuVits(PreTrainedModel):
    config_class = DinoHuVitsConfig

    def __init__(self, config: DinoHuVitsConfig):
        super().__init__(config)

        self.enc_r = CAMPPlus(embed_dim=config.gin_channels, pooling_func="TSTP")
        self.enc_q = PosteriorHubert(
            out_channels=config.inter_channels,
            feature_channels=config.hubert_feature_channels,
            downsample_channels=config.hubert_downsample_channels,
            output_layer=config.hubert_output_layer,
        )
        self.flow = Flow(
            channels=config.inter_channels,
            hidden_channels=config.hidden_channels,
            kernel_size=5,
            dilation_rate=1,
            n_layers=4,
            gin_channels=config.gin_channels,
        )
        self.dec = HiFiGAN(
            initial_channel=config.inter_channels,
            resblock=config.resblock,
            resblock_kernel_sizes=config.resblock_kernel_sizes,
            resblock_dilation_sizes=config.resblock_dilation_sizes,
            upsample_rates=config.upsample_rates,
            upsample_initial_channel=config.upsample_initial_channel,
            upsample_kernel_sizes=config.upsample_kernel_sizes,
            gin_channels=config.gin_channels,
        )

    def forward(
        self, content: torch.Tensor, lengths: torch.Tensor, reference: torch.Tensor
    ):
        g_src = self.__get_style_embedding(content)
        g_tgt = self.__get_style_embedding(reference)
        z, _, _, y_mask = self.enc_q(content, lengths, g=g_src)
        z_p = self.flow(z, y_mask, g=g_src)
        z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
        o_hat = self.dec(z_hat * y_mask, g=g_tgt)
        return o_hat, y_mask

    def __get_style_embedding(self, wavefrom: torch.Tensor):
        g = self.enc_r(wavefrom)  # [b, h, 1]
        g = torch.nn.functional.normalize(g, dim=1)
        return g.unsqueeze(-1)