|
import math |
|
import torch |
|
from torch import nn |
|
|
|
import audiosr.latent_diffusion.modules.phoneme_encoder.commons as commons |
|
import audiosr.latent_diffusion.modules.phoneme_encoder.attentions as attentions |
|
|
|
|
|
class TextEncoder(nn.Module): |
|
def __init__( |
|
self, |
|
n_vocab, |
|
out_channels=192, |
|
hidden_channels=192, |
|
filter_channels=768, |
|
n_heads=2, |
|
n_layers=6, |
|
kernel_size=3, |
|
p_dropout=0.1, |
|
): |
|
super().__init__() |
|
self.n_vocab = n_vocab |
|
self.out_channels = out_channels |
|
self.hidden_channels = hidden_channels |
|
self.filter_channels = filter_channels |
|
self.n_heads = n_heads |
|
self.n_layers = n_layers |
|
self.kernel_size = kernel_size |
|
self.p_dropout = p_dropout |
|
|
|
self.emb = nn.Embedding(n_vocab, hidden_channels) |
|
nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5) |
|
|
|
self.encoder = attentions.Encoder( |
|
hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout |
|
) |
|
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) |
|
|
|
def forward(self, x, x_lengths): |
|
x = self.emb(x) * math.sqrt(self.hidden_channels) |
|
x = torch.transpose(x, 1, -1) |
|
x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to( |
|
x.dtype |
|
) |
|
|
|
x = self.encoder(x * x_mask, x_mask) |
|
stats = self.proj(x) * x_mask |
|
|
|
m, logs = torch.split(stats, self.out_channels, dim=1) |
|
return x, m, logs, x_mask |
|
|