Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	| import math | |
| from typing import Tuple | |
| import numpy as np | |
| import torch | |
| import torch.nn as nn # pylint: disable=consider-using-from-import | |
| import torch.nn.functional as F | |
| from TTS.tts.layers.delightful_tts.conv_layers import ConvNorm | |
| def initialize_embeddings(shape: Tuple[int]) -> torch.Tensor: | |
| assert len(shape) == 2, "Can only initialize 2-D embedding matrices ..." | |
| # Kaiming initialization | |
| return torch.randn(shape) * np.sqrt(2 / shape[1]) | |
| def positional_encoding(d_model: int, length: int, device: torch.device) -> torch.Tensor: | |
| pe = torch.zeros(length, d_model, device=device) | |
| position = torch.arange(0, length, dtype=torch.float, device=device).unsqueeze(1) | |
| div_term = torch.exp(torch.arange(0, d_model, 2, device=device).float() * -(math.log(10000.0) / d_model)) | |
| pe[:, 0::2] = torch.sin(position * div_term) | |
| pe[:, 1::2] = torch.cos(position * div_term) | |
| pe = pe.unsqueeze(0) | |
| return pe | |
| class BottleneckLayer(nn.Module): | |
| """ | |
| Bottleneck layer for reducing the dimensionality of a tensor. | |
| Args: | |
| in_dim: The number of input dimensions. | |
| reduction_factor: The factor by which to reduce the number of dimensions. | |
| norm: The normalization method to use. Can be "weightnorm" or "instancenorm". | |
| non_linearity: The non-linearity to use. Can be "relu" or "leakyrelu". | |
| kernel_size: The size of the convolutional kernel. | |
| use_partial_padding: Whether to use partial padding with the convolutional kernel. | |
| Shape: | |
| - Input: :math:`[N, in_dim]` where `N` is the batch size and `in_dim` is the number of input dimensions. | |
| - Output: :math:`[N, out_dim]` where `out_dim` is the number of output dimensions. | |
| """ | |
| def __init__( | |
| self, | |
| in_dim, | |
| reduction_factor, | |
| norm="weightnorm", | |
| non_linearity="relu", | |
| kernel_size=3, | |
| use_partial_padding=False, # pylint: disable=unused-argument | |
| ): | |
| super(BottleneckLayer, self).__init__() # pylint: disable=super-with-arguments | |
| self.reduction_factor = reduction_factor | |
| reduced_dim = int(in_dim / reduction_factor) | |
| self.out_dim = reduced_dim | |
| if self.reduction_factor > 1: | |
| fn = ConvNorm(in_dim, reduced_dim, kernel_size=kernel_size, use_weight_norm=(norm == "weightnorm")) | |
| if norm == "instancenorm": | |
| fn = nn.Sequential(fn, nn.InstanceNorm1d(reduced_dim, affine=True)) | |
| self.projection_fn = fn | |
| self.non_linearity = nn.ReLU() | |
| if non_linearity == "leakyrelu": | |
| self.non_linearity = nn.LeakyReLU() | |
| def forward(self, x): | |
| if self.reduction_factor > 1: | |
| x = self.projection_fn(x) | |
| x = self.non_linearity(x) | |
| return x | |
| class GLUActivation(nn.Module): | |
| """Class that implements the Gated Linear Unit (GLU) activation function. | |
| The GLU activation function is a variant of the Leaky ReLU activation function, | |
| where the output of the activation function is gated by an input tensor. | |
| """ | |
| def __init__(self, slope: float): | |
| super().__init__() | |
| self.lrelu = nn.LeakyReLU(slope) | |
| def forward(self, x: torch.Tensor) -> torch.Tensor: | |
| out, gate = x.chunk(2, dim=1) | |
| x = out * self.lrelu(gate) | |
| return x | |
| class StyleEmbedAttention(nn.Module): | |
| def __init__(self, query_dim: int, key_dim: int, num_units: int, num_heads: int): | |
| super().__init__() | |
| self.num_units = num_units | |
| self.num_heads = num_heads | |
| self.key_dim = key_dim | |
| self.W_query = nn.Linear(in_features=query_dim, out_features=num_units, bias=False) | |
| self.W_key = nn.Linear(in_features=key_dim, out_features=num_units, bias=False) | |
| self.W_value = nn.Linear(in_features=key_dim, out_features=num_units, bias=False) | |
| def forward(self, query: torch.Tensor, key_soft: torch.Tensor) -> torch.Tensor: | |
| values = self.W_value(key_soft) | |
| split_size = self.num_units // self.num_heads | |
| values = torch.stack(torch.split(values, split_size, dim=2), dim=0) | |
| out_soft = scores_soft = None | |
| querys = self.W_query(query) # [N, T_q, num_units] | |
| keys = self.W_key(key_soft) # [N, T_k, num_units] | |
| # [h, N, T_q, num_units/h] | |
| querys = torch.stack(torch.split(querys, split_size, dim=2), dim=0) | |
| # [h, N, T_k, num_units/h] | |
| keys = torch.stack(torch.split(keys, split_size, dim=2), dim=0) | |
| # [h, N, T_k, num_units/h] | |
| # score = softmax(QK^T / (d_k ** 0.5)) | |
| scores_soft = torch.matmul(querys, keys.transpose(2, 3)) # [h, N, T_q, T_k] | |
| scores_soft = scores_soft / (self.key_dim**0.5) | |
| scores_soft = F.softmax(scores_soft, dim=3) | |
| # out = score * V | |
| # [h, N, T_q, num_units/h] | |
| out_soft = torch.matmul(scores_soft, values) | |
| out_soft = torch.cat(torch.split(out_soft, 1, dim=0), dim=3).squeeze(0) # [N, T_q, num_units] | |
| return out_soft # , scores_soft | |
| class EmbeddingPadded(nn.Module): | |
| def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int): | |
| super().__init__() | |
| padding_mult = torch.ones((num_embeddings, 1), dtype=torch.int64) | |
| padding_mult[padding_idx] = 0 | |
| self.register_buffer("padding_mult", padding_mult) | |
| self.embeddings = nn.parameter.Parameter(initialize_embeddings((num_embeddings, embedding_dim))) | |
| def forward(self, idx: torch.Tensor) -> torch.Tensor: | |
| embeddings_zeroed = self.embeddings * self.padding_mult | |
| x = F.embedding(idx, embeddings_zeroed) | |
| return x | |
| class EmbeddingProjBlock(nn.Module): | |
| def __init__(self, embedding_dim: int): | |
| super().__init__() | |
| self.layers = nn.ModuleList( | |
| [ | |
| nn.Linear(embedding_dim, embedding_dim), | |
| nn.LeakyReLU(0.3), | |
| nn.Linear(embedding_dim, embedding_dim), | |
| nn.LeakyReLU(0.3), | |
| ] | |
| ) | |
| def forward(self, x: torch.Tensor) -> torch.Tensor: | |
| res = x | |
| for layer in self.layers: | |
| x = layer(x) | |
| x = x + res | |
| return x | |
| class LinearNorm(nn.Module): | |
| def __init__(self, in_features: int, out_features: int, bias: bool = False): | |
| super().__init__() | |
| self.linear = nn.Linear(in_features, out_features, bias) | |
| nn.init.xavier_uniform_(self.linear.weight) | |
| if bias: | |
| nn.init.constant_(self.linear.bias, 0.0) | |
| def forward(self, x: torch.Tensor) -> torch.Tensor: | |
| x = self.linear(x) | |
| return x | |
| class STL(nn.Module): | |
| """ | |
| A PyTorch module for the Style Token Layer (STL) as described in | |
| "A Style-Based Generator Architecture for Generative Adversarial Networks" | |
| (https://arxiv.org/abs/1812.04948) | |
| The STL applies a multi-headed attention mechanism over the learned style tokens, | |
| using the text input as the query and the style tokens as the keys and values. | |
| The output of the attention mechanism is used as the text's style embedding. | |
| Args: | |
| token_num (int): The number of style tokens. | |
| n_hidden (int): Number of hidden dimensions. | |
| """ | |
| def __init__(self, n_hidden: int, token_num: int): | |
| super(STL, self).__init__() # pylint: disable=super-with-arguments | |
| num_heads = 1 | |
| E = n_hidden | |
| self.token_num = token_num | |
| self.embed = nn.Parameter(torch.FloatTensor(self.token_num, E // num_heads)) | |
| d_q = E // 2 | |
| d_k = E // num_heads | |
| self.attention = StyleEmbedAttention(query_dim=d_q, key_dim=d_k, num_units=E, num_heads=num_heads) | |
| torch.nn.init.normal_(self.embed, mean=0, std=0.5) | |
| def forward(self, x: torch.Tensor) -> torch.Tensor: | |
| N = x.size(0) | |
| query = x.unsqueeze(1) # [N, 1, E//2] | |
| keys_soft = torch.tanh(self.embed).unsqueeze(0).expand(N, -1, -1) # [N, token_num, E // num_heads] | |
| # Weighted sum | |
| emotion_embed_soft = self.attention(query, keys_soft) | |
| return emotion_embed_soft | |
