TTSInferencingApp / module_classes.py
ipreetichouhan's picture
Added App files
108ccb4 verified
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
class CNNPrenet(torch.nn.Module):
def __init__(self):
super(CNNPrenet, self).__init__()
# Define the layers using Sequential container
self.conv_layers = nn.Sequential(
nn.Conv1d(in_channels=1, out_channels=512, kernel_size=3, padding=1),
nn.BatchNorm1d(512),
nn.ReLU(),
nn.Dropout(0.1),
nn.Conv1d(in_channels=512, out_channels=512, kernel_size=3, padding=1),
nn.BatchNorm1d(512),
nn.ReLU(),
nn.Dropout(0.1),
nn.Conv1d(in_channels=512, out_channels=512, kernel_size=3, padding=1),
nn.BatchNorm1d(512),
nn.ReLU(),
nn.Dropout(0.1)
)
def forward(self, x):
# Add a new dimension for the channel
x = x.unsqueeze(1)
# Pass input through convolutional layers
x = self.conv_layers(x)
# Remove the channel dimension
x = x.squeeze(1)
# Scale the output to the range [-1, 1]
x = torch.tanh(x)
return x
class CNNDecoderPrenet(nn.Module):
def __init__(self, input_dim=80, hidden_dim=256, output_dim=256, final_dim=512, dropout_rate=0.5):
super(CNNDecoderPrenet, self).__init__()
self.layer1 = nn.Linear(input_dim, hidden_dim)
self.layer2 = nn.Linear(hidden_dim, output_dim)
self.linear_projection = nn.Linear(output_dim, final_dim) # Added linear projection
self.dropout = nn.Dropout(dropout_rate)
def forward(self, x):
# Transpose the input tensor to have the feature dimension as the last dimension
x = x.transpose(1, 2)
# Apply the linear layers
x = F.relu(self.layer1(x))
x = self.dropout(x)
x = F.relu(self.layer2(x))
x = self.dropout(x)
# Apply the linear projection
x = self.linear_projection(x)
x = x.transpose(1, 2)
return x
class CNNPostNet(torch.nn.Module):
"""
Conv Postnet
Arguments
---------
n_mel_channels: int
input feature dimension for convolution layers
postnet_embedding_dim: int
output feature dimension for convolution layers
postnet_kernel_size: int
postnet convolution kernal size
postnet_n_convolutions: int
number of convolution layers
postnet_dropout: float
dropout probability fot postnet
"""
def __init__(
self,
n_mel_channels=80,
postnet_embedding_dim=512,
postnet_kernel_size=5,
postnet_n_convolutions=5,
postnet_dropout=0.1,
):
super(CNNPostNet, self).__init__()
self.conv_pre = nn.Conv1d(
in_channels=n_mel_channels,
out_channels=postnet_embedding_dim,
kernel_size=postnet_kernel_size,
padding="same",
)
self.convs_intermedite = nn.ModuleList()
for i in range(1, postnet_n_convolutions - 1):
self.convs_intermedite.append(
nn.Conv1d(
in_channels=postnet_embedding_dim,
out_channels=postnet_embedding_dim,
kernel_size=postnet_kernel_size,
padding="same",
),
)
self.conv_post = nn.Conv1d(
in_channels=postnet_embedding_dim,
out_channels=n_mel_channels,
kernel_size=postnet_kernel_size,
padding="same",
)
self.tanh = nn.Tanh()
self.ln1 = nn.LayerNorm(postnet_embedding_dim)
self.ln2 = nn.LayerNorm(postnet_embedding_dim)
self.ln3 = nn.LayerNorm(n_mel_channels)
self.dropout1 = nn.Dropout(postnet_dropout)
self.dropout2 = nn.Dropout(postnet_dropout)
self.dropout3 = nn.Dropout(postnet_dropout)
def forward(self, x):
"""Computes the forward pass
Arguments
---------
x: torch.Tensor
a (batch, time_steps, features) input tensor
Returns
-------
output: torch.Tensor (the spectrogram predicted)
"""
x = self.conv_pre(x)
x = self.ln1(x.permute(0, 2, 1)).permute(0, 2, 1) # Transpose to [batch_size, feature_dim, sequence_length]
x = self.tanh(x)
x = self.dropout1(x)
for i in range(len(self.convs_intermedite)):
x = self.convs_intermedite[i](x)
x = self.ln2(x.permute(0, 2, 1)).permute(0, 2, 1) # Transpose to [batch_size, feature_dim, sequence_length]
x = self.tanh(x)
x = self.dropout2(x)
x = self.conv_post(x)
x = self.ln3(x.permute(0, 2, 1)).permute(0, 2, 1) # Transpose to [batch_size, feature_dim, sequence_length]
x = self.dropout3(x)
return x
class ScaledPositionalEncoding(nn.Module):
"""
This class implements the absolute sinusoidal positional encoding function
with an adaptive weight parameter alpha.
PE(pos, 2i) = sin(pos/(10000^(2i/dmodel)))
PE(pos, 2i+1) = cos(pos/(10000^(2i/dmodel)))
Arguments
---------
input_size: int
Embedding dimension.
max_len : int, optional
Max length of the input sequences (default 2500).
Example
-------
>>> a = torch.rand((8, 120, 512))
>>> enc = PositionalEncoding(input_size=a.shape[-1])
>>> b = enc(a)
>>> b.shape
torch.Size([1, 120, 512])
"""
def __init__(self, input_size, max_len=2500):
super().__init__()
if input_size % 2 != 0:
raise ValueError(
f"Cannot use sin/cos positional encoding with odd channels (got channels={input_size})"
)
self.max_len = max_len
self.alpha = nn.Parameter(torch.ones(1)) # Define alpha as a trainable parameter
pe = torch.zeros(self.max_len, input_size, requires_grad=False)
positions = torch.arange(0, self.max_len).unsqueeze(1).float()
denominator = torch.exp(
torch.arange(0, input_size, 2).float()
* -(math.log(10000.0) / input_size)
)
pe[:, 0::2] = torch.sin(positions * denominator)
pe[:, 1::2] = torch.cos(positions * denominator)
pe = pe.unsqueeze(0)
self.register_buffer("pe", pe)
def forward(self, x):
"""
Arguments
---------
x : tensor
Input feature shape (batch, time, fea)
"""
pe_scaled = self.pe[:, :x.size(1)].clone().detach() * self.alpha # Scale positional encoding by alpha
return pe_scaled