|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import torch |
|
|
|
from cosyvoice.transformer.activation import Swish |
|
from cosyvoice.transformer.subsampling import ( |
|
LinearNoSubsampling, |
|
EmbedinigNoSubsampling, |
|
Conv1dSubsampling2, |
|
Conv2dSubsampling4, |
|
Conv2dSubsampling6, |
|
Conv2dSubsampling8, |
|
) |
|
from cosyvoice.transformer.embedding import (PositionalEncoding, |
|
RelPositionalEncoding, |
|
WhisperPositionalEncoding, |
|
LearnablePositionalEncoding, |
|
NoPositionalEncoding) |
|
from cosyvoice.transformer.attention import (MultiHeadedAttention, |
|
RelPositionMultiHeadedAttention, |
|
BlockRelPositionMultiHeadedAttention) |
|
from cosyvoice.transformer.embedding import EspnetRelPositionalEncoding |
|
from cosyvoice.transformer.subsampling import LegacyLinearNoSubsampling |
|
|
|
|
|
COSYVOICE_ACTIVATION_CLASSES = { |
|
"hardtanh": torch.nn.Hardtanh, |
|
"tanh": torch.nn.Tanh, |
|
"relu": torch.nn.ReLU, |
|
"selu": torch.nn.SELU, |
|
"swish": getattr(torch.nn, "SiLU", Swish), |
|
"gelu": torch.nn.GELU, |
|
} |
|
|
|
COSYVOICE_SUBSAMPLE_CLASSES = { |
|
"linear": LinearNoSubsampling, |
|
"linear_legacy": LegacyLinearNoSubsampling, |
|
"embed": EmbedinigNoSubsampling, |
|
"conv1d2": Conv1dSubsampling2, |
|
"conv2d": Conv2dSubsampling4, |
|
"conv2d6": Conv2dSubsampling6, |
|
"conv2d8": Conv2dSubsampling8, |
|
'paraformer_dummy': torch.nn.Identity |
|
} |
|
|
|
COSYVOICE_EMB_CLASSES = { |
|
"embed": PositionalEncoding, |
|
"abs_pos": PositionalEncoding, |
|
"rel_pos": RelPositionalEncoding, |
|
"rel_pos_espnet": EspnetRelPositionalEncoding, |
|
"no_pos": NoPositionalEncoding, |
|
"abs_pos_whisper": WhisperPositionalEncoding, |
|
"embed_learnable_pe": LearnablePositionalEncoding, |
|
} |
|
|
|
COSYVOICE_ATTENTION_CLASSES = { |
|
"selfattn": MultiHeadedAttention, |
|
"rel_selfattn": RelPositionMultiHeadedAttention, |
|
"block_rel_selfattn": BlockRelPositionMultiHeadedAttention, |
|
} |
|
|