|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""XYTokenizer model configuration""" |
|
|
|
|
|
from transformers.configuration_utils import PretrainedConfig |
|
|
from transformers.utils import logging |
|
|
|
|
|
logger = logging.get_logger(__name__) |
|
|
|
|
|
|
|
|
class XYTokenizerConfig(PretrainedConfig): |
|
|
r""" |
|
|
This is the configuration class to store the configuration of a [`XYTokenizerModel`]. It is used to instantiate a |
|
|
XY Tokenizer model according to the specified arguments, defining the model architecture. |
|
|
|
|
|
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the |
|
|
documentation from [`PretrainedConfig`] for more information. |
|
|
|
|
|
Args: |
|
|
input_sample_rate (`int`, *optional*, defaults to 16000): |
|
|
The sampling rate of the input audio. |
|
|
output_sample_rate (`int`, *optional*, defaults to 16000): |
|
|
The sampling rate of the output audio. |
|
|
encoder_downsample_rate (`int`, *optional*, defaults to 1280): |
|
|
The total downsampling factor of the encoder part. |
|
|
decoder_upsample_rate (`int`, *optional*, defaults to 1920): |
|
|
The total upsampling factor of the decoder part. |
|
|
code_dim (`int`, *optional*, defaults to 1280): |
|
|
The dimension of the code embeddings. |
|
|
|
|
|
// ... (All other parameters from the original YAML/dict config would be listed here) ... |
|
|
// For brevity, we will define them with default values based on the provided code. |
|
|
|
|
|
Example: |
|
|
semantic_encoder_d_model (`int`, *optional*, defaults to 1280): |
|
|
Hidden dimension for the semantic encoder. |
|
|
num_quantizers (`int`, *optional*, defaults to 32): |
|
|
Number of residual quantizers. |
|
|
... |
|
|
""" |
|
|
model_type = "xy_tokenizer" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
input_sample_rate=16000, |
|
|
output_sample_rate=16000, |
|
|
encoder_downsample_rate=1280, |
|
|
decoder_upsample_rate=1920, |
|
|
code_dim=1280, |
|
|
|
|
|
|
|
|
**kwargs, |
|
|
): |
|
|
self.input_sample_rate = input_sample_rate |
|
|
self.output_sample_rate = output_sample_rate |
|
|
self.encoder_downsample_rate = encoder_downsample_rate |
|
|
self.decoder_upsample_rate = decoder_upsample_rate |
|
|
self.code_dim = code_dim |
|
|
|
|
|
|
|
|
|
|
|
self.params = kwargs |
|
|
|
|
|
super().__init__(**kwargs) |
|
|
|
|
|
|
|
|
__all__ = ["XYTokenizerConfig"] |