XY_Tokenizer_TTSD_V0_hf / configuration_xy_tokenizer.py
MCplayer's picture
support auto model for moss-ttsd
afc7015
raw
history blame
3.49 kB
# coding=utf-8
# Copyright 2024 Descript and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""XYTokenizer model configuration"""
from transformers.configuration_utils import PretrainedConfig
from transformers.utils import logging
logger = logging.get_logger(__name__)
class XYTokenizerConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`XYTokenizerModel`]. It is used to instantiate a
XY Tokenizer model according to the specified arguments, defining the model architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
input_sample_rate (`int`, *optional*, defaults to 16000):
The sampling rate of the input audio.
output_sample_rate (`int`, *optional*, defaults to 16000):
The sampling rate of the output audio.
encoder_downsample_rate (`int`, *optional*, defaults to 1280):
The total downsampling factor of the encoder part.
decoder_upsample_rate (`int`, *optional*, defaults to 1920):
The total upsampling factor of the decoder part.
code_dim (`int`, *optional*, defaults to 1280):
The dimension of the code embeddings.
// ... (All other parameters from the original YAML/dict config would be listed here) ...
// For brevity, we will define them with default values based on the provided code.
Example:
semantic_encoder_d_model (`int`, *optional*, defaults to 1280):
Hidden dimension for the semantic encoder.
num_quantizers (`int`, *optional*, defaults to 32):
Number of residual quantizers.
...
"""
model_type = "xy_tokenizer"
# A comprehensive config would flatten all nested kwargs from the original `generator_params`.
# For this example, we will create a simplified version. A real implementation would
# have all parameters explicitly defined here.
def __init__(
self,
input_sample_rate=16000,
output_sample_rate=16000,
encoder_downsample_rate=1280,
decoder_upsample_rate=1920,
code_dim=1280,
# A real config would have dozens of parameters here.
# We will dynamically accept them via **kwargs.
**kwargs,
):
self.input_sample_rate = input_sample_rate
self.output_sample_rate = output_sample_rate
self.encoder_downsample_rate = encoder_downsample_rate
self.decoder_upsample_rate = decoder_upsample_rate
self.code_dim = code_dim
# Store all other parameters dynamically. This is a shortcut.
# A production-ready config should list all parameters explicitly.
self.params = kwargs
super().__init__(**kwargs)
__all__ = ["XYTokenizerConfig"]