XY_Tokenizer_TTSD_V0_hf / configuration_xy_tokenizer.py

support auto model for moss-ttsd

afc7015 12 days ago

3.49 kB

	# coding=utf-8
	# Copyright 2024 Descript and The HuggingFace Inc. team. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	"""XYTokenizer model configuration"""

	from transformers.configuration_utils import PretrainedConfig
	from transformers.utils import logging

	logger = logging.get_logger(__name__)


	class XYTokenizerConfig(PretrainedConfig):
	r"""
	This is the configuration class to store the configuration of a [`XYTokenizerModel`]. It is used to instantiate a
	XY Tokenizer model according to the specified arguments, defining the model architecture.

	Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
	documentation from [`PretrainedConfig`] for more information.

	Args:
	input_sample_rate (`int`, optional, defaults to 16000):
	The sampling rate of the input audio.
	output_sample_rate (`int`, optional, defaults to 16000):
	The sampling rate of the output audio.
	encoder_downsample_rate (`int`, optional, defaults to 1280):
	The total downsampling factor of the encoder part.
	decoder_upsample_rate (`int`, optional, defaults to 1920):
	The total upsampling factor of the decoder part.
	code_dim (`int`, optional, defaults to 1280):
	The dimension of the code embeddings.

	// ... (All other parameters from the original YAML/dict config would be listed here) ...
	// For brevity, we will define them with default values based on the provided code.

	Example:
	semantic_encoder_d_model (`int`, optional, defaults to 1280):
	Hidden dimension for the semantic encoder.
	num_quantizers (`int`, optional, defaults to 32):
	Number of residual quantizers.
	...
	"""
	model_type = "xy_tokenizer"

	# A comprehensive config would flatten all nested kwargs from the original `generator_params`.
	# For this example, we will create a simplified version. A real implementation would
	# have all parameters explicitly defined here.
	def __init__(
	self,
	input_sample_rate=16000,
	output_sample_rate=16000,
	encoder_downsample_rate=1280,
	decoder_upsample_rate=1920,
	code_dim=1280,
	# A real config would have dozens of parameters here.
	# We will dynamically accept them via **kwargs.
	**kwargs,
	):
	self.input_sample_rate = input_sample_rate
	self.output_sample_rate = output_sample_rate
	self.encoder_downsample_rate = encoder_downsample_rate
	self.decoder_upsample_rate = decoder_upsample_rate
	self.code_dim = code_dim

	# Store all other parameters dynamically. This is a shortcut.
	# A production-ready config should list all parameters explicitly.
	self.params = kwargs

	super().__init__(**kwargs)


	__all__ = ["XYTokenizerConfig"]