| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | """ MossTTSDelay model configuration """ |
| |
|
| | from typing import Optional, Union |
| | from transformers.configuration_utils import PretrainedConfig |
| | from transformers.utils import logging |
| | from transformers.models.qwen3 import Qwen3Config |
| |
|
| | logger = logging.get_logger(__name__) |
| |
|
| |
|
| | class MossTTSDelayConfig(PretrainedConfig): |
| | r""" |
| | This is the configuration class to store the configuration of a [`MossTTSDelayModel`]. It is used to instantiate an |
| | MossTTSDelay model according to the specified arguments, defining the model architecture. Instantiating a configuration |
| | with the defaults will yield a similar configuration to that of the MossTTSDelay [MossTTSDelay-8B](https://huggingface.co/OpenMOSS/mosstts-8b) architecture. |
| | |
| | Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the |
| | documentation from [`PretrainedConfig`] for more information. |
| | |
| | Args: |
| | language_config (`Union[Qwen3Config, dict]`, *optional*): |
| | Configuration for the backbone language model (Qwen3). |
| | initializer_range (`float`, *optional*, defaults to 0.02): |
| | The standard deviation of the truncated_normal_initializer for initializing all weight matrices. |
| | n_vq (`int`, *optional*, defaults to 32): |
| | Number of additional VQ (Vector Quantization) heads/channels for audio. |
| | Determines the number of codebooks used in the audio representation. |
| | audio_vocab_size (`int`, *optional*, defaults to 1024): |
| | Vocabulary size for the audio tokens (codebooks 1 to N). |
| | audio_user_slot_token_id (`int`, *optional*, defaults to 151654): |
| | The specific token ID used as a placeholder/slot for user-side audio inputs in the prompt. |
| | audio_assistant_gen_slot_token_id (`int`, *optional*, defaults to 151656): |
| | The specific token ID representing the generation slot for the assistant's audio output. |
| | Acting as the trigger for the TTS generation process. |
| | audio_assistant_delay_slot_token_id (`int`, *optional*, defaults to 151662): |
| | The token ID used in the 'Delay Pattern' paradigm to represent the delayed/offset positions |
| | between different VQ channels. |
| | audio_start_token_id (`int`, *optional*, defaults to 151652): |
| | Special token ID used to denote the start of an audio sequence in the stream. |
| | audio_end_token_id (`int`, *optional*, defaults to 151653): |
| | Special token ID used to denote the end of an audio sequence (EOS for audio). |
| | audio_pad_code (`int`, *optional*, defaults to 1024): |
| | The padding value used within the audio VQ codebooks. Typically equals `audio_vocab_size`. |
| | """ |
| | model_type = "moss_tts_delay" |
| | keys_to_ignore_at_inference = ["past_key_values"] |
| |
|
| | def __init__( |
| | self, |
| | language_config: Optional[Union[Qwen3Config, dict]] = None, |
| | initializer_range: float = 0.02, |
| | n_vq: int = 32, |
| | pad_token_id: int = 151643, |
| | im_start_token_id: int = 151644, |
| | im_end_token_id: int = 151645, |
| | audio_vocab_size: int = 1024, |
| | audio_user_slot_token_id: int = 151654, |
| | audio_assistant_gen_slot_token_id: int = 151656, |
| | audio_assistant_delay_slot_token_id: int = 151662, |
| | audio_start_token_id: int = 151652, |
| | audio_end_token_id: int = 151653, |
| | audio_pad_code: int = 1024, |
| | sampling_rate: int = 24000, |
| | **kwargs, |
| | ): |
| | if isinstance(language_config, dict): |
| | self.language_config = Qwen3Config(**language_config) |
| | elif language_config is None: |
| | self.language_config = Qwen3Config() |
| | else: |
| | self.language_config = language_config |
| |
|
| | self.initializer_range = initializer_range |
| | self.n_vq = n_vq |
| | self.audio_vocab_size = audio_vocab_size |
| | self.audio_user_slot_token_id = audio_user_slot_token_id |
| | self.audio_assistant_gen_slot_token_id = audio_assistant_gen_slot_token_id |
| | self.audio_assistant_delay_slot_token_id = audio_assistant_delay_slot_token_id |
| | self.audio_start_token_id = audio_start_token_id |
| | self.audio_end_token_id = audio_end_token_id |
| | self.audio_pad_code = audio_pad_code |
| | self.sampling_rate = sampling_rate |
| |
|
| | self.hidden_size = self.language_config.hidden_size |
| | self.vocab_size = self.language_config.vocab_size |
| | self.im_start_token_id = self.language_config |
| | self.pad_token_id = pad_token_id |
| | self.im_start_token_id = im_start_token_id |
| | self.im_end_token_id = im_end_token_id |
| |
|
| | |
| | super().__init__(**kwargs) |
| |
|
| | def to_dict(self): |
| | output = super().to_dict() |
| | if hasattr(self.language_config, "to_dict"): |
| | output["language_config"] = self.language_config.to_dict() |
| | else: |
| | output["language_config"] = self.language_config |
| | return output |
| |
|