| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| """ MossTTSDelay model configuration """ |
|
|
| from typing import Optional, Union |
| from transformers.configuration_utils import PretrainedConfig |
| from transformers.utils import logging |
| from transformers.models.qwen3 import Qwen3Config |
|
|
| logger = logging.get_logger(__name__) |
|
|
|
|
| class MossTTSDelayConfig(PretrainedConfig): |
| r""" |
| This is the configuration class to store the configuration of a [`MossTTSDelayModel`]. It is used to instantiate an |
| MossTTSDelay model according to the specified arguments, defining the model architecture. Instantiating a configuration |
| with the defaults will yield a similar configuration to that of the MossTTSDelay [MossTTSDelay-8B](https://huggingface.co/OpenMOSS/mosstts-8b) architecture. |
| |
| Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the |
| documentation from [`PretrainedConfig`] for more information. |
| |
| Args: |
| language_config (`Union[Qwen3Config, dict]`, *optional*): |
| Configuration for the backbone language model (Qwen3). |
| initializer_range (`float`, *optional*, defaults to 0.02): |
| The standard deviation of the truncated_normal_initializer for initializing all weight matrices. |
| n_vq (`int`, *optional*, defaults to 32): |
| Number of additional VQ (Vector Quantization) heads/channels for audio. |
| Determines the number of codebooks used in the audio representation. |
| audio_vocab_size (`int`, *optional*, defaults to 1024): |
| Vocabulary size for the audio tokens (codebooks 1 to N). |
| audio_user_slot_token_id (`int`, *optional*, defaults to 151654): |
| The specific token ID used as a placeholder/slot for user-side audio inputs in the prompt. |
| audio_assistant_gen_slot_token_id (`int`, *optional*, defaults to 151656): |
| The specific token ID representing the generation slot for the assistant's audio output. |
| Acting as the trigger for the TTS generation process. |
| audio_assistant_delay_slot_token_id (`int`, *optional*, defaults to 151662): |
| The token ID used in the 'Delay Pattern' paradigm to represent the delayed/offset positions |
| between different VQ channels. |
| audio_start_token_id (`int`, *optional*, defaults to 151652): |
| Special token ID used to denote the start of an audio sequence in the stream. |
| audio_end_token_id (`int`, *optional*, defaults to 151653): |
| Special token ID used to denote the end of an audio sequence (EOS for audio). |
| audio_pad_code (`int`, *optional*, defaults to 1024): |
| The padding value used within the audio VQ codebooks. Typically equals `audio_vocab_size`. |
| """ |
| model_type = "moss_tts_delay" |
| keys_to_ignore_at_inference = ["past_key_values"] |
|
|
| def __init__( |
| self, |
| language_config: Optional[Union[Qwen3Config, dict]] = None, |
| initializer_range: float = 0.02, |
| n_vq: int = 32, |
| pad_token_id: int = 151643, |
| im_start_token_id: int = 151644, |
| im_end_token_id: int = 151645, |
| audio_vocab_size: int = 1024, |
| audio_user_slot_token_id: int = 151654, |
| audio_assistant_gen_slot_token_id: int = 151656, |
| audio_assistant_delay_slot_token_id: int = 151662, |
| audio_start_token_id: int = 151652, |
| audio_end_token_id: int = 151653, |
| audio_pad_code: int = 1024, |
| sampling_rate: int = 24000, |
| **kwargs, |
| ): |
| if isinstance(language_config, dict): |
| self.language_config = Qwen3Config(**language_config) |
| elif language_config is None: |
| self.language_config = Qwen3Config() |
| else: |
| self.language_config = language_config |
|
|
| self.initializer_range = initializer_range |
| self.n_vq = n_vq |
| self.audio_vocab_size = audio_vocab_size |
| self.audio_user_slot_token_id = audio_user_slot_token_id |
| self.audio_assistant_gen_slot_token_id = audio_assistant_gen_slot_token_id |
| self.audio_assistant_delay_slot_token_id = audio_assistant_delay_slot_token_id |
| self.audio_start_token_id = audio_start_token_id |
| self.audio_end_token_id = audio_end_token_id |
| self.audio_pad_code = audio_pad_code |
| self.sampling_rate = sampling_rate |
|
|
| self.hidden_size = self.language_config.hidden_size |
| self.vocab_size = self.language_config.vocab_size |
| self.im_start_token_id = self.language_config |
| self.pad_token_id = pad_token_id |
| self.im_start_token_id = im_start_token_id |
| self.im_end_token_id = im_end_token_id |
|
|
| |
| super().__init__(**kwargs) |
|
|
| def to_dict(self): |
| output = super().to_dict() |
| if hasattr(self.language_config, "to_dict"): |
| output["language_config"] = self.language_config.to_dict() |
| else: |
| output["language_config"] = self.language_config |
| return output |
|
|