| from transformers import PretrainedConfig, Qwen3Config | |
| class MossAudioConfig(PretrainedConfig): | |
| model_type = "moss_audio" | |
| is_composition = True | |
| def __init__( | |
| self, | |
| audio_config=None, | |
| language_config=None, | |
| adapter_hidden_size=8192, | |
| ignore_index=-100, | |
| deepstack_num_inject_layers=None, | |
| **kwargs, | |
| ): | |
| if isinstance(language_config, dict): | |
| language_config = Qwen3Config(**language_config) | |
| elif language_config is None: | |
| language_config = Qwen3Config() | |
| self.audio_config = audio_config | |
| self.language_config = language_config | |
| self.adapter_hidden_size = adapter_hidden_size | |
| self.ignore_index = ignore_index | |
| self.deepstack_num_inject_layers = deepstack_num_inject_layers | |
| for key in ("num_hidden_layers", "eos_token_id", "bos_token_id", "vocab_size"): | |
| kwargs.setdefault(key, getattr(language_config, key, None)) | |
| super().__init__(**kwargs) | |