from transformers import PretrainedConfig, Qwen2Config, SiglipVisionConfig class DoubutsuNextConfig(PretrainedConfig): model_type = "doubutsu_next" def __init__(self, **kwargs): self.text_config = Qwen2Config( **kwargs.pop( "text_config", {}, ), ) self.vision_config = SiglipVisionConfig(**kwargs.pop("vision_config", {})) super().__init__(**kwargs)