| import copy |
|
|
| from transformers import AutoConfig, Qwen3Config |
| from transformers.configuration_utils import PretrainedConfig |
| from transformers.utils import logging |
|
|
| from .configuration_dinov3_vit import DINOv3ViTConfig |
|
|
| logger = logging.get_logger(__name__) |
|
|
| class ProjectorConfig(PretrainedConfig): |
| model_type = "projector" |
| _auto_class = "AutoConfig" |
|
|
| def __init__( |
| self, |
| visual_hidden_size=4096, |
| llm_hidden_size=4096, |
| depth=2, |
| hidden_act="gelu", |
| bias=True, |
| **kwargs, |
| ): |
| self.visual_hidden_size = visual_hidden_size |
| self.llm_hidden_size = llm_hidden_size |
| self.depth = depth |
| self.hidden_act = hidden_act |
| self.bias = bias |
| super().__init__(**kwargs) |
|
|
| class VectorLLMConfig(PretrainedConfig): |
| model_type = 'vectorllm' |
| processor_class = "VectorLLMProcessor", |
| is_composition = True |
|
|
| def __init__( |
| self, |
| vision_config=None, |
| llm_config=None, |
| regression_size=(128, 128), |
| projector_depth=2, |
| pixel_idx=0, |
| **kwargs): |
| super().__init__(**kwargs) |
| if vision_config is None: |
| vision_config = {} |
| logger.info('vision_config is None. Initializing the DinoV3Config with default values.') |
|
|
| if llm_config is None: |
| llm_config = {} |
| logger.info('llm_config is None. Initializing the Qwen3 config with default values.') |
|
|
| self.vision_config = DINOv3ViTConfig(**vision_config) |
| self.llm_config = Qwen3Config(**llm_config) |
| self.text_config = self.llm_config |
|
|
| self.hidden_size = self.llm_config.hidden_size |
| self.vision_hidden_size = self.vision_config.hidden_size |
|
|
| self.projector_config = ProjectorConfig( |
| visual_hidden_size=self.vision_hidden_size, |
| llm_hidden_size=self.hidden_size, |
| depth=projector_depth |
| ) |
|
|
| self.regression_size = regression_size |
| self.pixel_idx = pixel_idx |
| self.tie_word_embeddings = False |
| self.num_cls_register_tokens = 1 + self.vision_config.num_register_tokens |
|
|
| def to_dict(self): |
| """ |
| Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. |
| |
| Returns: |
| `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance, |
| """ |
| output = copy.deepcopy(self.__dict__) |
| output['vision_config'] = self.vision_config.to_dict() |
| output['llm_config'] = self.llm_config.to_dict() |
| output['text_config'] = output['llm_config'] |
| output['projector_config'] = self.projector_config.to_dict() |
| output['model_type'] = self.__class__.model_type |
| return output |