vectorllm-hf / configuration_vectorllm.py
insomnia7's picture
Upload folder using huggingface_hub
a04bbbc verified
import copy
from transformers import AutoConfig, Qwen3Config
from transformers.configuration_utils import PretrainedConfig
from transformers.utils import logging
from .configuration_dinov3_vit import DINOv3ViTConfig
logger = logging.get_logger(__name__)
class ProjectorConfig(PretrainedConfig):
model_type = "projector"
_auto_class = "AutoConfig"
def __init__(
self,
visual_hidden_size=4096,
llm_hidden_size=4096,
depth=2,
hidden_act="gelu",
bias=True,
**kwargs,
):
self.visual_hidden_size = visual_hidden_size
self.llm_hidden_size = llm_hidden_size
self.depth = depth
self.hidden_act = hidden_act
self.bias = bias
super().__init__(**kwargs)
class VectorLLMConfig(PretrainedConfig):
model_type = 'vectorllm'
processor_class = "VectorLLMProcessor",
is_composition = True
def __init__(
self,
vision_config=None,
llm_config=None,
regression_size=(128, 128),
projector_depth=2,
pixel_idx=0,
**kwargs):
super().__init__(**kwargs)
if vision_config is None:
vision_config = {}
logger.info('vision_config is None. Initializing the DinoV3Config with default values.')
if llm_config is None:
llm_config = {}
logger.info('llm_config is None. Initializing the Qwen3 config with default values.')
self.vision_config = DINOv3ViTConfig(**vision_config)
self.llm_config = Qwen3Config(**llm_config)
self.text_config = self.llm_config
self.hidden_size = self.llm_config.hidden_size
self.vision_hidden_size = self.vision_config.hidden_size
self.projector_config = ProjectorConfig(
visual_hidden_size=self.vision_hidden_size,
llm_hidden_size=self.hidden_size,
depth=projector_depth
)
self.regression_size = regression_size
self.pixel_idx = pixel_idx
self.tie_word_embeddings = False
self.num_cls_register_tokens = 1 + self.vision_config.num_register_tokens
def to_dict(self):
"""
Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
Returns:
`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
"""
output = copy.deepcopy(self.__dict__)
output['vision_config'] = self.vision_config.to_dict()
output['llm_config'] = self.llm_config.to_dict()
output['text_config'] = output['llm_config']
output['projector_config'] = self.projector_config.to_dict()
output['model_type'] = self.__class__.model_type
return output