File size: 4,974 Bytes
bcc6605
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import copy

from transformers import PretrainedConfig, Qwen3Config

from .adaptor_base import *  # noqa: F401,F403
from .adaptor_generic import *  # noqa: F401,F403
from .adaptor_mlp import *  # noqa: F401,F403
from .adaptor_registry import *  # noqa: F401,F403
from .cls_token import *  # noqa: F401,F403
from .common import *  # noqa: F401,F403
from .dinov2_arch import *  # noqa: F401,F403
from .dual_hybrid_vit import *  # noqa: F401,F403
from .enable_cpe_support import *  # noqa: F401,F403
from .enable_spectral_reparam import *  # noqa: F401,F403
from .eradio_model import *  # noqa: F401,F403
from .extra_models import *  # noqa: F401,F403
from .extra_timm_models import *  # noqa: F401,F403
from .feature_normalizer import *  # noqa: F401,F403
from .forward_intermediates import *  # noqa: F401,F403
from .hf_model import RADIOConfig as HFRADIOConfig
from .input_conditioner import *  # noqa: F401,F403
from .open_clip_adaptor import *  # noqa: F401,F403
from .radio_model import *  # noqa: F401,F403
from .vit_patch_generator import *  # noqa: F401,F403
from .vitdet import *  # noqa: F401,F403


class ProjectorConfig(PretrainedConfig):
    model_type = "vectorllm_0407_projector"
    _auto_class = "AutoConfig"

    def __init__(
        self,
        visual_hidden_size=1024,
        llm_hidden_size=1024,
        depth=2,
        hidden_act="gelu",
        bias=True,
        **kwargs,
    ):
        self.visual_hidden_size = visual_hidden_size
        self.llm_hidden_size = llm_hidden_size
        self.depth = depth
        self.hidden_act = hidden_act
        self.bias = bias
        super().__init__(**kwargs)


class VectorLLMConfig(PretrainedConfig):
    model_type = "vectorllm_hf_0407"
    processor_class = "VectorLLMProcessor"
    is_composition = True

    def __init__(
        self,
        vision_config=None,
        llm_config=None,
        regression_size=(128, 128),
        projector_depth=2,
        visual_hidden_size=None,
        pixel_idx=0,
        pre_resize_size=432,
        resized_size=128,
        patch_size=16,
        do_normalize=False,
        vision_model_name_or_path="",
        llm_name_or_path="",
        visual_peft_config=None,
        vision_torch_dtype="bfloat16",
        **kwargs,
    ):
        serialized_visual_hidden_size = kwargs.get("vision_hidden_size", None)
        serialized_projector_config = kwargs.get("projector_config", None)
        super().__init__(**kwargs)

        if vision_config is None:
            vision_config = {}
        if llm_config is None:
            llm_config = {}

        if isinstance(vision_config, HFRADIOConfig):
            vision_config = vision_config.to_dict()
        else:
            vision_config = copy.deepcopy(vision_config)
        if isinstance(llm_config, Qwen3Config):
            llm_config = llm_config.to_dict()
        else:
            llm_config = copy.deepcopy(llm_config)

        self.vision_config = vision_config
        self.llm_config = llm_config

        qwen3_config = Qwen3Config(**llm_config)
        radio_config = HFRADIOConfig(**vision_config)
        self.text_config = qwen3_config
        self.hidden_size = qwen3_config.hidden_size
        radio_args = radio_config.args or {}
        if visual_hidden_size is None and serialized_visual_hidden_size is not None:
            visual_hidden_size = serialized_visual_hidden_size
        self.vision_hidden_size = (
            visual_hidden_size
            if visual_hidden_size is not None
            else radio_args.get("mlp_hidden_size", qwen3_config.hidden_size)
        )

        if serialized_projector_config is not None:
            self.projector_config = copy.deepcopy(serialized_projector_config)
        else:
            self.projector_config = ProjectorConfig(
                visual_hidden_size=self.vision_hidden_size,
                llm_hidden_size=self.hidden_size,
                depth=projector_depth,
            ).to_dict()

        self.regression_size = tuple(regression_size)
        self.pixel_idx = pixel_idx
        self.tie_word_embeddings = False
        self.num_cls_register_tokens = 1 + radio_args.get("register_multiple", 0)
        self.pre_resize_size = pre_resize_size
        self.resized_size = resized_size
        self.patch_size = patch_size
        self.do_normalize = do_normalize
        self.vision_model_name_or_path = vision_model_name_or_path
        self.llm_name_or_path = llm_name_or_path
        self.visual_peft_config = copy.deepcopy(visual_peft_config)
        self.vision_torch_dtype = vision_torch_dtype

    def to_dict(self):
        output = copy.deepcopy(self.__dict__)
        output["vision_config"] = copy.deepcopy(self.vision_config)
        output["llm_config"] = copy.deepcopy(self.llm_config)
        output["text_config"] = self.text_config.to_dict()
        output["projector_config"] = copy.deepcopy(self.projector_config)
        output["model_type"] = self.__class__.model_type
        return output