File size: 4,974 Bytes
bcc6605 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 | import copy
from transformers import PretrainedConfig, Qwen3Config
from .adaptor_base import * # noqa: F401,F403
from .adaptor_generic import * # noqa: F401,F403
from .adaptor_mlp import * # noqa: F401,F403
from .adaptor_registry import * # noqa: F401,F403
from .cls_token import * # noqa: F401,F403
from .common import * # noqa: F401,F403
from .dinov2_arch import * # noqa: F401,F403
from .dual_hybrid_vit import * # noqa: F401,F403
from .enable_cpe_support import * # noqa: F401,F403
from .enable_spectral_reparam import * # noqa: F401,F403
from .eradio_model import * # noqa: F401,F403
from .extra_models import * # noqa: F401,F403
from .extra_timm_models import * # noqa: F401,F403
from .feature_normalizer import * # noqa: F401,F403
from .forward_intermediates import * # noqa: F401,F403
from .hf_model import RADIOConfig as HFRADIOConfig
from .input_conditioner import * # noqa: F401,F403
from .open_clip_adaptor import * # noqa: F401,F403
from .radio_model import * # noqa: F401,F403
from .vit_patch_generator import * # noqa: F401,F403
from .vitdet import * # noqa: F401,F403
class ProjectorConfig(PretrainedConfig):
model_type = "vectorllm_0407_projector"
_auto_class = "AutoConfig"
def __init__(
self,
visual_hidden_size=1024,
llm_hidden_size=1024,
depth=2,
hidden_act="gelu",
bias=True,
**kwargs,
):
self.visual_hidden_size = visual_hidden_size
self.llm_hidden_size = llm_hidden_size
self.depth = depth
self.hidden_act = hidden_act
self.bias = bias
super().__init__(**kwargs)
class VectorLLMConfig(PretrainedConfig):
model_type = "vectorllm_hf_0407"
processor_class = "VectorLLMProcessor"
is_composition = True
def __init__(
self,
vision_config=None,
llm_config=None,
regression_size=(128, 128),
projector_depth=2,
visual_hidden_size=None,
pixel_idx=0,
pre_resize_size=432,
resized_size=128,
patch_size=16,
do_normalize=False,
vision_model_name_or_path="",
llm_name_or_path="",
visual_peft_config=None,
vision_torch_dtype="bfloat16",
**kwargs,
):
serialized_visual_hidden_size = kwargs.get("vision_hidden_size", None)
serialized_projector_config = kwargs.get("projector_config", None)
super().__init__(**kwargs)
if vision_config is None:
vision_config = {}
if llm_config is None:
llm_config = {}
if isinstance(vision_config, HFRADIOConfig):
vision_config = vision_config.to_dict()
else:
vision_config = copy.deepcopy(vision_config)
if isinstance(llm_config, Qwen3Config):
llm_config = llm_config.to_dict()
else:
llm_config = copy.deepcopy(llm_config)
self.vision_config = vision_config
self.llm_config = llm_config
qwen3_config = Qwen3Config(**llm_config)
radio_config = HFRADIOConfig(**vision_config)
self.text_config = qwen3_config
self.hidden_size = qwen3_config.hidden_size
radio_args = radio_config.args or {}
if visual_hidden_size is None and serialized_visual_hidden_size is not None:
visual_hidden_size = serialized_visual_hidden_size
self.vision_hidden_size = (
visual_hidden_size
if visual_hidden_size is not None
else radio_args.get("mlp_hidden_size", qwen3_config.hidden_size)
)
if serialized_projector_config is not None:
self.projector_config = copy.deepcopy(serialized_projector_config)
else:
self.projector_config = ProjectorConfig(
visual_hidden_size=self.vision_hidden_size,
llm_hidden_size=self.hidden_size,
depth=projector_depth,
).to_dict()
self.regression_size = tuple(regression_size)
self.pixel_idx = pixel_idx
self.tie_word_embeddings = False
self.num_cls_register_tokens = 1 + radio_args.get("register_multiple", 0)
self.pre_resize_size = pre_resize_size
self.resized_size = resized_size
self.patch_size = patch_size
self.do_normalize = do_normalize
self.vision_model_name_or_path = vision_model_name_or_path
self.llm_name_or_path = llm_name_or_path
self.visual_peft_config = copy.deepcopy(visual_peft_config)
self.vision_torch_dtype = vision_torch_dtype
def to_dict(self):
output = copy.deepcopy(self.__dict__)
output["vision_config"] = copy.deepcopy(self.vision_config)
output["llm_config"] = copy.deepcopy(self.llm_config)
output["text_config"] = self.text_config.to_dict()
output["projector_config"] = copy.deepcopy(self.projector_config)
output["model_type"] = self.__class__.model_type
return output
|