Gengzigang
commited on
Commit
•
3831a1e
1
Parent(s):
9d797ea
add
Browse files- modeling_clip.py +108 -60
modeling_clip.py
CHANGED
@@ -39,7 +39,6 @@ from transformers.utils import (
|
|
39 |
)
|
40 |
from .configuration_clip import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
|
41 |
|
42 |
-
|
43 |
if is_flash_attn_2_available():
|
44 |
from transformers.modeling_flash_attention_utils import _flash_attention_forward
|
45 |
|
@@ -603,16 +602,15 @@ class CLIPPreTrainedModel(PreTrainedModel):
|
|
603 |
fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
|
604 |
nn.init.normal_(module.fc1.weight, std=fc_std)
|
605 |
nn.init.normal_(module.fc2.weight, std=in_proj_std)
|
606 |
-
elif isinstance(module,
|
607 |
-
pass
|
608 |
# nn.init.normal_(
|
609 |
# module.text_projection.weight,
|
610 |
# std=module.text_embed_dim**-0.5 * self.config.initializer_factor,
|
611 |
# )
|
612 |
-
|
613 |
-
|
614 |
-
|
615 |
-
|
616 |
elif isinstance(module, CLIPVisionModelWithProjection):
|
617 |
nn.init.normal_(
|
618 |
module.visual_projection.weight,
|
@@ -1112,80 +1110,97 @@ class CLIPVisionModel(CLIPPreTrainedModel):
|
|
1112 |
|
1113 |
|
1114 |
@add_start_docstrings(CLIP_START_DOCSTRING)
|
1115 |
-
class
|
1116 |
config_class = CLIPConfig
|
1117 |
_no_split_modules = ["CLIPTextEmbeddings", "CLIPEncoderLayer", "CLIPVisionEmbeddings"]
|
1118 |
|
1119 |
def __init__(self, config: CLIPConfig):
|
1120 |
super().__init__(config)
|
|
|
|
|
|
|
|
|
|
|
|
|
1121 |
if not isinstance(config.vision_config, CLIPVisionConfig):
|
1122 |
raise TypeError(
|
1123 |
"config.vision_config is expected to be of type CLIPVisionConfig but is of type"
|
1124 |
f" {type(config.vision_config)}."
|
1125 |
)
|
1126 |
|
|
|
1127 |
vision_config = config.vision_config
|
1128 |
|
1129 |
self.projection_dim = config.projection_dim
|
|
|
1130 |
self.vision_embed_dim = vision_config.hidden_size
|
|
|
|
|
|
|
|
|
|
|
|
|
1131 |
|
1132 |
vision_model = CLIPVisionModel._from_config(vision_config, attn_implementation=config._attn_implementation)
|
1133 |
self.vision_model = vision_model.vision_model
|
1134 |
|
1135 |
-
|
1136 |
-
|
1137 |
-
self.visual_projection = nn.Parameter(scale * torch.randn(self.vision_embed_dim, self.projection_dim))
|
1138 |
self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
|
1139 |
|
1140 |
# Initialize weights and apply final processing
|
1141 |
self.post_init()
|
1142 |
-
|
1143 |
-
|
1144 |
-
|
1145 |
-
self
|
1146 |
-
|
1147 |
-
|
1148 |
-
|
1149 |
-
|
1150 |
-
|
1151 |
-
|
1152 |
-
|
1153 |
-
|
1154 |
-
|
1155 |
-
|
1156 |
-
|
1157 |
-
|
1158 |
-
|
1159 |
-
|
1160 |
-
|
1161 |
-
|
1162 |
-
|
1163 |
-
|
1164 |
-
|
1165 |
-
|
1166 |
-
|
1167 |
-
|
1168 |
-
|
1169 |
-
|
1170 |
-
|
1171 |
-
|
1172 |
-
|
1173 |
-
|
1174 |
-
|
1175 |
-
|
1176 |
-
|
1177 |
-
|
1178 |
-
|
1179 |
-
|
1180 |
-
|
1181 |
-
|
1182 |
-
|
1183 |
-
|
1184 |
-
|
1185 |
-
|
1186 |
-
|
1187 |
-
|
1188 |
-
|
|
|
|
|
|
|
|
|
1189 |
|
1190 |
@add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
|
1191 |
def get_image_features(
|
@@ -1232,7 +1247,7 @@ class CLIPModel(CLIPPreTrainedModel):
|
|
1232 |
)
|
1233 |
|
1234 |
pooled_output = vision_outputs[1] # pooled_output
|
1235 |
-
image_features =
|
1236 |
|
1237 |
return image_features
|
1238 |
|
@@ -1413,7 +1428,40 @@ class CLIPTextModelWithProjection(CLIPPreTrainedModel):
|
|
1413 |
attentions=text_outputs.attentions,
|
1414 |
)
|
1415 |
|
1416 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1417 |
@add_start_docstrings(
|
1418 |
"""
|
1419 |
CLIP Vision Model with a projection layer on top (a linear layer on top of the pooled output).
|
|
|
39 |
)
|
40 |
from .configuration_clip import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
|
41 |
|
|
|
42 |
if is_flash_attn_2_available():
|
43 |
from transformers.modeling_flash_attention_utils import _flash_attention_forward
|
44 |
|
|
|
602 |
fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
|
603 |
nn.init.normal_(module.fc1.weight, std=fc_std)
|
604 |
nn.init.normal_(module.fc2.weight, std=in_proj_std)
|
605 |
+
elif isinstance(module, LLM2CLIPModel):
|
|
|
606 |
# nn.init.normal_(
|
607 |
# module.text_projection.weight,
|
608 |
# std=module.text_embed_dim**-0.5 * self.config.initializer_factor,
|
609 |
# )
|
610 |
+
nn.init.normal_(
|
611 |
+
module.visual_projection.weight,
|
612 |
+
std=module.vision_embed_dim**-0.5 * self.config.initializer_factor,
|
613 |
+
)
|
614 |
elif isinstance(module, CLIPVisionModelWithProjection):
|
615 |
nn.init.normal_(
|
616 |
module.visual_projection.weight,
|
|
|
1110 |
|
1111 |
|
1112 |
@add_start_docstrings(CLIP_START_DOCSTRING)
|
1113 |
+
class LLM2CLIPModel(CLIPPreTrainedModel):
|
1114 |
config_class = CLIPConfig
|
1115 |
_no_split_modules = ["CLIPTextEmbeddings", "CLIPEncoderLayer", "CLIPVisionEmbeddings"]
|
1116 |
|
1117 |
def __init__(self, config: CLIPConfig):
|
1118 |
super().__init__(config)
|
1119 |
+
# if not isinstance(config.text_config, CLIPTextConfig):
|
1120 |
+
# raise TypeError(
|
1121 |
+
# "config.text_config is expected to be of type CLIPTextConfig but is of type"
|
1122 |
+
# f" {type(config.text_config)}."
|
1123 |
+
# )
|
1124 |
+
|
1125 |
if not isinstance(config.vision_config, CLIPVisionConfig):
|
1126 |
raise TypeError(
|
1127 |
"config.vision_config is expected to be of type CLIPVisionConfig but is of type"
|
1128 |
f" {type(config.vision_config)}."
|
1129 |
)
|
1130 |
|
1131 |
+
# text_config = config.text_config
|
1132 |
vision_config = config.vision_config
|
1133 |
|
1134 |
self.projection_dim = config.projection_dim
|
1135 |
+
# self.text_embed_dim = text_config.hidden_size
|
1136 |
self.vision_embed_dim = vision_config.hidden_size
|
1137 |
+
|
1138 |
+
adapter = LLM2CLIP_Adapter()
|
1139 |
+
self.text_adapter = adapter
|
1140 |
+
|
1141 |
+
# text_model = CLIPTextModel._from_config(text_config, attn_implementation=config._attn_implementation)
|
1142 |
+
# self.text_model = text_model.text_model
|
1143 |
|
1144 |
vision_model = CLIPVisionModel._from_config(vision_config, attn_implementation=config._attn_implementation)
|
1145 |
self.vision_model = vision_model.vision_model
|
1146 |
|
1147 |
+
self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
|
1148 |
+
# self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
|
|
|
1149 |
self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
|
1150 |
|
1151 |
# Initialize weights and apply final processing
|
1152 |
self.post_init()
|
1153 |
+
|
1154 |
+
def get_text_features(self, inputs):
|
1155 |
+
#TODO: make this more flexible and configurable
|
1156 |
+
return self.text_adapter(inputs)
|
1157 |
+
|
1158 |
+
# @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
|
1159 |
+
# def get_text_features(
|
1160 |
+
# self,
|
1161 |
+
# input_ids: Optional[torch.Tensor] = None,
|
1162 |
+
# attention_mask: Optional[torch.Tensor] = None,
|
1163 |
+
# position_ids: Optional[torch.Tensor] = None,
|
1164 |
+
# output_attentions: Optional[bool] = None,
|
1165 |
+
# output_hidden_states: Optional[bool] = None,
|
1166 |
+
# return_dict: Optional[bool] = None,
|
1167 |
+
# ) -> torch.FloatTensor:
|
1168 |
+
# r"""
|
1169 |
+
# Returns:
|
1170 |
+
# text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
|
1171 |
+
# applying the projection layer to the pooled output of [`CLIPTextModel`].
|
1172 |
+
|
1173 |
+
# Examples:
|
1174 |
+
|
1175 |
+
# ```python
|
1176 |
+
# >>> from transformers import AutoTokenizer, CLIPModel
|
1177 |
+
|
1178 |
+
# >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
|
1179 |
+
# >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
|
1180 |
+
|
1181 |
+
# >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
|
1182 |
+
# >>> text_features = model.get_text_features(**inputs)
|
1183 |
+
# ```"""
|
1184 |
+
# # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
|
1185 |
+
# output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
1186 |
+
# output_hidden_states = (
|
1187 |
+
# output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
1188 |
+
# )
|
1189 |
+
# return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
1190 |
+
|
1191 |
+
# text_outputs = self.text_model(
|
1192 |
+
# input_ids=input_ids,
|
1193 |
+
# attention_mask=attention_mask,
|
1194 |
+
# position_ids=position_ids,
|
1195 |
+
# output_attentions=output_attentions,
|
1196 |
+
# output_hidden_states=output_hidden_states,
|
1197 |
+
# return_dict=return_dict,
|
1198 |
+
# )
|
1199 |
+
|
1200 |
+
# pooled_output = text_outputs[1]
|
1201 |
+
# text_features = self.text_projection(pooled_output)
|
1202 |
+
|
1203 |
+
# return text_features
|
1204 |
|
1205 |
@add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
|
1206 |
def get_image_features(
|
|
|
1247 |
)
|
1248 |
|
1249 |
pooled_output = vision_outputs[1] # pooled_output
|
1250 |
+
image_features = self.visual_projection(pooled_output)
|
1251 |
|
1252 |
return image_features
|
1253 |
|
|
|
1428 |
attentions=text_outputs.attentions,
|
1429 |
)
|
1430 |
|
1431 |
+
class LinearBlock(nn.Module):
|
1432 |
+
def __init__(self, dim, expansion_factor=4, dropout=0.,norm_layer=nn.LayerNorm):
|
1433 |
+
super().__init__()
|
1434 |
+
self.fn = nn.Sequential(
|
1435 |
+
nn.Linear(dim, int(expansion_factor * dim)),
|
1436 |
+
nn.GELU(),
|
1437 |
+
nn.Dropout(dropout),
|
1438 |
+
nn.Linear(int(expansion_factor * dim), dim),
|
1439 |
+
)
|
1440 |
+
self.ln = norm_layer(dim)
|
1441 |
+
|
1442 |
+
def forward(self, x):
|
1443 |
+
return x + self.fn(self.ln(x))
|
1444 |
+
|
1445 |
+
class LLM2CLIP_Adapter(nn.Module):
|
1446 |
+
def __init__(self):
|
1447 |
+
super().__init__()
|
1448 |
+
#TODO: make this more flexible and configurable
|
1449 |
+
# hard-coded values from the LLM2CLIP model
|
1450 |
+
text_embedding_dim = 4096
|
1451 |
+
expansion_factor = 2
|
1452 |
+
adaptor_num_layers = 4
|
1453 |
+
proj_bias = True
|
1454 |
+
output_dim = 1280
|
1455 |
+
self.adaptor = nn.Sequential(
|
1456 |
+
*[LinearBlock(text_embedding_dim, expansion_factor) for _ in range(adaptor_num_layers)],
|
1457 |
+
nn.LayerNorm(text_embedding_dim),
|
1458 |
+
nn.Linear(text_embedding_dim, output_dim, bias=proj_bias),
|
1459 |
+
)
|
1460 |
+
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
|
1461 |
+
hidden_states = torch.nn.functional.normalize(hidden_states, p=2, dim=1)
|
1462 |
+
hidden_states = self.adaptor(hidden_states)
|
1463 |
+
return hidden_states
|
1464 |
+
|
1465 |
@add_start_docstrings(
|
1466 |
"""
|
1467 |
CLIP Vision Model with a projection layer on top (a linear layer on top of the pooled output).
|