matveymih commited on
Commit
8ea3dd9
1 Parent(s): ba07ab9

API update

Browse files
Files changed (1) hide show
  1. models.py +84 -0
models.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+ from transformers import CLIPVisionModel, CLIPImageProcessor
4
+
5
+ class VisualToGPTMapping(nn.Module):
6
+ def __init__(self, visual_emb_dim, gpt_emb_dim, num_gpt_embs, num_heads):
7
+ super(VisualToGPTMapping, self).__init__()
8
+ self.transformer_layer = TransformerEncoderLayer(d_model=visual_emb_dim, nhead=num_heads, batch_first=True, norm_first=False)
9
+ self.linear = Linear(visual_emb_dim, gpt_emb_dim)
10
+ self.n_embeddings = num_gpt_embs
11
+ self.embedding_dim = gpt_emb_dim
12
+ def forward(self, visual_embs):
13
+ out = self.transformer_layer(visual_embs)
14
+ out = self.linear(out).view(-1, self.n_embeddings, self.embedding_dim)
15
+ return out
16
+
17
+ class CLIPVisionTower(nn.Module):
18
+ def __init__(self, vision_tower, delay_load=False):
19
+ super().__init__()
20
+
21
+ self.is_loaded = False
22
+
23
+ self.vision_tower_name = vision_tower
24
+ self.select_layer = -2
25
+ self.select_feature = 'patch'
26
+
27
+ if not delay_load:
28
+ self.load_model()
29
+ else:
30
+ self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
31
+
32
+ def load_model(self):
33
+ self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
34
+ self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name)
35
+ self.vision_tower.requires_grad_(False)
36
+
37
+ self.is_loaded = True
38
+
39
+ def feature_select(self, image_forward_outs):
40
+ image_features = image_forward_outs.hidden_states[self.select_layer]
41
+ if self.select_feature == 'patch':
42
+ image_features = image_features[:, 1:]
43
+ elif self.select_feature == 'cls_patch':
44
+ image_features = image_features
45
+ else:
46
+ raise ValueError(f'Unexpected select feature: {self.select_feature}')
47
+ return image_features
48
+
49
+ @torch.no_grad()
50
+ def forward(self, images):
51
+ if type(images) is list:
52
+ image_features = []
53
+ for image in images:
54
+ image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
55
+ image_feature = self.feature_select(image_forward_out).to(image.dtype)
56
+ image_features.append(image_feature)
57
+ else:
58
+ image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
59
+ image_features = self.feature_select(image_forward_outs).to(images.dtype)
60
+
61
+ return image_features
62
+
63
+ @property
64
+ def dummy_feature(self):
65
+ return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
66
+
67
+ @property
68
+ def dtype(self):
69
+ return self.vision_tower.dtype
70
+
71
+ @property
72
+ def device(self):
73
+ return self.vision_tower.device
74
+
75
+ @property
76
+ def config(self):
77
+ if self.is_loaded:
78
+ return self.vision_tower.config
79
+ else:
80
+ return self.cfg_only
81
+
82
+ @property
83
+ def hidden_size(self):
84
+ return self.config.hidden_size