morpheushoc commited on
Commit
a9e49d0
·
verified ·
1 Parent(s): f3f535c

Upload InternVideo2_Classification_test

Browse files
config.json CHANGED
@@ -5,49 +5,50 @@
5
  "auto_map": {
6
  "AutoModel": "modeling_videochat2_classification.InternVideo2_Classification_test"
7
  },
8
- "bridge": {
9
- "extra_num_query_token": 64,
10
- "name": "qformer",
11
- "num_query_token": 32,
12
- "qformer_attention_probs_dropout_prob": 0.1,
13
- "qformer_drop_path_rate": 0.2,
14
- "qformer_hidden_dropout_prob": 0.1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  },
16
- "freeze_bridge": false,
17
- "freeze_llm": false,
18
- "freeze_vision_encoder": false,
19
- "llm": {
20
- "lora_alpha": 32,
21
- "lora_dropout": 0.1,
22
- "lora_r": 16,
23
- "name": "mistral_7b",
24
- "pretrained_llm_path": "mistralai/Mistral-7B-Instruct-v0.3",
25
- "use_lora": true
26
- },
27
- "loss": {
28
- "use_vision_regression_loss": false
29
- },
30
- "model_config": {},
31
  "model_type": "InternVideo2_Classification_test",
32
- "pretrained_paths": {},
33
  "torch_dtype": "float32",
34
- "transformers_version": "4.46.1",
35
- "use_flash_attention": true,
36
- "vision_encoder": {
37
- "checkpoint_num": 48,
38
- "d_model": 1408,
39
- "encoder_embed_dim": 1408,
40
- "img_size": 224,
41
- "name": "internvideo2-1B",
42
- "num_frames": 8,
43
- "origin_num_frames": 4,
44
- "patch_size": 14,
45
- "pretrained": null,
46
- "sep_image_video_pos_embed": true,
47
- "tubelet_size": 1,
48
- "use_checkpoint": true,
49
- "vit_add_ln": true,
50
- "x_vis_only": true,
51
- "x_vis_return_idx": -2
52
- }
53
  }
 
5
  "auto_map": {
6
  "AutoModel": "modeling_videochat2_classification.InternVideo2_Classification_test"
7
  },
8
+ "model_config": {
9
+ "bridge": {
10
+ "extra_num_query_token": 64,
11
+ "name": "qformer",
12
+ "num_query_token": 32,
13
+ "qformer_attention_probs_dropout_prob": 0.1,
14
+ "qformer_drop_path_rate": 0.2,
15
+ "qformer_hidden_dropout_prob": 0.1
16
+ },
17
+ "freeze_bridge": false,
18
+ "freeze_llm": false,
19
+ "freeze_vision_encoder": false,
20
+ "llm": {
21
+ "lora_alpha": 32,
22
+ "lora_dropout": 0.1,
23
+ "lora_r": 16,
24
+ "name": "mistral_7b",
25
+ "pretrained_llm_path": "mistralai/Mistral-7B-Instruct-v0.3",
26
+ "use_lora": true
27
+ },
28
+ "loss": {
29
+ "use_vision_regression_loss": false
30
+ },
31
+ "pretrained_paths": {},
32
+ "use_flash_attention": true,
33
+ "vision_encoder": {
34
+ "checkpoint_num": 48,
35
+ "d_model": 1408,
36
+ "encoder_embed_dim": 1408,
37
+ "img_size": 224,
38
+ "name": "internvideo2-1B",
39
+ "num_frames": 8,
40
+ "origin_num_frames": 4,
41
+ "patch_size": 14,
42
+ "pretrained": null,
43
+ "sep_image_video_pos_embed": true,
44
+ "tubelet_size": 1,
45
+ "use_checkpoint": true,
46
+ "vit_add_ln": true,
47
+ "x_vis_only": true,
48
+ "x_vis_return_idx": -2
49
+ }
50
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  "model_type": "InternVideo2_Classification_test",
 
52
  "torch_dtype": "float32",
53
+ "transformers_version": "4.46.1"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c2e92eec0623bf8e345a2310b4baff5fd2ecb0897a3b6eb94e5de89951a2de3c
3
- size 42488
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dad2c8637e76385541187f8578c922050cc7b9c704e774f1fdab50b623f3b517
3
+ size 743024240
modeling_videochat2_classification.py CHANGED
@@ -391,6 +391,9 @@ class InternVideo2_Classification_test(PreTrainedModel):
391
  super().__init__(config)
392
  self.conv1 = nn.Conv2d(1, 20, 5)
393
  self.conv2 = nn.Conv2d(20, 20, 5)
 
 
 
394
 
395
  def forward(self, x):
396
  x = self.conv1(x)
@@ -399,6 +402,35 @@ class InternVideo2_Classification_test(PreTrainedModel):
399
  def test_lol(self, x):
400
  return x
401
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
402
  if __name__ == "__main__":
403
 
404
  tokenizer = AutoTokenizer.from_pretrained('OpenGVLab/InternVideo2-Chat-8B',trust_remote_code=True,use_fast=False)
 
391
  super().__init__(config)
392
  self.conv1 = nn.Conv2d(1, 20, 5)
393
  self.conv2 = nn.Conv2d(20, 20, 5)
394
+ self.model_config = config.model_config
395
+ self.build_bridge()
396
+
397
 
398
  def forward(self, x):
399
  x = self.conv1(x)
 
402
  def test_lol(self, x):
403
  return x
404
 
405
+ def build_bridge(self):
406
+
407
+ if 'qformer' in self.model_config.bridge.name.lower():
408
+ from transformers import BertTokenizer
409
+ self.qformer_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", truncation_side="left")
410
+ self.qformer_tokenizer.add_special_tokens({"bos_token": "[DEC]"})
411
+ self.qformer_tokenizer.padding_side = "left"
412
+ if self.model_config.bridge.name == 'qformer':
413
+ self.qformer, self.query_tokens = build_qformer(
414
+ self.model_config.bridge.num_query_token, self.model_config.vision_encoder.encoder_embed_dim,
415
+ qformer_hidden_dropout_prob=self.model_config.bridge.qformer_hidden_dropout_prob,
416
+ qformer_attention_probs_dropout_prob=self.model_config.bridge.qformer_attention_probs_dropout_prob,
417
+ qformer_drop_path_rate=self.model_config.bridge.qformer_drop_path_rate,
418
+ )
419
+ self.qformer.resize_token_embeddings(len(self.qformer_tokenizer))
420
+ self.qformer.cls = None
421
+ self.extra_num_query_token = self.model_config.bridge.extra_num_query_token
422
+ if self.model_config.bridge.extra_num_query_token > 0:
423
+ logger.info(f"Add extra {self.model_config.bridge.extra_num_query_token} tokens in QFormer")
424
+ self.extra_query_tokens = nn.Parameter(
425
+ torch.zeros(1, self.model_config.bridge.extra_num_query_token, self.query_tokens.shape[-1])
426
+ )
427
+
428
+ self.freeze_bridge = self.model_config.get("freeze_bridge", False)
429
+ if self.freeze_bridge:
430
+ logger.info("freeze bridge")
431
+ freeze_module(self.qformer)
432
+ self.query_tokens.requires_grad = False
433
+
434
  if __name__ == "__main__":
435
 
436
  tokenizer = AutoTokenizer.from_pretrained('OpenGVLab/InternVideo2-Chat-8B',trust_remote_code=True,use_fast=False)