DongfuJiang commited on
Commit
98cf109
1 Parent(s): a5a9bfc
app_regression.py CHANGED
@@ -17,7 +17,7 @@ from typing import List
17
  processor = AutoProcessor.from_pretrained("Mantis-VL/mantis-8b-idefics2-video-eval-20k-mantis-2epoch_4096_regression")
18
  model = Idefics2ForSequenceClassification.from_pretrained("Mantis-VL/mantis-8b-idefics2-video-eval-20k-mantis-2epoch_4096_regression", torch_dtype=torch.bfloat16).eval()
19
 
20
- MAX_NUM_FRAMES = 16
21
  conv_template = conv_templates["idefics_2"]
22
 
23
  with open("./examples/all_subsets.json", 'r') as f:
 
17
  processor = AutoProcessor.from_pretrained("Mantis-VL/mantis-8b-idefics2-video-eval-20k-mantis-2epoch_4096_regression")
18
  model = Idefics2ForSequenceClassification.from_pretrained("Mantis-VL/mantis-8b-idefics2-video-eval-20k-mantis-2epoch_4096_regression", torch_dtype=torch.bfloat16).eval()
19
 
20
+ MAX_NUM_FRAMES = 24
21
  conv_template = conv_templates["idefics_2"]
22
 
23
  with open("./examples/all_subsets.json", 'r') as f:
models/idefics2/modeling_idefics2.py CHANGED
@@ -1658,15 +1658,33 @@ class Idefics2Model(Idefics2PreTrainedModel):
1658
  patch_attention_mask = (patches_subgrid.sum(dim=(-1, -2)) > 0).bool()
1659
 
1660
  # Get sequence from the vision encoder
1661
- image_hidden_states = self.vision_model(
1662
- pixel_values=pixel_values,
1663
- patch_attention_mask=patch_attention_mask,
1664
- ).last_hidden_state
1665
-
1666
- # Modality projection & resampling
1667
- image_hidden_states = self.connector(
1668
- image_hidden_states, attention_mask=patch_attention_mask.view(pixel_values.size(0), -1)
1669
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1670
 
1671
  elif image_hidden_states is not None:
1672
  image_hidden_states = image_hidden_states.to(dtype=self.dtype, device=input_ids.device)
 
1658
  patch_attention_mask = (patches_subgrid.sum(dim=(-1, -2)) > 0).bool()
1659
 
1660
  # Get sequence from the vision encoder
1661
+ pixel_batch_size = 4
1662
+ all_image_hidden_states = []
1663
+ for i in range(0, pixel_values.size(0), pixel_batch_size):
1664
+ batch_pixel_values = pixel_values[i : i + pixel_batch_size]
1665
+ batch_patch_attention_mask = patch_attention_mask[i : i + pixel_batch_size]
1666
+
1667
+ batch_image_hidden_states = self.vision_model(
1668
+ pixel_values=batch_pixel_values,
1669
+ patch_attention_mask=batch_patch_attention_mask,
1670
+ ).last_hidden_state
1671
+
1672
+ batch_image_hidden_states = self.connector(
1673
+ batch_image_hidden_states, attention_mask=batch_patch_attention_mask.view(batch_pixel_values.size(0), -1)
1674
+ )
1675
+ all_image_hidden_states.append(batch_image_hidden_states)
1676
+
1677
+ image_hidden_states = torch.cat(all_image_hidden_states, dim=0)
1678
+
1679
+ # image_hidden_states = self.vision_model(
1680
+ # pixel_values=pixel_values,
1681
+ # patch_attention_mask=patch_attention_mask,
1682
+ # ).last_hidden_state
1683
+
1684
+ # # Modality projection & resampling
1685
+ # image_hidden_states = self.connector(
1686
+ # image_hidden_states, attention_mask=patch_attention_mask.view(pixel_values.size(0), -1)
1687
+ # )
1688
 
1689
  elif image_hidden_states is not None:
1690
  image_hidden_states = image_hidden_states.to(dtype=self.dtype, device=input_ids.device)