HuggingFaceM4
/

VLM_WebSight_finetuned

@@ -1390,15 +1390,16 @@ class VMistralModel(VMistralPreTrainedModel):
             vision_pipeline_output_seq_len = image_hidden_states.shape[1]
             vision_hidden_size = image_hidden_states.shape[2]
             new_inputs_embeds = inputs_embeds.clone()
-            # Get a view of the image_hidden_states separating batch_size and num_images, to discard padding hidden_states
-            image_hidden_states = image_hidden_states.view(
-                batch_size, num_images, vision_pipeline_output_seq_len, vision_hidden_size
-            )
             for batch_idx in range(batch_size):
                 # Get the number of images for this particular example
-                example_num_images = (input_ids[batch_idx] == self.image_token_id).sum() // self.image_seq_len
                 # Get the image_hidden_states corresponding to True images for the example, so get rid of the padding images.
-                example_true_image_hidden_states = image_hidden_states[batch_idx][:example_num_images]
                 if (
                     new_inputs_embeds[batch_idx][input_ids[batch_idx] == self.image_token_id].shape[0]
                     != example_num_images * vision_pipeline_output_seq_len
@@ -1484,6 +1485,9 @@ class VMistralModel(VMistralPreTrainedModel):
             pixel_values = pixel_values.to(dtype=self.dtype, device=input_ids.device)  # fp16 compatibility
             batch_size, num_images = pixel_values.size(0), pixel_values.size(1)
             pixel_values = pixel_values.contiguous().view(batch_size * num_images, *pixel_values.shape[2:])
             # Get sequence from the vision encoder
             image_hidden_states = self.vision_model(pixel_values=pixel_values).last_hidden_state

             vision_pipeline_output_seq_len = image_hidden_states.shape[1]
             vision_hidden_size = image_hidden_states.shape[2]
             new_inputs_embeds = inputs_embeds.clone()
+            # Get the number of images for each example
+            num_images = (input_ids == self.image_token_id).sum(dim=-1) // self.image_seq_len
+            cum_num_images = num_images.cumsum(dim=-1)
             for batch_idx in range(batch_size):
                 # Get the number of images for this particular example
+                example_num_images = num_images[batch_idx]
                 # Get the image_hidden_states corresponding to True images for the example, so get rid of the padding images.
+                start = 0 if batch_idx == 0 else cum_num_images[batch_idx - 1]
+                end = cum_num_images[batch_idx]
+                example_true_image_hidden_states = image_hidden_states[start:end]
                 if (
                     new_inputs_embeds[batch_idx][input_ids[batch_idx] == self.image_token_id].shape[0]
                     != example_num_images * vision_pipeline_output_seq_len
             pixel_values = pixel_values.to(dtype=self.dtype, device=input_ids.device)  # fp16 compatibility
             batch_size, num_images = pixel_values.size(0), pixel_values.size(1)
             pixel_values = pixel_values.contiguous().view(batch_size * num_images, *pixel_values.shape[2:])
+            # Remove padding images - padding images are full 0.
+            real_images_inds = pixel_values.sum(dim=(-1, -2, -3)) != 0.0
+            pixel_values = pixel_values[real_images_inds]
             # Get sequence from the vision encoder
             image_hidden_states = self.vision_model(pixel_values=pixel_values).last_hidden_state