HuggingFaceM4
/

VLM_WebSight_finetuned

@@ -1372,36 +1372,38 @@ class VMistralModel(VMistralPreTrainedModel):
         batch_size = input_ids.size(0)
         if inputs_embeds is not None:
-            vision_pipeline_output_seq_len = image_hidden_states.shape[1]
-            vision_hidden_size = image_hidden_states.shape[2]
             new_inputs_embeds = inputs_embeds.clone()
-            # Get the number of images for each example
-            num_images = (input_ids == self.image_token_id).sum(dim=-1) // self.image_seq_len
-            cum_num_images = num_images.cumsum(dim=-1)
-            for batch_idx in range(batch_size):
-                # Get the number of images for this particular example
-                example_num_images = num_images[batch_idx]
-                # Get the image_hidden_states corresponding to True images for the example, so get rid of the padding images.
-                start = 0 if batch_idx == 0 else cum_num_images[batch_idx - 1]
-                end = cum_num_images[batch_idx]
-                example_true_image_hidden_states = image_hidden_states[start:end]
-                if (
-                    new_inputs_embeds[batch_idx][input_ids[batch_idx] == self.image_token_id].shape[0]
-                    != example_num_images * vision_pipeline_output_seq_len
-                ):
-                    raise ValueError(
-                        "new_inputs_embeds to replace has shape[0]:"
-                        f" {new_inputs_embeds[batch_idx][input_ids[batch_idx] == self.image_token_id].shape[0]} but"
-                        " should have shape[0]:"
-                        f" {example_num_images}*{vision_pipeline_output_seq_len}={example_num_images * vision_pipeline_output_seq_len} "
-                    )
-                # Insert the image_hidden_states
-                new_inputs_embeds[batch_idx][input_ids[batch_idx] == self.image_token_id] = (
-                    example_true_image_hidden_states.view(
-                        example_num_images * vision_pipeline_output_seq_len,
-                        vision_hidden_size,
                     )
-                )
         return_dict = {}
         if inputs_embeds is not None:

         batch_size = input_ids.size(0)
         if inputs_embeds is not None:
             new_inputs_embeds = inputs_embeds.clone()
+            if image_hidden_states is not None:
+                vision_pipeline_output_seq_len = image_hidden_states.shape[1]
+                vision_hidden_size = image_hidden_states.shape[2]
+                # Get the number of images for each example
+                num_images = (input_ids == self.image_token_id).sum(dim=-1) // self.image_seq_len
+                cum_num_images = num_images.cumsum(dim=-1)
+                for batch_idx in range(batch_size):
+                    # Get the number of images for this particular example
+                    example_num_images = num_images[batch_idx]
+                    # Get the image_hidden_states corresponding to True images for the example, so get rid of the padding images.
+                    start = 0 if batch_idx == 0 else cum_num_images[batch_idx - 1]
+                    end = cum_num_images[batch_idx]
+                    example_true_image_hidden_states = image_hidden_states[start:end]
+                    if (
+                        new_inputs_embeds[batch_idx][input_ids[batch_idx] == self.image_token_id].shape[0]
+                        != example_num_images * vision_pipeline_output_seq_len
+                    ):
+                        raise ValueError(
+                            "new_inputs_embeds to replace has shape[0]:"
+                            f" {new_inputs_embeds[batch_idx][input_ids[batch_idx] == self.image_token_id].shape[0]} but"
+                            " should have shape[0]:"
+                            f" {example_num_images}*{vision_pipeline_output_seq_len}={example_num_images * vision_pipeline_output_seq_len} "
+                        )
+                    # Insert the image_hidden_states
+                    new_inputs_embeds[batch_idx][input_ids[batch_idx] == self.image_token_id] = (
+                        example_true_image_hidden_states.view(
+                            example_num_images * vision_pipeline_output_seq_len,
+                            vision_hidden_size,
+                        )
                     )
         return_dict = {}
         if inputs_embeds is not None: