Add task_prefix_attention_mask Argument to _merge_input_ids_with_image_features for Better Padding Handling

This PR introduces a small change in the _merge_input_ids_with_image_features function by adding a task_prefix_attention_mask=None argument. This enhancement ensures that when doing batch processing with padding to the max length, the attention mask correctly ignores padding tokens.

Changes Made:
1. Added task_prefix_attention_mask=None argument to _merge_input_ids_with_image_features function.
2. Updated the function to incorporate the provided attention mask, allowing it to ignore padding tokens during batch processing.

Below is an example demonstrating the issue and the improvement:
```python
prompts =["prompt", "longer prompt", "much much longer prompt"]

url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true"

image = Image.open(requests.get(url, stream=True).raw)
images = [image] * len(prompts)

inputs = processor(text=prompts, images=images, return_tensors="pt", padding=True).to("cuda", torch.float16)

inputs_embeds = model.get_input_embeddings()(inputs.input_ids)
image_features = model._encode_image(inputs.pixel_values)

print(inputs.input_ids)
# Output:
# tensor([[ 0, 12501, 3320, 2, 1, 1],
# [ 0, 3479, 254, 14302, 2, 1],
# [ 0, 28431, 203, 1181, 14302, 2]], device='cuda:0')

# Before change
inputs_embeds, attention_mask = model._merge_input_ids_with_image_features(image_features, inputs_embeds)
print(attention_mask[:, -10:])
# Output:
# tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
# [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
# [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]], device='cuda:0')

# After change
inputs_embeds, attention_mask = model._merge_input_ids_with_image_features(image_features, inputs_embeds, task_prefix_attention_mask=inputs.attention_mask)
print(attention_mask[:, -10:])
# Output:
# tensor([[1., 1., 1., 1., 1., 1., 1., 1., 0., 0.],
# [1., 1., 1., 1., 1., 1., 1., 1., 1., 0.],
# [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]], device='cuda:0')
```

Files changed (1) hide show

modeling_florence2.py +7 -5

modeling_florence2.py CHANGED Viewed

@@ -2643,7 +2643,7 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
         return x
     def _merge_input_ids_with_image_features(
-        self, image_features, inputs_embeds
     ):
         batch_size, image_token_length = image_features.size()[:-1]
         device = image_features.device
@@ -2655,10 +2655,12 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
             return image_features, image_attention_mask
         task_prefix_embeds = inputs_embeds
-        task_prefix_attention_mask = torch.ones(batch_size, task_prefix_embeds.size(1), device=device)
-        if len(task_prefix_attention_mask.shape) == 3:
-            task_prefix_attention_mask = task_prefix_attention_mask[:, 0]
         # concat [image embeds, task prefix embeds]
         inputs_embeds = torch.cat([image_features, task_prefix_embeds], dim=1)
@@ -2734,7 +2736,7 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
             if pixel_values is not None:
                 # (batch_size, num_image_tokens, hidden_size)
                 image_features = self._encode_image(pixel_values)
-                inputs_embeds, attention_mask = self._merge_input_ids_with_image_features(image_features, inputs_embeds)
         if inputs_embeds is not None:
             attention_mask = attention_mask.to(inputs_embeds.dtype)

         return x
     def _merge_input_ids_with_image_features(
+        self, image_features, inputs_embeds, task_prefix_attention_mask=None
     ):
         batch_size, image_token_length = image_features.size()[:-1]
         device = image_features.device
             return image_features, image_attention_mask
         task_prefix_embeds = inputs_embeds
+        if task_prefix_attention_mask is None:
+            task_prefix_attention_mask = torch.ones(batch_size, task_prefix_embeds.size(1), device=device)
+            if len(task_prefix_attention_mask.shape) == 3:
+                task_prefix_attention_mask = task_prefix_attention_mask[:, 0]
         # concat [image embeds, task prefix embeds]
         inputs_embeds = torch.cat([image_features, task_prefix_embeds], dim=1)
             if pixel_values is not None:
                 # (batch_size, num_image_tokens, hidden_size)
                 image_features = self._encode_image(pixel_values)
+                inputs_embeds, attention_mask = self._merge_input_ids_with_image_features(image_features, inputs_embeds, task_prefix_attention_mask=attention_mask)
         if inputs_embeds is not None:
             attention_mask = attention_mask.to(inputs_embeds.dtype)