Update modeling.py

There is currently a bug in the code where manually setting the device for sentence transformer to a non-zero device triggers an issue with tensors not being on the same device. This modification is made during the initialization of attention_bias and does not introduce additional risks or inference time overhead.

Files changed (1) hide show

modeling.py +2 -2

modeling.py CHANGED Viewed

@@ -897,11 +897,11 @@ class NewModel(NewPreTrainedModel):
         if unpad_inputs:
             assert self.config.use_memory_efficient_attention
-            attention_bias = xops.fmha.attn_bias.BlockDiagonalMask.from_seqlens(length)
         else:
             # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
             # ourselves in which case we just need to make it broadcastable to all heads.
-            attention_bias = self.get_extended_attention_mask(attention_mask, input_shape)
             if self.config.use_memory_efficient_attention:
                 # Invalid shape for attention bias: torch.Size([48, 1, 1, 512]) (expected (48, 12, 512, 512))
                 attention_bias = attention_bias.expand(-1, self.config.num_attention_heads, seq_length, -1)

         if unpad_inputs:
             assert self.config.use_memory_efficient_attention
+            attention_bias = xops.fmha.attn_bias.BlockDiagonalMask.from_seqlens(length,device=self.device)
         else:
             # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
             # ourselves in which case we just need to make it broadcastable to all heads.
+            attention_bias = self.get_extended_attention_mask(attention_mask, input_shape,device=self.device)
             if self.config.use_memory_efficient_attention:
                 # Invalid shape for attention bias: torch.Size([48, 1, 1, 512]) (expected (48, 12, 512, 512))
                 attention_bias = attention_bias.expand(-1, self.config.num_attention_heads, seq_length, -1)