Allow pytorch<2 to use without passing attn_implementation flag (#4)

- Allow pytorch<2 to use without passing attn_implementation flag (969de83296b491de451557789e9770b9335612bb)

Files changed (1) hide show

modeling_bert.py CHANGED Viewed

@@ -353,7 +353,7 @@ class JinaBertSelfAttention(nn.Module):
             # if encoder bi-directional self-attention `past_key_value` is always `None`
             past_key_value = (key_layer, value_layer)
-        if self.attn_implementation == 'torch':
             b, _, s, _ = query_layer.shape
             new_bias = attention_mask + bias
             attn = scaled_dot_product_attention(query_layer, key_layer, value_layer, new_bias)

             # if encoder bi-directional self-attention `past_key_value` is always `None`
             past_key_value = (key_layer, value_layer)
+        if self.attn_implementation == 'torch' and scaled_dot_product_attention is not None:
             b, _, s, _ = query_layer.shape
             new_bias = attention_mask + bias
             attn = scaled_dot_product_attention(query_layer, key_layer, value_layer, new_bias)