baichuan-inc
/

Baichuan2-13B-Chat-4bits

Text Generation

text-generation-inference

4-bit precision

Model card Files Files and versions Community

s-JoL commited on Sep 6, 2023

Commit

3c9628b

·

1 Parent(s): a9f41a6

Update modeling_baichuan.py

Files changed (1) hide show

modeling_baichuan.py +8 -6

modeling_baichuan.py CHANGED Viewed

@@ -173,12 +173,14 @@ class BaichuanAttention(torch.nn.Module):
         past_key_value = (key_states, value_states) if use_cache else None
         if xops is not None and self.training:
             attn_weights = None
-            query_states = query_states.transpose(1, 2)
-            key_states = key_states.transpose(1, 2)
-            value_states = value_states.transpose(1, 2)
-            attn_output = xops.memory_efficient_attention(
-                query_states, key_states, value_states, attn_bias=attention_mask
-            )
         else:
             attn_weights = torch.matmul(
                 query_states, key_states.transpose(2, 3)

         past_key_value = (key_states, value_states) if use_cache else None
         if xops is not None and self.training:
             attn_weights = None
+            # query_states = query_states.transpose(1, 2)
+            # key_states = key_states.transpose(1, 2)
+            # value_states = value_states.transpose(1, 2)
+            # attn_output = xops.memory_efficient_attention(
+            #     query_states, key_states, value_states, attn_bias=attention_mask
+            # )
+            with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=True, enable_mem_efficient=True):
+                attn_output = F.scaled_dot_product_attention(query_states, key_states, value_states, attn_mask = attention_mask)
         else:
             attn_weights = torch.matmul(
                 query_states, key_states.transpose(2, 3)