microsoft
/

Phi-3-mini-4k-instruct

@@ -546,7 +546,7 @@ class Phi3FlashAttention2(Phi3Attention):
         # This might slowdown training & inference so it is recommended to not cast the LayerNorms
         # in fp32.
-        if query_states.dtype == torch.float32:
             if torch.is_autocast_enabled():
                 target_dtype = torch.get_autocast_gpu_dtype()
             # Handle the case where the model is quantized

         # This might slowdown training & inference so it is recommended to not cast the LayerNorms
         # in fp32.
+        if query_states.dtype == torch.float32 or key_states.dtype == torch.float32:
             if torch.is_autocast_enabled():
                 target_dtype = torch.get_autocast_gpu_dtype()
             # Handle the case where the model is quantized