Synthyra
/

DPLM-3B

@@ -482,7 +482,17 @@ def _try_get_kernels_flash():
     return flash_kernel, flash_kernel_variant
-FLASH_KERNEL, FLASH_KERNEL_VARIANT = _try_get_kernels_flash()
 def _kernels_flash_forward(
@@ -656,6 +666,8 @@ def resolve_attention_backend(requested_backend: str) -> AttentionBackend:
     assert requested_backend in VALID_ATTENTION_BACKENDS, (
         f"Unsupported attention backend: {requested_backend}. Expected one of {VALID_ATTENTION_BACKENDS}."
     )
     if requested_backend == AttentionBackend.AUTO.value:
         if FLASH_KERNEL is not None:
             resolved = AttentionBackend.KERNELS_FLASH
@@ -945,9 +957,6 @@ class ModifiedEsmSelfAttention(EsmSelfAttention):
         flex_block_mask: "BlockMask | None" = None,
     ) -> tuple[torch.Tensor, None]:
         assert flex_attention is not None, "Flex attention is not available in this environment."
-        assert query_BHLD.dtype in (torch.float16, torch.bfloat16), (
-            f"Flex attention requires float16 or bfloat16, got {query_BHLD.dtype}."
-        )
         fn = _get_flex_attention_fn()
         context_BHLD = fn(query_BHLD, key_BHLD, value_BHLD, block_mask=flex_block_mask, scale=1.0)
         return rearrange(context_BHLD, "b h s d -> b s (h d)"), None

     return flash_kernel, flash_kernel_variant
+_FLASH_KERNELS_LOADED = False
+FLASH_KERNEL = None
+FLASH_KERNEL_VARIANT = None
+def _ensure_flash_kernels_loaded():
+    global _FLASH_KERNELS_LOADED, FLASH_KERNEL, FLASH_KERNEL_VARIANT
+    if _FLASH_KERNELS_LOADED:
+        return
+    _FLASH_KERNELS_LOADED = True
+    FLASH_KERNEL, FLASH_KERNEL_VARIANT = _try_get_kernels_flash()
 def _kernels_flash_forward(
     assert requested_backend in VALID_ATTENTION_BACKENDS, (
         f"Unsupported attention backend: {requested_backend}. Expected one of {VALID_ATTENTION_BACKENDS}."
     )
+    if requested_backend in (AttentionBackend.AUTO.value, AttentionBackend.KERNELS_FLASH.value):
+        _ensure_flash_kernels_loaded()
     if requested_backend == AttentionBackend.AUTO.value:
         if FLASH_KERNEL is not None:
             resolved = AttentionBackend.KERNELS_FLASH
         flex_block_mask: "BlockMask | None" = None,
     ) -> tuple[torch.Tensor, None]:
         assert flex_attention is not None, "Flex attention is not available in this environment."
         fn = _get_flex_attention_fn()
         context_BHLD = fn(query_BHLD, key_BHLD, value_BHLD, block_mask=flex_block_mask, scale=1.0)
         return rearrange(context_BHLD, "b h s d -> b s (h d)"), None