Synthyra
/

FastESMFold

@@ -1,3 +1,5 @@
 import torch
 import torch._inductor.config as inductor_config
 import torch._dynamo as dynamo
@@ -27,7 +29,8 @@ Contains: AttentionBackend enum, backend resolution, mask creation,
 flex attention helpers, flash kernel detection/dispatch, and pad/unpad utilities.
 """
 from enum import Enum
-from typing import Optional
 import torch
 import torch.nn as nn
@@ -45,7 +48,12 @@ _compiled_flex_attention = None
 def _get_flex_attention_fn():
-    """Return flex_attention callable: compiled (fused kernel) by default, or eager when debug flag is set."""
     global _compiled_flex_attention
     if flex_attention is None:
         return None
@@ -53,12 +61,15 @@ def _get_flex_attention_fn():
     if getattr(flex_mod, "_FLEX_ATTENTION_DISABLE_COMPILE_DEBUG", False):
         return flex_attention
     if _compiled_flex_attention is None:
-        _compiled_flex_attention = torch.compile(flex_attention)
     return _compiled_flex_attention
 ### Kernels Flash Attention Detection
-def _infer_kernels_flash_variant(kernel) -> str | None:
     if hasattr(kernel, "fwd") and hasattr(kernel, "varlen_fwd"):
         return "flash_attn2"
     if hasattr(kernel, "flash_attn_func") and hasattr(kernel, "flash_attn_varlen_func"):
@@ -174,7 +185,7 @@ class IndexFirstAxis(torch.autograd.Function):
         ).reshape(-1, *other_shape)
     @staticmethod
-    def backward(ctx, grad_output) -> tuple[torch.Tensor, None]:
         (indices,) = ctx.saved_tensors
         assert grad_output.ndim >= 2
         other_shape = grad_output.shape[1:]
@@ -197,7 +208,7 @@ class IndexPutFirstAxis(torch.autograd.Function):
         return output
     @staticmethod
-    def backward(ctx, grad_output) -> tuple[torch.Tensor, None, None]:
         (indices,) = ctx.saved_tensors
         return grad_output[indices], None, None
@@ -216,7 +227,7 @@ def _unpad_input(
     key_layer: torch.Tensor,
     value_layer: torch.Tensor,
     attention_mask_2d: torch.Tensor,
-) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, tuple[torch.Tensor, torch.Tensor], tuple[int, int]]:
     batch_size, seq_len, num_heads, head_dim = query_layer.shape
     seqlens = attention_mask_2d.sum(dim=1).int()
     cu_seqlens = F.pad(seqlens.cumsum(0, dtype=torch.int32), (1, 0))
@@ -232,7 +243,7 @@ def kernels_flash_attention_func(
     query_states: torch.Tensor,
     key_states: torch.Tensor,
     value_states: torch.Tensor,
-    attention_mask_2d: torch.Tensor | None = None,
     causal: bool = False,
 ) -> torch.Tensor:
     assert FLASH_KERNEL is not None, "Kernel Flash Attention is not available in this environment."
@@ -305,7 +316,7 @@ def get_attention_mask(
     seq_len: int,
     device: torch.device,
     attention_mask: Optional[torch.Tensor] = None,
-) -> tuple[torch.Tensor | None, torch.Tensor | None, "BlockMask | None"]:
     """Build padding masks once for all encoder layers.
     Returns (attention_mask_2d, attention_mask_4d, flex_block_mask).
@@ -418,11 +429,11 @@ class EsmSelfAttention(nn.Module):
     def forward(
         self,
         hidden_states: torch.Tensor,
-        attention_mask_2d: torch.Tensor | None = None,
-        attention_mask_4d: torch.Tensor | None = None,
-        flex_block_mask: "BlockMask | None" = None,
         output_attentions: bool = False,
-    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         batch_size, seq_length = hidden_states.shape[:-1]
         hidden_shape = (batch_size, seq_length, -1, self.attention_head_size)
         query_BHLD = self.query(hidden_states).view(hidden_shape).transpose(1, 2)
@@ -448,11 +459,11 @@ class EsmSelfAttention(nn.Module):
         query_BHLD: torch.Tensor,
         key_BHLD: torch.Tensor,
         value_BHLD: torch.Tensor,
-        attention_mask_2d: torch.Tensor | None = None,
-        attention_mask_4d: torch.Tensor | None = None,
-        flex_block_mask: "BlockMask | None" = None,
         output_attentions: bool = False,
-    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         if output_attentions:
             return self._manual_attn(query_BHLD, key_BHLD, value_BHLD, attention_mask_4d)
@@ -470,8 +481,8 @@ class EsmSelfAttention(nn.Module):
         query_BHLD: torch.Tensor,
         key_BHLD: torch.Tensor,
         value_BHLD: torch.Tensor,
-        attention_mask_4d: torch.Tensor | None = None,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
         attn_weights = torch.matmul(query_BHLD, key_BHLD.transpose(-1, -2))
         if attention_mask_4d is not None:
             attn_weights = attn_weights.masked_fill(attention_mask_4d.logical_not(), float("-inf"))
@@ -487,8 +498,8 @@ class EsmSelfAttention(nn.Module):
         query_BHLD: torch.Tensor,
         key_BHLD: torch.Tensor,
         value_BHLD: torch.Tensor,
-        attention_mask_2d: torch.Tensor | None = None,
-    ) -> tuple[torch.Tensor, None]:
         query_BLHD = query_BHLD.transpose(1, 2).contiguous()
         key_BLHD = key_BHLD.transpose(1, 2).contiguous()
         value_BLHD = value_BHLD.transpose(1, 2).contiguous()
@@ -503,8 +514,8 @@ class EsmSelfAttention(nn.Module):
         query_BHLD: torch.Tensor,
         key_BHLD: torch.Tensor,
         value_BHLD: torch.Tensor,
-        flex_block_mask: "BlockMask | None" = None,
-    ) -> tuple[torch.Tensor, None]:
         assert flex_attention is not None, "Flex attention is not available in this environment."
         fn = _get_flex_attention_fn()
         context_BHLD = fn(query_BHLD, key_BHLD, value_BHLD, block_mask=flex_block_mask, scale=1.0)
@@ -515,8 +526,8 @@ class EsmSelfAttention(nn.Module):
         query_BHLD: torch.Tensor,
         key_BHLD: torch.Tensor,
         value_BHLD: torch.Tensor,
-        attention_mask_4d: torch.Tensor | None = None,
-    ) -> tuple[torch.Tensor, None]:
         context_BHLD = F.scaled_dot_product_attention(
             query_BHLD, key_BHLD, value_BHLD,
             attn_mask=attention_mask_4d,
@@ -536,11 +547,11 @@ class EsmAttention(nn.Module):
     def forward(
         self,
         hidden_states: torch.Tensor,
-        attention_mask_2d: torch.Tensor | None = None,
-        attention_mask_4d: torch.Tensor | None = None,
-        flex_block_mask: "BlockMask | None" = None,
         output_attentions: bool = False,
-    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         hidden_states_ln = self.LayerNorm(hidden_states)
         attn_output, attn_weights = self.self(
             hidden_states_ln,
@@ -564,11 +575,11 @@ class EsmLayer(nn.Module):
     def forward(
         self,
         hidden_states: torch.Tensor,
-        attention_mask_2d: torch.Tensor | None = None,
-        attention_mask_4d: torch.Tensor | None = None,
-        flex_block_mask: "BlockMask | None" = None,
         output_attentions: bool = False,
-    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         attention_output, attn_weights = self.attention(
             hidden_states,
             attention_mask_2d=attention_mask_2d,
@@ -1203,8 +1214,12 @@ class FastEsmForProteinFolding(EsmForProteinFolding):
         with torch.no_grad():
             output = self.infer(sequence)
         plddt = output["plddt"]
-        if plddt.dim() >= 2:
-            mean_plddt = float(plddt.mean(dim=-1).mean().item())
         else:
             mean_plddt = float(plddt.mean().item())
         result = {

+from __future__ import annotations
 import torch
 import torch._inductor.config as inductor_config
 import torch._dynamo as dynamo
 flex attention helpers, flash kernel detection/dispatch, and pad/unpad utilities.
 """
 from enum import Enum
+from functools import partial
+from typing import Dict, List, Optional, Tuple
 import torch
 import torch.nn as nn
 def _get_flex_attention_fn():
+    """Return flex_attention callable: compiled (fused kernel) by default, or eager when debug flag is set.
+    Uses kernel_options={"BACKEND": "FLASH"} to prefer Flash Attention 4 (FA4)
+    on Hopper/Blackwell GPUs (PyTorch 2.11+). Automatically falls back to Triton
+    on older hardware.
+    """
     global _compiled_flex_attention
     if flex_attention is None:
         return None
     if getattr(flex_mod, "_FLEX_ATTENTION_DISABLE_COMPILE_DEBUG", False):
         return flex_attention
     if _compiled_flex_attention is None:
+        _compiled_flex_attention = torch.compile(
+            partial(flex_attention, kernel_options={"BACKEND": "FLASH"}),
+            dynamic=False,
+        )
     return _compiled_flex_attention
 ### Kernels Flash Attention Detection
+def _infer_kernels_flash_variant(kernel) -> Optional[str]:
     if hasattr(kernel, "fwd") and hasattr(kernel, "varlen_fwd"):
         return "flash_attn2"
     if hasattr(kernel, "flash_attn_func") and hasattr(kernel, "flash_attn_varlen_func"):
         ).reshape(-1, *other_shape)
     @staticmethod
+    def backward(ctx, grad_output) -> Tuple[torch.Tensor, None]:
         (indices,) = ctx.saved_tensors
         assert grad_output.ndim >= 2
         other_shape = grad_output.shape[1:]
         return output
     @staticmethod
+    def backward(ctx, grad_output) -> Tuple[torch.Tensor, None, None]:
         (indices,) = ctx.saved_tensors
         return grad_output[indices], None, None
     key_layer: torch.Tensor,
     value_layer: torch.Tensor,
     attention_mask_2d: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Tuple[torch.Tensor, torch.Tensor], Tuple[int, int]]:
     batch_size, seq_len, num_heads, head_dim = query_layer.shape
     seqlens = attention_mask_2d.sum(dim=1).int()
     cu_seqlens = F.pad(seqlens.cumsum(0, dtype=torch.int32), (1, 0))
     query_states: torch.Tensor,
     key_states: torch.Tensor,
     value_states: torch.Tensor,
+    attention_mask_2d: Optional[torch.Tensor] = None,
     causal: bool = False,
 ) -> torch.Tensor:
     assert FLASH_KERNEL is not None, "Kernel Flash Attention is not available in this environment."
     seq_len: int,
     device: torch.device,
     attention_mask: Optional[torch.Tensor] = None,
+) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[BlockMask]]:
     """Build padding masks once for all encoder layers.
     Returns (attention_mask_2d, attention_mask_4d, flex_block_mask).
     def forward(
         self,
         hidden_states: torch.Tensor,
+        attention_mask_2d: Optional[torch.Tensor] = None,
+        attention_mask_4d: Optional[torch.Tensor] = None,
+        flex_block_mask: Optional[BlockMask] = None,
         output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
         batch_size, seq_length = hidden_states.shape[:-1]
         hidden_shape = (batch_size, seq_length, -1, self.attention_head_size)
         query_BHLD = self.query(hidden_states).view(hidden_shape).transpose(1, 2)
         query_BHLD: torch.Tensor,
         key_BHLD: torch.Tensor,
         value_BHLD: torch.Tensor,
+        attention_mask_2d: Optional[torch.Tensor] = None,
+        attention_mask_4d: Optional[torch.Tensor] = None,
+        flex_block_mask: Optional[BlockMask] = None,
         output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
         if output_attentions:
             return self._manual_attn(query_BHLD, key_BHLD, value_BHLD, attention_mask_4d)
         query_BHLD: torch.Tensor,
         key_BHLD: torch.Tensor,
         value_BHLD: torch.Tensor,
+        attention_mask_4d: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
         attn_weights = torch.matmul(query_BHLD, key_BHLD.transpose(-1, -2))
         if attention_mask_4d is not None:
             attn_weights = attn_weights.masked_fill(attention_mask_4d.logical_not(), float("-inf"))
         query_BHLD: torch.Tensor,
         key_BHLD: torch.Tensor,
         value_BHLD: torch.Tensor,
+        attention_mask_2d: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, None]:
         query_BLHD = query_BHLD.transpose(1, 2).contiguous()
         key_BLHD = key_BHLD.transpose(1, 2).contiguous()
         value_BLHD = value_BHLD.transpose(1, 2).contiguous()
         query_BHLD: torch.Tensor,
         key_BHLD: torch.Tensor,
         value_BHLD: torch.Tensor,
+        flex_block_mask: Optional[BlockMask] = None,
+    ) -> Tuple[torch.Tensor, None]:
         assert flex_attention is not None, "Flex attention is not available in this environment."
         fn = _get_flex_attention_fn()
         context_BHLD = fn(query_BHLD, key_BHLD, value_BHLD, block_mask=flex_block_mask, scale=1.0)
         query_BHLD: torch.Tensor,
         key_BHLD: torch.Tensor,
         value_BHLD: torch.Tensor,
+        attention_mask_4d: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, None]:
         context_BHLD = F.scaled_dot_product_attention(
             query_BHLD, key_BHLD, value_BHLD,
             attn_mask=attention_mask_4d,
     def forward(
         self,
         hidden_states: torch.Tensor,
+        attention_mask_2d: Optional[torch.Tensor] = None,
+        attention_mask_4d: Optional[torch.Tensor] = None,
+        flex_block_mask: Optional[BlockMask] = None,
         output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
         hidden_states_ln = self.LayerNorm(hidden_states)
         attn_output, attn_weights = self.self(
             hidden_states_ln,
     def forward(
         self,
         hidden_states: torch.Tensor,
+        attention_mask_2d: Optional[torch.Tensor] = None,
+        attention_mask_4d: Optional[torch.Tensor] = None,
+        flex_block_mask: Optional[BlockMask] = None,
         output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
         attention_output, attn_weights = self.attention(
             hidden_states,
             attention_mask_2d=attention_mask_2d,
         with torch.no_grad():
             output = self.infer(sequence)
         plddt = output["plddt"]
+        # plddt shape is (batch, L, 37) - per-atom across atom37 types.
+        # Use CA atom (index 1) only, matching PDB B-factor output.
+        if plddt.dim() == 3:
+            mean_plddt = float(plddt[:, :, 1].mean().item())
+        elif plddt.dim() == 2:
+            mean_plddt = float(plddt[:, 1].mean().item())
         else:
             mean_plddt = float(plddt.mean().item())
         result = {