jaandoui
/

DNABERT2-AttentionExtracted

@@ -410,13 +410,13 @@ class BertEncoder(nn.Module):
         attention_mask: torch.Tensor,
         output_all_encoded_layers: Optional[bool] = True,
         subset_mask: Optional[torch.Tensor] = None,
-    ) -> List[torch.Tensor]:
         extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
         extended_attention_mask = extended_attention_mask.to(
             dtype=torch.float32)  # fp16 compatibility
         extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
         attention_mask_bool = attention_mask.bool()
         batch, seqlen = hidden_states.shape[:2]
         # Unpad inputs and mask. It will remove tokens that are padded.
@@ -426,7 +426,7 @@ class BertEncoder(nn.Module):
         # hidden_states[ntokens,hidden] -> hidden_states[ntokens_unpad,hidden]
         hidden_states, indices, cu_seqlens, _ = unpad_input(
             hidden_states, attention_mask_bool)
         # Add alibi matrix to extended_attention_mask
         if self._current_alibi_size < seqlen:
             # Rebuild the alibi tensor when needed
@@ -440,17 +440,20 @@ class BertEncoder(nn.Module):
         alibi_bias = self.alibi[:, :, :seqlen, :seqlen]
         attn_bias = extended_attention_mask[:, :, :seqlen, :seqlen]
         alibi_attn_mask = attn_bias + alibi_bias
         all_encoder_layers = []
         if subset_mask is None:
             for layer_module in self.layer:
-                hidden_states = layer_module(hidden_states,
-                                             cu_seqlens,
-                                             seqlen,
-                                             None,
-                                             indices,
-                                             attn_mask=attention_mask,
-                                             bias=alibi_attn_mask)
                 if output_all_encoded_layers:
                     all_encoder_layers.append(hidden_states)
             # Pad inputs and mask. It will insert back zero-padded tokens.
@@ -462,28 +465,31 @@ class BertEncoder(nn.Module):
         else:
             for i in range(len(self.layer) - 1):
                 layer_module = self.layer[i]
-                hidden_states = layer_module(hidden_states,
-                                             cu_seqlens,
-                                             seqlen,
-                                             None,
-                                             indices,
-                                             attn_mask=attention_mask,
-                                             bias=alibi_attn_mask)
                 if output_all_encoded_layers:
                     all_encoder_layers.append(hidden_states)
             subset_idx = torch.nonzero(subset_mask[attention_mask_bool],
                                        as_tuple=False).flatten()
-            hidden_states = self.layer[-1](hidden_states,
-                                           cu_seqlens,
-                                           seqlen,
-                                           subset_idx=subset_idx,
-                                           indices=indices,
-                                           attn_mask=attention_mask,
-                                           bias=alibi_attn_mask)
         if not output_all_encoded_layers:
             all_encoder_layers.append(hidden_states)
-        return all_encoder_layers
 class BertPooler(nn.Module):

         attention_mask: torch.Tensor,
         output_all_encoded_layers: Optional[bool] = True,
         subset_mask: Optional[torch.Tensor] = None,
+    ) -> Tuple[List[torch.Tensor], torch.Tensor]:  # Modify return type to include attention weights
         extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
         extended_attention_mask = extended_attention_mask.to(
             dtype=torch.float32)  # fp16 compatibility
         extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
         attention_mask_bool = attention_mask.bool()
         batch, seqlen = hidden_states.shape[:2]
         # Unpad inputs and mask. It will remove tokens that are padded.
         # hidden_states[ntokens,hidden] -> hidden_states[ntokens_unpad,hidden]
         hidden_states, indices, cu_seqlens, _ = unpad_input(
             hidden_states, attention_mask_bool)
         # Add alibi matrix to extended_attention_mask
         if self._current_alibi_size < seqlen:
             # Rebuild the alibi tensor when needed
         alibi_bias = self.alibi[:, :, :seqlen, :seqlen]
         attn_bias = extended_attention_mask[:, :, :seqlen, :seqlen]
         alibi_attn_mask = attn_bias + alibi_bias
         all_encoder_layers = []
+        all_attention_weights = []  # List to store attention weights
         if subset_mask is None:
             for layer_module in self.layer:
+                hidden_states, attention_weights = layer_module(hidden_states,
+                                                                cu_seqlens,
+                                                                seqlen,
+                                                                None,
+                                                                indices,
+                                                                attn_mask=attention_mask,
+                                                                bias=alibi_attn_mask)
+                all_attention_weights.append(attention_weights)  # Store attention weights
                 if output_all_encoded_layers:
                     all_encoder_layers.append(hidden_states)
             # Pad inputs and mask. It will insert back zero-padded tokens.
         else:
             for i in range(len(self.layer) - 1):
                 layer_module = self.layer[i]
+                hidden_states, attention_weights = layer_module(hidden_states,
+                                                                cu_seqlens,
+                                                                seqlen,
+                                                                None,
+                                                                indices,
+                                                                attn_mask=attention_mask,
+                                                                bias=alibi_attn_mask)
+                all_attention_weights.append(attention_weights)  # Store attention weights
                 if output_all_encoded_layers:
                     all_encoder_layers.append(hidden_states)
             subset_idx = torch.nonzero(subset_mask[attention_mask_bool],
                                        as_tuple=False).flatten()
+            hidden_states, attention_weights = self.layer[-1](hidden_states,
+                                                              cu_seqlens,
+                                                              seqlen,
+                                                              subset_idx=subset_idx,
+                                                              indices=indices,
+                                                              attn_mask=attention_mask,
+                                                              bias=alibi_attn_mask)
+            all_attention_weights.append(attention_weights)  # Store attention weights
         if not output_all_encoded_layers:
             all_encoder_layers.append(hidden_states)
+        return all_encoder_layers, all_attention_weights  # Return both hidden states and attention weights
 class BertPooler(nn.Module):