jaandoui
/

DNABERT2-AttentionExtracted

@@ -169,9 +169,12 @@ class BertUnpadSelfAttention(nn.Module):
                 self.attention_head_size)
             attention_scores = attention_scores + bias
             attention_probs = nn.functional.softmax(attention_scores, dim=-1)
             attention_probs = self.dropout(attention_probs)
             attention = torch.matmul(attention_probs, v).permute(0, 2, 1,
                                                                  3)  # b s h d
         else:
             # Triton implementation only supports 0 attention dropout
             convert_dtype = qkv.dtype not in [torch.float16, torch.bfloat16]
@@ -182,13 +185,16 @@ class BertUnpadSelfAttention(nn.Module):
                 bias_dtype = bias.dtype
                 bias = bias.to(torch.float16)
                 attention = flash_attn_qkvpacked_func(qkv, bias)
                 attention = attention.to(orig_dtype)
                 bias = bias.to(bias_dtype)
             else:
                 attention = flash_attn_qkvpacked_func(qkv, bias)
         # attn_mask is 1 for attend and 0 for don't
         attention = unpad_input_only(attention, torch.squeeze(attn_mask) == 1)
         return rearrange(attention, 'nnz h d -> nnz (h d)')
@@ -329,7 +335,9 @@ class BertLayer(nn.Module):
         """
         attention_output = self.attention(hidden_states, cu_seqlens, seqlen,
                                           subset_idx, indices, attn_mask, bias)
         layer_output = self.mlp(attention_output)
         return layer_output, attention_output # JAANDOUI: this only returns layer_output in the original work.
@@ -350,7 +358,7 @@ class BertEncoder(nn.Module):
             [copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
         self.num_attention_heads = config.num_attention_heads
         # The alibi mask will be dynamically expanded if it is too small for
         # the input the model receives. But it generally helps to initialize it
         # to a reasonably large size to help pre-allocate CUDA memory.
@@ -937,6 +945,7 @@ class BertForSequenceClassification(BertPreTrainedModel):
         if not return_dict:
             # JAANDOUI TODO maybe.
             output = (logits,) + outputs[2:]
             return ((loss,) + output) if loss is not None else output
@@ -947,6 +956,7 @@ class BertForSequenceClassification(BertPreTrainedModel):
             logits=logits,
             hidden_states=outputs[0],
             #JAANDOUI: returning all_attention_weights here
-            attentions=torch.stack(outputs[2], dim=0),
         )

                 self.attention_head_size)
             attention_scores = attention_scores + bias
             attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+            print(f'BUSA: attention_probs 1 shape: {attention_probs.shape}')
             attention_probs = self.dropout(attention_probs)
+            print(f'BUSA: attention_probs 2 shape: {attention_probs.shape}')
             attention = torch.matmul(attention_probs, v).permute(0, 2, 1,
                                                                  3)  # b s h d
+            print(f'BUSA: attention shape: {attention.shape}')
         else:
             # Triton implementation only supports 0 attention dropout
             convert_dtype = qkv.dtype not in [torch.float16, torch.bfloat16]
                 bias_dtype = bias.dtype
                 bias = bias.to(torch.float16)
                 attention = flash_attn_qkvpacked_func(qkv, bias)
+                print(f'BUSA Triton: attention 0 shape: {attention_probs.shape}')
                 attention = attention.to(orig_dtype)
+                print(f'BUSA Triton: attention 1 shape: {attention_probs.shape}')
                 bias = bias.to(bias_dtype)
             else:
                 attention = flash_attn_qkvpacked_func(qkv, bias)
+                print(f'BUSA Triton: attention 2 shape: {attention_probs.shape}')
         # attn_mask is 1 for attend and 0 for don't
         attention = unpad_input_only(attention, torch.squeeze(attn_mask) == 1)
+        print(f'BUSA unpadded final attention shape: {attention_probs.shape}')
         return rearrange(attention, 'nnz h d -> nnz (h d)')
         """
         attention_output = self.attention(hidden_states, cu_seqlens, seqlen,
                                           subset_idx, indices, attn_mask, bias)
+        print(f'BertLayer attention_output shape: {attention_output}')
         layer_output = self.mlp(attention_output)
+        print(f'BertLayer layer_output shape: {layer_output}')
         return layer_output, attention_output # JAANDOUI: this only returns layer_output in the original work.
             [copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
         self.num_attention_heads = config.num_attention_heads
+        print(f'nbr of attention heads: {self.num_attention_heads}')
         # The alibi mask will be dynamically expanded if it is too small for
         # the input the model receives. But it generally helps to initialize it
         # to a reasonably large size to help pre-allocate CUDA memory.
         if not return_dict:
             # JAANDOUI TODO maybe.
+            print(f'return_dict is {return_dict}')
             output = (logits,) + outputs[2:]
             return ((loss,) + output) if loss is not None else output
             logits=logits,
             hidden_states=outputs[0],
             #JAANDOUI: returning all_attention_weights here
+            # attentions=torch.stack(outputs[2], dim=0),
+            attentions=torch.stack(outputs[2], dim=0), # JAANDOUI TODO: should I stack here ????
         )