jaandoui
/

DNABERT2-AttentionExtracted

@@ -1,3 +1,5 @@
 # Copyright 2022 MosaicML Examples authors
 # SPDX-License-Identifier: Apache-2.0
@@ -328,7 +330,7 @@ class BertLayer(nn.Module):
         attention_output = self.attention(hidden_states, cu_seqlens, seqlen,
                                           subset_idx, indices, attn_mask, bias)
         layer_output = self.mlp(attention_output)
-        return layer_output, attention_output
 class BertEncoder(nn.Module):
@@ -343,7 +345,7 @@ class BertEncoder(nn.Module):
     def __init__(self, config):
         super().__init__()
-        layer = BertLayer(config)
         self.layer = nn.ModuleList(
             [copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
@@ -446,6 +448,7 @@ class BertEncoder(nn.Module):
         if subset_mask is None:
             for layer_module in self.layer:
                 hidden_states, attention_weights = layer_module(hidden_states,
                                                                 cu_seqlens,
                                                                 seqlen,
@@ -453,8 +456,9 @@ class BertEncoder(nn.Module):
                                                                 indices,
                                                                 attn_mask=attention_mask,
                                                                 bias=alibi_attn_mask)
-                print(f'Inner Attention: {attention_weights}')
-                print(f'Inner Attention shape: {attention_weights.shape}')
                 all_attention_weights.append(attention_weights)  # Store attention weights
                 if output_all_encoded_layers:
                     all_encoder_layers.append(hidden_states)
@@ -467,6 +471,7 @@ class BertEncoder(nn.Module):
         else:
             for i in range(len(self.layer) - 1):
                 layer_module = self.layer[i]
                 hidden_states, attention_weights = layer_module(hidden_states,
                                                                 cu_seqlens,
                                                                 seqlen,
@@ -474,11 +479,12 @@ class BertEncoder(nn.Module):
                                                                 indices,
                                                                 attn_mask=attention_mask,
                                                                 bias=alibi_attn_mask)
-                all_attention_weights.append(attention_weights)  # Store attention weights
                 if output_all_encoded_layers:
                     all_encoder_layers.append(hidden_states)
             subset_idx = torch.nonzero(subset_mask[attention_mask_bool],
                                        as_tuple=False).flatten()
             hidden_states, attention_weights = self.layer[-1](hidden_states,
                                                               cu_seqlens,
                                                               seqlen,
@@ -486,14 +492,16 @@ class BertEncoder(nn.Module):
                                                               indices=indices,
                                                               attn_mask=attention_mask,
                                                               bias=alibi_attn_mask)
-            all_attention_weights.append(attention_weights)  # Store attention weights
-            print(f'here is the matrix of attentions inside encoder: \n {all_attention_weights}')
-            print(f'and this is the shape inside encoder: \n {all_attention_weights.shape}')
         if not output_all_encoded_layers:
             all_encoder_layers.append(hidden_states)
-        # return all_encoder_layers, all_attention_weights  # Return both hidden states and attention weights
-        return all_encoder_layers  # Return both hidden states and attention weights
@@ -617,7 +625,9 @@ class BertModel(BertPreTrainedModel):
             first_col_mask[:, 0] = True
             subset_mask = masked_tokens_mask | first_col_mask
-        encoder_outputs = self.encoder(
             embedding_output,
             attention_mask,
             output_all_encoded_layers=output_all_encoded_layers,
@@ -645,11 +655,13 @@ class BertModel(BertPreTrainedModel):
         if not output_all_encoded_layers:
             encoder_outputs = sequence_output
         if self.pooler is not None:
-            return encoder_outputs, pooled_output
-        return encoder_outputs, None
 ###################
 # Bert Heads
@@ -705,6 +717,8 @@ class BertForMaskedLM(BertPreTrainedModel):
                 'If you want to use `BertForMaskedLM` make sure `config.is_decoder=False` for '
                 'bi-directional self-attention.')
         self.bert = BertModel(config, add_pooling_layer=False)
         self.cls = BertOnlyMLMHead(config,
                                    self.bert.embeddings.word_embeddings.weight)
@@ -754,6 +768,7 @@ class BertForMaskedLM(BertPreTrainedModel):
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         outputs = self.bert(
             input_ids,
             attention_mask=attention_mask,
@@ -789,7 +804,7 @@ class BertForMaskedLM(BertPreTrainedModel):
                                           b=batch)
         if not return_dict:
-            output = (prediction_scores,) + outputs[2:]
             return ((loss,) + output) if loss is not None else output
         return MaskedLMOutput(
@@ -823,7 +838,7 @@ class BertForMaskedLM(BertPreTrainedModel):
         return {'input_ids': input_ids, 'attention_mask': attention_mask}
 class BertForSequenceClassification(BertPreTrainedModel):
     """Bert Model transformer with a sequence classification/regression head.
@@ -869,7 +884,7 @@ class BertForSequenceClassification(BertPreTrainedModel):
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        outputs = self.bert(
             input_ids,
             attention_mask=attention_mask,
             token_type_ids=token_type_ids,
@@ -882,6 +897,9 @@ class BertForSequenceClassification(BertPreTrainedModel):
         )
         pooled_output = outputs[1]
         pooled_output = self.dropout(pooled_output)
         logits = self.classifier(pooled_output)
@@ -913,6 +931,7 @@ class BertForSequenceClassification(BertPreTrainedModel):
                 loss = loss_fct(logits, labels)
         if not return_dict:
             output = (logits,) + outputs[2:]
             return ((loss,) + output) if loss is not None else output
@@ -923,6 +942,7 @@ class BertForSequenceClassification(BertPreTrainedModel):
             loss=loss,
             logits=logits,
             hidden_states=outputs[0],
-            attentions=None,
         )

+# search for JAANDOUI for the parts I have modified, and for JAANDOUI TODO for the parts that might need to be changed.
 # Copyright 2022 MosaicML Examples authors
 # SPDX-License-Identifier: Apache-2.0
         attention_output = self.attention(hidden_states, cu_seqlens, seqlen,
                                           subset_idx, indices, attn_mask, bias)
         layer_output = self.mlp(attention_output)
+        return layer_output, attention_output # JAANDOUI: this only returns layer_output in the original work.
 class BertEncoder(nn.Module):
     def __init__(self, config):
         super().__init__()
+        layer = BertLayer(config) # JAANDOUI: In this line we define the BertLayer, note that now the forward of this class returns attention too!! 2 values instead of 1
         self.layer = nn.ModuleList(
             [copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
         if subset_mask is None:
             for layer_module in self.layer:
+                # JAANDOUI: Since we get now attention too, we need to unpack 2 elements instead of 1.
                 hidden_states, attention_weights = layer_module(hidden_states,
                                                                 cu_seqlens,
                                                                 seqlen,
                                                                 indices,
                                                                 attn_mask=attention_mask,
                                                                 bias=alibi_attn_mask)
+                # JAANDOUI
+                # print(f'Inner Attention: {attention_weights}')
+                # print(f'Inner Attention shape: {attention_weights.shape}')
                 all_attention_weights.append(attention_weights)  # Store attention weights
                 if output_all_encoded_layers:
                     all_encoder_layers.append(hidden_states)
         else:
             for i in range(len(self.layer) - 1):
                 layer_module = self.layer[i]
+                # JAANDOUI: Since we get now attention too, we need to unpack 2 elements instead of 1.
                 hidden_states, attention_weights = layer_module(hidden_states,
                                                                 cu_seqlens,
                                                                 seqlen,
                                                                 indices,
                                                                 attn_mask=attention_mask,
                                                                 bias=alibi_attn_mask)
+                all_attention_weights.append(attention_weights)  # JAANDOUI: Store attention weights
                 if output_all_encoded_layers:
                     all_encoder_layers.append(hidden_states)
             subset_idx = torch.nonzero(subset_mask[attention_mask_bool],
                                        as_tuple=False).flatten()
+            # JAANDOUI: Since we get now attention too, we need to unpack 2 elements instead of 1.
             hidden_states, attention_weights = self.layer[-1](hidden_states,
                                                               cu_seqlens,
                                                               seqlen,
                                                               indices=indices,
                                                               attn_mask=attention_mask,
                                                               bias=alibi_attn_mask)
+            all_attention_weights.append(attention_weights)  # JAANDOUI: appending the attention of different layers together.
+            # print(f'here is the matrix of attentions inside encoder: \n {all_attention_weights}')
+            # print(f'and this is the shape inside encoder: \n {all_attention_weights.shape}')
         if not output_all_encoded_layers:
             all_encoder_layers.append(hidden_states)
+        # JAANDOUI: Since we now return both, we need to handle them wherever BertEncoder forward is called.
+        return all_encoder_layers, all_attention_weights  # Return both hidden states and attention weights
+        # return all_encoder_layers  # JAANDOUI: original return.
             first_col_mask[:, 0] = True
             subset_mask = masked_tokens_mask | first_col_mask
+        # JAANDOUI: first part where we call self.encoder (which is the instance of BertEncoder defined here)
+        # JAANDOUI: need to return the attention weights here too.
+        encoder_outputs, all_attention_weights = self.encoder(
             embedding_output,
             attention_mask,
             output_all_encoded_layers=output_all_encoded_layers,
         if not output_all_encoded_layers:
             encoder_outputs = sequence_output
+        # JAANDOUI: returning all_attention_weights too
         if self.pooler is not None:
+            return encoder_outputs, pooled_output, all_attention_weights
+        # JAANDOUI: returning all_attention_weights too
+        return encoder_outputs, None, all_attention_weights
+        # JAANDOUI: need to handle the returned elements wherever BertModel is instantiated.
 ###################
 # Bert Heads
                 'If you want to use `BertForMaskedLM` make sure `config.is_decoder=False` for '
                 'bi-directional self-attention.')
+        # JAANDOUI: this part is only for the pretraining, I don't think it is called if we finetune
+        # there handle the returned elements (we now get 3 elements) of BertModel if pretraining
         self.bert = BertModel(config, add_pooling_layer=False)
         self.cls = BertOnlyMLMHead(config,
                                    self.bert.embeddings.word_embeddings.weight)
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # JAANDOUI: for the pretraining: return handled here.
         outputs = self.bert(
             input_ids,
             attention_mask=attention_mask,
                                           b=batch)
         if not return_dict:
+            output = (prediction_scores,) + outputs[2:] # JAANDOUI TODO: might need to handle this part and everywhere where we get outputs (now outputs has 3 elements not 2)
             return ((loss,) + output) if loss is not None else output
         return MaskedLMOutput(
         return {'input_ids': input_ids, 'attention_mask': attention_mask}
+# JAANDOUI: this model is the one used for finetuning.
 class BertForSequenceClassification(BertPreTrainedModel):
     """Bert Model transformer with a sequence classification/regression head.
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs, _, all_attention_weights = self.bert(
             input_ids,
             attention_mask=attention_mask,
             token_type_ids=token_type_ids,
         )
         pooled_output = outputs[1]
+        # JAANDOUI:
+        all_attention_weights = outputs[2]
         pooled_output = self.dropout(pooled_output)
         logits = self.classifier(pooled_output)
                 loss = loss_fct(logits, labels)
         if not return_dict:
+            # JAANDOUI TODO maybe.
             output = (logits,) + outputs[2:]
             return ((loss,) + output) if loss is not None else output
             loss=loss,
             logits=logits,
             hidden_states=outputs[0],
+            #JAANDOUI: returning all_attention_weights here
+            attentions=outputs[2],
         )