ccdv
/

lsg-legal-small-uncased-4096

@@ -53,10 +53,11 @@ class LSGBertConfig(BertConfig):
         self.sparse_block_size = sparse_block_size
         self.sparsity_factor = sparsity_factor
         self.sparsity_type = sparsity_type
         if sparsity_type not in [None, "none", "norm", "lsh", "pooling", "stride", "block_stride"]:
             logger.warning(
-                "[WARNING CONFIG]: sparsity_mode not in [None, 'none', 'norm', 'lsh', 'pooling', 'stride', 'block_stride'], setting sparsity_type=None, computation will skip sparse attention")
             self.sparsity_type = None
         if self.sparsity_type in ["stride", "block_stride"]:
@@ -64,7 +65,7 @@ class LSGBertConfig(BertConfig):
                 logger.warning(
                 "[WARNING CONFIG]: sparsity_factor > encoder_attention_heads is not recommended for stride/block_stride sparsity"
             )
         if self.num_global_tokens < 1:
             logger.warning(
                 "[WARNING CONFIG]: num_global_tokens < 1 is not compatible, setting num_global_tokens=1"
@@ -72,13 +73,23 @@ class LSGBertConfig(BertConfig):
             self.num_global_tokens = 1
         elif self.num_global_tokens > 512:
             logger.warning(
-                "[WARNING CONFIG]: num_global_tokens > 512 is not compatible, setting num_global_tokens=512"
             )
             self.num_global_tokens = 512
         if self.sparsity_factor > 0:
             assert self.block_size % self.sparsity_factor == 0, "[ERROR CONFIG]: block_size must be divisible by sparsity_factor"
             assert self.block_size//self.sparsity_factor >= 1, "[ERROR CONFIG]: make sure block_size >= sparsity_factor"
 class BaseSelfAttention(nn.Module):
@@ -188,7 +199,7 @@ class CausalAttentionProduct(nn.Module):
                 diagonal=-1
                 )
             causal_mask = causal_mask.T * torch.finfo(attention_scores.dtype).min
-            attention_scores[..., -causal_shape[0]:, -causal_shape[1]:] = causal_mask
             del attention_mask
@@ -529,7 +540,8 @@ class LSGSelfAttention(BaseSelfAttention):
         keys = keys.sum(dim=-2) / (mask + 1e-6)
         values = values.sum(dim=-2) / (mask + 1e-6)
-        mask = (1. - mask.clamp(0, 1)) * torch.finfo(mask.dtype).min
         return keys.reshape(n, h, -1, d), values.reshape(n, h, -1, d), mask.expand(-1, h, -1, -1).transpose(-1, -2)
     def get_sparse_tokens_with_stride(self, keys, values, mask):
@@ -594,7 +606,8 @@ class LSGSelfAttention(BaseSelfAttention):
         keys /= mask + 1e-8
         values /= mask + 1e-8
-        mask = (1. - mask.clamp(0, 1)) * torch.finfo(mask.dtype).min
         return keys.reshape(n, h, -1, d), values.reshape(n, h, -1, d), mask.transpose(-1, -2).reshape(n, h, 1, -1)
@@ -695,8 +708,6 @@ class LSGSelfAttention(BaseSelfAttention):
                 output_attentions=output_attentions
                 )
-        #if head_mask is not None:
-        #    outputs = (outputs[0] * head_mask[:, :, :1, :1], ) + outputs[1:]
         return outputs
     def causal_forward(
@@ -862,12 +873,6 @@ class LSGSelfAttention(BaseSelfAttention):
         return x.reshape(n, h, -1, chunk_size, d)
-class LSGBertSelfOutput(BertSelfOutput):
-    def __init__(self, config):
-        super().__init__(config)
 class LSGAttention(BertAttention):
     def __init__(self, config):
@@ -875,107 +880,97 @@ class LSGAttention(BertAttention):
         nn.Module.__init__(self)
         self.self = LSGSelfAttention(config)
-        self.output = LSGBertSelfOutput(config)
         self.pruned_heads = set()
-class LSGBertIntermediate(BertIntermediate):
-    def __init__(self, config):
-        super().__init__(config)
-class LSGBertOutput(BertOutput):
-    def __init__(self, config):
-        super().__init__(config)
 class LSGBertLayer(BertLayer):
     def __init__(self, config):
-        nn.Module.__init__(self)
-        self.chunk_size_feed_forward = config.chunk_size_feed_forward
-        self.seq_len_dim = 1
         self.attention = LSGAttention(config)
-        self.is_decoder = config.is_decoder
-        self.add_cross_attention = config.add_cross_attention
         if self.add_cross_attention:
             if not self.is_decoder:
                 assert self.is_decoder, f"{self} should be used as a decoder model if cross attention is added"
             self.crossattention = LSGAttention(config)
-        self.intermediate = LSGBertIntermediate(config)
-        self.output = LSGBertOutput(config)
 class LSGBertEncoder(BertEncoder):
     def __init__(self, config):
-        nn.Module.__init__(self)
-        self.config = config
-        self.layer = nn.ModuleList([LSGBertLayer(config) for _ in range(config.num_hidden_layers)])
-        self.gradient_checkpointing = False
-class LSGBertPooler(BertPooler):
-    def __init__(self, config):
-        super().__init__(config)
-class LSGBertPredictionHeadTransform(BertPredictionHeadTransform):
-    def __init__(self, config):
         super().__init__(config)
-class LSGBertLMPredictionHead(BertLMPredictionHead):
-    def __init__(self, config):
-        nn.Module.__init__(self)
-        self.transform = LSGBertPredictionHeadTransform(config)
-        # The output weights are the same as the input embeddings, but there is
-        # an output-only bias for each token.
-        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
-        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
-        self.decoder.bias = self.bias
-class LSGBertOnlyMLMHead(BertOnlyMLMHead):
-    """LSG Head for masked language modeling."""
-    def __init__(self, config):
-        nn.Module.__init__(self)
-        self.predictions = LSGBertLMPredictionHead(config)
-class LSGBertOnlyNSPHead(BertOnlyNSPHead):
-    def __init__(self, config):
-        super().__init__(config)
-class LSGBertPreTrainingHeads(BertPreTrainingHeads):
-    def __init__(self, config):
-        nn.Module.__init__(self)
-        self.predictions = BertLMPredictionHead(config)
-        self.seq_relationship = nn.Linear(config.hidden_size, 2)
 class LSGBertPreTrainedModel(BertPreTrainedModel):
     """
@@ -1003,19 +998,10 @@ class LSGBertModel(LSGBertPreTrainedModel, BertModel):
         LSGBertPreTrainedModel.__init__(self, config)
         self.config = config
-        assert hasattr(config, "num_global_tokens")
-        self.num_global_tokens = config.num_global_tokens
-        self.pad_idx = config.pad_token_id
-        assert hasattr(config, "block_size") and hasattr(config, "adaptive")
-        self.block_size = config.block_size
-        self.adaptive = config.adaptive
-        self.mask_first_token = config.mask_first_token
-        self.pool_with_global = config.pool_with_global
         self.embeddings = LSGBertEmbeddings(config)
         self.encoder = LSGBertEncoder(config)
-        self.pooler = LSGBertPooler(config) if add_pooling_layer else None
         if config.add_cross_attention:
             logger.warning(
@@ -1025,97 +1011,6 @@ class LSGBertModel(LSGBertPreTrainedModel, BertModel):
         # Initialize weights and apply final processing
         self.post_init()
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_values=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None
-        ):
-        inputs_ = input_ids if input_ids is not None else inputs_embeds
-        n, t = inputs_.size()[:2]
-        if attention_mask is None:
-            attention_mask = torch.ones(n, t, device=inputs_.device, dtype=inputs_.dtype)
-        if self.mask_first_token:
-            attention_mask[:,0] = 0
-        if token_type_ids is None:
-            token_type_ids = torch.zeros(n, t, device=inputs_.device).long()
-        b = self.block_size * 2
-        pad = t % self.block_size
-        # Check if t is multiple of block_size and pad
-        if self.adaptive and t > b and pad > 0:
-            pad_length = self.block_size - pad
-            if input_ids is not None:
-                input_ids = torch.nn.functional.pad(input_ids, (0, pad_length), value=self.pad_idx)
-            else:
-                inputs_embeds = torch.nn.functional.pad(inputs_embeds.transpose(-1, -2), (0, pad_length), value=0.).transpose(-1, -2)
-            attention_mask = torch.nn.functional.pad(attention_mask, (0, pad_length), value=0)
-            token_type_ids = torch.nn.functional.pad(token_type_ids, (0, pad_length), value=0)
-            if position_ids is not None:
-                position_ids = torch.nn.functional.pad(position_ids, (0, pad_length), value=0)
-        n, t_ = attention_mask.size()
-        encoder_outputs = super().forward(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict
-            )
-        context = encoder_outputs[0]
-        if self.pool_with_global:
-            context[:, self.num_global_tokens] = context[:, 0]
-        diff = t - t_
-        n, _, d = context.size()
-        context = context[..., self.num_global_tokens:, :]
-        # Adapt sequence to initial shape
-        if diff < 0:
-            context = context[:, :t]
-        encoder_outputs.last_hidden_state = context
-        sequence_output = encoder_outputs[0]
-        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
-        if not return_dict:
-            return (sequence_output, pooled_output) + encoder_outputs[1:]
-        return BaseModelOutputWithPoolingAndCrossAttentions(
-            last_hidden_state=sequence_output,
-            pooler_output=pooled_output,
-            past_key_values=encoder_outputs.past_key_values,
-            hidden_states=encoder_outputs.hidden_states,
-            attentions=encoder_outputs.attentions,
-            cross_attentions=encoder_outputs.cross_attentions,
-        )
     def get_extended_attention_mask(self, attention_mask, input_shape, device=None):
         # Do not rely on original triangular mask from BERT/RoBERTa for causalLM
@@ -1134,33 +1029,33 @@ class LSGBertModel(LSGBertPreTrainedModel, BertModel):
         return extended_attention_mask
-class LSGBertForPreTraining(LSGBertPreTrainedModel):
     def __init__(self, config):
-        super().__init__(config)
         self.bert = LSGBertModel(config)
-        self.cls = LSGBertPreTrainingHeads(config)
         # Initialize weights and apply final processing
         self.post_init()
-class LSGBertLMHeadModel(BertLMHeadModel):
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
     _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
     def __init__(self, config):
-        BertPreTrainedModel.__init__(self, config)
         if not config.is_decoder:
             logger.warning("If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`")
         self.bert = LSGBertModel(config, add_pooling_layer=False)
-        self.cls = LSGBertOnlyMLMHead(config)
         # Initialize weights and apply final processing
         self.post_init()
@@ -1187,7 +1082,7 @@ class LSGBertForMaskedLM(LSGBertPreTrainedModel, BertForMaskedLM):
             )
         self.bert = LSGBertModel(config, add_pooling_layer=False)
-        self.cls = LSGBertOnlyMLMHead(config)
         # Initialize weights and apply final processing
         self.post_init()
@@ -1200,7 +1095,7 @@ class LSGBertForNextSentencePrediction(LSGBertPreTrainedModel, BertForNextSenten
         LSGBertPreTrainedModel.__init__(self, config)
         self.bert = LSGBertModel(config)
-        self.cls = LSGBertOnlyNSPHead(config)
         # Initialize weights and apply final processing
         self.post_init()

         self.sparse_block_size = sparse_block_size
         self.sparsity_factor = sparsity_factor
         self.sparsity_type = sparsity_type
         if sparsity_type not in [None, "none", "norm", "lsh", "pooling", "stride", "block_stride"]:
             logger.warning(
+                "[WARNING CONFIG]: sparsity_mode not in [None, 'none', 'norm', 'lsh', 'pooling', 'stride', 'block_stride'], \
+                    setting sparsity_type=None, computation will skip sparse attention")
             self.sparsity_type = None
         if self.sparsity_type in ["stride", "block_stride"]:
                 logger.warning(
                 "[WARNING CONFIG]: sparsity_factor > encoder_attention_heads is not recommended for stride/block_stride sparsity"
             )
         if self.num_global_tokens < 1:
             logger.warning(
                 "[WARNING CONFIG]: num_global_tokens < 1 is not compatible, setting num_global_tokens=1"
             self.num_global_tokens = 1
         elif self.num_global_tokens > 512:
             logger.warning(
+                "[WARNING CONFIG]: num_global_tokens > 512 is not allowed, setting num_global_tokens=512"
             )
             self.num_global_tokens = 512
         if self.sparsity_factor > 0:
             assert self.block_size % self.sparsity_factor == 0, "[ERROR CONFIG]: block_size must be divisible by sparsity_factor"
             assert self.block_size//self.sparsity_factor >= 1, "[ERROR CONFIG]: make sure block_size >= sparsity_factor"
+        if self.mask_first_token and not pool_with_global:
+            logger.warning(
+                "[WARNING CONFIG]: pool_with_global==False is not compatible with mask_first_token==True. Setting pool_with_global to True.")
+            self.pool_with_global = True
+        if hasattr(self, "position_embedding_type"):
+            if self.position_embedding_type != "absolute":
+                logger.warning(
+                "[WARNING CONFIG]: LSG Attention is not compatible with relative positional embedding and will skip its computation. Set position_embedding_type='absolute' to remove this warning.")
 class BaseSelfAttention(nn.Module):
                 diagonal=-1
                 )
             causal_mask = causal_mask.T * torch.finfo(attention_scores.dtype).min
+            attention_scores[..., -causal_shape[0]:, -causal_shape[1] + 1:] = causal_mask[:, 1:]
             del attention_mask
         keys = keys.sum(dim=-2) / (mask + 1e-6)
         values = values.sum(dim=-2) / (mask + 1e-6)
+        mask = (1. - mask.clamp(0, 1))
+        mask *= torch.finfo(mask.dtype).min
         return keys.reshape(n, h, -1, d), values.reshape(n, h, -1, d), mask.expand(-1, h, -1, -1).transpose(-1, -2)
     def get_sparse_tokens_with_stride(self, keys, values, mask):
         keys /= mask + 1e-8
         values /= mask + 1e-8
+        mask = (1. - mask.clamp(0, 1))
+        mask *= torch.finfo(mask.dtype).min
         return keys.reshape(n, h, -1, d), values.reshape(n, h, -1, d), mask.transpose(-1, -2).reshape(n, h, 1, -1)
                 output_attentions=output_attentions
                 )
         return outputs
     def causal_forward(
         return x.reshape(n, h, -1, chunk_size, d)
 class LSGAttention(BertAttention):
     def __init__(self, config):
         nn.Module.__init__(self)
         self.self = LSGSelfAttention(config)
+        self.output = BertSelfOutput(config)
         self.pruned_heads = set()
 class LSGBertLayer(BertLayer):
     def __init__(self, config):
+        super().__init__(config)
         self.attention = LSGAttention(config)
         if self.add_cross_attention:
             if not self.is_decoder:
                 assert self.is_decoder, f"{self} should be used as a decoder model if cross attention is added"
             self.crossattention = LSGAttention(config)
 class LSGBertEncoder(BertEncoder):
     def __init__(self, config):
         super().__init__(config)
+        self.layer = nn.ModuleList([LSGBertLayer(config) for _ in range(config.num_hidden_layers)])
+        assert hasattr(config, "num_global_tokens")
+        self.num_global_tokens = config.num_global_tokens
+        self.pad_idx = config.pad_token_id
+        assert hasattr(config, "block_size") and hasattr(config, "adaptive")
+        self.block_size = config.block_size
+        self.adaptive = config.adaptive
+        self.mask_first_token = config.mask_first_token
+        self.pool_with_global = config.pool_with_global
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
+        mask_value = torch.finfo(attention_mask.dtype).min
+        n, _, __, t = attention_mask.size()
+        if not (self.config.is_decoder and encoder_hidden_states is not None):
+            b = self.block_size * 2
+            pad = t % self.block_size
+            # Check if t is multiple of block_size and pad
+            if self.adaptive and t > b and pad > 0:
+                pad_length = self.block_size - pad
+                hidden_states = torch.nn.functional.pad(hidden_states.transpose(-1, -2), (0, pad_length), value=0.).transpose(-1, -2)
+                attention_mask = torch.nn.functional.pad(attention_mask, (0, pad_length), value=mask_value)
+            if self.mask_first_token:
+                attention_mask[..., 0] = mask_value
+        encoder_outputs = super().forward(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict
+            )
+        sequence_output = encoder_outputs[0]
+        if self.pool_with_global:
+            sequence_output[:, self.num_global_tokens] = sequence_output[:, 0]
+        # Adapt sequence to initial shape
+        sequence_output = sequence_output[..., self.num_global_tokens: t + self.num_global_tokens, :]
+        if not return_dict:
+            return (sequence_output, ) + encoder_outputs[1:]
+        encoder_outputs.last_hidden_state = sequence_output
+        return encoder_outputs
 class LSGBertPreTrainedModel(BertPreTrainedModel):
     """
         LSGBertPreTrainedModel.__init__(self, config)
         self.config = config
         self.embeddings = LSGBertEmbeddings(config)
         self.encoder = LSGBertEncoder(config)
+        self.pooler = BertPooler(config) if add_pooling_layer else None
         if config.add_cross_attention:
             logger.warning(
         # Initialize weights and apply final processing
         self.post_init()
     def get_extended_attention_mask(self, attention_mask, input_shape, device=None):
         # Do not rely on original triangular mask from BERT/RoBERTa for causalLM
         return extended_attention_mask
+class LSGBertForPreTraining(LSGBertPreTrainedModel, BertForPreTraining):
     def __init__(self, config):
+        LSGBertPreTrainedModel.__init__(self, config)
         self.bert = LSGBertModel(config)
+        self.cls = BertPreTrainingHeads(config)
         # Initialize weights and apply final processing
         self.post_init()
+class LSGBertLMHeadModel(LSGBertPreTrainedModel, BertLMHeadModel):
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
     _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
     def __init__(self, config):
+        LSGBertPreTrainedModel.__init__(self, config)
         if not config.is_decoder:
             logger.warning("If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`")
         self.bert = LSGBertModel(config, add_pooling_layer=False)
+        self.cls = BertOnlyMLMHead(config)
         # Initialize weights and apply final processing
         self.post_init()
             )
         self.bert = LSGBertModel(config, add_pooling_layer=False)
+        self.cls = BertOnlyMLMHead(config)
         # Initialize weights and apply final processing
         self.post_init()
         LSGBertPreTrainedModel.__init__(self, config)
         self.bert = LSGBertModel(config)
+        self.cls = BertOnlyNSPHead(config)
         # Initialize weights and apply final processing
         self.post_init()