update

Browse files

Files changed (4) hide show

README.md +21 -13
config.json +3 -3
modeling_lsg_bart.py +85 -480
pytorch_model.bin +1 -1

README.md CHANGED Viewed

@@ -23,16 +23,23 @@ should probably proofread and complete it, then remove this comment. -->
 This model is a fine-tuned version of [ccdv/lsg-bart-base-4096](https://huggingface.co/ccdv/lsg-bart-base-4096) on the scientific_papers arxiv dataset. \
 It achieves the following results on the test set:
-| Length | Sparse Type | Block Size | Sparsity | Connexions | R1    | R2    | RL    | RLsum |
-|:------ |:----------- |:---------- |:-------- | :--------- |:----- |:----- |:----- |:----- |
-| 4096   | -           | 256        | 0        | 768        | 46.29 | 18.71 | 26.77 | 41.85 |
-| 4096   | -           | 128        | 0        | 384        | 45.87 | 18.44 | 26.66 | 41.42 |
-| 4096   | Stride      | 128        | 4        | 644        | 46.07 | 18.51 | 26.61 | 41.58 |
-| 4096   | Pooling     | 128        | 4        | 644        | 46.02 | 18.52 | 26.73 | 41.55 |
-| 4096   | LSH         | 128        | 4        | 644        | 45.78 | 18.48 | 26.70 | 41.40 |
-| 4096   | Norm        | 128        | 4        | 644        | 45.76 | 18.26 | 26.36 | 41.26 |
 ## Model description
 The model relies on Local-Sparse-Global attention to handle long sequences:
@@ -61,7 +68,8 @@ The following hyperparameters were used during training:
 - total_train_batch_size: 32
 - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
 - lr_scheduler_type: linear
-- num_epochs: 5.0
 ### Generate hyperparameters
@@ -69,13 +77,13 @@ The following hyperparameters were used during generation:
 - dataset_name: scientific_papers
 - dataset_config_name: arxiv
 - eval_batch_size: 8
 - early_stopping: True
 - ignore_pad_token_for_loss: True
 - length_penalty: 2.0
 - max_length: 320
-- min_length: 64
 - num_beams: 5
-- num_samples: None
 - no_repeat_ngram_size: None
 - seed: 123

 This model is a fine-tuned version of [ccdv/lsg-bart-base-4096](https://huggingface.co/ccdv/lsg-bart-base-4096) on the scientific_papers arxiv dataset. \
 It achieves the following results on the test set:
+| Length | Sparse Type  | Block Size | Sparsity | Connexions | R1    | R2    | RL    | RLsum |
+|:------ |:------------ |:---------- |:-------- | :--------- |:----- |:----- |:----- |:----- |
+| 4096   | Local        | 256        | 0        | 768        | 46.65 | 18.91 | 26.90 | 42.18 |
+| 4096   | Local        | 128        | 0        | 384        | 46.18 | 18.57 | 26.71 | 41.69 |
+| 4096   | Pooling      | 128        | 4        | 644        | 46.27 | 18.68 | 26.87 | 41.82 |
+| 4096   | Stride       | 128        | 4        | 644        | 46.34 | 18.64 | 26.69 | 41.87 |
+| 4096   | Norm         | 128        | 4        | 644        | 45.96 | 18.46 | 26.52 | 41.51 |
+| 4096   | LSH          | 128        | 4        | 644        | 46.19 | 18.72 | 26.89 | 41.76 |
+With blocks of size 32 (lower ressources):
+| Length | Sparse Type  | Block Size | Sparsity | Connexions | R1    | R2    | RL    | RLsum |
+|:------ |:------------ |:---------- |:-------- | :--------- |:----- |:----- |:----- |:----- |
+| 4096   | Pooling      | 32         | 4        | 160        | 42.75 | 16.34 | 25.20 | 38.23 |
+| 4096   | Stride       | 32         | 4        | 160        | 44.23 | 17.21 | 25.71 | 39.72 |
+| 4096   | Block Stride | 32         | 4        | 160        | 44.15 | 17.10 | 25.68 | 39.60 |
+| 4096   | Norm         | 32         | 4        | 160        | 42.02 | 15.65 | 24.56 | 37.45 |
+| 4096   | LSH          | 32         | 4        | 160        | 42.58 | 16.21 | 25.10 | 38.04 |
 ## Model description
 The model relies on Local-Sparse-Global attention to handle long sequences:
 - total_train_batch_size: 32
 - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
 - lr_scheduler_type: linear
+- lr_scheduler_warmup_ratio: 0.1
+- num_epochs: 6.0
 ### Generate hyperparameters
 - dataset_name: scientific_papers
 - dataset_config_name: arxiv
 - eval_batch_size: 8
+- eval_samples: 6440
 - early_stopping: True
 - ignore_pad_token_for_loss: True
 - length_penalty: 2.0
 - max_length: 320
+- min_length: 32
 - num_beams: 5
 - no_repeat_ngram_size: None
 - seed: 123

config.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-  "_name_or_path": "ccdv/lsg-bart-base-4096-arxiv",
   "activation_dropout": 0.1,
   "activation_function": "gelu",
   "adaptive": true,
@@ -67,8 +67,8 @@
   "pool_with_global": true,
   "scale_embedding": false,
   "sparse_block_size": 0,
-  "sparsity_factor": 4,
-  "sparsity_type": "norm",
   "task_specific_params": {
     "summarization": {
       "length_penalty": 1.0,

 {
+  "_name_or_path": "/data/ccondevaux/lsg/text-summarization/tmp_final/arxiv/lsg_local",
   "activation_dropout": 0.1,
   "activation_function": "gelu",
   "adaptive": true,
   "pool_with_global": true,
   "scale_embedding": false,
   "sparse_block_size": 0,
+  "sparsity_factor": 2,
+  "sparsity_type": "none",
   "task_specific_params": {
     "summarization": {
       "length_penalty": 1.0,

modeling_lsg_bart.py CHANGED Viewed

@@ -41,8 +41,6 @@ class LSGBartConfig(BartConfig):
         ):
         """Constructs LSGConfig."""
         super().__init__(**kwargs)
-        assert sparsity_type in ["norm", "lsh", "pooling", "stride"], "Sparsity mode must be 'norm', 'lsh' or 'pooling'"
         self.adaptive = adaptive
         self.auto_map = AUTO_MAP
@@ -55,7 +53,33 @@ class LSGBartConfig(BartConfig):
         self.sparse_block_size = sparse_block_size
         self.sparsity_factor = sparsity_factor
         self.sparsity_type = sparsity_type
 def shift_tokens_right(input_ids, pad_token_id, decoder_start_token_id):
     """
@@ -208,8 +232,6 @@ class LSGAttentionProduct(nn.Module):
         # Shape of blocks
         self.local_shapes = (self.block_size*3, self.block_size)
         if self.sparse_block_size and self.sparsity_factor > 0:
-            assert self.block_size % self.sparsity_factor == 0, "block_size must be divisible by sparsity_factor"
-            assert self.block_size//self.sparsity_factor >= 1, "Config is wrong, make sure block_size >= sparsity_factor"
             self.sparse_shapes = (self.sparse_block_size*3, self.block_size//self.sparsity_factor)
         self.attention = BaseAttentionProduct(config)
@@ -393,21 +415,15 @@ class LSGBartEncoderAttention(BaseSelfAttention):
             }
         self.sparsity_type = config.sparsity_type
-        self.get_sparse_elements = sparse_functions[self.sparsity_type]
-        if config.sparsity_type == "stride":
-            if config.sparsity_factor > config.encoder_attention_heads:
-                logger.warning(
-                "Warning: sparsity_factor > encoder_attention_heads is not recommended for stride sparsity"
-            )
         if config.sparsity_type == "lsh":
             self.lsh_num_pre_rounds = config.lsh_num_pre_rounds
     def get_sparse_tokens_with_norm(self, keys, values, mask):
         if self.sparsity_factor == 1:
-            return keys, values, mask
         with torch.no_grad():
@@ -435,7 +451,7 @@ class LSGBartEncoderAttention(BaseSelfAttention):
     def get_sparse_tokens_with_pooling(self, keys, values, mask):
         if self.sparsity_factor == 1:
-            return keys, values, mask
         keys = self.chunk(keys, self.sparsity_factor)
         values = self.chunk(values, self.sparsity_factor)
@@ -457,13 +473,30 @@ class LSGBartEncoderAttention(BaseSelfAttention):
     def get_sparse_tokens_with_stride(self, keys, values, mask):
         if self.sparsity_factor == 1:
-            return keys, values, mask
         n, h, t, d = keys.size()
         sparse_idx = torch.arange(t // self.sparsity_factor, device=keys.device) * self.sparsity_factor
         sparse_idx = sparse_idx.reshape(1, 1, -1, 1) + (torch.arange(h, device=keys.device) % self.sparsity_factor).reshape(1, h, 1, 1)
         sparse_idx = sparse_idx.expand(n, h, -1, 1)
         keys = keys.gather(dim=-2, index=sparse_idx.expand(-1, -1, -1, d))
         values = values.gather(dim=-2, index=sparse_idx.expand(-1, -1, -1, d))
         mask = mask.expand(-1, h, -1, -1).transpose(-1, -2).gather(dim=-2, index=sparse_idx).transpose(-1, -2)
@@ -473,7 +506,7 @@ class LSGBartEncoderAttention(BaseSelfAttention):
     def get_sparse_tokens_with_lsh(self, keys, values, mask):
         if self.sparsity_factor == 1:
-            return keys, values, mask
         block_size = min(self.block_size, self.sparse_block_size)
         keys = self.chunk(keys, block_size)
@@ -490,9 +523,9 @@ class LSGBartEncoderAttention(BaseSelfAttention):
         extra_factor = 1
         for _ in range(self.lsh_num_pre_rounds):
-            keys, values, mask = self.lsg_round(keys, values, mask, t*extra_factor)
-        keys, values, mask = self.lsg_round(keys, values, mask, t//self.sparsity_factor)
         keys /= mask + 1e-8
         values /= mask + 1e-8
@@ -500,7 +533,7 @@ class LSGBartEncoderAttention(BaseSelfAttention):
         return keys.reshape(n, h, -1, d), values.reshape(n, h, -1, d), mask.transpose(-1, -2).reshape(n, h, 1, -1)
-    def lsg_round(self, keys, values, mask, output_size):
         with torch.no_grad():
@@ -1304,6 +1337,7 @@ class LSGBartDecoder(LSGBartPretrainedModel):
         self.padding_idx = config.pad_token_id
         self.max_target_positions = config.max_position_embeddings
         self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
         if embed_tokens is not None:
             self.embed_tokens = embed_tokens
@@ -1346,6 +1380,15 @@ class LSGBartDecoder(LSGBartPretrainedModel):
         return combined_attention_mask
     def forward(
         self,
         input_ids=None,
@@ -1386,12 +1429,14 @@ class LSGBartDecoder(LSGBartPretrainedModel):
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
-        # Cut
-        if attention_mask is not None:
-            max_len = int(attention_mask.sum(dim=-1).max())
-            inputs_embeds = inputs_embeds[:, :max_len]
-            attention_mask = attention_mask[..., :max_len]
-            input_shape = inputs_embeds.size()[:-1]
         attention_mask = self._prepare_decoder_attention_mask(
             attention_mask, input_shape, inputs_embeds, past_key_values_length
@@ -1485,6 +1530,9 @@ class LSGBartDecoder(LSGBartPretrainedModel):
                 if encoder_hidden_states is not None:
                     all_cross_attentions += (layer_outputs[2],)
         # add hidden states from the last decoder layer
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
@@ -1621,14 +1669,14 @@ class LSGBartModel(LSGBartPretrainedModel):
         )
-class LSGBartForConditionalGeneration(LSGBartPretrainedModel):
     base_model_prefix = "model"
     _keys_to_ignore_on_load_missing = [r"final_logits_bias", r"lm_head\.weight"]
     def __init__(self, config):
-        super().__init__(config)
         self.model = LSGBartModel(config)
         self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings)))
         self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False)
@@ -1636,157 +1684,12 @@ class LSGBartForConditionalGeneration(LSGBartPretrainedModel):
         # Initialize weights and apply final processing
         self.post_init()
-    def get_encoder(self):
-        return self.model.get_encoder()
-    def get_decoder(self):
-        return self.model.get_decoder()
-    def resize_token_embeddings(self, new_num_tokens):
-        new_embeddings = super().resize_token_embeddings(new_num_tokens)
-        self._resize_final_logits_bias(new_num_tokens)
-        return new_embeddings
-    def _resize_final_logits_bias(self, new_num_tokens):
-        old_num_tokens = self.final_logits_bias.shape[-1]
-        if new_num_tokens <= old_num_tokens:
-            new_bias = self.final_logits_bias[:, :new_num_tokens]
-        else:
-            extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device)
-            new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1)
-        self.register_buffer("final_logits_bias", new_bias)
-    def get_output_embeddings(self):
-        return self.lm_head
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        decoder_input_ids=None,
-        decoder_attention_mask=None,
-        head_mask=None,
-        decoder_head_mask=None,
-        cross_attn_head_mask=None,
-        encoder_outputs=None,
-        past_key_values=None,
-        inputs_embeds=None,
-        decoder_inputs_embeds=None,
-        labels=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        ):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Labels for computing the masked language modeling loss. Indices should either be in ``[0, ...,
-            config.vocab_size]`` or -100 (see ``input_ids`` docstring). Tokens with indices set to ``-100`` are ignored
-            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``.
-        Returns:
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        if labels is not None:
-            if decoder_input_ids is None and decoder_inputs_embeds is None:
-                decoder_input_ids = shift_tokens_right(
-                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
-                )
-        outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            decoder_input_ids=decoder_input_ids,
-            encoder_outputs=encoder_outputs,
-            decoder_attention_mask=decoder_attention_mask,
-            head_mask=head_mask,
-            decoder_head_mask=decoder_head_mask,
-            cross_attn_head_mask=cross_attn_head_mask,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            decoder_inputs_embeds=decoder_inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        lm_logits = self.lm_head(outputs[0]) + self.final_logits_bias
-        masked_lm_loss = None
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
-        if not return_dict:
-            output = (lm_logits,) + outputs[1:]
-            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
-        return Seq2SeqLMOutput(
-            loss=masked_lm_loss,
-            logits=lm_logits,
-            past_key_values=outputs.past_key_values,
-            decoder_hidden_states=outputs.decoder_hidden_states,
-            decoder_attentions=outputs.decoder_attentions,
-            cross_attentions=outputs.cross_attentions,
-            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
-            encoder_hidden_states=outputs.encoder_hidden_states,
-            encoder_attentions=outputs.encoder_attentions,
-        )
-    def prepare_inputs_for_generation(
-        self,
-        decoder_input_ids,
-        past=None,
-        attention_mask=None,
-        head_mask=None,
-        decoder_head_mask=None,
-        cross_attn_head_mask=None,
-        use_cache=None,
-        encoder_outputs=None,
-        **kwargs
-        ):
-        # cut decoder_input_ids if past is used
-        if past is not None:
-            decoder_input_ids = decoder_input_ids[:, -1:]
-        return {
-            "input_ids": None,  # encoder_outputs is defined. input_ids not needed
-            "encoder_outputs": encoder_outputs,
-            "past_key_values": past,
-            "decoder_input_ids": decoder_input_ids,
-            "attention_mask": attention_mask,
-            "head_mask": head_mask,
-            "decoder_head_mask": decoder_head_mask,
-            "cross_attn_head_mask": cross_attn_head_mask,
-            "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
-        }
-    def prepare_decoder_input_ids_from_labels(self, labels):
-        return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
-    @staticmethod
-    def _reorder_cache(past, beam_idx):
-        reordered_past = ()
-        for layer_past in past:
-            # cached cross_attention states don't have to be reordered -> they are always the same
-            reordered_past += (
-                tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
-            )
-        return reordered_past
-class LSGBartForSequenceClassification(LSGBartPretrainedModel):
-    def __init__(self, config, **kwargs):
-        super().__init__(config, **kwargs)
         self.model = LSGBartModel(config)
         self.classification_head = LSGBartClassificationHead(
             config.d_model,
@@ -1797,115 +1700,12 @@ class LSGBartForSequenceClassification(LSGBartPretrainedModel):
         self.model._init_weights(self.classification_head.dense)
         self.model._init_weights(self.classification_head.out_proj)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        decoder_input_ids=None,
-        decoder_attention_mask=None,
-        head_mask=None,
-        decoder_head_mask=None,
-        cross_attn_head_mask=None,
-        encoder_outputs=None,
-        inputs_embeds=None,
-        decoder_inputs_embeds=None,
-        labels=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        ):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
-            config.num_labels - 1]`. If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        if labels is not None:
-            use_cache = False
-        if input_ids is None and inputs_embeds is not None:
-            raise NotImplementedError(
-                f"Passing input embeddings is currently not supported for {self.__class__.__name__}"
-            )
-        outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-            head_mask=head_mask,
-            decoder_head_mask=decoder_head_mask,
-            cross_attn_head_mask=cross_attn_head_mask,
-            encoder_outputs=encoder_outputs,
-            inputs_embeds=inputs_embeds,
-            decoder_inputs_embeds=decoder_inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = outputs[0]  # last hidden state
-        eos_mask = input_ids.eq(self.config.eos_token_id)
-        t, t_ = eos_mask.size()[-1], hidden_states.size()[-2]
-        if t > t_:
-            eos_mask = eos_mask[:, :t_]
-        if len(torch.unique_consecutive(eos_mask.sum(1))) > 1:
-            raise ValueError("All examples must have the same number of <eos> tokens.")
-        sentence_representation = hidden_states[eos_mask, :].view(hidden_states.size(0), -1, hidden_states.size(-1))[
-            :, -1, :
-        ]
-        logits = self.classification_head(sentence_representation)
-        loss = None
-        if labels is not None:
-            if self.config.problem_type is None:
-                if self.config.num_labels == 1:
-                    self.config.problem_type = "regression"
-                elif self.config.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
-                    self.config.problem_type = "single_label_classification"
-                else:
-                    self.config.problem_type = "multi_label_classification"
-            if self.config.problem_type == "regression":
-                loss_fct = MSELoss()
-                if self.config.num_labels == 1:
-                    loss = loss_fct(logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(logits, labels)
-            elif self.config.problem_type == "single_label_classification":
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
-            elif self.config.problem_type == "multi_label_classification":
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(logits, labels)
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-        return Seq2SeqSequenceClassifierOutput(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            decoder_hidden_states=outputs.decoder_hidden_states,
-            decoder_attentions=outputs.decoder_attentions,
-            cross_attentions=outputs.cross_attentions,
-            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
-            encoder_hidden_states=outputs.encoder_hidden_states,
-            encoder_attentions=outputs.encoder_attentions,
-        )
-class LSGBartForQuestionAnswering(LSGBartPretrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
         config.num_labels = 2
         self.num_labels = config.num_labels
@@ -1915,102 +1715,6 @@ class LSGBartForQuestionAnswering(LSGBartPretrainedModel):
         self.model._init_weights(self.qa_outputs)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        decoder_input_ids=None,
-        decoder_attention_mask=None,
-        head_mask=None,
-        decoder_head_mask=None,
-        cross_attn_head_mask=None,
-        encoder_outputs=None,
-        start_positions=None,
-        end_positions=None,
-        inputs_embeds=None,
-        decoder_inputs_embeds=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        ):
-        r"""
-        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
-            are not taken into account for computing the loss.
-        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
-            are not taken into account for computing the loss.
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        if start_positions is not None and end_positions is not None:
-            use_cache = False
-        outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-            head_mask=head_mask,
-            decoder_head_mask=decoder_head_mask,
-            cross_attn_head_mask=cross_attn_head_mask,
-            encoder_outputs=encoder_outputs,
-            inputs_embeds=inputs_embeds,
-            decoder_inputs_embeds=decoder_inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        sequence_output = outputs[0]
-        logits = self.qa_outputs(sequence_output)
-        start_logits, end_logits = logits.split(1, dim=-1)
-        start_logits = start_logits.squeeze(-1).contiguous()
-        end_logits = end_logits.squeeze(-1).contiguous()
-        total_loss = None
-        if start_positions is not None and end_positions is not None:
-            # If we are on multi-GPU, split add a dimension
-            if len(start_positions.size()) > 1:
-                start_positions = start_positions.squeeze(-1)
-            if len(end_positions.size()) > 1:
-                end_positions = end_positions.squeeze(-1)
-            # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = start_logits.size(1)
-            start_positions = start_positions.clamp(0, ignored_index)
-            end_positions = end_positions.clamp(0, ignored_index)
-            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
-            start_loss = loss_fct(start_logits, start_positions)
-            end_loss = loss_fct(end_logits, end_positions)
-            total_loss = (start_loss + end_loss) / 2
-        if not return_dict:
-            output = (
-                start_logits,
-                end_logits,
-            ) + outputs[1:]
-            return ((total_loss,) + output) if total_loss is not None else output
-        return Seq2SeqQuestionAnsweringModelOutput(
-            loss=total_loss,
-            start_logits=start_logits,
-            end_logits=end_logits,
-            past_key_values=outputs.past_key_values,
-            decoder_hidden_states=outputs.decoder_hidden_states,
-            decoder_attentions=outputs.decoder_attentions,
-            cross_attentions=outputs.cross_attentions,
-            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
-            encoder_hidden_states=outputs.encoder_hidden_states,
-            encoder_attentions=outputs.encoder_attentions,
-        )
 class LSGBartDecoderWrapper(LSGBartPretrainedModel):
     """
@@ -2018,7 +1722,7 @@ class LSGBartDecoderWrapper(LSGBartPretrainedModel):
     used in combination with the :class:`~transformers.EncoderDecoderModel` framework.
     """
-    def __init__(self, config):
         super().__init__(config)
         self.decoder = LSGBartDecoder(config)
@@ -2026,14 +1730,14 @@ class LSGBartDecoderWrapper(LSGBartPretrainedModel):
         return self.decoder(*args, **kwargs)
-class LSGBartForCausalLM(LSGBartPretrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
         config = copy.deepcopy(config)
         config.is_decoder = True
         config.is_encoder_decoder = False
         self.model = LSGBartDecoderWrapper(config)
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
@@ -2041,105 +1745,6 @@ class LSGBartForCausalLM(LSGBartPretrainedModel):
         # Initialize weights and apply final processing
         self.post_init()
-    def get_input_embeddings(self):
-        return self.model.decoder.embed_tokens
-    def set_input_embeddings(self, value):
-        self.model.decoder.embed_tokens = value
-    def get_output_embeddings(self):
-        return self.lm_head
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-    def set_decoder(self, decoder):
-        self.model.decoder = decoder
-    def get_decoder(self):
-        return self.model.decoder
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        head_mask=None,
-        cross_attn_head_mask=None,
-        past_key_values=None,
-        inputs_embeds=None,
-        labels=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        ):
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model.decoder(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            head_mask=head_mask,
-            cross_attn_head_mask=cross_attn_head_mask,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        logits = self.lm_head(outputs[0])
-        loss = None
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1))
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-        return CausalLMOutputWithCrossAttentions(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-            cross_attentions=outputs.cross_attentions,
-        )
-    def prepare_inputs_for_generation(self, input_ids, past=None, attention_mask=None, use_cache=None, **kwargs):
-        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
-        if attention_mask is None:
-            attention_mask = input_ids.new_ones(input_ids.shape)
-        if past:
-            input_ids = input_ids[:, -1:]
-        # first step, decoder_cached_states are empty
-        return {
-            "input_ids": input_ids,  # encoder_outputs is defined. input_ids not needed
-            "attention_mask": attention_mask,
-            "past_key_values": past,
-            "use_cache": use_cache,
-        }
-    @staticmethod
-    def _reorder_cache(past, beam_idx):
-        reordered_past = ()
-        for layer_past in past:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
-        return reordered_past
 def str_to_class(classname):
     return getattr(sys.modules[__name__], classname)

         ):
         """Constructs LSGConfig."""
         super().__init__(**kwargs)
         self.adaptive = adaptive
         self.auto_map = AUTO_MAP
         self.sparse_block_size = sparse_block_size
         self.sparsity_factor = sparsity_factor
         self.sparsity_type = sparsity_type
+        if sparsity_type not in [None, "none", "norm", "lsh", "pooling", "stride"]:
+            logger.warning(
+                "[WARNING CONFIG]: sparsity_mode not in [None, 'none', 'norm', 'lsh', 'pooling', 'stride'], setting sparsity_type=None, computation will skip sparse attention")
+            self.sparsity_type = None
+        if self.sparsity_type == "stride":
+            if self.sparsity_factor > self.encoder_attention_heads:
+                logger.warning(
+                "[WARNING CONFIG]: sparsity_factor > encoder_attention_heads is not recommended for stride sparsity"
+            )
+        if self.num_global_tokens < 1:
+            logger.warning(
+                "[WARNING CONFIG]: num_global_tokens < 1 is not compatible, setting num_global_tokens=1"
+            )
+            self.num_global_tokens = 1
+        elif self.num_global_tokens > 512:
+            logger.warning(
+                "[WARNING CONFIG]: num_global_tokens > 512 is not compatible, setting num_global_tokens=512"
+            )
+            self.num_global_tokens = 512
+        if self.sparsity_factor > 0:
+            assert self.block_size % self.sparsity_factor == 0, "[ERROR CONFIG]: block_size must be divisible by sparsity_factor"
+            assert self.block_size//self.sparsity_factor >= 1, "[ERROR CONFIG]: make sure block_size >= sparsity_factor"
 def shift_tokens_right(input_ids, pad_token_id, decoder_start_token_id):
     """
         # Shape of blocks
         self.local_shapes = (self.block_size*3, self.block_size)
         if self.sparse_block_size and self.sparsity_factor > 0:
             self.sparse_shapes = (self.sparse_block_size*3, self.block_size//self.sparsity_factor)
         self.attention = BaseAttentionProduct(config)
             }
         self.sparsity_type = config.sparsity_type
+        self.get_sparse_elements = sparse_functions.get(self.sparsity_type, lambda x, y, z: (None, None, None))
         if config.sparsity_type == "lsh":
             self.lsh_num_pre_rounds = config.lsh_num_pre_rounds
     def get_sparse_tokens_with_norm(self, keys, values, mask):
         if self.sparsity_factor == 1:
+            return keys, values, mask.expand(-1, keys.size()[1], -1, -1)
         with torch.no_grad():
     def get_sparse_tokens_with_pooling(self, keys, values, mask):
         if self.sparsity_factor == 1:
+            return keys, values, mask.expand(-1, keys.size()[1], -1, -1)
         keys = self.chunk(keys, self.sparsity_factor)
         values = self.chunk(values, self.sparsity_factor)
     def get_sparse_tokens_with_stride(self, keys, values, mask):
         if self.sparsity_factor == 1:
+            return keys, values, mask.expand(-1, keys.size()[1], -1, -1)
         n, h, t, d = keys.size()
         sparse_idx = torch.arange(t // self.sparsity_factor, device=keys.device) * self.sparsity_factor
         sparse_idx = sparse_idx.reshape(1, 1, -1, 1) + (torch.arange(h, device=keys.device) % self.sparsity_factor).reshape(1, h, 1, 1)
         sparse_idx = sparse_idx.expand(n, h, -1, 1)
+        """
+        t, b = self.block_size, t // self.block_size
+        sparse_idx = torch.arange(t // self.sparsity_factor, device=keys.device) * self.sparsity_factor
+        sparse_idx = sparse_idx.reshape(1, 1, 1, -1, 1) + (torch.arange(h, device=keys.device) % self.sparsity_factor).reshape(1, h, 1, 1, 1)
+        sparse_idx = sparse_idx + torch.arange(b, device=keys.device).reshape(1, 1, -1, 1, 1) * t
+        sparse_idx = sparse_idx.reshape(1, h, -1, 1).expand(n, h, -1, 1)
+        t, b = self.block_size, t // self.block_size
+        sparse_idx = torch.arange(t // self.sparsity_factor, device=keys.device)
+        sparse_idx = sparse_idx.reshape(1, 1, 1, -1, 1) + torch.arange(h, device=keys.device).reshape(1, h, 1, 1, 1) * (t // self.sparsity_factor)
+        sparse_idx = (sparse_idx % t)
+        #sparse_idx[..., -t//2:, :] = (sparse_idx[..., -t//2:, :] + t//2) % t
+        sparse_idx = sparse_idx + torch.arange(b, device=keys.device).reshape(1, 1, -1, 1, 1) * t
+        sparse_idx = sparse_idx.reshape(1, h, -1, 1).expand(n, h, -1, 1)
+        """
         keys = keys.gather(dim=-2, index=sparse_idx.expand(-1, -1, -1, d))
         values = values.gather(dim=-2, index=sparse_idx.expand(-1, -1, -1, d))
         mask = mask.expand(-1, h, -1, -1).transpose(-1, -2).gather(dim=-2, index=sparse_idx).transpose(-1, -2)
     def get_sparse_tokens_with_lsh(self, keys, values, mask):
         if self.sparsity_factor == 1:
+            return keys, values, mask.expand(-1, keys.size()[1], -1, -1)
         block_size = min(self.block_size, self.sparse_block_size)
         keys = self.chunk(keys, block_size)
         extra_factor = 1
         for _ in range(self.lsh_num_pre_rounds):
+            keys, values, mask = self.lsh_round(keys, values, mask, t*extra_factor)
+        keys, values, mask = self.lsh_round(keys, values, mask, t//self.sparsity_factor)
         keys /= mask + 1e-8
         values /= mask + 1e-8
         return keys.reshape(n, h, -1, d), values.reshape(n, h, -1, d), mask.transpose(-1, -2).reshape(n, h, 1, -1)
+    def lsh_round(self, keys, values, mask, output_size):
         with torch.no_grad():
         self.padding_idx = config.pad_token_id
         self.max_target_positions = config.max_position_embeddings
         self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+        self.adaptive = config.adaptive
         if embed_tokens is not None:
             self.embed_tokens = embed_tokens
         return combined_attention_mask
+    def resize_inputs(self, inputs_embeds, attention_mask):
+        pad = 0
+        max_len = int(attention_mask.sum(dim=-1).max())
+        pad = attention_mask.size()[-1] - max_len
+        inputs_embeds = inputs_embeds[:, :max_len]
+        attention_mask = attention_mask[..., :max_len]
+        return pad, inputs_embeds, attention_mask
     def forward(
         self,
         input_ids=None,
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+        # Resize to reduce computation
+        pad = 0
+        if self.adaptive:
+            if attention_mask is not None:
+                pad, inputs_embeds, attention_mask = self.resize_inputs(inputs_embeds, attention_mask)
+                input_shape = inputs_embeds.size()[:-1]
+            if encoder_attention_mask is not None:
+                _, encoder_hidden_states, encoder_attention_mask = self.resize_inputs(encoder_hidden_states, encoder_attention_mask)
         attention_mask = self._prepare_decoder_attention_mask(
             attention_mask, input_shape, inputs_embeds, past_key_values_length
                 if encoder_hidden_states is not None:
                     all_cross_attentions += (layer_outputs[2],)
+        # Resize to original shape
+        hidden_states = torch.nn.functional.pad(hidden_states.transpose(-1, -2), pad=(0, pad), value=0).transpose(-1, -2)
         # add hidden states from the last decoder layer
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
         )
+class LSGBartForConditionalGeneration(BartForConditionalGeneration, LSGBartPretrainedModel):
     base_model_prefix = "model"
     _keys_to_ignore_on_load_missing = [r"final_logits_bias", r"lm_head\.weight"]
     def __init__(self, config):
+        LSGBartPretrainedModel.__init__(self, config)
         self.model = LSGBartModel(config)
         self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings)))
         self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False)
         # Initialize weights and apply final processing
         self.post_init()
+class LSGBartForSequenceClassification(BartForSequenceClassification, LSGBartPretrainedModel):
+    def __init__(self, config: LSGBartConfig, **kwargs):
+        LSGBartPretrainedModel.__init__(self, config, **kwargs)
         self.model = LSGBartModel(config)
         self.classification_head = LSGBartClassificationHead(
             config.d_model,
         self.model._init_weights(self.classification_head.dense)
         self.model._init_weights(self.classification_head.out_proj)
+class LSGBartForQuestionAnswering(BartForQuestionAnswering, LSGBartPretrainedModel):
+    def __init__(self, config: LSGBartConfig):
+        LSGBartPretrainedModel.__init__(self, config)
         config.num_labels = 2
         self.num_labels = config.num_labels
         self.model._init_weights(self.qa_outputs)
 class LSGBartDecoderWrapper(LSGBartPretrainedModel):
     """
     used in combination with the :class:`~transformers.EncoderDecoderModel` framework.
     """
+    def __init__(self, config: LSGBartConfig):
         super().__init__(config)
         self.decoder = LSGBartDecoder(config)
         return self.decoder(*args, **kwargs)
+class LSGBartForCausalLM(BartForCausalLM, LSGBartPretrainedModel):
+    def __init__(self, config: LSGBartConfig):
         config = copy.deepcopy(config)
         config.is_decoder = True
         config.is_encoder_decoder = False
+        LSGBartPretrainedModel.__init__(self, config)
         self.model = LSGBartDecoderWrapper(config)
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
         # Initialize weights and apply final processing
         self.post_init()
 def str_to_class(classname):
     return getattr(sys.modules[__name__], classname)

pytorch_model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:12252a53bd8fdbb6daafd2118a44789c0d3d37f01c62ccde0d10a92142e44a72
 size 578416695

 version https://git-lfs.github.com/spec/v1
+oid sha256:88af6fadc19698eaa5d49e63aa969487846fbdfb41852afe199350a98d04801d
 size 578416695