update

Browse files

Files changed (4) hide show

README.md +20 -11
config.json +1 -1
modeling_lsg_bart.py +100 -484
pytorch_model.bin +1 -1

README.md CHANGED Viewed

@@ -23,15 +23,23 @@ should probably proofread and complete it, then remove this comment. -->
 This model is a fine-tuned version of [ccdv/lsg-bart-base-4096](https://huggingface.co/ccdv/lsg-bart-base-4096) on the scientific_papers pubmed dataset. \
 It achieves the following results on the test set:
-| Length | Sparse Type | Block Size | Sparsity | Connexions | R1    | R2    | RL    | RLsum |
-|:------ |:----------- |:---------- |:-------- | :--------- |:----- |:----- |:----- |:----- |
-| 4096   | Local       | 256        | 0        | 768        | 47.33 | 21.67 | 28.53 | 43.67 |
-| 4096   | Local       | 128        | 0        | 384        | 46.84 | 21.24 | 28.22 | 43.15 |
-| 4096   | Pooling     | 128        | 4        | 644        | 47.07 | 21.41 | 28.40 | 43.36 |
-| 4096   | Stride      | 128        | 4        | 644        | 47.02 | 21.46 | 28.33 | 43.35 |
-| 4096   | Norm        | 128        | 4        | 644        | 47.01 | 21.32 | 28.26 | 43.33 |
-| 4096   | LSH         | 128        | 4        | 644        | 46.92 | 21.27 | 28.26 | 43.26 |
 ## Model description
@@ -61,7 +69,8 @@ The following hyperparameters were used during training:
 - total_train_batch_size: 32
 - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
 - lr_scheduler_type: linear
-- num_epochs: 7.0
 ### Generate hyperparameters
@@ -69,13 +78,13 @@ The following hyperparameters were used during generation:
 - dataset_name: scientific_papers
 - dataset_config_name: pubmed
 - eval_batch_size: 8
 - early_stopping: True
 - ignore_pad_token_for_loss: True
 - length_penalty: 2.0
 - max_length: 512
 - min_length: 128
 - num_beams: 5
-- num_samples: None
 - no_repeat_ngram_size: None
 - seed: 123

 This model is a fine-tuned version of [ccdv/lsg-bart-base-4096](https://huggingface.co/ccdv/lsg-bart-base-4096) on the scientific_papers pubmed dataset. \
 It achieves the following results on the test set:
+| Length | Sparse Type  | Block Size | Sparsity | Connexions | R1    | R2    | RL    | RLsum |
+|:------ |:------------ |:---------- |:-------- | :--------- |:----- |:----- |:----- |:----- |
+| 4096   | Local        | 256        | 0        | 768        | 47.37 | 21.74 | 28.59 | 43.67 |
+| 4096   | Local        | 128        | 0        | 384        | 47.02 | 21.33 | 28.34 | 43.31 |
+| 4096   | Pooling      | 128        | 4        | 644        | 47.11 | 21.42 | 28.43 | 43.40 |
+| 4096   | Stride       | 128        | 4        | 644        | 47.16 | 21.49 | 28.38 | 43.44 |
+| 4096   | Norm         | 128        | 4        | 644        | 47.09 | 21.44 | 28.40 | 43.36 |
+| 4096   | LSH          | 128        | 4        | 644        | 47.11 | 21.41 | 28.41 | 43.42 |
+With blocks of size 32 (lower ressources):
+| Length | Sparse Type  | Block Size | Sparsity | Connexions | R1    | R2    | RL    | RLsum |
+|:------ |:------------ |:---------- |:-------- | :--------- |:----- |:----- |:----- |:----- |
+| 4096   | Pooling      | 32         | 4        | 160        | 44.60 | 19.35 | 26.83 | 40.85 |
+| 4096   | Stride       | 32         | 4        | 160        | 45.52 | 20.07 | 27.39 | 41.75 |
+| 4096   | Block Stride | 32         | 4        | 160        | 45.30 | 19.89 | 27.22 | 41.54 |
+| 4096   | Norm         | 32         | 4        | 160        | 44.30 | 19.05 | 26.57 | 40.47 |
+| 4096   | LSH          | 32         | 4        | 160        | 44.53 | 19.27 | 26.84 | 40.74 |
 ## Model description
 - total_train_batch_size: 32
 - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
 - lr_scheduler_type: linear
+- lr_scheduler_warmup_ratio: 0.1
+- num_epochs: 8.0
 ### Generate hyperparameters
 - dataset_name: scientific_papers
 - dataset_config_name: pubmed
 - eval_batch_size: 8
+- eval_samples: 6658
 - early_stopping: True
 - ignore_pad_token_for_loss: True
 - length_penalty: 2.0
 - max_length: 512
 - min_length: 128
 - num_beams: 5
 - no_repeat_ngram_size: None
 - seed: 123

config.json CHANGED Viewed

@@ -68,7 +68,7 @@
   "scale_embedding": false,
   "sparse_block_size": 0,
   "sparsity_factor": 2,
-  "sparsity_type": "pooling",
   "task_specific_params": {
     "summarization": {
       "length_penalty": 1.0,

   "scale_embedding": false,
   "sparse_block_size": 0,
   "sparsity_factor": 2,
+  "sparsity_type": "none",
   "task_specific_params": {
     "summarization": {
       "length_penalty": 1.0,

modeling_lsg_bart.py CHANGED Viewed

@@ -41,8 +41,6 @@ class LSGBartConfig(BartConfig):
         ):
         """Constructs LSGConfig."""
         super().__init__(**kwargs)
-        assert sparsity_type in ["norm", "lsh", "pooling", "stride"], "Sparsity mode must be 'norm', 'lsh' or 'pooling'"
         self.adaptive = adaptive
         self.auto_map = AUTO_MAP
@@ -55,7 +53,33 @@ class LSGBartConfig(BartConfig):
         self.sparse_block_size = sparse_block_size
         self.sparsity_factor = sparsity_factor
         self.sparsity_type = sparsity_type
 def shift_tokens_right(input_ids, pad_token_id, decoder_start_token_id):
     """
@@ -207,7 +231,7 @@ class LSGAttentionProduct(nn.Module):
         # Shape of blocks
         self.local_shapes = (self.block_size*3, self.block_size)
-        if self.sparsity_factor > 0:
             self.sparse_shapes = (self.sparse_block_size*3, self.block_size//self.sparsity_factor)
         self.attention = BaseAttentionProduct(config)
@@ -306,9 +330,12 @@ class LSGAttentionProduct(nn.Module):
         size, step = self.sparse_shapes
         # n, h, t, d*2 + 1
         size = size*2
-        s = (size - step) // 2
         # Pad before block reshaping
         if is_attn_mask:
@@ -326,11 +353,16 @@ class LSGAttentionProduct(nn.Module):
         # Make blocks
         hidden_states = hidden_states.unfold(-2, size=size, step=step).transpose(-1, -2)
         # Indexes for selection
-        u = (size - self.block_size * 3 // self.sparsity_factor) // 2
         s = self.sparse_block_size
-        return torch.cat([hidden_states[..., u-s:u, :], hidden_states[..., -u:-u+s, :]], dim=-2)
     def cat_global_sparse_local_tokens(self, x_global, x_sparse=None, x_local=None, dim=-2):
@@ -383,21 +415,15 @@ class LSGBartEncoderAttention(BaseSelfAttention):
             }
         self.sparsity_type = config.sparsity_type
-        self.get_sparse_elements = sparse_functions[self.sparsity_type]
-        if config.sparsity_type == "stride":
-            if config.sparsity_factor > config.encoder_attention_heads:
-                logger.warning(
-                "Warning: sparsity_factor > encoder_attention_heads is not recommended for stride sparsity"
-            )
         if config.sparsity_type == "lsh":
             self.lsh_num_pre_rounds = config.lsh_num_pre_rounds
     def get_sparse_tokens_with_norm(self, keys, values, mask):
         if self.sparsity_factor == 1:
-            return keys, values, mask
         with torch.no_grad():
@@ -425,7 +451,7 @@ class LSGBartEncoderAttention(BaseSelfAttention):
     def get_sparse_tokens_with_pooling(self, keys, values, mask):
         if self.sparsity_factor == 1:
-            return keys, values, mask
         keys = self.chunk(keys, self.sparsity_factor)
         values = self.chunk(values, self.sparsity_factor)
@@ -447,13 +473,30 @@ class LSGBartEncoderAttention(BaseSelfAttention):
     def get_sparse_tokens_with_stride(self, keys, values, mask):
         if self.sparsity_factor == 1:
-            return keys, values, mask
         n, h, t, d = keys.size()
         sparse_idx = torch.arange(t // self.sparsity_factor, device=keys.device) * self.sparsity_factor
         sparse_idx = sparse_idx.reshape(1, 1, -1, 1) + (torch.arange(h, device=keys.device) % self.sparsity_factor).reshape(1, h, 1, 1)
         sparse_idx = sparse_idx.expand(n, h, -1, 1)
         keys = keys.gather(dim=-2, index=sparse_idx.expand(-1, -1, -1, d))
         values = values.gather(dim=-2, index=sparse_idx.expand(-1, -1, -1, d))
         mask = mask.expand(-1, h, -1, -1).transpose(-1, -2).gather(dim=-2, index=sparse_idx).transpose(-1, -2)
@@ -463,7 +506,7 @@ class LSGBartEncoderAttention(BaseSelfAttention):
     def get_sparse_tokens_with_lsh(self, keys, values, mask):
         if self.sparsity_factor == 1:
-            return keys, values, mask
         block_size = min(self.block_size, self.sparse_block_size)
         keys = self.chunk(keys, block_size)
@@ -480,9 +523,9 @@ class LSGBartEncoderAttention(BaseSelfAttention):
         extra_factor = 1
         for _ in range(self.lsh_num_pre_rounds):
-            keys, values, mask = self.lsg_round(keys, values, mask, t*extra_factor)
-        keys, values, mask = self.lsg_round(keys, values, mask, t//self.sparsity_factor)
         keys /= mask + 1e-8
         values /= mask + 1e-8
@@ -490,7 +533,7 @@ class LSGBartEncoderAttention(BaseSelfAttention):
         return keys.reshape(n, h, -1, d), values.reshape(n, h, -1, d), mask.transpose(-1, -2).reshape(n, h, 1, -1)
-    def lsg_round(self, keys, values, mask, output_size):
         with torch.no_grad():
@@ -1130,7 +1173,8 @@ class LSGBartEncoder(LSGBartPretrainedModel):
         # else adaptive sequence length
         elif self.adaptive:
-            s = int(torch.max(attention_mask.sum(dim=-1)))
             if s < t and self.block_size is not None:
                 s = max(2, s // self.block_size + 1) * self.block_size if s > b else s
                 if input_ids is not None:
@@ -1293,6 +1337,7 @@ class LSGBartDecoder(LSGBartPretrainedModel):
         self.padding_idx = config.pad_token_id
         self.max_target_positions = config.max_position_embeddings
         self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
         if embed_tokens is not None:
             self.embed_tokens = embed_tokens
@@ -1335,6 +1380,15 @@ class LSGBartDecoder(LSGBartPretrainedModel):
         return combined_attention_mask
     def forward(
         self,
         input_ids=None,
@@ -1375,12 +1429,14 @@ class LSGBartDecoder(LSGBartPretrainedModel):
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
-        # Cut
-        if attention_mask is not None:
-            max_len = int(attention_mask.sum(dim=-1).max())
-            inputs_embeds = inputs_embeds[:, :max_len]
-            attention_mask = attention_mask[..., :max_len]
-            input_shape = inputs_embeds.size()[:-1]
         attention_mask = self._prepare_decoder_attention_mask(
             attention_mask, input_shape, inputs_embeds, past_key_values_length
@@ -1474,6 +1530,9 @@ class LSGBartDecoder(LSGBartPretrainedModel):
                 if encoder_hidden_states is not None:
                     all_cross_attentions += (layer_outputs[2],)
         # add hidden states from the last decoder layer
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
@@ -1610,14 +1669,14 @@ class LSGBartModel(LSGBartPretrainedModel):
         )
-class LSGBartForConditionalGeneration(LSGBartPretrainedModel):
     base_model_prefix = "model"
     _keys_to_ignore_on_load_missing = [r"final_logits_bias", r"lm_head\.weight"]
     def __init__(self, config):
-        super().__init__(config)
         self.model = LSGBartModel(config)
         self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings)))
         self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False)
@@ -1625,157 +1684,12 @@ class LSGBartForConditionalGeneration(LSGBartPretrainedModel):
         # Initialize weights and apply final processing
         self.post_init()
-    def get_encoder(self):
-        return self.model.get_encoder()
-    def get_decoder(self):
-        return self.model.get_decoder()
-    def resize_token_embeddings(self, new_num_tokens):
-        new_embeddings = super().resize_token_embeddings(new_num_tokens)
-        self._resize_final_logits_bias(new_num_tokens)
-        return new_embeddings
-    def _resize_final_logits_bias(self, new_num_tokens):
-        old_num_tokens = self.final_logits_bias.shape[-1]
-        if new_num_tokens <= old_num_tokens:
-            new_bias = self.final_logits_bias[:, :new_num_tokens]
-        else:
-            extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device)
-            new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1)
-        self.register_buffer("final_logits_bias", new_bias)
-    def get_output_embeddings(self):
-        return self.lm_head
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        decoder_input_ids=None,
-        decoder_attention_mask=None,
-        head_mask=None,
-        decoder_head_mask=None,
-        cross_attn_head_mask=None,
-        encoder_outputs=None,
-        past_key_values=None,
-        inputs_embeds=None,
-        decoder_inputs_embeds=None,
-        labels=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        ):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Labels for computing the masked language modeling loss. Indices should either be in ``[0, ...,
-            config.vocab_size]`` or -100 (see ``input_ids`` docstring). Tokens with indices set to ``-100`` are ignored
-            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``.
-        Returns:
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        if labels is not None:
-            if decoder_input_ids is None and decoder_inputs_embeds is None:
-                decoder_input_ids = shift_tokens_right(
-                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
-                )
-        outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            decoder_input_ids=decoder_input_ids,
-            encoder_outputs=encoder_outputs,
-            decoder_attention_mask=decoder_attention_mask,
-            head_mask=head_mask,
-            decoder_head_mask=decoder_head_mask,
-            cross_attn_head_mask=cross_attn_head_mask,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            decoder_inputs_embeds=decoder_inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        lm_logits = self.lm_head(outputs[0]) + self.final_logits_bias
-        masked_lm_loss = None
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
-        if not return_dict:
-            output = (lm_logits,) + outputs[1:]
-            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
-        return Seq2SeqLMOutput(
-            loss=masked_lm_loss,
-            logits=lm_logits,
-            past_key_values=outputs.past_key_values,
-            decoder_hidden_states=outputs.decoder_hidden_states,
-            decoder_attentions=outputs.decoder_attentions,
-            cross_attentions=outputs.cross_attentions,
-            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
-            encoder_hidden_states=outputs.encoder_hidden_states,
-            encoder_attentions=outputs.encoder_attentions,
-        )
-    def prepare_inputs_for_generation(
-        self,
-        decoder_input_ids,
-        past=None,
-        attention_mask=None,
-        head_mask=None,
-        decoder_head_mask=None,
-        cross_attn_head_mask=None,
-        use_cache=None,
-        encoder_outputs=None,
-        **kwargs
-        ):
-        # cut decoder_input_ids if past is used
-        if past is not None:
-            decoder_input_ids = decoder_input_ids[:, -1:]
-        return {
-            "input_ids": None,  # encoder_outputs is defined. input_ids not needed
-            "encoder_outputs": encoder_outputs,
-            "past_key_values": past,
-            "decoder_input_ids": decoder_input_ids,
-            "attention_mask": attention_mask,
-            "head_mask": head_mask,
-            "decoder_head_mask": decoder_head_mask,
-            "cross_attn_head_mask": cross_attn_head_mask,
-            "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
-        }
-    def prepare_decoder_input_ids_from_labels(self, labels):
-        return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
-    @staticmethod
-    def _reorder_cache(past, beam_idx):
-        reordered_past = ()
-        for layer_past in past:
-            # cached cross_attention states don't have to be reordered -> they are always the same
-            reordered_past += (
-                tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
-            )
-        return reordered_past
-class LSGBartForSequenceClassification(LSGBartPretrainedModel):
-    def __init__(self, config, **kwargs):
-        super().__init__(config, **kwargs)
         self.model = LSGBartModel(config)
         self.classification_head = LSGBartClassificationHead(
             config.d_model,
@@ -1786,115 +1700,12 @@ class LSGBartForSequenceClassification(LSGBartPretrainedModel):
         self.model._init_weights(self.classification_head.dense)
         self.model._init_weights(self.classification_head.out_proj)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        decoder_input_ids=None,
-        decoder_attention_mask=None,
-        head_mask=None,
-        decoder_head_mask=None,
-        cross_attn_head_mask=None,
-        encoder_outputs=None,
-        inputs_embeds=None,
-        decoder_inputs_embeds=None,
-        labels=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        ):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
-            config.num_labels - 1]`. If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        if labels is not None:
-            use_cache = False
-        if input_ids is None and inputs_embeds is not None:
-            raise NotImplementedError(
-                f"Passing input embeddings is currently not supported for {self.__class__.__name__}"
-            )
-        outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-            head_mask=head_mask,
-            decoder_head_mask=decoder_head_mask,
-            cross_attn_head_mask=cross_attn_head_mask,
-            encoder_outputs=encoder_outputs,
-            inputs_embeds=inputs_embeds,
-            decoder_inputs_embeds=decoder_inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = outputs[0]  # last hidden state
-        eos_mask = input_ids.eq(self.config.eos_token_id)
-        t, t_ = eos_mask.size()[-1], hidden_states.size()[-2]
-        if t > t_:
-            eos_mask = eos_mask[:, :t_]
-        if len(torch.unique_consecutive(eos_mask.sum(1))) > 1:
-            raise ValueError("All examples must have the same number of <eos> tokens.")
-        sentence_representation = hidden_states[eos_mask, :].view(hidden_states.size(0), -1, hidden_states.size(-1))[
-            :, -1, :
-        ]
-        logits = self.classification_head(sentence_representation)
-        loss = None
-        if labels is not None:
-            if self.config.problem_type is None:
-                if self.config.num_labels == 1:
-                    self.config.problem_type = "regression"
-                elif self.config.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
-                    self.config.problem_type = "single_label_classification"
-                else:
-                    self.config.problem_type = "multi_label_classification"
-            if self.config.problem_type == "regression":
-                loss_fct = MSELoss()
-                if self.config.num_labels == 1:
-                    loss = loss_fct(logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(logits, labels)
-            elif self.config.problem_type == "single_label_classification":
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
-            elif self.config.problem_type == "multi_label_classification":
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(logits, labels)
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-        return Seq2SeqSequenceClassifierOutput(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            decoder_hidden_states=outputs.decoder_hidden_states,
-            decoder_attentions=outputs.decoder_attentions,
-            cross_attentions=outputs.cross_attentions,
-            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
-            encoder_hidden_states=outputs.encoder_hidden_states,
-            encoder_attentions=outputs.encoder_attentions,
-        )
-class LSGBartForQuestionAnswering(LSGBartPretrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
         config.num_labels = 2
         self.num_labels = config.num_labels
@@ -1904,102 +1715,6 @@ class LSGBartForQuestionAnswering(LSGBartPretrainedModel):
         self.model._init_weights(self.qa_outputs)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        decoder_input_ids=None,
-        decoder_attention_mask=None,
-        head_mask=None,
-        decoder_head_mask=None,
-        cross_attn_head_mask=None,
-        encoder_outputs=None,
-        start_positions=None,
-        end_positions=None,
-        inputs_embeds=None,
-        decoder_inputs_embeds=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        ):
-        r"""
-        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
-            are not taken into account for computing the loss.
-        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
-            are not taken into account for computing the loss.
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        if start_positions is not None and end_positions is not None:
-            use_cache = False
-        outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-            head_mask=head_mask,
-            decoder_head_mask=decoder_head_mask,
-            cross_attn_head_mask=cross_attn_head_mask,
-            encoder_outputs=encoder_outputs,
-            inputs_embeds=inputs_embeds,
-            decoder_inputs_embeds=decoder_inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        sequence_output = outputs[0]
-        logits = self.qa_outputs(sequence_output)
-        start_logits, end_logits = logits.split(1, dim=-1)
-        start_logits = start_logits.squeeze(-1).contiguous()
-        end_logits = end_logits.squeeze(-1).contiguous()
-        total_loss = None
-        if start_positions is not None and end_positions is not None:
-            # If we are on multi-GPU, split add a dimension
-            if len(start_positions.size()) > 1:
-                start_positions = start_positions.squeeze(-1)
-            if len(end_positions.size()) > 1:
-                end_positions = end_positions.squeeze(-1)
-            # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = start_logits.size(1)
-            start_positions = start_positions.clamp(0, ignored_index)
-            end_positions = end_positions.clamp(0, ignored_index)
-            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
-            start_loss = loss_fct(start_logits, start_positions)
-            end_loss = loss_fct(end_logits, end_positions)
-            total_loss = (start_loss + end_loss) / 2
-        if not return_dict:
-            output = (
-                start_logits,
-                end_logits,
-            ) + outputs[1:]
-            return ((total_loss,) + output) if total_loss is not None else output
-        return Seq2SeqQuestionAnsweringModelOutput(
-            loss=total_loss,
-            start_logits=start_logits,
-            end_logits=end_logits,
-            past_key_values=outputs.past_key_values,
-            decoder_hidden_states=outputs.decoder_hidden_states,
-            decoder_attentions=outputs.decoder_attentions,
-            cross_attentions=outputs.cross_attentions,
-            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
-            encoder_hidden_states=outputs.encoder_hidden_states,
-            encoder_attentions=outputs.encoder_attentions,
-        )
 class LSGBartDecoderWrapper(LSGBartPretrainedModel):
     """
@@ -2007,22 +1722,22 @@ class LSGBartDecoderWrapper(LSGBartPretrainedModel):
     used in combination with the :class:`~transformers.EncoderDecoderModel` framework.
     """
-    def __init__(self, config):
         super().__init__(config)
-        self.decoder = BartDecoder(config)
     def forward(self, *args, **kwargs):
         return self.decoder(*args, **kwargs)
-class LSGBartForCausalLM(LSGBartPretrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
         config = copy.deepcopy(config)
         config.is_decoder = True
         config.is_encoder_decoder = False
         self.model = LSGBartDecoderWrapper(config)
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
@@ -2030,105 +1745,6 @@ class LSGBartForCausalLM(LSGBartPretrainedModel):
         # Initialize weights and apply final processing
         self.post_init()
-    def get_input_embeddings(self):
-        return self.model.decoder.embed_tokens
-    def set_input_embeddings(self, value):
-        self.model.decoder.embed_tokens = value
-    def get_output_embeddings(self):
-        return self.lm_head
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-    def set_decoder(self, decoder):
-        self.model.decoder = decoder
-    def get_decoder(self):
-        return self.model.decoder
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        head_mask=None,
-        cross_attn_head_mask=None,
-        past_key_values=None,
-        inputs_embeds=None,
-        labels=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        ):
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model.decoder(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            head_mask=head_mask,
-            cross_attn_head_mask=cross_attn_head_mask,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        logits = self.lm_head(outputs[0])
-        loss = None
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1))
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-        return CausalLMOutputWithCrossAttentions(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-            cross_attentions=outputs.cross_attentions,
-        )
-    def prepare_inputs_for_generation(self, input_ids, past=None, attention_mask=None, use_cache=None, **kwargs):
-        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
-        if attention_mask is None:
-            attention_mask = input_ids.new_ones(input_ids.shape)
-        if past:
-            input_ids = input_ids[:, -1:]
-        # first step, decoder_cached_states are empty
-        return {
-            "input_ids": input_ids,  # encoder_outputs is defined. input_ids not needed
-            "attention_mask": attention_mask,
-            "past_key_values": past,
-            "use_cache": use_cache,
-        }
-    @staticmethod
-    def _reorder_cache(past, beam_idx):
-        reordered_past = ()
-        for layer_past in past:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
-        return reordered_past
 def str_to_class(classname):
     return getattr(sys.modules[__name__], classname)

         ):
         """Constructs LSGConfig."""
         super().__init__(**kwargs)
         self.adaptive = adaptive
         self.auto_map = AUTO_MAP
         self.sparse_block_size = sparse_block_size
         self.sparsity_factor = sparsity_factor
         self.sparsity_type = sparsity_type
+        if sparsity_type not in [None, "none", "norm", "lsh", "pooling", "stride"]:
+            logger.warning(
+                "[WARNING CONFIG]: sparsity_mode not in [None, 'none', 'norm', 'lsh', 'pooling', 'stride'], setting sparsity_type=None, computation will skip sparse attention")
+            self.sparsity_type = None
+        if self.sparsity_type == "stride":
+            if self.sparsity_factor > self.encoder_attention_heads:
+                logger.warning(
+                "[WARNING CONFIG]: sparsity_factor > encoder_attention_heads is not recommended for stride sparsity"
+            )
+        if self.num_global_tokens < 1:
+            logger.warning(
+                "[WARNING CONFIG]: num_global_tokens < 1 is not compatible, setting num_global_tokens=1"
+            )
+            self.num_global_tokens = 1
+        elif self.num_global_tokens > 512:
+            logger.warning(
+                "[WARNING CONFIG]: num_global_tokens > 512 is not compatible, setting num_global_tokens=512"
+            )
+            self.num_global_tokens = 512
+        if self.sparsity_factor > 0:
+            assert self.block_size % self.sparsity_factor == 0, "[ERROR CONFIG]: block_size must be divisible by sparsity_factor"
+            assert self.block_size//self.sparsity_factor >= 1, "[ERROR CONFIG]: make sure block_size >= sparsity_factor"
 def shift_tokens_right(input_ids, pad_token_id, decoder_start_token_id):
     """
         # Shape of blocks
         self.local_shapes = (self.block_size*3, self.block_size)
+        if self.sparse_block_size and self.sparsity_factor > 0:
             self.sparse_shapes = (self.sparse_block_size*3, self.block_size//self.sparsity_factor)
         self.attention = BaseAttentionProduct(config)
         size, step = self.sparse_shapes
+        # In case of odd case
+        odd_offset = (step % 2)
         # n, h, t, d*2 + 1
         size = size*2
+        s = (size - step) // 2 + odd_offset
         # Pad before block reshaping
         if is_attn_mask:
         # Make blocks
         hidden_states = hidden_states.unfold(-2, size=size, step=step).transpose(-1, -2)
+        # Fix case where block_size == sparsify_factor
+        if odd_offset:
+            hidden_states = hidden_states[..., :-1, :, :]
         # Indexes for selection
+        u = (size - self.block_size * 3 // self.sparsity_factor) // 2 + odd_offset
         s = self.sparse_block_size
+        u_ = u + odd_offset
+        return torch.cat([hidden_states[..., u-s:u, :], hidden_states[..., -u_:-u_+s, :]], dim=-2)
     def cat_global_sparse_local_tokens(self, x_global, x_sparse=None, x_local=None, dim=-2):
             }
         self.sparsity_type = config.sparsity_type
+        self.get_sparse_elements = sparse_functions.get(self.sparsity_type, lambda x, y, z: (None, None, None))
         if config.sparsity_type == "lsh":
             self.lsh_num_pre_rounds = config.lsh_num_pre_rounds
     def get_sparse_tokens_with_norm(self, keys, values, mask):
         if self.sparsity_factor == 1:
+            return keys, values, mask.expand(-1, keys.size()[1], -1, -1)
         with torch.no_grad():
     def get_sparse_tokens_with_pooling(self, keys, values, mask):
         if self.sparsity_factor == 1:
+            return keys, values, mask.expand(-1, keys.size()[1], -1, -1)
         keys = self.chunk(keys, self.sparsity_factor)
         values = self.chunk(values, self.sparsity_factor)
     def get_sparse_tokens_with_stride(self, keys, values, mask):
         if self.sparsity_factor == 1:
+            return keys, values, mask.expand(-1, keys.size()[1], -1, -1)
         n, h, t, d = keys.size()
         sparse_idx = torch.arange(t // self.sparsity_factor, device=keys.device) * self.sparsity_factor
         sparse_idx = sparse_idx.reshape(1, 1, -1, 1) + (torch.arange(h, device=keys.device) % self.sparsity_factor).reshape(1, h, 1, 1)
         sparse_idx = sparse_idx.expand(n, h, -1, 1)
+        """
+        t, b = self.block_size, t // self.block_size
+        sparse_idx = torch.arange(t // self.sparsity_factor, device=keys.device) * self.sparsity_factor
+        sparse_idx = sparse_idx.reshape(1, 1, 1, -1, 1) + (torch.arange(h, device=keys.device) % self.sparsity_factor).reshape(1, h, 1, 1, 1)
+        sparse_idx = sparse_idx + torch.arange(b, device=keys.device).reshape(1, 1, -1, 1, 1) * t
+        sparse_idx = sparse_idx.reshape(1, h, -1, 1).expand(n, h, -1, 1)
+        t, b = self.block_size, t // self.block_size
+        sparse_idx = torch.arange(t // self.sparsity_factor, device=keys.device)
+        sparse_idx = sparse_idx.reshape(1, 1, 1, -1, 1) + torch.arange(h, device=keys.device).reshape(1, h, 1, 1, 1) * (t // self.sparsity_factor)
+        sparse_idx = (sparse_idx % t)
+        #sparse_idx[..., -t//2:, :] = (sparse_idx[..., -t//2:, :] + t//2) % t
+        sparse_idx = sparse_idx + torch.arange(b, device=keys.device).reshape(1, 1, -1, 1, 1) * t
+        sparse_idx = sparse_idx.reshape(1, h, -1, 1).expand(n, h, -1, 1)
+        """
         keys = keys.gather(dim=-2, index=sparse_idx.expand(-1, -1, -1, d))
         values = values.gather(dim=-2, index=sparse_idx.expand(-1, -1, -1, d))
         mask = mask.expand(-1, h, -1, -1).transpose(-1, -2).gather(dim=-2, index=sparse_idx).transpose(-1, -2)
     def get_sparse_tokens_with_lsh(self, keys, values, mask):
         if self.sparsity_factor == 1:
+            return keys, values, mask.expand(-1, keys.size()[1], -1, -1)
         block_size = min(self.block_size, self.sparse_block_size)
         keys = self.chunk(keys, block_size)
         extra_factor = 1
         for _ in range(self.lsh_num_pre_rounds):
+            keys, values, mask = self.lsh_round(keys, values, mask, t*extra_factor)
+        keys, values, mask = self.lsh_round(keys, values, mask, t//self.sparsity_factor)
         keys /= mask + 1e-8
         values /= mask + 1e-8
         return keys.reshape(n, h, -1, d), values.reshape(n, h, -1, d), mask.transpose(-1, -2).reshape(n, h, 1, -1)
+    def lsh_round(self, keys, values, mask, output_size):
         with torch.no_grad():
         # else adaptive sequence length
         elif self.adaptive:
+            # Get last non zero mask index
+            s = int(attention_mask.cumsum(dim=-1).argmax(dim=-1).max()) + 1
             if s < t and self.block_size is not None:
                 s = max(2, s // self.block_size + 1) * self.block_size if s > b else s
                 if input_ids is not None:
         self.padding_idx = config.pad_token_id
         self.max_target_positions = config.max_position_embeddings
         self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+        self.adaptive = config.adaptive
         if embed_tokens is not None:
             self.embed_tokens = embed_tokens
         return combined_attention_mask
+    def resize_inputs(self, inputs_embeds, attention_mask):
+        pad = 0
+        max_len = int(attention_mask.sum(dim=-1).max())
+        pad = attention_mask.size()[-1] - max_len
+        inputs_embeds = inputs_embeds[:, :max_len]
+        attention_mask = attention_mask[..., :max_len]
+        return pad, inputs_embeds, attention_mask
     def forward(
         self,
         input_ids=None,
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+        # Resize to reduce computation
+        pad = 0
+        if self.adaptive:
+            if attention_mask is not None:
+                pad, inputs_embeds, attention_mask = self.resize_inputs(inputs_embeds, attention_mask)
+                input_shape = inputs_embeds.size()[:-1]
+            if encoder_attention_mask is not None:
+                _, encoder_hidden_states, encoder_attention_mask = self.resize_inputs(encoder_hidden_states, encoder_attention_mask)
         attention_mask = self._prepare_decoder_attention_mask(
             attention_mask, input_shape, inputs_embeds, past_key_values_length
                 if encoder_hidden_states is not None:
                     all_cross_attentions += (layer_outputs[2],)
+        # Resize to original shape
+        hidden_states = torch.nn.functional.pad(hidden_states.transpose(-1, -2), pad=(0, pad), value=0).transpose(-1, -2)
         # add hidden states from the last decoder layer
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
         )
+class LSGBartForConditionalGeneration(BartForConditionalGeneration, LSGBartPretrainedModel):
     base_model_prefix = "model"
     _keys_to_ignore_on_load_missing = [r"final_logits_bias", r"lm_head\.weight"]
     def __init__(self, config):
+        LSGBartPretrainedModel.__init__(self, config)
         self.model = LSGBartModel(config)
         self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings)))
         self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False)
         # Initialize weights and apply final processing
         self.post_init()
+class LSGBartForSequenceClassification(BartForSequenceClassification, LSGBartPretrainedModel):
+    def __init__(self, config: LSGBartConfig, **kwargs):
+        LSGBartPretrainedModel.__init__(self, config, **kwargs)
         self.model = LSGBartModel(config)
         self.classification_head = LSGBartClassificationHead(
             config.d_model,
         self.model._init_weights(self.classification_head.dense)
         self.model._init_weights(self.classification_head.out_proj)
+class LSGBartForQuestionAnswering(BartForQuestionAnswering, LSGBartPretrainedModel):
+    def __init__(self, config: LSGBartConfig):
+        LSGBartPretrainedModel.__init__(self, config)
         config.num_labels = 2
         self.num_labels = config.num_labels
         self.model._init_weights(self.qa_outputs)
 class LSGBartDecoderWrapper(LSGBartPretrainedModel):
     """
     used in combination with the :class:`~transformers.EncoderDecoderModel` framework.
     """
+    def __init__(self, config: LSGBartConfig):
         super().__init__(config)
+        self.decoder = LSGBartDecoder(config)
     def forward(self, *args, **kwargs):
         return self.decoder(*args, **kwargs)
+class LSGBartForCausalLM(BartForCausalLM, LSGBartPretrainedModel):
+    def __init__(self, config: LSGBartConfig):
         config = copy.deepcopy(config)
         config.is_decoder = True
         config.is_encoder_decoder = False
+        LSGBartPretrainedModel.__init__(self, config)
         self.model = LSGBartDecoderWrapper(config)
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
         # Initialize weights and apply final processing
         self.post_init()
 def str_to_class(classname):
     return getattr(sys.modules[__name__], classname)

pytorch_model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0baa2aecffe0dc5c00cb4f23b89134663008055a409645e422850c2e5d78240f
 size 578416695

 version https://git-lfs.github.com/spec/v1
+oid sha256:933a1e3345672ba1ca8fb2956ca511a720e4a4ae54fe466c80c12c4a30df281b
 size 578416695