Fix issues regarding to transformer version

Browse files

Files changed (3) hide show

generation_config.json +5 -2
modeling_transnormer.py +145 -279
tokenization_baichuan.py +5 -5

generation_config.json CHANGED Viewed

@@ -1,6 +1,9 @@
 {
-  "_from_model_config": true,
   "bos_token_id": 1,
   "eos_token_id": 2,
-  "transformers_version": "4.31.0"
 }

 {
+  "pad_token_id": 0,
   "bos_token_id": 1,
   "eos_token_id": 2,
+  "max_new_tokens": 2048,
+  "temperature": 1.0,
+  "repetition_penalty": 1.03,
+  "do_sample": true
 }

modeling_transnormer.py CHANGED Viewed

@@ -11,8 +11,7 @@
 #    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #    See the License for the specific language governing permissions and
 #    limitations under the License.
-#   coding=utf-8
 """ PyTorch Transnormer model."""
 import math
 import os
@@ -29,7 +28,6 @@ from transformers.activations import ACT2FN
 from transformers.modeling_outputs import (
     BaseModelOutputWithPast,
     CausalLMOutputWithPast,
-    SequenceClassifierOutputWithPast,
 )
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import (
@@ -85,7 +83,6 @@ if not has_lightning_attention:
 ########## start Transnormer
 ##### Linearized Relative Positional Encoding: https://openreview.net/forum?id=xoLyps2qWc&referrer=%5BAuthor%20Console%5D(%2Fgroup%3Fid%3DTMLR%2FAuthors%23your-submissions)
 class Lrpe(nn.Module):
     def __init__(
         self,
         num_heads=8,
@@ -95,8 +92,9 @@ class Lrpe(nn.Module):
         d = num_heads * embed_dim
         self.index = torch.empty(0)
-        self.theta = nn.Parameter(10000**(-2 / d * torch.arange(d)).reshape(
-            num_heads, 1, -1))
     def extra_repr(self):
         return print_module(self)
@@ -115,7 +113,6 @@ class Lrpe(nn.Module):
 class GLU(nn.Module):
     def __init__(self, d1, d2, bias=False):
         super().__init__()
         if debug:
@@ -138,7 +135,6 @@ class GLU(nn.Module):
 class NormLinearAttention(nn.Module):
     def __init__(
         self,
         embed_dim,
@@ -194,7 +190,7 @@ class NormLinearAttention(nn.Module):
                 output_attentions,
                 past_key_value,
                 use_cache,
-                slope_rate=slope_rate,
             )
         # x: b n d
         n = x.shape[-2]
@@ -202,8 +198,8 @@ class NormLinearAttention(nn.Module):
         q, k, v, u = self.qkvu_proj(x).chunk(4, dim=-1)
         # reshape
         q, k, v = map(
-            lambda x: rearrange(x, "b n (h d) -> b h n d", h=self.num_heads),
-            [q, k, v])
         # act
         q = self.act(q)
         k = self.act(k)
@@ -211,7 +207,7 @@ class NormLinearAttention(nn.Module):
         q_offset = 0
         # lrpe relys on position, get cache first
         if past_key_value is not None:
-            # reuse k, v, self_attention
             k = torch.cat([past_key_value[0], k], dim=-2)
             v = torch.cat([past_key_value[1], v], dim=-2)
             q_offset = past_key_value[0].shape[-2]
@@ -228,17 +224,17 @@ class NormLinearAttention(nn.Module):
         if attn_padding_mask is not None:
             v = v.masked_fill(
-                (1 - attn_padding_mask).unsqueeze(1).unsqueeze(-1).to(
-                    torch.bool), 0)
         if not has_lightning_attention:
             if slope_rate != None:
                 attn_mask = torch.exp(slope_rate * attn_mask)
             output = linear_attention(q, k, v, attn_mask)
         else:
-            output = lightning_attention(q, k, v, True,
-                                         slope_rate.squeeze(-1).squeeze(-1))
         # reshape
         output = rearrange(output, "b h n d -> b n (h d)")
@@ -257,14 +253,14 @@ class NormLinearAttention(nn.Module):
         return output, attn_weights, past_key_value
     def inference(
-            self,
-            x,
-            attn_mask: Optional[torch.Tensor] = None,  # (b, h, n, m)
-            attn_padding_mask: Optional[torch.Tensor] = None,  # (b, m)
-            output_attentions: bool = False,
-            past_key_value: Optional[Tuple[torch.Tensor]] = None,
-            use_cache: bool = False,
-            slope_rate: Optional[torch.Tensor] = None,  # (h, 1, 1)
     ):
         # x: b n d
         n = x.shape[-2]
@@ -272,8 +268,8 @@ class NormLinearAttention(nn.Module):
         q, k, v, u = self.qkvu_proj(x).chunk(4, dim=-1)
         # reshape
         q, k, v = map(
-            lambda x: rearrange(x, "b n (h d) -> b h n d", h=self.num_heads),
-            [q, k, v])
         # act
         q = self.act(q)
         k = self.act(k)
@@ -281,7 +277,7 @@ class NormLinearAttention(nn.Module):
         # rpe
         if self.linear_use_lrpe:
             q = self.lrpe(q, offset=self.offset)
-            k = self.lrpe(k, offset=self.offset)
         if past_key_value == None:
             self.offset = q.shape[-2]
@@ -299,8 +295,7 @@ class NormLinearAttention(nn.Module):
             if attn_padding_mask is not None:
                 attn_mask = attn_mask.masked_fill(
-                    (1 - attn_padding_mask).unsqueeze(1).unsqueeze(2).to(
-                        torch.bool),
                     0,
                 )
             energy = torch.einsum("... n d, ... m d -> ... n m", q, k)
@@ -311,18 +306,17 @@ class NormLinearAttention(nn.Module):
             output = torch.einsum("... n m, ... m d -> ... n d", energy, v)
             eval_and_not_generate = eval(
-                os.environ.get("eval_and_not_generate", default="False"))
             if eval_and_not_generate:
                 kv = None
             else:
                 # b, h, n, e, d
-                kv_outproduct = torch.einsum("... n e, ... n d -> ... n e d",
-                                             k, v)
                 # 1, 1, n, 1, 1
-                index = torch.arange(n - 1, -1, -1).reshape(1, 1, -1, 1,
-                                                            1).to(x)
                 # (h, 1, 1) -> (1, h, 1, 1, 1); (1, h, 1, 1, 1), (1, 1, n, 1, 1) -> (1, h, n, 1, 1)
-                decay = ratio.unsqueeze(0).unsqueeze(-1)**index
                 kv_outproduct_with_decay = kv_outproduct * decay
                 kv = torch.sum(kv_outproduct_with_decay, dim=-3)
@@ -333,11 +327,12 @@ class NormLinearAttention(nn.Module):
             for i in range(n):
                 kv = ratio * kv + torch.einsum(
                     "... n d, ... n e -> ... d e",
-                    k[:, :, i:i + 1],
-                    v[:, :, i:i + 1],
                 )
-                qkv = torch.einsum("... n e, ... e d -> ... n d",
-                                   q[:, :, i:i + 1], kv)
                 output.append(qkv)
             output = torch.concat(output, dim=-2)
@@ -356,7 +351,6 @@ class NormLinearAttention(nn.Module):
 class TransnormerDecoderLayer(nn.Module):
     def __init__(self, config: TransnormerConfig):
         super().__init__()
         self.embed_dim = config.decoder_embed_dim
@@ -395,14 +389,14 @@ class TransnormerDecoderLayer(nn.Module):
         return residual + x
     def forward(
-            self,
-            x,
-            attn_mask: Optional[torch.Tensor] = None,
-            attn_padding_mask: Optional[torch.Tensor] = None,
-            past_key_value: Optional[Tuple[torch.Tensor]] = None,
-            output_attentions: Optional[bool] = False,
-            use_cache: Optional[bool] = False,
-            slope_rate: Optional[torch.Tensor] = None,  # (h, 1, 1)
     ):
         residual = x
         x = self.token_norm(x)
@@ -422,13 +416,13 @@ class TransnormerDecoderLayer(nn.Module):
         x = self.channel_mixer(x)
         x = self.residual_connection(x, residual)
-        outputs = (x, )
         if output_attentions:
-            outputs += (self_attn_weights, )
         if use_cache:
-            outputs += (present_key_value, )
         return outputs
@@ -450,7 +444,9 @@ TRANSNORMER_START_DOCSTRING = r"""
 """
-@add_start_docstrings(TRANSNORMER_START_DOCSTRING, )
 class TransnormerPreTrainedModel(PreTrainedModel):
     config_class = TransnormerConfig
     base_model_prefix = "model"
@@ -535,7 +531,9 @@ TRANSNORMER_INPUTS_DOCSTRING = r"""
 """
-@add_start_docstrings(TRANSNORMER_START_DOCSTRING, )
 class TransnormerModel(TransnormerPreTrainedModel):
     """
     Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`TransnormerDecoderLayer`]
@@ -559,31 +557,29 @@ class TransnormerModel(TransnormerPreTrainedModel):
         self.slopes = self._build_slope_tensor(config.decoder_attention_heads)
         # params
-        self.embed_tokens = nn.Embedding(config.vocab_size,
-                                         config.decoder_embed_dim,
-                                         self.padding_idx)
         self.layers = nn.ModuleList([])
         for i in range(config.decoder_layers):
             if len(self.linear_use_lrpe_list) > 0:
                 config.linear_use_lrpe = self.linear_use_lrpe_list[i]
             self.layers.append(TransnormerDecoderLayer(config))
-        self.final_norm = get_norm_fn(config.norm_type)(
-            config.decoder_embed_dim)
         self.embed_dim = config.decoder_embed_dim
-        self.embed_scale = (1.0 if config.no_scale_embedding else math.sqrt(
-            self.embed_dim))
         # Initialize weights and apply final processing
         self.post_init()
     @staticmethod
     def _build_slope_tensor(n_attention_heads: int):
         def get_slopes(n):
             def get_slopes_power_of_2(n):
-                start = 2**(-(2**-(math.log2(n) - 3)))
                 ratio = start
                 return [start * ratio**i for i in range(n)]
@@ -592,15 +588,18 @@ class TransnormerModel(TransnormerPreTrainedModel):
                     n
                 )  # In the paper, we only train models that have 2^a heads for some a. This function has
             else:  # some good properties that only occur when the input is a power of 2. To maintain that even
-                closest_power_of_2 = 2**math.floor(
                     math.log2(n)
                 )  # when the number of heads is not a power of 2, we use this workaround.
-                return (get_slopes_power_of_2(closest_power_of_2) + get_slopes(
-                    2 * closest_power_of_2)[0::2][:n - closest_power_of_2])
         # h, 1, 1
         slopes = torch.tensor(get_slopes(n_attention_heads)).reshape(
-            n_attention_heads, 1, 1)
         return slopes
@@ -613,26 +612,26 @@ class TransnormerModel(TransnormerPreTrainedModel):
     def set_input_embeddings(self, value):
         self.embed_tokens = value
-    def _prepare_decoder_linear_attn_mask(self, input_shape, inputs_embeds,
-                                          past_key_values_length):
         bsz, tgt_len = input_shape
         src_len = tgt_len + past_key_values_length
         def power_log(x):
-            return 2**(math.ceil(math.log(x, 2)))
         n = power_log(max(tgt_len, src_len))
         if self._linear_attn_mask.shape[-1] < n:
             def get_mask(n):
-                mask = torch.triu(
-                    torch.zeros(n, n).float().fill_(float("-inf")), 1)
                 # no slope version
                 # -n, ..., -2, -1, 0
                 for i in range(n):
                     x = torch.arange(i + 1)
                     y = x
-                    mask[i, :i + 1] = -torch.flip(y, [0])
                 return mask
@@ -644,8 +643,7 @@ class TransnormerModel(TransnormerPreTrainedModel):
         linear_attn_mask = self._linear_attn_mask[:, -tgt_len:, -src_len:]
         num_heads = linear_attn_mask.shape[0]
-        return linear_attn_mask[None, :, :, :].expand(bsz, num_heads, tgt_len,
-                                                      src_len)
     @add_start_docstrings_to_model_forward(TRANSNORMER_INPUTS_DOCSTRING)
     def forward(
@@ -659,15 +657,21 @@ class TransnormerModel(TransnormerPreTrainedModel):
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, BaseModelOutputWithPast]:
-        output_attentions = (output_attentions if output_attentions is not None
-                             else self.config.output_attentions)
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states is not None else
-                                self.config.output_hidden_states)
         use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = (return_dict if return_dict is not None else
-                       self.config.use_return_dict)
         # retrieve input_ids and inputs_embeds
         if input_ids is not None and inputs_embeds is not None:
@@ -689,7 +693,7 @@ class TransnormerModel(TransnormerPreTrainedModel):
         if past_key_values is not None:
             past_key_values_length = past_key_values[0][0].shape[-2]
             seq_length_with_past = seq_length_with_past + past_key_values_length
         if inputs_embeds is None:
             # !!! use embed_scale
             inputs_embeds = self.embed_scale * self.embed_tokens(input_ids)
@@ -711,72 +715,54 @@ class TransnormerModel(TransnormerPreTrainedModel):
         ##### norm linear layers
         linear_attn_padding_mask = attn_padding_mask
         linear_attn_mask = self._prepare_decoder_linear_attn_mask(
-            (batch_size, seq_length), inputs_embeds, past_key_values_length)
-        slope_rates = [
-            self.slopes.to(input_ids.device) for _ in range(self.num_layers)
-        ]
         for idx, layer in enumerate(self.layers):
             if output_hidden_states:
-                all_hidden_states += (hidden_states, )
-            past_key_value = (past_key_values[idx]
-                              if past_key_values is not None else None)
             slope_rate = slope_rates[idx]
             slope_rate = slope_rate * (1 - idx / (self.num_layers - 1) + 1e-5)
             mask = linear_attn_mask
-            if self.gradient_checkpointing and self.training:
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        # None for past_key_value
-                        return module(*inputs, output_attentions, None)
-                    return custom_forward
-                layer_outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(layer),
-                    hidden_states,
-                    mask,
-                    linear_attn_padding_mask,
-                    None,
-                )
-            else:
-                layer_outputs = layer(
-                    hidden_states,
-                    attn_mask=mask,
-                    attn_padding_mask=linear_attn_padding_mask,
-                    past_key_value=past_key_value,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                    slope_rate=slope_rate,
-                )
             hidden_states = layer_outputs[0]
             if use_cache:
-                next_decoder_cache += (
-                    layer_outputs[2 if output_attentions else 1], )
             if output_attentions:
-                all_self_attns += (layer_outputs[1], )
         hidden_states = self.final_norm(hidden_states)
         # add hidden states from the last decoder layer
         if output_hidden_states:
-            all_hidden_states += (hidden_states, )
         next_cache = next_decoder_cache if use_cache else None
         if not return_dict:
             return tuple(
-                v for v in
-                [hidden_states, next_cache, all_hidden_states, all_self_attns]
-                if v is not None)
         return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=next_cache,
@@ -786,7 +772,6 @@ class TransnormerModel(TransnormerPreTrainedModel):
 class TransnormerForCausalLM(TransnormerPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.model = TransnormerModel(config)
@@ -794,9 +779,9 @@ class TransnormerForCausalLM(TransnormerPreTrainedModel):
             logging_info(self.model)
         # the lm_head weight is automatically tied to the embed tokens weight
-        self.lm_head = nn.Linear(config.decoder_embed_dim,
-                                 config.vocab_size,
-                                 bias=False)
         # Initialize weights and apply final processing
         self.post_init()
@@ -820,8 +805,9 @@ class TransnormerForCausalLM(TransnormerPreTrainedModel):
         return self.model
     @add_start_docstrings_to_model_forward(TRANSNORMER_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=CausalLMOutputWithPast,
-                               config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         input_ids: torch.LongTensor = None,
@@ -859,13 +845,19 @@ class TransnormerForCausalLM(TransnormerPreTrainedModel):
         >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
         "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
         ```"""
-        output_attentions = (output_attentions if output_attentions is not None
-                             else self.config.output_attentions)
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states is not None else
-                                self.config.output_hidden_states)
-        return_dict = (return_dict if return_dict is not None else
-                       self.config.use_return_dict)
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
         outputs = self.model(
@@ -896,8 +888,8 @@ class TransnormerForCausalLM(TransnormerPreTrainedModel):
             loss = loss_fct(shift_logits, shift_labels)
         if not return_dict:
-            output = (logits, ) + outputs[1:]
-            return (loss, ) + output if loss is not None else output
         return CausalLMOutputWithPast(
             loss=loss,
@@ -924,149 +916,23 @@ class TransnormerForCausalLM(TransnormerPreTrainedModel):
         else:
             model_inputs = {"input_ids": input_ids}
-        model_inputs.update({
-            "past_key_values": past_key_values,
-            "use_cache": kwargs.get("use_cache"),
-            "attention_mask": attention_mask,
-        })
         return model_inputs
     @staticmethod
     def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
         for layer_past in past_key_values:
-            reordered_past += (tuple(
-                past_state.index_select(0, beam_idx)
-                for past_state in layer_past), )
-        return reordered_past
-@add_start_docstrings(
-    """
-    The LLaMa Model transformer with a sequence classification head on top (linear layer).
-    [`TransnormerForSequenceClassification`] uses the last token in order to do the classification, as other causal models
-    (e.g. GPT-2) do.
-    Since it does classification on the last token, it requires to know the position of the last token. If a
-    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
-    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
-    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
-    each row of the batch).
-    """,
-    TRANSNORMER_START_DOCSTRING,
-)
-class TransnormerForSequenceClassification(TransnormerPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"lm_head.weight"]
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.model = TransnormerModel(config)
-        self.score = nn.Linear(config.decoder_embed_dim,
-                               self.num_labels,
-                               bias=False)
-        # Initialize weights and apply final processing
-        self.post_init()
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-    @add_start_docstrings_to_model_forward(TRANSNORMER_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attn_mask: Optional[torch.Tensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = (return_dict if return_dict is not None else
-                       self.config.use_return_dict)
-        transformer_outputs = self.model(
-            input_ids,
-            attn_padding_mask=attn_mask,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = transformer_outputs[0]
-        logits = self.score(hidden_states)
-        if input_ids is not None:
-            batch_size = input_ids.shape[0]
-        else:
-            batch_size = inputs_embeds.shape[0]
-        if self.config.pad_token_id is None and batch_size != 1:
-            raise ValueError(
-                "Cannot handle batch sizes > 1 if no padding token is defined."
             )
-        if self.config.pad_token_id is None:
-            sequence_lengths = -1
-        else:
-            if input_ids is not None:
-                sequence_lengths = (
-                    torch.ne(input_ids, self.config.pad_token_id).sum(-1) -
-                    1).to(logits.device)
-            else:
-                sequence_lengths = -1
-        pooled_logits = logits[torch.arange(batch_size, device=logits.device),
-                               sequence_lengths]
-        loss = None
-        if labels is not None:
-            labels = labels.to(logits.device)
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = "regression"
-                elif self.num_labels > 1 and (labels.dtype == torch.long
-                                              or labels.dtype == torch.int):
-                    self.config.problem_type = "single_label_classification"
-                else:
-                    self.config.problem_type = "multi_label_classification"
-            if self.config.problem_type == "regression":
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(pooled_logits, labels)
-            elif self.config.problem_type == "single_label_classification":
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(pooled_logits.view(-1, self.num_labels),
-                                labels.view(-1))
-            elif self.config.problem_type == "multi_label_classification":
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(pooled_logits, labels)
-        if not return_dict:
-            output = (pooled_logits, ) + transformer_outputs[1:]
-            return ((loss, ) + output) if loss is not None else output
-        return SequenceClassifierOutputWithPast(
-            loss=loss,
-            logits=pooled_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )

 #    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #    See the License for the specific language governing permissions and
 #    limitations under the License.
+# coding=utf-8
 """ PyTorch Transnormer model."""
 import math
 import os
 from transformers.modeling_outputs import (
     BaseModelOutputWithPast,
     CausalLMOutputWithPast,
 )
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import (
 ########## start Transnormer
 ##### Linearized Relative Positional Encoding: https://openreview.net/forum?id=xoLyps2qWc&referrer=%5BAuthor%20Console%5D(%2Fgroup%3Fid%3DTMLR%2FAuthors%23your-submissions)
 class Lrpe(nn.Module):
     def __init__(
         self,
         num_heads=8,
         d = num_heads * embed_dim
         self.index = torch.empty(0)
+        self.theta = nn.Parameter(
+            10000 ** (-2 / d * torch.arange(d)).reshape(num_heads, 1, -1)
+        )
     def extra_repr(self):
         return print_module(self)
 class GLU(nn.Module):
     def __init__(self, d1, d2, bias=False):
         super().__init__()
         if debug:
 class NormLinearAttention(nn.Module):
     def __init__(
         self,
         embed_dim,
                 output_attentions,
                 past_key_value,
                 use_cache,
+                slope_rate,
             )
         # x: b n d
         n = x.shape[-2]
         q, k, v, u = self.qkvu_proj(x).chunk(4, dim=-1)
         # reshape
         q, k, v = map(
+            lambda x: rearrange(x, "b n (h d) -> b h n d", h=self.num_heads), [q, k, v]
+        )
         # act
         q = self.act(q)
         k = self.act(k)
         q_offset = 0
         # lrpe relys on position, get cache first
         if past_key_value is not None:
+            # reuse k, v, for evaluation only
             k = torch.cat([past_key_value[0], k], dim=-2)
             v = torch.cat([past_key_value[1], v], dim=-2)
             q_offset = past_key_value[0].shape[-2]
         if attn_padding_mask is not None:
             v = v.masked_fill(
+                (1 - attn_padding_mask).unsqueeze(1).unsqueeze(-1).to(torch.bool), 0
+            )
         if not has_lightning_attention:
             if slope_rate != None:
                 attn_mask = torch.exp(slope_rate * attn_mask)
             output = linear_attention(q, k, v, attn_mask)
         else:
+            output = lightning_attention(
+                q, k, v, True, slope_rate.squeeze(-1).squeeze(-1)
+            )
         # reshape
         output = rearrange(output, "b h n d -> b n (h d)")
         return output, attn_weights, past_key_value
     def inference(
+        self,
+        x,
+        attn_mask: Optional[torch.Tensor] = None,  # (b, h, n, m)
+        attn_padding_mask: Optional[torch.Tensor] = None,  # (b, m)
+        output_attentions: bool = False,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        use_cache: bool = False,
+        slope_rate: Optional[torch.Tensor] = None,  # (h, 1, 1)
     ):
         # x: b n d
         n = x.shape[-2]
         q, k, v, u = self.qkvu_proj(x).chunk(4, dim=-1)
         # reshape
         q, k, v = map(
+            lambda x: rearrange(x, "b n (h d) -> b h n d", h=self.num_heads), [q, k, v]
+        )
         # act
         q = self.act(q)
         k = self.act(k)
         # rpe
         if self.linear_use_lrpe:
             q = self.lrpe(q, offset=self.offset)
+            k = self.lrpe(k)
         if past_key_value == None:
             self.offset = q.shape[-2]
             if attn_padding_mask is not None:
                 attn_mask = attn_mask.masked_fill(
+                    (1 - attn_padding_mask).unsqueeze(1).unsqueeze(2).to(torch.bool),
                     0,
                 )
             energy = torch.einsum("... n d, ... m d -> ... n m", q, k)
             output = torch.einsum("... n m, ... m d -> ... n d", energy, v)
             eval_and_not_generate = eval(
+                os.environ.get("eval_and_not_generate", default="False")
+            )
             if eval_and_not_generate:
                 kv = None
             else:
                 # b, h, n, e, d
+                kv_outproduct = torch.einsum("... n e, ... n d -> ... n e d", k, v)
                 # 1, 1, n, 1, 1
+                index = torch.arange(n - 1, -1, -1).reshape(1, 1, -1, 1, 1).to(x)
                 # (h, 1, 1) -> (1, h, 1, 1, 1); (1, h, 1, 1, 1), (1, 1, n, 1, 1) -> (1, h, n, 1, 1)
+                decay = ratio.unsqueeze(0).unsqueeze(-1) ** index
                 kv_outproduct_with_decay = kv_outproduct * decay
                 kv = torch.sum(kv_outproduct_with_decay, dim=-3)
             for i in range(n):
                 kv = ratio * kv + torch.einsum(
                     "... n d, ... n e -> ... d e",
+                    k[:, :, i : i + 1],
+                    v[:, :, i : i + 1],
+                )
+                qkv = torch.einsum(
+                    "... n e, ... e d -> ... n d", q[:, :, i : i + 1], kv
                 )
                 output.append(qkv)
             output = torch.concat(output, dim=-2)
 class TransnormerDecoderLayer(nn.Module):
     def __init__(self, config: TransnormerConfig):
         super().__init__()
         self.embed_dim = config.decoder_embed_dim
         return residual + x
     def forward(
+        self,
+        x,
+        attn_mask: Optional[torch.Tensor] = None,
+        attn_padding_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        slope_rate: Optional[torch.Tensor] = None,  # (h, 1, 1)
     ):
         residual = x
         x = self.token_norm(x)
         x = self.channel_mixer(x)
         x = self.residual_connection(x, residual)
+        outputs = (x,)
         if output_attentions:
+            outputs += (self_attn_weights,)
         if use_cache:
+            outputs += (present_key_value,)
         return outputs
 """
+@add_start_docstrings(
+    TRANSNORMER_START_DOCSTRING,
+)
 class TransnormerPreTrainedModel(PreTrainedModel):
     config_class = TransnormerConfig
     base_model_prefix = "model"
 """
+@add_start_docstrings(
+    TRANSNORMER_START_DOCSTRING,
+)
 class TransnormerModel(TransnormerPreTrainedModel):
     """
     Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`TransnormerDecoderLayer`]
         self.slopes = self._build_slope_tensor(config.decoder_attention_heads)
         # params
+        self.embed_tokens = nn.Embedding(
+            config.vocab_size, config.decoder_embed_dim, self.padding_idx
+        )
         self.layers = nn.ModuleList([])
         for i in range(config.decoder_layers):
             if len(self.linear_use_lrpe_list) > 0:
                 config.linear_use_lrpe = self.linear_use_lrpe_list[i]
             self.layers.append(TransnormerDecoderLayer(config))
+        self.final_norm = get_norm_fn(config.norm_type)(config.decoder_embed_dim)
         self.embed_dim = config.decoder_embed_dim
+        self.embed_scale = (
+            1.0 if config.no_scale_embedding else math.sqrt(self.embed_dim)
+        )
         # Initialize weights and apply final processing
         self.post_init()
     @staticmethod
     def _build_slope_tensor(n_attention_heads: int):
         def get_slopes(n):
             def get_slopes_power_of_2(n):
+                start = 2 ** (-(2 ** -(math.log2(n) - 3)))
                 ratio = start
                 return [start * ratio**i for i in range(n)]
                     n
                 )  # In the paper, we only train models that have 2^a heads for some a. This function has
             else:  # some good properties that only occur when the input is a power of 2. To maintain that even
+                closest_power_of_2 = 2 ** math.floor(
                     math.log2(n)
                 )  # when the number of heads is not a power of 2, we use this workaround.
+                return (
+                    get_slopes_power_of_2(closest_power_of_2)
+                    + get_slopes(2 * closest_power_of_2)[0::2][: n - closest_power_of_2]
+                )
         # h, 1, 1
         slopes = torch.tensor(get_slopes(n_attention_heads)).reshape(
+            n_attention_heads, 1, 1
+        )
         return slopes
     def set_input_embeddings(self, value):
         self.embed_tokens = value
+    def _prepare_decoder_linear_attn_mask(
+        self, input_shape, inputs_embeds, past_key_values_length
+    ):
         bsz, tgt_len = input_shape
         src_len = tgt_len + past_key_values_length
         def power_log(x):
+            return 2 ** (math.ceil(math.log(x, 2)))
         n = power_log(max(tgt_len, src_len))
         if self._linear_attn_mask.shape[-1] < n:
             def get_mask(n):
+                mask = torch.triu(torch.zeros(n, n).float().fill_(float("-inf")), 1)
                 # no slope version
                 # -n, ..., -2, -1, 0
                 for i in range(n):
                     x = torch.arange(i + 1)
                     y = x
+                    mask[i, : i + 1] = -torch.flip(y, [0])
                 return mask
         linear_attn_mask = self._linear_attn_mask[:, -tgt_len:, -src_len:]
         num_heads = linear_attn_mask.shape[0]
+        return linear_attn_mask[None, :, :, :].expand(bsz, num_heads, tgt_len, src_len)
     @add_start_docstrings_to_model_forward(TRANSNORMER_INPUTS_DOCSTRING)
     def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
         # retrieve input_ids and inputs_embeds
         if input_ids is not None and inputs_embeds is not None:
         if past_key_values is not None:
             past_key_values_length = past_key_values[0][0].shape[-2]
             seq_length_with_past = seq_length_with_past + past_key_values_length
         if inputs_embeds is None:
             # !!! use embed_scale
             inputs_embeds = self.embed_scale * self.embed_tokens(input_ids)
         ##### norm linear layers
         linear_attn_padding_mask = attn_padding_mask
         linear_attn_mask = self._prepare_decoder_linear_attn_mask(
+            (batch_size, seq_length), inputs_embeds, past_key_values_length
+        )
+        slope_rates = [self.slopes.to(input_ids.device) for _ in range(self.num_layers)]
         for idx, layer in enumerate(self.layers):
             if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            past_key_value = (
+                past_key_values[idx] if past_key_values is not None else None
+            )
             slope_rate = slope_rates[idx]
             slope_rate = slope_rate * (1 - idx / (self.num_layers - 1) + 1e-5)
             mask = linear_attn_mask
+            layer_outputs = layer(
+                hidden_states,
+                attn_mask=mask,
+                attn_padding_mask=linear_attn_padding_mask,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                slope_rate=slope_rate,
+            )
             hidden_states = layer_outputs[0]
             if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
             if output_attentions:
+                all_self_attns += (layer_outputs[1],)
         hidden_states = self.final_norm(hidden_states)
         # add hidden states from the last decoder layer
         if output_hidden_states:
+            all_hidden_states += (hidden_states,)
         next_cache = next_decoder_cache if use_cache else None
         if not return_dict:
             return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
+                if v is not None
+            )
         return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=next_cache,
 class TransnormerForCausalLM(TransnormerPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.model = TransnormerModel(config)
             logging_info(self.model)
         # the lm_head weight is automatically tied to the embed tokens weight
+        self.lm_head = nn.Linear(
+            config.decoder_embed_dim, config.vocab_size, bias=False
+        )
         # Initialize weights and apply final processing
         self.post_init()
         return self.model
     @add_start_docstrings_to_model_forward(TRANSNORMER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(
+        output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
+    )
     def forward(
         self,
         input_ids: torch.LongTensor = None,
         >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
         "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
         ```"""
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
         outputs = self.model(
             loss = loss_fct(shift_logits, shift_labels)
         if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
         return CausalLMOutputWithPast(
             loss=loss,
         else:
             model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
         return model_inputs
     @staticmethod
     def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
         for layer_past in past_key_values:
+            reordered_past += (
+                tuple(
+                    past_state.index_select(0, beam_idx) for past_state in layer_past
+                ),
             )
+        return reordered_past

tokenization_baichuan.py CHANGED Viewed

@@ -73,6 +73,11 @@ class BaiChuanTokenizer(PreTrainedTokenizer):
                      if isinstance(unk_token, str) else unk_token)
         pad_token = (AddedToken(pad_token, lstrip=False, rstrip=False)
                      if isinstance(pad_token, str) else pad_token)
         super().__init__(
             bos_token=bos_token,
             eos_token=eos_token,
@@ -84,11 +89,6 @@ class BaiChuanTokenizer(PreTrainedTokenizer):
             clean_up_tokenization_spaces=clean_up_tokenization_spaces,
             **kwargs,
         )
-        self.vocab_file = vocab_file
-        self.add_bos_token = add_bos_token
-        self.add_eos_token = add_eos_token
-        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        self.sp_model.Load(vocab_file)
     def __getstate__(self):
         state = self.__dict__.copy()

                      if isinstance(unk_token, str) else unk_token)
         pad_token = (AddedToken(pad_token, lstrip=False, rstrip=False)
                      if isinstance(pad_token, str) else pad_token)
+        self.vocab_file = vocab_file
+        self.add_bos_token = add_bos_token
+        self.add_eos_token = add_eos_token
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(vocab_file)
         super().__init__(
             bos_token=bos_token,
             eos_token=eos_token,
             clean_up_tokenization_spaces=clean_up_tokenization_spaces,
             **kwargs,
         )
     def __getstate__(self):
         state = self.__dict__.copy()