THUDM
/

LongWriter-glm4-9b

@@ -1,42 +1,39 @@
 """ PyTorch ChatGLM model. """
 import math
-import copy
-import warnings
-import re
 import sys
 import torch
 import torch.utils.checkpoint
 import torch.nn.functional as F
 from torch import nn
-from torch.nn import CrossEntropyLoss, LayerNorm
 from torch.nn.utils import skip_init
-from typing import Optional, Tuple, Union, List, Callable, Dict, Any
 from transformers.modeling_outputs import (
     BaseModelOutputWithPast,
     CausalLMOutputWithPast,
 )
 from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import logging
 from transformers.generation.logits_process import LogitsProcessor
-from transformers.generation.utils import LogitsProcessorList, StoppingCriteriaList, GenerationConfig, ModelOutput
 from .configuration_chatglm import ChatGLMConfig
-from einops import rearrange
 try:
-    from flash_attn.flash_attn_interface import flash_attn_unpadded_func
-except ImportError:
-    try:
-        # FlashAttention-2
-        from flash_attn.flash_attn_interface import flash_attn_varlen_func as flash_attn_unpadded_func
-    except ImportError:
-        flash_attn_unpadded_func = None
 # flags required to enable jit fusion kernels
-if sys.platform != 'darwin':
     torch._C._jit_set_profiling_mode(False)
     torch._C._jit_set_profiling_executor(False)
     torch._C._jit_override_can_fuse_on_cpu(True)
@@ -44,13 +41,9 @@ if sys.platform != 'darwin':
 logger = logging.get_logger(__name__)
-_CHECKPOINT_FOR_DOC = "THUDM/ChatGLM2-6B"
-_CONFIG_FOR_DOC = "ChatGLM6BConfig"
-CHATGLM_6B_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "THUDM/chatglm2-6b",
-    # See all ChatGLM models at https://huggingface.co/models?filter=chatglm
-]
 def default_init(cls, *args, **kwargs):
     return cls(*args, **kwargs)
@@ -60,22 +53,21 @@ class InvalidScoreLogitsProcessor(LogitsProcessor):
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         if torch.isnan(scores).any() or torch.isinf(scores).any():
             scores.zero_()
-            scores[..., 5] = 5e4
         return scores
 def split_tensor_along_last_dim(
         tensor: torch.Tensor,
         num_partitions: int,
         contiguous_split_chunks: bool = False,
 ) -> List[torch.Tensor]:
     """Split a tensor along its last dimension.
     Arguments:
         tensor: input tensor.
         num_partitions: number of partitions to split the tensor
         contiguous_split_chunks: If True, make each chunk contiguous
                                  in memory.
     Returns:
         A list of Tensors
     """
@@ -104,13 +96,11 @@ class RotaryEmbedding(nn.Module):
             self, seq_len: int, n_elem: int, dtype: torch.dtype, device: torch.device, base: int = 10000
     ):
         """Enhanced Transformer with Rotary Position Embedding.
         Derived from: https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/
         transformers/rope/__init__.py. MIT License:
         https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/license.
         """
         # $\Theta = {\theta_i = 10000^{\frac{2(i-1)}{d}}, i \in [1, 2, ..., \frac{d}{2}]}$
         base = base * self.rope_ratio
         theta = 1.0 / (base ** (torch.arange(0, n_elem, 2, dtype=torch.float, device=device) / n_elem))
@@ -135,14 +125,14 @@ class RotaryEmbedding(nn.Module):
 @torch.jit.script
 def apply_rotary_pos_emb(x: torch.Tensor, rope_cache: torch.Tensor) -> torch.Tensor:
-    # x: [sq, b, np, hn]
-    sq, b, np, hn = x.size(0), x.size(1), x.size(2), x.size(3)
     rot_dim = rope_cache.shape[-2] * 2
     x, x_pass = x[..., :rot_dim], x[..., rot_dim:]
     # truncate to support variable sizes
-    rope_cache = rope_cache[:sq]
-    xshaped = x.reshape(sq, -1, np, rot_dim // 2, 2)
-    rope_cache = rope_cache.view(sq, -1, 1, xshaped.size(3), 2)
     x_out2 = torch.stack(
         [
             xshaped[..., 0] * rope_cache[..., 0] - xshaped[..., 1] * rope_cache[..., 1],
@@ -171,12 +161,13 @@ class RMSNorm(torch.nn.Module):
 class CoreAttention(torch.nn.Module):
     def __init__(self, config: ChatGLMConfig, layer_number):
         super(CoreAttention, self).__init__()
         self.apply_query_key_layer_scaling = config.apply_query_key_layer_scaling
         self.attention_softmax_in_fp32 = config.attention_softmax_in_fp32
         if self.apply_query_key_layer_scaling:
             self.attention_softmax_in_fp32 = True
         self.layer_number = max(1, layer_number)
         projection_size = config.kv_channels * config.num_attention_heads
@@ -185,43 +176,213 @@ class CoreAttention(torch.nn.Module):
         self.hidden_size_per_attention_head = projection_size // config.num_attention_heads
         self.num_attention_heads_per_partition = config.num_attention_heads
         self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
-        self.attention_dropout = config.attention_dropout
     def forward(self, query_layer, key_layer, value_layer, attention_mask):
-        seqlen_q, batch_size = query_layer.shape[0], query_layer.shape[1]
-        seqlen_k = key_layer.shape[0]
-        query_layer, key_layer, value_layer = [rearrange(x, 's b ... -> (b s) ...') for x in [query_layer, key_layer, value_layer]]
-        # DO flash_attn_varlen_func
-        if attention_mask is None or attention_mask.ndim != 1:
-            cu_seqlens_q = torch.arange(0, (batch_size + 1) * seqlen_q, step=seqlen_q, dtype=torch.int32,
-                                    device=query_layer.device)
-        else:
-            assert seqlen_q == seqlen_k
-            cu_seqlens_q = attention_mask
-        if self.training:
-            assert seqlen_k == seqlen_q
-            is_causal = True
-            cu_seqlens_k = cu_seqlens_q
-        else:
-            is_causal = seqlen_q == seqlen_k
-            cu_seqlens_k = torch.arange(0, (batch_size + 1) * seqlen_k, step=seqlen_k, dtype=torch.int32,
-                        device=query_layer.device) if not is_causal else cu_seqlens_q
-            self.attention_dropout = 0
-        context_layer = flash_attn_unpadded_func(
-            query_layer, key_layer, value_layer, cu_seqlens_q, cu_seqlens_k, seqlen_q, seqlen_k,
-            self.attention_dropout,
-            softmax_scale=1.0 / self.norm_factor, causal=is_causal
         )
-        context_layer = rearrange(context_layer, '(b s) ... -> s b ...', b=batch_size)
         new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
         context_layer = context_layer.reshape(*new_context_layer_shape)
         return context_layer
 class SelfAttention(torch.nn.Module):
     """Parallel self-attention layer abstract class.
     Self-attention layer takes input with size [s, b, h]
     and returns output of the same size.
     """
@@ -248,7 +409,7 @@ class SelfAttention(torch.nn.Module):
                                          device=device, **_config_to_kwargs(config)
                                          )
-        self.core_attention = CoreAttention(config, self.layer_number)
         # Output.
         self.dense = nn.Linear(self.projection_size, config.hidden_size, bias=config.add_bias_linear,
@@ -272,7 +433,7 @@ class SelfAttention(torch.nn.Module):
     def forward(
             self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True
     ):
-        # hidden_states: [sq, b, h]
         # =================================================
         # Pre-allocate memory for key-values for inference.
@@ -281,7 +442,7 @@ class SelfAttention(torch.nn.Module):
         # Query, Key, and Value
         # =====================
-        # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)]
         mixed_x_layer = self.query_key_value(hidden_states)
         if self.multi_query_attention:
@@ -309,39 +470,45 @@ class SelfAttention(torch.nn.Module):
                                 3 * self.hidden_size_per_attention_head)
             mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
-            # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn]
             (query_layer, key_layer, value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3)
         # apply relative positional encoding (rotary embedding)
         if rotary_pos_emb is not None:
             query_layer = apply_rotary_pos_emb(query_layer, rotary_pos_emb)
             key_layer = apply_rotary_pos_emb(key_layer, rotary_pos_emb)
         # adjust key and value for inference
         if use_cache:
-            if kv_cache is not None:
-                cache_k, cache_v = kv_cache
-                key_layer = torch.cat((cache_k, key_layer), dim=0)
-                value_layer = torch.cat((cache_v, value_layer), dim=0)
-            kv_cache = (key_layer, value_layer)
         else:
             kv_cache = None
         if self.multi_query_attention:
-            key_layer = key_layer.unsqueeze(-2)
             key_layer = key_layer.expand(
-                -1, -1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1
             )
             key_layer = key_layer.contiguous().view(
-                key_layer.size()[:2] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
             )
-            value_layer = value_layer.unsqueeze(-2)
             value_layer = value_layer.expand(
-                -1, -1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1
             )
             value_layer = value_layer.contiguous().view(
-                value_layer.size()[:2] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
             )
         # ==================================
@@ -368,7 +535,6 @@ def _config_to_kwargs(args):
 class MLP(torch.nn.Module):
     """MLP.
     MLP will take the input with h hidden state, project it to 4*h
     hidden dimension, perform nonlinear transformation, and project the
     state back into h hidden dimension.
@@ -414,7 +580,6 @@ class MLP(torch.nn.Module):
 class GLMBlock(torch.nn.Module):
     """A single transformer layer.
     Transformer layer takes input with size [s, b, h] and returns an
     output of the same size.
     """
@@ -525,9 +690,9 @@ class GLMTransformer(torch.nn.Module):
         presents = () if use_cache else None
         if self.gradient_checkpointing and self.training:
             if use_cache:
-                # logger.warning_once(
-                #     "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                # )
                 use_cache = False
         all_self_attentions = None
@@ -557,7 +722,15 @@ class GLMTransformer(torch.nn.Module):
                 )
             hidden_states, kv_cache = layer_ret
             if use_cache:
-                presents = presents + (kv_cache,)
         if output_hidden_states:
             all_hidden_states = all_hidden_states + (hidden_states,)
@@ -580,18 +753,24 @@ class ChatGLMPreTrainedModel(PreTrainedModel):
     config_class = ChatGLMConfig
     base_model_prefix = "transformer"
     _no_split_modules = ["GLMBlock"]
     def _init_weights(self, module: nn.Module):
         """Initialize the weights."""
         return
     def get_masks(self, input_ids, past_key_values, padding_mask=None):
         batch_size, seq_length = input_ids.shape
         full_attention_mask = torch.ones(batch_size, seq_length, seq_length, device=input_ids.device)
         full_attention_mask.tril_()
         past_length = 0
         if past_key_values:
-            past_length = past_key_values[0][0].shape[0]
         if past_length:
             full_attention_mask = torch.cat((torch.ones(batch_size, seq_length, past_length,
                                                         device=input_ids.device), full_attention_mask), dim=-1)
@@ -608,11 +787,6 @@ class ChatGLMPreTrainedModel(PreTrainedModel):
         position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1)
         return position_ids
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, GLMTransformer):
-            module.gradient_checkpointing = value
 class Embedding(torch.nn.Module):
     """Language model embeddings."""
@@ -633,8 +807,6 @@ class Embedding(torch.nn.Module):
         # Embeddings.
         words_embeddings = self.word_embeddings(input_ids)
         embeddings = words_embeddings
-        # Data format change to avoid explicit tranposes : [b s h] --> [s b h].
-        embeddings = embeddings.transpose(0, 1).contiguous()
         # If the input flag for fp32 residual connection is set, convert for float.
         if self.fp32_residual_connection:
             embeddings = embeddings.float()
@@ -652,6 +824,9 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
         if device is not None:
             init_kwargs["device"] = device
         self.embedding = init_method(Embedding, config, **init_kwargs)
         # Rotary positional embeddings
         self.seq_length = config.seq_length
@@ -659,7 +834,8 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
             config.hidden_size // config.num_attention_heads if config.kv_channels is None else config.kv_channels
         )
-        self.rotary_pos_emb = RotaryEmbedding(rotary_dim // 2, rope_ratio=config.rope_ratio, original_impl=config.original_rope,
                                               device=device, dtype=config.torch_dtype)
         self.encoder = init_method(GLMTransformer, config, **init_kwargs)
         self.output_layer = init_method(nn.Linear, config.hidden_size, config.padded_vocab_size, bias=False,
@@ -668,6 +844,9 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
     def get_input_embeddings(self):
         return self.embedding.word_embeddings
     def forward(
             self,
             input_ids,
@@ -677,6 +856,7 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
             past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
             inputs_embeds: Optional[torch.Tensor] = None,
             use_cache: Optional[bool] = None,
             output_hidden_states: Optional[bool] = None,
             return_dict: Optional[bool] = None,
     ):
@@ -691,9 +871,9 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
         if inputs_embeds is None:
             inputs_embeds = self.embedding(input_ids)
-        # if full_attention_mask is None:
-        #     if (attention_mask is not None and not attention_mask.all()) or (past_key_values and seq_length != 1):
-        #         full_attention_mask = self.get_masks(input_ids, past_key_values, padding_mask=attention_mask)
         # Rotary positional embeddings
         rotary_pos_emb = self.rotary_pos_emb(self.seq_length)
@@ -701,13 +881,18 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
             rotary_pos_emb = rotary_pos_emb[position_ids]
         else:
             rotary_pos_emb = rotary_pos_emb[None, :seq_length]
-        rotary_pos_emb = rotary_pos_emb.transpose(0, 1).contiguous()
         # Run encoder.
         hidden_states, presents, all_hidden_states, all_self_attentions = self.encoder(
-            inputs_embeds, attention_mask, rotary_pos_emb=rotary_pos_emb,
             kv_caches=past_key_values, use_cache=use_cache, output_hidden_states=output_hidden_states
         )
         if not return_dict:
             return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
@@ -727,7 +912,6 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
         self.max_sequence_length = config.max_length
         self.transformer = ChatGLMModel(config, empty_init=empty_init, device=device)
         self.config = config
-        self.pack_loss = False
     def _update_model_kwargs_for_generation(
             self,
@@ -764,6 +948,7 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
             past_key_values: Optional[torch.Tensor] = None,
             attention_mask: Optional[torch.Tensor] = None,
             position_ids: Optional[torch.Tensor] = None,
             is_first_forward: bool = True,
             **kwargs
     ) -> dict:
@@ -771,14 +956,16 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
         if position_ids is None:
             position_ids = self.get_position_ids(input_ids, device=input_ids.device)
         if not is_first_forward:
-            position_ids = position_ids[..., -1:]
-            input_ids = input_ids[:, -1:]
         return {
             "input_ids": input_ids,
             "past_key_values": past_key_values,
             "position_ids": position_ids,
             "attention_mask": attention_mask,
-            "return_last_logit": True
         }
     def forward(
@@ -788,7 +975,7 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
             attention_mask: Optional[torch.Tensor] = None,
             past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
             inputs_embeds: Optional[torch.Tensor] = None,
-            labels: Optional[Tuple[torch.Tensor]] = None,
             use_cache: Optional[bool] = None,
             output_attentions: Optional[bool] = None,
             output_hidden_states: Optional[bool] = None,
@@ -811,30 +998,19 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
         hidden_states = transformer_outputs[0]
         if return_last_logit:
-            hidden_states = hidden_states[-1:]
         lm_logits = self.transformer.output_layer(hidden_states)
-        lm_logits = lm_logits.transpose(0, 1).contiguous()
         loss = None
         if labels is not None:
             lm_logits = lm_logits.to(torch.float32)
             # Shift so that tokens < n predict n
             shift_logits = lm_logits[..., :-1, :].contiguous()
-            if isinstance(labels, tuple) or isinstance(labels, list):
-                labels, weights = labels
             shift_labels = labels[..., 1:].contiguous()
-            if self.pack_loss:
-                loss_fct = CrossEntropyLoss(ignore_index=-100)#, reduction='none')
-                loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
-                loss *= weights
-            # if self.pack_loss:
-            #     shift_weights = weights[..., 1:].contiguous()
-            #     loss_fct = CrossEntropyLoss(ignore_index=-100, reduction='none')
-            #     loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
-            #     loss = (loss * shift_weights).sum()
-            else:
-                loss_fct = CrossEntropyLoss(ignore_index=-100)
-                loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
             lm_logits = lm_logits.to(hidden_states.dtype)
             loss = loss.to(hidden_states.dtype)
@@ -859,33 +1035,24 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
         This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
         [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
         beam_idx at every generation step.
         Output shares the same memory storage as `past`.
         """
         return tuple(
             (
-                layer_past[0].index_select(1, beam_idx.to(layer_past[0].device)),
-                layer_past[1].index_select(1, beam_idx.to(layer_past[1].device)),
             )
             for layer_past in past
         )
-    def process_response(self, response):
-        response = response.strip()
-        response = response.replace("[[训练时间]]", "2023年")
-        return response
     @torch.inference_mode()
     def chat(self, tokenizer, query: str, history: List[Dict] = None, role: str = "user",
-             max_length: int = 8192, num_beams=1, do_sample=True, top_p=0.8, temperature=0.8, logits_processor=None,
              **kwargs):
         if history is None:
             history = []
-        if logits_processor is None:
-            logits_processor = LogitsProcessorList()
-        logits_processor.append(InvalidScoreLogitsProcessor())
         gen_kwargs = {"max_length": max_length, "num_beams": num_beams, "do_sample": do_sample, "top_p": top_p,
-                      "temperature": temperature, "logits_processor": logits_processor, **kwargs}
         inputs = tokenizer.build_chat_input(query, history=history, role=role)
         inputs = inputs.to(self.device)
         eos_token_id = [tokenizer.eos_token_id, tokenizer.get_command("<|user|>"),
@@ -894,5 +1061,4 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
         outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):-1]
         response = tokenizer.decode(outputs)
         history.append({"role": role, "content": query})
-        response = self.process_response(response)
         return response, history

 """ PyTorch ChatGLM model. """
 import math
 import sys
 import torch
 import torch.utils.checkpoint
 import torch.nn.functional as F
 from torch import nn
+from torch.nn import CrossEntropyLoss, LayerNorm, MSELoss, BCEWithLogitsLoss
 from torch.nn.utils import skip_init
+from typing import Optional, Tuple, Union, List, Dict, Any
 from transformers.modeling_outputs import (
     BaseModelOutputWithPast,
     CausalLMOutputWithPast,
+    SequenceClassifierOutputWithPast,
 )
 from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import logging, is_torch_npu_available
 from transformers.generation.logits_process import LogitsProcessor
+from transformers.generation.utils import ModelOutput
 from .configuration_chatglm import ChatGLMConfig
 try:
+    from transformers.utils import is_flash_attn_greater_or_equal_2_10, is_flash_attn_2_available
+    if is_flash_attn_2_available():
+        from flash_attn import flash_attn_func, flash_attn_varlen_func
+        from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+except:
+    pass
 # flags required to enable jit fusion kernels
+if sys.platform != 'darwin' and not is_torch_npu_available():
     torch._C._jit_set_profiling_mode(False)
     torch._C._jit_set_profiling_executor(False)
     torch._C._jit_override_can_fuse_on_cpu(True)
 logger = logging.get_logger(__name__)
+_CHECKPOINT_FOR_DOC = "THUDM/ChatGLM"
+_CONFIG_FOR_DOC = "ChatGLMConfig"
 def default_init(cls, *args, **kwargs):
     return cls(*args, **kwargs)
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         if torch.isnan(scores).any() or torch.isinf(scores).any():
             scores.zero_()
+            scores[..., 198] = 5e4
         return scores
 def split_tensor_along_last_dim(
         tensor: torch.Tensor,
         num_partitions: int,
         contiguous_split_chunks: bool = False,
 ) -> List[torch.Tensor]:
     """Split a tensor along its last dimension.
     Arguments:
         tensor: input tensor.
         num_partitions: number of partitions to split the tensor
         contiguous_split_chunks: If True, make each chunk contiguous
                                  in memory.
     Returns:
         A list of Tensors
     """
             self, seq_len: int, n_elem: int, dtype: torch.dtype, device: torch.device, base: int = 10000
     ):
         """Enhanced Transformer with Rotary Position Embedding.
         Derived from: https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/
         transformers/rope/__init__.py. MIT License:
         https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/license.
         """
         # $\Theta = {\theta_i = 10000^{\frac{2(i-1)}{d}}, i \in [1, 2, ..., \frac{d}{2}]}$
         base = base * self.rope_ratio
         theta = 1.0 / (base ** (torch.arange(0, n_elem, 2, dtype=torch.float, device=device) / n_elem))
 @torch.jit.script
 def apply_rotary_pos_emb(x: torch.Tensor, rope_cache: torch.Tensor) -> torch.Tensor:
+    # x: [b, np, sq, hn]
+    b, np, sq, hn = x.size(0), x.size(1), x.size(2), x.size(3)
     rot_dim = rope_cache.shape[-2] * 2
     x, x_pass = x[..., :rot_dim], x[..., rot_dim:]
     # truncate to support variable sizes
+    rope_cache = rope_cache[:, :sq]
+    xshaped = x.reshape(b, np, sq, rot_dim // 2, 2)
+    rope_cache = rope_cache.view(-1, 1, sq, xshaped.size(3), 2)
     x_out2 = torch.stack(
         [
             xshaped[..., 0] * rope_cache[..., 0] - xshaped[..., 1] * rope_cache[..., 1],
 class CoreAttention(torch.nn.Module):
     def __init__(self, config: ChatGLMConfig, layer_number):
         super(CoreAttention, self).__init__()
+        self.config = config
         self.apply_query_key_layer_scaling = config.apply_query_key_layer_scaling
         self.attention_softmax_in_fp32 = config.attention_softmax_in_fp32
         if self.apply_query_key_layer_scaling:
             self.attention_softmax_in_fp32 = True
         self.layer_number = max(1, layer_number)
+        self.is_causal = True
         projection_size = config.kv_channels * config.num_attention_heads
         self.hidden_size_per_attention_head = projection_size // config.num_attention_heads
         self.num_attention_heads_per_partition = config.num_attention_heads
+        coeff = None
         self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
+        if self.apply_query_key_layer_scaling:
+            coeff = self.layer_number
+            self.norm_factor *= coeff
+        self.coeff = coeff
+        self.attention_dropout = torch.nn.Dropout(config.attention_dropout)
     def forward(self, query_layer, key_layer, value_layer, attention_mask):
+        # [b, np, sq, sk]
+        output_size = (query_layer.size(0), query_layer.size(1), query_layer.size(2), key_layer.size(2))
+        # [b, np, sq, hn] -> [b * np, sq, hn]
+        query_layer = query_layer.view(output_size[0] * output_size[1], output_size[2], -1)
+        # [b, np, sk, hn] -> [b * np, sk, hn]
+        key_layer = key_layer.view(output_size[0] * output_size[1], output_size[3], -1)
+        # preallocting input tensor: [b * np, sq, sk]
+        matmul_input_buffer = torch.empty(
+            output_size[0] * output_size[1], output_size[2], output_size[3], dtype=query_layer.dtype,
+            device=query_layer.device
+        )
+        # Raw attention scores. [b * np, sq, sk]
+        matmul_result = torch.baddbmm(
+            matmul_input_buffer,
+            query_layer,  # [b * np, sq, hn]
+            key_layer.transpose(1, 2),  # [b * np, hn, sk]
+            beta=0.0,
+            alpha=(1.0 / self.norm_factor),
         )
+        # change view to [b, np, sq, sk]
+        attention_scores = matmul_result.view(*output_size)
+        # ===========================
+        # Attention probs and dropout
+        # ===========================
+        # attention scores and attention mask [b, np, sq, sk]
+        if self.attention_softmax_in_fp32:
+            attention_scores = attention_scores.float()
+        if self.coeff is not None:
+            attention_scores = attention_scores * self.coeff
+        if attention_mask is None and attention_scores.shape[2] == attention_scores.shape[3]:
+            attention_mask = torch.ones(output_size[0], 1, output_size[2], output_size[3],
+                                        device=attention_scores.device, dtype=torch.bool)
+            attention_mask.tril_()
+            attention_mask = ~attention_mask
+        if attention_mask is not None:
+            attention_scores = attention_scores.masked_fill(attention_mask, float("-inf"))
+        attention_probs = F.softmax(attention_scores, dim=-1)
+        attention_probs = attention_probs.type_as(value_layer)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.attention_dropout(attention_probs)
+        # query layer shape: [b * np, sq, hn]
+        # value layer shape: [b, np, sk, hn]
+        # attention shape: [b, np, sq, sk]
+        # context layer shape: [b, np, sq, hn]
+        output_size = (value_layer.size(0), value_layer.size(1), query_layer.size(1), value_layer.size(3))
+        # change view [b * np, sk, hn]
+        value_layer = value_layer.view(output_size[0] * output_size[1], value_layer.size(2), -1)
+        # change view [b * np, sq, sk]
+        attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1)
+        # matmul: [b * np, sq, hn]
+        context_layer = torch.bmm(attention_probs, value_layer)
+        # change view [b, np, sq, hn]
+        context_layer = context_layer.view(*output_size)
+        # [b, np, sq, hn] --> [b, sq, np, hn]
+        context_layer = context_layer.transpose(1, 2).contiguous()
+        # [b, sq, np, hn] --> [b, sq, hp]
+        new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
+        context_layer = context_layer.reshape(*new_context_layer_shape)
+        return context_layer
+class SdpaAttention(CoreAttention):
+    def forward(self, query_layer, key_layer, value_layer, attention_mask):
+        if attention_mask is None and query_layer.shape[2] == key_layer.shape[2]:
+            context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer,
+                                                                             is_causal=True,
+                                                                             dropout_p=self.config.attention_dropout if self.training else 0.0)
+        else:
+            if attention_mask is not None:
+                attention_mask = ~attention_mask
+            context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer,
+                                                                             attention_mask,
+                                                                             dropout_p=self.config.attention_dropout if self.training else 0.0)
+        context_layer = context_layer.transpose(1, 2).contiguous()
         new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
         context_layer = context_layer.reshape(*new_context_layer_shape)
         return context_layer
+def _get_unpad_data(attention_mask):
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+# Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2
+class FlashAttention2(CoreAttention):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+    def forward(self, query_states, key_states, value_states, attention_mask):
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+        batch_size, query_length = query_states.shape[:2]
+        if not self._flash_attn_uses_top_left_mask:
+            causal = self.is_causal
+        else:
+            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
+            causal = self.is_causal and query_length != 1
+        dropout = self.config.attention_dropout if self.training else 0.0
+        # Contains at least one padding token in the sequence
+        if attention_mask is not None:
+            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
+                query_states, key_states, value_states, attention_mask, query_length
+            )
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+            attn_output_unpad = flash_attn_varlen_func(
+                query_states,
+                key_states,
+                value_states,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=max_seqlen_in_batch_q,
+                max_seqlen_k=max_seqlen_in_batch_k,
+                dropout_p=dropout,
+                softmax_scale=None,
+                causal=causal,
+            )
+            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+        else:
+            attn_output = flash_attn_func(
+                query_states, key_states, value_states, dropout, softmax_scale=None, causal=causal
+            )
+        attn_output = attn_output.reshape(batch_size, query_length, self.hidden_size_per_partition).contiguous()
+        return attn_output
+    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
+        key_layer = index_first_axis(
+            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+        )
+        value_layer = index_first_axis(
+            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+        )
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, self.num_attention_heads_per_partition, head_dim),
+                indices_k
+            )
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            attention_mask = attention_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+CORE_ATTENTION_CLASSES = {
+    "eager": CoreAttention,
+    "sdpa": SdpaAttention,
+    "flash_attention_2": FlashAttention2
+}
 class SelfAttention(torch.nn.Module):
     """Parallel self-attention layer abstract class.
     Self-attention layer takes input with size [s, b, h]
     and returns output of the same size.
     """
                                          device=device, **_config_to_kwargs(config)
                                          )
+        self.core_attention = CORE_ATTENTION_CLASSES[config._attn_implementation](config, self.layer_number)
         # Output.
         self.dense = nn.Linear(self.projection_size, config.hidden_size, bias=config.add_bias_linear,
     def forward(
             self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True
     ):
+        # hidden_states: [b, sq, h]
         # =================================================
         # Pre-allocate memory for key-values for inference.
         # Query, Key, and Value
         # =====================
+        # Attention heads [b, sq, h] --> [b, sq, (np * 3 * hn)]
         mixed_x_layer = self.query_key_value(hidden_states)
         if self.multi_query_attention:
                                 3 * self.hidden_size_per_attention_head)
             mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
+            # [b, sq, np, 3 * hn] --> 3 [b, sq, np, hn]
             (query_layer, key_layer, value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3)
+        # [b, sq, np, hn] -> [b, np, sq, hn]
+        query_layer, key_layer, value_layer = [k.transpose(1, 2) for k in [query_layer, key_layer, value_layer]]
         # apply relative positional encoding (rotary embedding)
         if rotary_pos_emb is not None:
             query_layer = apply_rotary_pos_emb(query_layer, rotary_pos_emb)
             key_layer = apply_rotary_pos_emb(key_layer, rotary_pos_emb)
         # adjust key and value for inference
+        if kv_cache is not None:
+            cache_k, cache_v = kv_cache
+            key_layer = torch.cat((cache_k, key_layer), dim=2)
+            value_layer = torch.cat((cache_v, value_layer), dim=2)
         if use_cache:
+            if kv_cache is None:
+                kv_cache = torch.cat((key_layer.unsqueeze(0).unsqueeze(0), value_layer.unsqueeze(0).unsqueeze(0)),
+                                     dim=1)
+            else:
+                kv_cache = (key_layer, value_layer)
         else:
             kv_cache = None
         if self.multi_query_attention:
+            key_layer = key_layer.unsqueeze(2)
             key_layer = key_layer.expand(
+                -1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1, -1
             )
             key_layer = key_layer.contiguous().view(
+                key_layer.size()[:1] + (self.num_attention_heads_per_partition,) + key_layer.size()[3:]
             )
+            value_layer = value_layer.unsqueeze(2)
             value_layer = value_layer.expand(
+                -1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1, -1
             )
             value_layer = value_layer.contiguous().view(
+                value_layer.size()[:1] + (self.num_attention_heads_per_partition,) + value_layer.size()[3:]
             )
         # ==================================
 class MLP(torch.nn.Module):
     """MLP.
     MLP will take the input with h hidden state, project it to 4*h
     hidden dimension, perform nonlinear transformation, and project the
     state back into h hidden dimension.
 class GLMBlock(torch.nn.Module):
     """A single transformer layer.
     Transformer layer takes input with size [s, b, h] and returns an
     output of the same size.
     """
         presents = () if use_cache else None
         if self.gradient_checkpointing and self.training:
             if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
                 use_cache = False
         all_self_attentions = None
                 )
             hidden_states, kv_cache = layer_ret
             if use_cache:
+                # token by token decoding, use tuple format
+                if kv_caches[0] is not None:
+                    presents = presents + (kv_cache,)
+                # prefilling in decoding, use tensor format to save cuda memory
+                else:
+                    if len(presents) == 0:
+                        presents = kv_cache
+                    else:
+                        presents = torch.cat((presents, kv_cache.to(presents.device)), dim=0)
         if output_hidden_states:
             all_hidden_states = all_hidden_states + (hidden_states,)
     config_class = ChatGLMConfig
     base_model_prefix = "transformer"
     _no_split_modules = ["GLMBlock"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
     def _init_weights(self, module: nn.Module):
         """Initialize the weights."""
         return
     def get_masks(self, input_ids, past_key_values, padding_mask=None):
+        if self.config._attn_implementation == "flash_attention_2":
+            if padding_mask is not None and not padding_mask.all():
+                return padding_mask
+            return None
         batch_size, seq_length = input_ids.shape
         full_attention_mask = torch.ones(batch_size, seq_length, seq_length, device=input_ids.device)
         full_attention_mask.tril_()
         past_length = 0
         if past_key_values:
+            past_length = past_key_values[0][0].shape[2]
         if past_length:
             full_attention_mask = torch.cat((torch.ones(batch_size, seq_length, past_length,
                                                         device=input_ids.device), full_attention_mask), dim=-1)
         position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1)
         return position_ids
 class Embedding(torch.nn.Module):
     """Language model embeddings."""
         # Embeddings.
         words_embeddings = self.word_embeddings(input_ids)
         embeddings = words_embeddings
         # If the input flag for fp32 residual connection is set, convert for float.
         if self.fp32_residual_connection:
             embeddings = embeddings.float()
         if device is not None:
             init_kwargs["device"] = device
         self.embedding = init_method(Embedding, config, **init_kwargs)
+        self.num_layers = config.num_layers
+        self.multi_query_group_num = config.multi_query_group_num
+        self.kv_channels = config.kv_channels
         # Rotary positional embeddings
         self.seq_length = config.seq_length
             config.hidden_size // config.num_attention_heads if config.kv_channels is None else config.kv_channels
         )
+        self.rotary_pos_emb = RotaryEmbedding(rotary_dim // 2, rope_ratio=config.rope_ratio,
+                                              original_impl=config.original_rope,
                                               device=device, dtype=config.torch_dtype)
         self.encoder = init_method(GLMTransformer, config, **init_kwargs)
         self.output_layer = init_method(nn.Linear, config.hidden_size, config.padded_vocab_size, bias=False,
     def get_input_embeddings(self):
         return self.embedding.word_embeddings
+    def set_input_embeddings(self, value):
+        self.embedding.word_embeddings = value
     def forward(
             self,
             input_ids,
             past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
             inputs_embeds: Optional[torch.Tensor] = None,
             use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
             output_hidden_states: Optional[bool] = None,
             return_dict: Optional[bool] = None,
     ):
         if inputs_embeds is None:
             inputs_embeds = self.embedding(input_ids)
+        if full_attention_mask is None:
+            if (attention_mask is not None and not attention_mask.all()) or (past_key_values and seq_length != 1):
+                full_attention_mask = self.get_masks(input_ids, past_key_values, padding_mask=attention_mask)
         # Rotary positional embeddings
         rotary_pos_emb = self.rotary_pos_emb(self.seq_length)
             rotary_pos_emb = rotary_pos_emb[position_ids]
         else:
             rotary_pos_emb = rotary_pos_emb[None, :seq_length]
         # Run encoder.
         hidden_states, presents, all_hidden_states, all_self_attentions = self.encoder(
+            inputs_embeds, full_attention_mask, rotary_pos_emb=rotary_pos_emb,
             kv_caches=past_key_values, use_cache=use_cache, output_hidden_states=output_hidden_states
         )
+        if presents is not None and type(presents) is torch.Tensor:
+            presents = presents.split(1, dim=0)
+            presents = list(presents)
+            presents = [list(x.squeeze(0).split(1, dim=0)) for x in presents]
+            presents = [tuple([x.squeeze(0) for x in y]) for y in presents]
+            presents = tuple(presents)
         if not return_dict:
             return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
         self.max_sequence_length = config.max_length
         self.transformer = ChatGLMModel(config, empty_init=empty_init, device=device)
         self.config = config
     def _update_model_kwargs_for_generation(
             self,
             past_key_values: Optional[torch.Tensor] = None,
             attention_mask: Optional[torch.Tensor] = None,
             position_ids: Optional[torch.Tensor] = None,
+            use_cache: Optional[bool] = None,
             is_first_forward: bool = True,
             **kwargs
     ) -> dict:
         if position_ids is None:
             position_ids = self.get_position_ids(input_ids, device=input_ids.device)
         if not is_first_forward:
+            if past_key_values is not None:
+                position_ids = position_ids[..., -1:]
+                input_ids = input_ids[:, -1:]
         return {
             "input_ids": input_ids,
             "past_key_values": past_key_values,
             "position_ids": position_ids,
             "attention_mask": attention_mask,
+            "return_last_logit": True,
+            "use_cache": use_cache
         }
     def forward(
             attention_mask: Optional[torch.Tensor] = None,
             past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
             inputs_embeds: Optional[torch.Tensor] = None,
+            labels: Optional[torch.Tensor] = None,
             use_cache: Optional[bool] = None,
             output_attentions: Optional[bool] = None,
             output_hidden_states: Optional[bool] = None,
         hidden_states = transformer_outputs[0]
         if return_last_logit:
+            hidden_states = hidden_states[:, -1:]
         lm_logits = self.transformer.output_layer(hidden_states)
         loss = None
         if labels is not None:
             lm_logits = lm_logits.to(torch.float32)
             # Shift so that tokens < n predict n
             shift_logits = lm_logits[..., :-1, :].contiguous()
             shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss(ignore_index=-100)
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
             lm_logits = lm_logits.to(hidden_states.dtype)
             loss = loss.to(hidden_states.dtype)
         This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
         [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
         beam_idx at every generation step.
         Output shares the same memory storage as `past`.
         """
         return tuple(
             (
+                layer_past[0].index_select(0, beam_idx.to(layer_past[0].device)),
+                layer_past[1].index_select(0, beam_idx.to(layer_past[1].device)),
             )
             for layer_past in past
         )
     @torch.inference_mode()
     def chat(self, tokenizer, query: str, history: List[Dict] = None, role: str = "user",
+             max_length: int = 8192, num_beams=1, do_sample=True, top_p=0.8, temperature=0.8,
              **kwargs):
         if history is None:
             history = []
         gen_kwargs = {"max_length": max_length, "num_beams": num_beams, "do_sample": do_sample, "top_p": top_p,
+                      "temperature": temperature, **kwargs}
         inputs = tokenizer.build_chat_input(query, history=history, role=role)
         inputs = inputs.to(self.device)
         eos_token_id = [tokenizer.eos_token_id, tokenizer.get_command("<|user|>"),
         outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):-1]
         response = tokenizer.decode(outputs)
         history.append({"role": role, "content": query})
         return response, history