FreedomIntelligence
/

HuatuoGPT2-13B

@@ -1,6 +1,6 @@
 {
   "_from_model_config": true,
-  "_name_or_path": "/mntcephfs/data/med/zhanghongbo/yaojishi/cjy/ckpts/huatuo2_13B_v3_final/checkpoint-0-8706/tfmr",
   "architectures": [
     "BaichuanForCausalLM"
   ],

 {
   "_from_model_config": true,
+  "_name_or_path": "HuatuoGPT2-13B",
   "architectures": [
     "BaichuanForCausalLM"
   ],

generation_utils.py CHANGED Viewed

@@ -3,51 +3,6 @@ from queue import Queue
 import torch
-# def build_chat_input(model, tokenizer, messages: List[dict], max_new_tokens: int=0):
-#     def _parse_messages(messages, split_role="user"):
-#         system, rounds = "", []
-#         round = []
-#         for i, message in enumerate(messages):
-#             if message["role"] == "system":
-#                 assert i == 0
-#                 system = message["content"]
-#                 continue
-#             if message["role"] == split_role and round:
-#                 rounds.append(round)
-#                 round = []
-#             round.append(message)
-#         if round:
-#             rounds.append(round)
-#         return system, rounds
-#     max_new_tokens = max_new_tokens or model.generation_config.max_new_tokens
-#     max_input_tokens = model.config.model_max_length - max_new_tokens
-#     system, rounds = _parse_messages(messages, split_role="user")
-#     system_tokens = tokenizer.encode(system)
-#     max_history_tokens = max_input_tokens - len(system_tokens)
-#     history_tokens = []
-#     for round in rounds[::-1]:
-#         round_tokens = []
-#         for message in round:
-#             if message["role"] == "user":
-#                 round_tokens.append(model.generation_config.user_token_id)
-#             else:
-#                 round_tokens.append(model.generation_config.assistant_token_id)
-#             round_tokens.extend(tokenizer.encode(message["content"]))
-#         if len(history_tokens) == 0 or len(history_tokens) + len(round_tokens) <= max_history_tokens:
-#             history_tokens = round_tokens + history_tokens  # concat left
-#             if len(history_tokens) < max_history_tokens:
-#                 continue
-#         break
-#     input_tokens = system_tokens + history_tokens
-#     if messages[-1]["role"] != "assistant":
-#         input_tokens.append(model.generation_config.assistant_token_id)
-#     input_tokens = input_tokens[-max_input_tokens:]  # truncate left
-#     return torch.LongTensor([input_tokens]).to(model.device)
 # for HuatuoGPT2
 def build_chat_input(model, tokenizer, messages: List[dict], max_new_tokens: int=0):
     def _parse_messages(messages, split_role="user"):

 import torch
 # for HuatuoGPT2
 def build_chat_input(model, tokenizer, messages: List[dict], max_new_tokens: int=0):
     def _parse_messages(messages, split_role="user"):

modeling_baichuan.py CHANGED Viewed

@@ -1,45 +1,26 @@
-# Copyright 2023 Baichuan Inc. All Rights Reserved.
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 from .configuration_baichuan import BaichuanConfig
 from .generation_utils import build_chat_input, TextIterStreamer
 import math
-from typing import List, Optional, Tuple, Union
 from threading import Thread
 import torch
-import torch.utils.checkpoint
 from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from torch.nn import functional as F
 from transformers import PreTrainedModel, PretrainedConfig
 from transformers.activations import ACT2FN
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
 from transformers.generation.utils import GenerationConfig
 from transformers.utils import logging, ContextManagers
 import os
 from contextlib import contextmanager
 logger = logging.get_logger(__name__)
 try:
@@ -51,169 +32,138 @@ except ImportError:
     )
-# Copied from transformers.models.bart.modeling_bart._make_causal_mask
-def _make_causal_mask(
-        input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
-):
-    """
-    Make causal mask used for bi-directional self-attention.
-    """
-    bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
-    mask_cond = torch.arange(mask.size(-1), device=device)
-    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
-    mask = mask.to(dtype)
-    if past_key_values_length > 0:
-        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
-    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
-def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
-    """
-    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
-    """
-    if len(mask.size()) == 3:
-        bsz, src_len, _ = mask.size()
-        tgt_len = tgt_len if tgt_len is not None else src_len
-        expanded_mask = mask[:,None,:,:].expand(bsz, 1, tgt_len, src_len).to(dtype)
     else:
-        bsz, src_len = mask.size()
-        tgt_len = tgt_len if tgt_len is not None else src_len
-        expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
-    inverted_mask = 1.0 - expanded_mask
-    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
-class RMSNorm(nn.Module):
-    def __init__(self, hidden_size, eps=1e-6):
-        """
-        RMSNorm is equivalent to T5LayerNorm
-        """
         super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
     def forward(self, hidden_states):
         variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-        # convert into half-precision if necessary
         if self.weight.dtype in [torch.float16, torch.bfloat16]:
             hidden_states = hidden_states.to(self.weight.dtype)
         return self.weight * hidden_states
-class RotaryEmbedding(torch.nn.Module):
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
-        super().__init__()
-        self.inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim))
-        self.max_seq_len_cached = max_position_embeddings
-        t = torch.arange(self.max_seq_len_cached, device=self.inv_freq.device, dtype=torch.float32)
-        freqs = torch.outer(t, self.inv_freq)
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.cos_cached = emb.cos()[None, None, :, :].to(torch.float32)
-        self.sin_cached = emb.sin()[None, None, :, :].to(torch.float32)
-    def forward(self, x, seq_len=None):
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case.
-        if seq_len > self.max_seq_len_cached:
-            self.max_seq_len_cached = seq_len
-            t = torch.arange(self.max_seq_len_cached, device=self.inv_freq.device, dtype=torch.float32)
-            freqs = torch.outer(t, self.inv_freq)
-            emb = torch.cat((freqs, freqs), dim=-1)
-            self.cos_cached = emb.cos()[None, None, :, :].to(torch.float32).to(x.device)
-            self.sin_cached = emb.sin()[None, None, :, :].to(torch.float32).to(x.device)
-        elif self.cos_cached.device != x.device:
-            self.cos_cached = self.cos_cached.to(x.device)
-            self.sin_cached = self.sin_cached.to(x.device)
-        return (
-            self.cos_cached[:, :, :seq_len, ...],
-            self.sin_cached[:, :, :seq_len, ...],
-        )
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2:]
-    return torch.cat((-x2, x1), dim=-1)
-def apply_rotary_pos_emb(q, k, cos_, sin_, position_ids):
-    cos = cos_.squeeze(1).squeeze(0)  # [seq_len, dim]
-    sin = sin_.squeeze(1).squeeze(0)  # [seq_len, dim]
-    cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
-    sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
-    q_embed = (q.float() * cos) + (rotate_half(q.float()) * sin)
-    k_embed = (k.float() * cos) + (rotate_half(k.float()) * sin)
-    return q_embed.to(q.dtype), k_embed.to(k.dtype)
-class MLP(nn.Module):
     def __init__(
-            self,
-            hidden_size: int,
-            intermediate_size: int,
-            hidden_act: str,
     ):
         super().__init__()
-        self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
-        self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False)
-        self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
         self.act_fn = ACT2FN[hidden_act]
     def forward(self, x):
         return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-class Attention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper"""
     def __init__(self, config: BaichuanConfig):
         super().__init__()
         self.config = config
         self.hidden_size = config.hidden_size
         self.num_heads = config.num_attention_heads
         self.head_dim = self.hidden_size // self.num_heads
-        self.max_position_embeddings = config.max_position_embeddings
         if (self.head_dim * self.num_heads) != self.hidden_size:
             raise ValueError(
-                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
-                f" and `num_heads`: {self.num_heads})."
             )
-        self.W_pack = nn.Linear(self.hidden_size, 3 * self.hidden_size, bias=False)
-        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
-        self.rotary_emb = RotaryEmbedding(self.head_dim, max_position_embeddings=self.max_position_embeddings)
     def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
     def forward(
-            self,
-            hidden_states: torch.Tensor,
-            attention_mask: Optional[torch.Tensor] = None,
-            position_ids: Optional[torch.LongTensor] = None,
-            past_key_value: Optional[Tuple[torch.Tensor]] = None,
-            output_attentions: bool = False,
-            use_cache: bool = False,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         bsz, q_len, _ = hidden_states.size()
         proj = self.W_pack(hidden_states)
-        proj = proj.unflatten(-1, (3, self.hidden_size)).unsqueeze(0).transpose(0, -2).squeeze(-2)
-        query_states = proj[0].view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = proj[1].view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        value_states = proj[2].view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
         kv_seq_len = key_states.shape[-2]
         if past_key_value is not None:
             kv_seq_len += past_key_value[0].shape[-2]
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-        # [bsz, nh, t, hd]
         if past_key_value is not None:
             # reuse k, v, self_attention
@@ -223,16 +173,35 @@ class Attention(nn.Module):
         past_key_value = (key_states, value_states) if use_cache else None
         if xops is not None and self.training:
             attn_weights = None
-            query_states = query_states.transpose(1, 2)
-            key_states = key_states.transpose(1, 2)
-            value_states = value_states.transpose(1, 2)
-            attn_output = xops.memory_efficient_attention(
-                query_states, key_states, value_states, attn_bias=xops.LowerTriangularMask()
-            )
-        else:
             with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=True, enable_mem_efficient=True):
                 attn_output = F.scaled_dot_product_attention(query_states, key_states, value_states, attn_mask = attention_mask)
             attn_output = attn_output.transpose(1, 2)
         attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
         attn_output = self.o_proj(attn_output)
@@ -242,29 +211,31 @@ class Attention(nn.Module):
         return attn_output, attn_weights, past_key_value
-class DecoderLayer(nn.Module):
     def __init__(self, config: BaichuanConfig):
         super().__init__()
         self.hidden_size = config.hidden_size
-        self.self_attn = Attention(config=config)
         self.mlp = MLP(
             hidden_size=self.hidden_size,
             intermediate_size=config.intermediate_size,
             hidden_act=config.hidden_act,
         )
-        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
     def forward(
-            self,
-            hidden_states: torch.Tensor,
-            attention_mask: Optional[torch.Tensor] = None,
-            position_ids: Optional[torch.LongTensor] = None,
-            past_key_value: Optional[Tuple[torch.Tensor]] = None,
-            output_attentions: Optional[bool] = False,
-            use_cache: Optional[bool] = False,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
         residual = hidden_states
         hidden_states = self.input_layernorm(hidden_states)
@@ -273,7 +244,6 @@ class DecoderLayer(nn.Module):
         hidden_states, self_attn_weights, present_key_value = self.self_attn(
             hidden_states=hidden_states,
             attention_mask=attention_mask,
-            position_ids=position_ids,
             past_key_value=past_key_value,
             output_attentions=output_attentions,
             use_cache=use_cache,
@@ -288,9 +258,6 @@ class DecoderLayer(nn.Module):
         outputs = (hidden_states,)
-        if output_attentions:
-            outputs += (self_attn_weights,)
         if use_cache:
             outputs += (present_key_value,)
@@ -301,16 +268,16 @@ class BaichuanPreTrainedModel(PreTrainedModel):
     config_class = BaichuanConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
-    _no_split_modules = ["DecoderLayer"]
     _keys_to_ignore_on_load_unexpected = [r"decoder\.version"]
     def _init_weights(self, module):
         std = self.config.initializer_range
-        if isinstance(module, nn.Linear):
             module.weight.data.normal_(mean=0.0, std=std)
             if module.bias is not None:
                 module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
             module.weight.data.normal_(mean=0.0, std=std)
             if module.padding_idx is not None:
                 module.weight.data[module.padding_idx].zero_()
@@ -325,14 +292,20 @@ class BaichuanModel(BaichuanPreTrainedModel):
         super().__init__(config)
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
-        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
-        self.layers = nn.ModuleList([DecoderLayer(config) for _ in range(config.num_hidden_layers)])
-        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.gradient_checkpointing = False
-        # Initialize weights and apply final processing
         self.post_init()
     def get_input_embeddings(self):
         return self.embed_tokens
@@ -340,86 +313,118 @@ class BaichuanModel(BaichuanPreTrainedModel):
     def set_input_embeddings(self, value):
         self.embed_tokens = value
-    # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
-    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
-        # create causal mask
-        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-        combined_attention_mask = None
-        if input_shape[-1] > 1:
-            combined_attention_mask = _make_causal_mask(
-                input_shape,
-                inputs_embeds.dtype,
-                device=inputs_embeds.device,
-                past_key_values_length=past_key_values_length,
             )
-        if attention_mask is not None:
-            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
-                inputs_embeds.device
             )
-            combined_attention_mask = (
-                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
             )
-        return combined_attention_mask
     def forward(
-            self,
-            input_ids: torch.LongTensor = None,
-            attention_mask: Optional[torch.Tensor] = None,
-            position_ids: Optional[torch.LongTensor] = None,
-            past_key_values: Optional[List[torch.FloatTensor]] = None,
-            inputs_embeds: Optional[torch.FloatTensor] = None,
-            use_cache: Optional[bool] = None,
-            output_attentions: Optional[bool] = None,
-            output_hidden_states: Optional[bool] = None,
-            return_dict: Optional[bool] = None,
     ) -> Union[Tuple, BaseModelOutputWithPast]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        # retrieve input_ids and inputs_embeds
         if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
         elif input_ids is not None:
             batch_size, seq_length = input_ids.shape
         elif inputs_embeds is not None:
             batch_size, seq_length, _ = inputs_embeds.shape
         else:
-            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
         seq_length_with_past = seq_length
-        past_key_values_length = 0
         if past_key_values is not None:
             past_key_values_length = past_key_values[0][0].shape[2]
             seq_length_with_past = seq_length_with_past + past_key_values_length
-        if position_ids is None:
-            device = input_ids.device if input_ids is not None else inputs_embeds.device
-            position_ids = torch.arange(
-                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
-            )
-            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
-        else:
-            position_ids = position_ids.view(-1, seq_length).long()
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
-        # embed positions
-        if attention_mask is None:
-            attention_mask = torch.ones(
-                (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
             )
-        attention_mask = self._prepare_decoder_attention_mask(
-            attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
-        )
         hidden_states = inputs_embeds
@@ -439,7 +444,9 @@ class BaichuanModel(BaichuanPreTrainedModel):
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            past_key_value = past_key_values[idx] if past_key_values is not None else None
             if self.gradient_checkpointing and self.training:
@@ -454,14 +461,12 @@ class BaichuanModel(BaichuanPreTrainedModel):
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     attention_mask,
-                    position_ids,
                     None,
                 )
             else:
                 layer_outputs = decoder_layer(
                     hidden_states,
                     attention_mask=attention_mask,
-                    position_ids=position_ids,
                     past_key_value=past_key_value,
                     output_attentions=output_attentions,
                     use_cache=use_cache,
@@ -483,7 +488,11 @@ class BaichuanModel(BaichuanPreTrainedModel):
         next_cache = next_decoder_cache if use_cache else None
         if not return_dict:
-            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
         return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=next_cache,
@@ -505,7 +514,7 @@ class NormHead(nn.Module):
             self.first_flag = True
         elif self.first_flag:
             self.first_flag = False
-            self.weight.data = nn.functional.normalize(self.weight)
             norm_weight = self.weight
         else:
             norm_weight = self.weight
@@ -523,17 +532,18 @@ def no_init_weights(_enable=True):
     finally:
         _init_weights = old_init_weights
 class BaichuanForCausalLM(BaichuanPreTrainedModel):
     def __init__(self, config, *model_args, **model_kwargs):
         super().__init__(config, *model_args, **model_kwargs)
         self.model = BaichuanModel(config)
         self.lm_head = NormHead(config.hidden_size, config.vocab_size, bias=False)
         if hasattr(config, "quantization_config") and isinstance(config.quantization_config, dict) and config.quantization_config.get('load_in_4bit', False):
             try:
                 from .quantizer import quantize_offline, init_model_weight_int4
             except ImportError:
-                raise ImportError(f"Needs QLinear to run quantize.")
             quantize_offline(self, 4)
         # Initialize weights and apply final processing
         self.post_init()
@@ -571,6 +581,7 @@ class BaichuanForCausalLM(BaichuanPreTrainedModel):
         use_safetensors: bool = None,
         **kwargs,
     ):
         # Load config if we don't provide a configuration
         if not isinstance(config, PretrainedConfig):
             config_path = config if config is not None else pretrained_model_name_or_path
@@ -591,36 +602,97 @@ class BaichuanForCausalLM(BaichuanPreTrainedModel):
             )
         else:
             model_kwargs = kwargs
         return super(BaichuanForCausalLM, cls).from_pretrained(pretrained_model_name_or_path, *model_args,
                 config=config, cache_dir=cache_dir, ignore_mismatched_sizes=ignore_mismatched_sizes,
                 force_download=force_download, local_files_only=local_files_only, token=token, revision=revision,
-                use_safetensors=use_safetensors, **kwargs)
     def forward(
-            self,
-            input_ids: torch.LongTensor = None,
-            attention_mask: Optional[torch.Tensor] = None,
-            position_ids: Optional[torch.LongTensor] = None,
-            past_key_values: Optional[List[torch.FloatTensor]] = None,
-            inputs_embeds: Optional[torch.FloatTensor] = None,
-            labels: Optional[torch.LongTensor] = None,
-            use_cache: Optional[bool] = None,
-            output_attentions: Optional[bool] = None,
-            output_hidden_states: Optional[bool] = None,
-            return_dict: Optional[bool] = None,
     ) -> Union[Tuple, CausalLMOutputWithPast]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
         outputs = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
-            position_ids=position_ids,
             past_key_values=past_key_values,
             inputs_embeds=inputs_embeds,
             use_cache=use_cache,
@@ -658,20 +730,24 @@ class BaichuanForCausalLM(BaichuanPreTrainedModel):
             attentions=outputs.attentions,
         )
     def prepare_inputs_for_generation(
-            self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
     ):
         if past_key_values:
             input_ids = input_ids[:, -1:]
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -1].unsqueeze(-1)
         # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
         if inputs_embeds is not None and past_key_values is None:
             model_inputs = {"inputs_embeds": inputs_embeds}
@@ -680,7 +756,6 @@ class BaichuanForCausalLM(BaichuanPreTrainedModel):
         model_inputs.update(
             {
-                "position_ids": position_ids,
                 "past_key_values": past_key_values,
                 "use_cache": kwargs.get("use_cache"),
                 "attention_mask": attention_mask,
@@ -690,22 +765,71 @@ class BaichuanForCausalLM(BaichuanPreTrainedModel):
     @staticmethod
     def _reorder_cache(past_key_values, beam_idx):
-        reordered_past = ()
-        for layer_past in past_key_values:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
-        return reordered_past
-    def quantize(self, bits: int):
-        try:
-            from .quantizer import quantize_online
-        except ImportError:
-            raise ImportError(f"Needs QLinear to run quantize.")
-        return quantize_online(self, bits)
     def chat(self, tokenizer, messages: List[dict], stream=False,
              generation_config: Optional[GenerationConfig]=None):
         generation_config = generation_config or self.generation_config
         input_ids = build_chat_input(self, tokenizer, messages, generation_config.max_new_tokens)
         if stream:
             streamer = TextIterStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
             Thread(target=self.generate, kwargs=dict(

+# Copyright (c) 2023, Baichuan Intelligent Technology. All rights reserved.
 from .configuration_baichuan import BaichuanConfig
 from .generation_utils import build_chat_input, TextIterStreamer
 import math
 from threading import Thread
+from typing import List, Optional, Tuple, Union
 import torch
 from torch import nn
+from torch.nn import CrossEntropyLoss
 from torch.nn import functional as F
 from transformers import PreTrainedModel, PretrainedConfig
 from transformers.activations import ACT2FN
 from transformers.generation.utils import GenerationConfig
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
 from transformers.utils import logging, ContextManagers
 import os
 from contextlib import contextmanager
+from accelerate import init_empty_weights
 logger = logging.get_logger(__name__)
 try:
     )
+def _get_interleave(n):
+    def _get_interleave_power_of_2(n):
+        start = 2 ** (-(2 ** -(math.log2(n) - 3)))
+        ratio = start
+        return [start * ratio**i for i in range(n)]
+    if math.log2(n).is_integer():
+        return _get_interleave_power_of_2(n)
     else:
+        closest_power_of_2 = 2 ** math.floor(math.log2(n))
+        return (
+            _get_interleave_power_of_2(closest_power_of_2)
+            + _get_interleave(2 * closest_power_of_2)[0::2][: n - closest_power_of_2]
+        )
+def _fill_with_neg_inf(t):
+    """FP16-compatible function that fills a tensor with -inf."""
+    return t.float().fill_(float("-inf")).type_as(t)
+def _buffered_future_mask(tensor, maxpos, alibi, attn_heads):
+    _future_mask = torch.triu(_fill_with_neg_inf(torch.zeros([maxpos, maxpos])), 1)
+    _future_mask = _future_mask.unsqueeze(0) + alibi
+    new_future_mask = _future_mask.to(tensor)
+    return new_future_mask[: tensor.shape[0] * attn_heads, :maxpos, :maxpos]
+def _gen_alibi_mask(tensor, n_head, max_pos):
+    slopes = torch.Tensor(_get_interleave(n_head))
+    position_point = torch.arange(max_pos) - max_pos + 1
+    position_point = position_point.unsqueeze(0).unsqueeze(0).expand(n_head, -1, -1)
+    diag = torch.diag(position_point[0])
+    position_point = position_point - diag.unsqueeze(0).unsqueeze(0).transpose(-1, -2)
+    alibi = slopes.unsqueeze(1).unsqueeze(1) * position_point
+    alibi = alibi.view(n_head, 1, max_pos)
+    alibi_mask = torch.triu(_fill_with_neg_inf(torch.zeros([max_pos, max_pos])), 1)
+    alibi_mask = alibi_mask.unsqueeze(0) + alibi
+    return alibi_mask
+class RMSNorm(torch.nn.Module):
+    def __init__(self, hidden_size, epsilon=1e-6):
         super().__init__()
+        self.weight = torch.nn.Parameter(torch.empty(hidden_size))
+        self.epsilon = epsilon
     def forward(self, hidden_states):
         variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.epsilon)
+        # convert into half-precision
         if self.weight.dtype in [torch.float16, torch.bfloat16]:
             hidden_states = hidden_states.to(self.weight.dtype)
         return self.weight * hidden_states
+class MLP(torch.nn.Module):
     def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
     ):
         super().__init__()
+        self.gate_proj = torch.nn.Linear(hidden_size, intermediate_size, bias=False)
+        self.down_proj = torch.nn.Linear(intermediate_size, hidden_size, bias=False)
+        self.up_proj = torch.nn.Linear(hidden_size, intermediate_size, bias=False)
         self.act_fn = ACT2FN[hidden_act]
     def forward(self, x):
         return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+class BaichuanAttention(torch.nn.Module):
     def __init__(self, config: BaichuanConfig):
         super().__init__()
         self.config = config
         self.hidden_size = config.hidden_size
         self.num_heads = config.num_attention_heads
         self.head_dim = self.hidden_size // self.num_heads
+        self.max_position_embeddings = config.model_max_length
         if (self.head_dim * self.num_heads) != self.hidden_size:
             raise ValueError(
+                f"hidden_size {self.hidden_size} is not divisible by num_heads {self.num_heads}"
             )
+        self.W_pack = torch.nn.Linear(
+            self.hidden_size, 3 * self.hidden_size, bias=False
+        )
+        self.o_proj = torch.nn.Linear(
+            self.num_heads * self.head_dim, self.hidden_size, bias=False
+        )
     def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return (
+            tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
+            .transpose(1, 2)
+            .contiguous()
+        )
     def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         bsz, q_len, _ = hidden_states.size()
         proj = self.W_pack(hidden_states)
+        proj = (
+            proj.unflatten(-1, (3, self.hidden_size))
+            .unsqueeze(0)
+            .transpose(0, -2)
+            .squeeze(-2)
+        )
+        query_states = (
+            proj[0].view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        )
+        key_states = (
+            proj[1].view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        )
+        value_states = (
+            proj[2].view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        )
         kv_seq_len = key_states.shape[-2]
         if past_key_value is not None:
             kv_seq_len += past_key_value[0].shape[-2]
         if past_key_value is not None:
             # reuse k, v, self_attention
         past_key_value = (key_states, value_states) if use_cache else None
         if xops is not None and self.training:
             attn_weights = None
+            # query_states = query_states.transpose(1, 2)
+            # key_states = key_states.transpose(1, 2)
+            # value_states = value_states.transpose(1, 2)
+            # attn_output = xops.memory_efficient_attention(
+            #     query_states, key_states, value_states, attn_bias=attention_mask
+            # )
             with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=True, enable_mem_efficient=True):
                 attn_output = F.scaled_dot_product_attention(query_states, key_states, value_states, attn_mask = attention_mask)
             attn_output = attn_output.transpose(1, 2)
+        else:
+            attn_weights = torch.matmul(
+                query_states, key_states.transpose(2, 3)
+            ) / math.sqrt(self.head_dim)
+            if attention_mask is not None:
+                if q_len == 1:  # inference with cache
+                    if len(attention_mask.size()) == 4:
+                        attention_mask = attention_mask[:, :, -1:, :]
+                    else:
+                        attention_mask = attention_mask[:, -1:, :]
+                attn_weights = attn_weights + attention_mask
+                attn_weights = torch.max(
+                    attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min)
+                )
+            attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1)
+            attn_output = torch.matmul(attn_weights, value_states)
+            attn_output = attn_output.transpose(1, 2)
         attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
         attn_output = self.o_proj(attn_output)
         return attn_output, attn_weights, past_key_value
+class BaichuanLayer(torch.nn.Module):
     def __init__(self, config: BaichuanConfig):
         super().__init__()
         self.hidden_size = config.hidden_size
+        self.self_attn = BaichuanAttention(config=config)
         self.mlp = MLP(
             hidden_size=self.hidden_size,
             intermediate_size=config.intermediate_size,
             hidden_act=config.hidden_act,
         )
+        self.input_layernorm = RMSNorm(config.hidden_size, epsilon=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, epsilon=config.rms_norm_eps
+        )
     def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+    ) -> Tuple[
+        torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]
+    ]:
         residual = hidden_states
         hidden_states = self.input_layernorm(hidden_states)
         hidden_states, self_attn_weights, present_key_value = self.self_attn(
             hidden_states=hidden_states,
             attention_mask=attention_mask,
             past_key_value=past_key_value,
             output_attentions=output_attentions,
             use_cache=use_cache,
         outputs = (hidden_states,)
         if use_cache:
             outputs += (present_key_value,)
     config_class = BaichuanConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
+    _no_split_modules = ["BaichuanLayer"]
     _keys_to_ignore_on_load_unexpected = [r"decoder\.version"]
     def _init_weights(self, module):
         std = self.config.initializer_range
+        if isinstance(module, torch.nn.Linear):
             module.weight.data.normal_(mean=0.0, std=std)
             if module.bias is not None:
                 module.bias.data.zero_()
+        elif isinstance(module, torch.nn.Embedding):
             module.weight.data.normal_(mean=0.0, std=std)
             if module.padding_idx is not None:
                 module.weight.data[module.padding_idx].zero_()
         super().__init__(config)
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
+        self.n_head = config.num_attention_heads
+        self.embed_tokens = torch.nn.Embedding(
+            config.vocab_size, config.hidden_size, self.padding_idx
+        )
+        self.layers = torch.nn.ModuleList(
+            [BaichuanLayer(config) for _ in range(config.num_hidden_layers)]
+        )
+        self.norm = RMSNorm(config.hidden_size, epsilon=config.rms_norm_eps)
+        self.gradient_checkpointing = config.gradient_checkpointing
         self.post_init()
+        self.max_cache_pos = config.model_max_length
+        self.first_run = True
+        self.alibi_mask = None
     def get_input_embeddings(self):
         return self.embed_tokens
     def set_input_embeddings(self, value):
         self.embed_tokens = value
+    def get_alibi_mask(self, tensor, seq_length_with_past):
+        if self.training:
+            slopes = torch.Tensor(_get_interleave(self.n_head))
+            position_point = (
+                torch.arange(seq_length_with_past) - seq_length_with_past + 1
             )
+            position_point = (
+                position_point.unsqueeze(0)
+                .unsqueeze(0)
+                .expand(self.n_head, seq_length_with_past, -1)
             )
+            diag = torch.diag(position_point[0])
+            position_point = position_point - diag.unsqueeze(0).unsqueeze(0).transpose(
+                -1, -2
             )
+            alibi = slopes.unsqueeze(1).unsqueeze(1) * position_point
+            mask = _buffered_future_mask(
+                tensor, seq_length_with_past, alibi, self.n_head
+            )
+        else:
+            if self.first_run:
+                self.first_run = False
+                self.register_buffer(
+                    "future_mask",
+                    _gen_alibi_mask(tensor, self.n_head, self.max_cache_pos).to(
+                        tensor
+                    ),
+                    persistent=False,
+                )
+            if seq_length_with_past > self.max_cache_pos:
+                self.max_cache_pos = seq_length_with_past
+                self.register_buffer(
+                    "future_mask",
+                    _gen_alibi_mask(tensor, self.n_head, self.max_cache_pos).to(
+                        tensor
+                    ),
+                    persistent=False,
+                )
+            mask = self.future_mask[
+                : self.n_head, :seq_length_with_past, :seq_length_with_past
+            ]
+        return mask
     def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
     ) -> Union[Tuple, BaseModelOutputWithPast]:
         if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot provide both input_ids and inputs_embeds simultaneously"
+            )
         elif input_ids is not None:
             batch_size, seq_length = input_ids.shape
         elif inputs_embeds is not None:
             batch_size, seq_length, _ = inputs_embeds.shape
         else:
+            raise ValueError("You need to provide input_ids or inputs_embeds")
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
         seq_length_with_past = seq_length
         if past_key_values is not None:
             past_key_values_length = past_key_values[0][0].shape[2]
             seq_length_with_past = seq_length_with_past + past_key_values_length
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
+        if self.training:
+            if (
+                self.alibi_mask is None
+                or self.alibi_mask.shape[-1] != seq_length_with_past
+            ):
+                self.alibi_mask = self.get_alibi_mask(
+                    inputs_embeds, seq_length_with_past
+                )
+            alibi_mask = self.alibi_mask
+        else:
+            alibi_mask = self.get_alibi_mask(inputs_embeds, seq_length_with_past)
+        if attention_mask is not None:
+            if len(attention_mask.shape) == 2:
+                expanded_mask = attention_mask.to(alibi_mask.dtype)
+                expanded_mask = torch.tril(
+                    torch.gt(expanded_mask[:, :, None] * expanded_mask[:, None, :], 0)
+                ) * torch.eq(expanded_mask[:, :, None] - expanded_mask[:, None, :], 0)
+            else:
+                expanded_mask = attention_mask
+            bsz = inputs_embeds.size(0)
+            src_len, tgt_len = alibi_mask.size()[-2:]
+            expanded_mask = (
+                expanded_mask.unsqueeze(1)
+                .expand(bsz, 1, src_len, tgt_len)
+                .to(alibi_mask.dtype)
             )
+            inverted_mask = 1.0 - expanded_mask
+            inverted_mask = inverted_mask.masked_fill(
+                inverted_mask.to(torch.bool), torch.finfo(alibi_mask.dtype).min
+            )
+            attention_mask = inverted_mask + alibi_mask.unsqueeze(0)
+        else:
+            attention_mask = alibi_mask
         hidden_states = inputs_embeds
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
+            past_key_value = (
+                past_key_values[idx] if past_key_values is not None else None
+            )
             if self.gradient_checkpointing and self.training:
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     attention_mask,
                     None,
                 )
             else:
                 layer_outputs = decoder_layer(
                     hidden_states,
                     attention_mask=attention_mask,
                     past_key_value=past_key_value,
                     output_attentions=output_attentions,
                     use_cache=use_cache,
         next_cache = next_decoder_cache if use_cache else None
         if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
+                if v is not None
+            )
         return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=next_cache,
             self.first_flag = True
         elif self.first_flag:
             self.first_flag = False
+            self.weight = nn.Parameter(nn.functional.normalize(self.weight))
             norm_weight = self.weight
         else:
             norm_weight = self.weight
     finally:
         _init_weights = old_init_weights
 class BaichuanForCausalLM(BaichuanPreTrainedModel):
     def __init__(self, config, *model_args, **model_kwargs):
         super().__init__(config, *model_args, **model_kwargs)
         self.model = BaichuanModel(config)
         self.lm_head = NormHead(config.hidden_size, config.vocab_size, bias=False)
+        #if hasattr(config, "quantization_config") and config.quantization_config['load_in_4bit']:
         if hasattr(config, "quantization_config") and isinstance(config.quantization_config, dict) and config.quantization_config.get('load_in_4bit', False):
             try:
                 from .quantizer import quantize_offline, init_model_weight_int4
             except ImportError:
+                raise ImportError(f"Needs quantize_offline to run quantize.")
             quantize_offline(self, 4)
         # Initialize weights and apply final processing
         self.post_init()
         use_safetensors: bool = None,
         **kwargs,
     ):
         # Load config if we don't provide a configuration
         if not isinstance(config, PretrainedConfig):
             config_path = config if config is not None else pretrained_model_name_or_path
             )
         else:
             model_kwargs = kwargs
+        if hasattr(config, "quantization_config") and config.quantization_config['load_in_4bit']:
+            try:
+                from .quantizer import init_model_weight_int4
+                from accelerate import init_empty_weights, dispatch_model, infer_auto_device_map
+                from accelerate.utils import CustomDtype
+                from accelerate.utils import get_balanced_memory
+            except ImportError:
+                raise ImportError(f"Needs import model weight init func to run quantize.")
+            # Instantiate model.
+            init_contexts = [no_init_weights(_enable=True)]
+            init_contexts.append(init_empty_weights())
+            with ContextManagers(init_contexts):
+                model = cls(config)
+            model_file = os.path.join(pretrained_model_name_or_path, 'pytorch_model.bin')
+            state_dict = torch.load(model_file, map_location="cpu")
+            model.is_quantized = True
+            device_map = kwargs.pop("device_map", None)
+            torch_dtype = kwargs.pop("torch_dtype", None)
+            if device_map is not None:
+                kwargs = {"no_split_module_classes": model._no_split_modules}
+                target_dtype = CustomDtype.INT4
+                max_memory = get_balanced_memory(
+                    model,
+                    dtype=target_dtype,
+                    low_zero=(device_map == "balanced_low_0"),
+                    max_memory=None,
+                    **kwargs,
+                )
+                kwargs["max_memory"] = max_memory
+                device_map = infer_auto_device_map(model, dtype=target_dtype, **kwargs)
+            model = init_model_weight_int4(config, model, state_dict)
+            # Set model in evaluation mode to deactivate DropOut modules by default
+            model.eval()
+            # If it is a model with generation capabilities, attempt to load the generation config
+            if model.can_generate():
+                try:
+                    model.generation_config = GenerationConfig.from_pretrained(
+                        pretrained_model_name_or_path,
+                        cache_dir=cache_dir,
+                        force_download=force_download,
+                        resume_download=False,
+                        proxies=None,
+                        local_files_only=local_files_only,
+                        token=token,
+                        revision=revision,
+                        subfolder="",
+                        _from_auto=False,
+                        _from_pipeline=None,
+                        **kwargs,
+                    )
+                except (OSError, TypeError):
+                    logger.info(
+                        "Generation config file not found, using a generation config created from the model config."
+                    )
+                    pass
+            if device_map is not None:
+                dispatch_model(model, device_map=device_map)
+            return model
         return super(BaichuanForCausalLM, cls).from_pretrained(pretrained_model_name_or_path, *model_args,
                 config=config, cache_dir=cache_dir, ignore_mismatched_sizes=ignore_mismatched_sizes,
                 force_download=force_download, local_files_only=local_files_only, token=token, revision=revision,
+                use_safetensors=use_safetensors, **kwargs)
     def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+        **kwargs,
     ) -> Union[Tuple, CausalLMOutputWithPast]:
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
         )
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
         outputs = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
             past_key_values=past_key_values,
             inputs_embeds=inputs_embeds,
             use_cache=use_cache,
             attentions=outputs.attentions,
         )
+    def quantize(self, bits: int):
+        try:
+            from .quantizer import quantize_online
+        except ImportError:
+            raise ImportError(f"Needs QLinear to run quantize.")
+        return quantize_online(self, bits)
     def prepare_inputs_for_generation(
+        self,
+        input_ids: torch.LongTensor,
+        past_key_values: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs,
     ):
         if past_key_values:
             input_ids = input_ids[:, -1:]
         # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
         if inputs_embeds is not None and past_key_values is None:
             model_inputs = {"inputs_embeds": inputs_embeds}
         model_inputs.update(
             {
                 "past_key_values": past_key_values,
                 "use_cache": kwargs.get("use_cache"),
                 "attention_mask": attention_mask,
     @staticmethod
     def _reorder_cache(past_key_values, beam_idx):
+        return tuple(
+            tuple(past_state.index_select(0, beam_idx) for past_state in layer_past)
+            for layer_past in past_key_values
+        )
+    def _build_chat_input(
+        self, tokenizer, messages: List[dict], max_new_tokens: int = 0
+    ):
+        max_new_tokens = max_new_tokens or self.generation_config.max_new_tokens
+        max_input_tokens = self.config.model_max_length - max_new_tokens
+        max_input_tokens = max(self.config.model_max_length // 2, max_input_tokens)
+        total_input, round_input = [], []
+        for i, message in enumerate(messages[::-1]):
+            content_tokens = tokenizer.encode(message["content"])
+            if message["role"] == "user":
+                round_input = (
+                    [self.generation_config.user_token_id]
+                    + content_tokens
+                    + round_input
+                )
+                if (
+                    total_input
+                    and len(total_input) + len(round_input) > max_input_tokens
+                ):
+                    break
+                else:
+                    total_input = round_input + total_input
+                    if len(total_input) >= max_input_tokens:
+                        break
+                    else:
+                        round_input = []
+            elif message["role"] == "assistant":
+                round_input = (
+                    [self.generation_config.assistant_token_id]
+                    + content_tokens
+                    + [self.generation_config.eos_token_id]
+                    + round_input
+                )
+            else:
+                raise ValueError(f"message role not supported yet: {message['role']}")
+        total_input = total_input[-max_input_tokens:]  # truncate left
+        total_input.append(self.generation_config.assistant_token_id)
+        total_input = torch.LongTensor([total_input]).to(self.device)
+        return total_input
     def chat(self, tokenizer, messages: List[dict], stream=False,
              generation_config: Optional[GenerationConfig]=None):
         generation_config = generation_config or self.generation_config
         input_ids = build_chat_input(self, tokenizer, messages, generation_config.max_new_tokens)
+        if stream:
+            streamer = TextIterStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+            Thread(target=self.generate, kwargs=dict(
+                inputs=input_ids, streamer=streamer,
+                generation_config=generation_config,
+            )).start()
+            return streamer
+        else:
+            outputs = self.generate(input_ids, generation_config=generation_config)
+            response = tokenizer.decode(outputs[0][len(input_ids[0]):], skip_special_tokens=True)
+            return response
+    def HuatuoChat(self, tokenizer, messages: List[dict], stream=False,
+             generation_config: Optional[GenerationConfig]=None):
+        generation_config = generation_config or self.generation_config
+        input_ids = build_chat_input(self, tokenizer, messages, generation_config.max_new_tokens)
         if stream:
             streamer = TextIterStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
             Thread(target=self.generate, kwargs=dict(