derek33125 commited on
Commit
146cc0e
1 Parent(s): d4b7138

Updating model based on GLM4 update

Browse files

Updating the model to the newest version of GLM4-Chat, follow the new requirements in their page

config.json CHANGED
@@ -1,50 +1,45 @@
1
  {
2
  "_name_or_path": "THUDM/glm-4-9b-chat",
3
- "add_bias_linear": false,
4
- "add_qkv_bias": true,
5
- "apply_query_key_layer_scaling": true,
6
- "apply_residual_connection_post_layernorm": false,
7
  "architectures": [
8
- "ChatGLMForConditionalGeneration"
9
  ],
10
- "attention_dropout": 0.0,
11
- "attention_softmax_in_fp32": true,
12
  "auto_map": {
13
  "AutoConfig": "configuration_chatglm.ChatGLMConfig",
14
  "AutoModel": "modeling_chatglm.ChatGLMForConditionalGeneration",
15
- "AutoModelForCausalLM": "THUDM/glm-4-9b-chat--modeling_chatglm.ChatGLMForConditionalGeneration",
16
- "AutoModelForSeq2SeqLM": "THUDM/glm-4-9b-chat--modeling_chatglm.ChatGLMForConditionalGeneration",
17
- "AutoModelForSequenceClassification": "THUDM/glm-4-9b-chat--modeling_chatglm.ChatGLMForSequenceClassification"
18
  },
 
 
 
 
 
 
 
19
  "bias_dropout_fusion": true,
20
- "classifier_dropout": null,
21
- "eos_token_id": [
22
- 151329,
23
- 151336,
24
- 151338
25
- ],
26
  "ffn_hidden_size": 13696,
27
  "fp32_residual_connection": false,
28
  "hidden_dropout": 0.0,
29
  "hidden_size": 4096,
30
  "kv_channels": 128,
31
  "layernorm_epsilon": 1.5625e-07,
32
- "model_type": "chatglm",
33
  "multi_query_attention": true,
34
  "multi_query_group_num": 2,
35
  "num_attention_heads": 32,
36
  "num_hidden_layers": 40,
37
  "num_layers": 40,
 
38
  "original_rope": true,
39
- "pad_token_id": 151329,
40
  "padded_vocab_size": 151552,
41
  "post_layer_norm": true,
42
  "rmsnorm": true,
43
- "rope_ratio": 500,
44
  "seq_length": 131072,
45
- "tie_word_embeddings": false,
46
- "torch_dtype": "bfloat16",
47
- "transformers_version": "4.41.2",
48
  "use_cache": true,
49
- "vocab_size": 151552
50
- }
 
 
 
 
 
1
  {
2
  "_name_or_path": "THUDM/glm-4-9b-chat",
3
+ "model_type": "chatglm",
 
 
 
4
  "architectures": [
5
+ "ChatGLMModel"
6
  ],
 
 
7
  "auto_map": {
8
  "AutoConfig": "configuration_chatglm.ChatGLMConfig",
9
  "AutoModel": "modeling_chatglm.ChatGLMForConditionalGeneration",
10
+ "AutoModelForCausalLM": "modeling_chatglm.ChatGLMForConditionalGeneration",
11
+ "AutoModelForSeq2SeqLM": "modeling_chatglm.ChatGLMForConditionalGeneration",
12
+ "AutoModelForSequenceClassification": "modeling_chatglm.ChatGLMForSequenceClassification"
13
  },
14
+ "add_bias_linear": false,
15
+ "add_qkv_bias": true,
16
+ "apply_query_key_layer_scaling": true,
17
+ "apply_residual_connection_post_layernorm": false,
18
+ "attention_dropout": 0.0,
19
+ "attention_softmax_in_fp32": true,
20
+ "attn_implementation": "sdpa",
21
  "bias_dropout_fusion": true,
 
 
 
 
 
 
22
  "ffn_hidden_size": 13696,
23
  "fp32_residual_connection": false,
24
  "hidden_dropout": 0.0,
25
  "hidden_size": 4096,
26
  "kv_channels": 128,
27
  "layernorm_epsilon": 1.5625e-07,
 
28
  "multi_query_attention": true,
29
  "multi_query_group_num": 2,
30
  "num_attention_heads": 32,
31
  "num_hidden_layers": 40,
32
  "num_layers": 40,
33
+ "rope_ratio": 500,
34
  "original_rope": true,
 
35
  "padded_vocab_size": 151552,
36
  "post_layer_norm": true,
37
  "rmsnorm": true,
 
38
  "seq_length": 131072,
 
 
 
39
  "use_cache": true,
40
+ "torch_dtype": "bfloat16",
41
+ "transformers_version": "4.42.4",
42
+ "tie_word_embeddings": false,
43
+ "eos_token_id": [151329, 151336, 151338],
44
+ "pad_token_id": 151329
45
+ }
generation_config.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
- "do_sample": true,
3
  "eos_token_id": [
4
  151329,
5
  151336,
6
  151338
7
  ],
8
- "max_length": 128000,
9
  "pad_token_id": 151329,
 
10
  "temperature": 0.8,
 
11
  "top_p": 0.8,
12
- "transformers_version": "4.41.2"
13
- }
 
1
  {
 
2
  "eos_token_id": [
3
  151329,
4
  151336,
5
  151338
6
  ],
 
7
  "pad_token_id": 151329,
8
+ "do_sample": true,
9
  "temperature": 0.8,
10
+ "max_length": 128000,
11
  "top_p": 0.8,
12
+ "transformers_version": "4.42.4"
13
+ }
modeling_chatglm.py CHANGED
@@ -1,19 +1,14 @@
1
  """ PyTorch ChatGLM model. """
2
- import json
3
  import math
4
- import copy
5
- import warnings
6
- import re
7
  import sys
8
-
9
  import torch
10
  import torch.utils.checkpoint
11
  import torch.nn.functional as F
12
  from torch import nn
13
  from torch.nn import CrossEntropyLoss, LayerNorm, MSELoss, BCEWithLogitsLoss
14
  from torch.nn.utils import skip_init
15
- from typing import Optional, Tuple, Union, List, Callable, Dict, Any
16
- from copy import deepcopy
17
 
18
  from transformers.modeling_outputs import (
19
  BaseModelOutputWithPast,
@@ -23,10 +18,19 @@ from transformers.modeling_outputs import (
23
  from transformers.modeling_utils import PreTrainedModel
24
  from transformers.utils import logging, is_torch_npu_available
25
  from transformers.generation.logits_process import LogitsProcessor
26
- from transformers.generation.utils import LogitsProcessorList, StoppingCriteriaList, GenerationConfig, ModelOutput
27
 
28
  from .configuration_chatglm import ChatGLMConfig
29
 
 
 
 
 
 
 
 
 
 
30
  # flags required to enable jit fusion kernels
31
 
32
  if sys.platform != 'darwin' and not is_torch_npu_available():
@@ -40,6 +44,7 @@ logger = logging.get_logger(__name__)
40
  _CHECKPOINT_FOR_DOC = "THUDM/ChatGLM"
41
  _CONFIG_FOR_DOC = "ChatGLMConfig"
42
 
 
43
  def default_init(cls, *args, **kwargs):
44
  return cls(*args, **kwargs)
45
 
@@ -159,12 +164,13 @@ class RMSNorm(torch.nn.Module):
159
  class CoreAttention(torch.nn.Module):
160
  def __init__(self, config: ChatGLMConfig, layer_number):
161
  super(CoreAttention, self).__init__()
162
-
163
  self.apply_query_key_layer_scaling = config.apply_query_key_layer_scaling
164
  self.attention_softmax_in_fp32 = config.attention_softmax_in_fp32
165
  if self.apply_query_key_layer_scaling:
166
  self.attention_softmax_in_fp32 = True
167
  self.layer_number = max(1, layer_number)
 
168
 
169
  projection_size = config.kv_channels * config.num_attention_heads
170
 
@@ -183,91 +189,199 @@ class CoreAttention(torch.nn.Module):
183
  self.attention_dropout = torch.nn.Dropout(config.attention_dropout)
184
 
185
  def forward(self, query_layer, key_layer, value_layer, attention_mask):
186
- pytorch_major_version = int(torch.__version__.split('.')[0])
187
- if pytorch_major_version >= 2:
188
- if attention_mask is None and query_layer.shape[2] == key_layer.shape[2]:
189
- context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer,
190
- is_causal=True)
191
- else:
192
- if attention_mask is not None:
193
- attention_mask = ~attention_mask
194
- context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer,
195
- attention_mask)
196
- context_layer = context_layer.transpose(1, 2).contiguous()
197
- new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
198
- context_layer = context_layer.reshape(*new_context_layer_shape)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
  else:
200
- # Raw attention scores
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
 
202
- # [b, np, sq, sk]
203
- output_size = (query_layer.size(0), query_layer.size(1), query_layer.size(2), key_layer.size(2))
204
 
205
- # [b, np, sq, hn] -> [b * np, sq, hn]
206
- query_layer = query_layer.view(output_size[0] * output_size[1], output_size[2], -1)
207
- # [b, np, sk, hn] -> [b * np, sk, hn]
208
- key_layer = key_layer.view(output_size[0] * output_size[1], output_size[3], -1)
 
209
 
210
- # preallocting input tensor: [b * np, sq, sk]
211
- matmul_input_buffer = torch.empty(
212
- output_size[0] * output_size[1], output_size[2], output_size[3], dtype=query_layer.dtype,
213
- device=query_layer.device
 
 
 
 
 
 
 
 
 
 
 
214
  )
215
 
216
- # Raw attention scores. [b * np, sq, sk]
217
- matmul_result = torch.baddbmm(
218
- matmul_input_buffer,
219
- query_layer, # [b * np, sq, hn]
220
- key_layer.transpose(1, 2), # [b * np, hn, sk]
221
- beta=0.0,
222
- alpha=(1.0 / self.norm_factor),
 
 
 
 
 
 
 
223
  )
224
 
225
- # change view to [b, np, sq, sk]
226
- attention_scores = matmul_result.view(*output_size)
227
-
228
- # ===========================
229
- # Attention probs and dropout
230
- # ===========================
231
-
232
- # attention scores and attention mask [b, np, sq, sk]
233
- if self.attention_softmax_in_fp32:
234
- attention_scores = attention_scores.float()
235
- if self.coeff is not None:
236
- attention_scores = attention_scores * self.coeff
237
- if attention_mask is None and attention_scores.shape[2] == attention_scores.shape[3]:
238
- attention_mask = torch.ones(output_size[0], 1, output_size[2], output_size[3],
239
- device=attention_scores.device, dtype=torch.bool)
240
- attention_mask.tril_()
241
- attention_mask = ~attention_mask
242
- if attention_mask is not None:
243
- attention_scores = attention_scores.masked_fill(attention_mask, float("-inf"))
244
- attention_probs = F.softmax(attention_scores, dim=-1)
245
- attention_probs = attention_probs.type_as(value_layer)
246
-
247
- # This is actually dropping out entire tokens to attend to, which might
248
- # seem a bit unusual, but is taken from the original Transformer paper.
249
- attention_probs = self.attention_dropout(attention_probs)
250
-
251
- # query layer shape: [b * np, sq, hn]
252
- # value layer shape: [b, np, sk, hn]
253
- # attention shape: [b, np, sq, sk]
254
- # context layer shape: [b, np, sq, hn]
255
- output_size = (value_layer.size(0), value_layer.size(1), query_layer.size(1), value_layer.size(3))
256
- # change view [b * np, sk, hn]
257
- value_layer = value_layer.view(output_size[0] * output_size[1], value_layer.size(2), -1)
258
- # change view [b * np, sq, sk]
259
- attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1)
260
- # matmul: [b * np, sq, hn]
261
- context_layer = torch.bmm(attention_probs, value_layer)
262
- # change view [b, np, sq, hn]
263
- context_layer = context_layer.view(*output_size)
264
- # [b, np, sq, hn] --> [b, sq, np, hn]
265
- context_layer = context_layer.transpose(1, 2).contiguous()
266
- # [b, sq, np, hn] --> [b, sq, hp]
267
- new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
268
- context_layer = context_layer.reshape(*new_context_layer_shape)
269
 
270
- return context_layer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
271
 
272
 
273
  class SelfAttention(torch.nn.Module):
@@ -299,7 +413,7 @@ class SelfAttention(torch.nn.Module):
299
  device=device, **_config_to_kwargs(config)
300
  )
301
 
302
- self.core_attention = CoreAttention(config, self.layer_number)
303
 
304
  # Output.
305
  self.dense = nn.Linear(self.projection_size, config.hidden_size, bias=config.add_bias_linear,
@@ -378,7 +492,8 @@ class SelfAttention(torch.nn.Module):
378
  value_layer = torch.cat((cache_v, value_layer), dim=2)
379
  if use_cache:
380
  if kv_cache is None:
381
- kv_cache = torch.cat((key_layer.unsqueeze(0).unsqueeze(0), value_layer.unsqueeze(0).unsqueeze(0)), dim=1)
 
382
  else:
383
  kv_cache = (key_layer, value_layer)
384
  else:
@@ -644,12 +759,18 @@ class ChatGLMPreTrainedModel(PreTrainedModel):
644
  config_class = ChatGLMConfig
645
  base_model_prefix = "transformer"
646
  _no_split_modules = ["GLMBlock"]
 
 
647
 
648
  def _init_weights(self, module: nn.Module):
649
  """Initialize the weights."""
650
  return
651
 
652
  def get_masks(self, input_ids, past_key_values, padding_mask=None):
 
 
 
 
653
  batch_size, seq_length = input_ids.shape
654
  full_attention_mask = torch.ones(batch_size, seq_length, seq_length, device=input_ids.device)
655
  full_attention_mask.tril_()
@@ -672,11 +793,6 @@ class ChatGLMPreTrainedModel(PreTrainedModel):
672
  position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1)
673
  return position_ids
674
 
675
- def gradient_checkpointing_enable(self, gradient_checkpointing_kwargs=None):
676
- if not self.supports_gradient_checkpointing:
677
- raise ValueError(f"{self.__class__.__name__} does not support gradient checkpointing.")
678
-
679
-
680
  class Embedding(torch.nn.Module):
681
  """Language model embeddings."""
682
 
@@ -724,7 +840,8 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
724
  config.hidden_size // config.num_attention_heads if config.kv_channels is None else config.kv_channels
725
  )
726
 
727
- self.rotary_pos_emb = RotaryEmbedding(rotary_dim // 2, rope_ratio=config.rope_ratio, original_impl=config.original_rope,
 
728
  device=device, dtype=config.torch_dtype)
729
  self.encoder = init_method(GLMTransformer, config, **init_kwargs)
730
  self.output_layer = init_method(nn.Linear, config.hidden_size, config.padded_vocab_size, bias=False,
@@ -745,6 +862,7 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
745
  past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
746
  inputs_embeds: Optional[torch.Tensor] = None,
747
  use_cache: Optional[bool] = None,
 
748
  output_hidden_states: Optional[bool] = None,
749
  return_dict: Optional[bool] = None,
750
  ):
@@ -809,9 +927,10 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
809
  standardize_cache_format: bool = False,
810
  ) -> Dict[str, Any]:
811
  # update past_key_values
812
- model_kwargs["past_key_values"] = self._extract_past_from_model_output(
813
  outputs, standardize_cache_format=standardize_cache_format
814
  )
 
815
 
816
  # update attention mask
817
  if "attention_mask" in model_kwargs:
@@ -936,201 +1055,6 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
936
  for layer_past in past
937
  )
938
 
939
- def process_response(self, output, history):
940
- content = ""
941
- history = deepcopy(history)
942
- for response in output.split("<|assistant|>"):
943
- if "\n" in response:
944
- metadata, content = response.split("\n", maxsplit=1)
945
- else:
946
- metadata, content = "", response
947
- if not metadata.strip():
948
- content = content.strip()
949
- history.append({"role": "assistant", "metadata": metadata, "content": content})
950
- content = content.replace("[[训练时间]]", "2023年")
951
- else:
952
- history.append({"role": "assistant", "metadata": metadata, "content": content})
953
- if history[0]["role"] == "system" and "tools" in history[0]:
954
- parameters = json.loads(content)
955
- content = {"name": metadata.strip(), "parameters": parameters}
956
- else:
957
- content = {"name": metadata.strip(), "content": content}
958
- return content, history
959
-
960
- @torch.inference_mode()
961
- def chat(self, tokenizer, query: str, history: List[Dict] = None, role: str = "user",
962
- max_length: int = 8192, num_beams=1, do_sample=True, top_p=0.8, temperature=0.8, logits_processor=None,
963
- **kwargs):
964
- if history is None:
965
- history = []
966
- if logits_processor is None:
967
- logits_processor = LogitsProcessorList()
968
- logits_processor.append(InvalidScoreLogitsProcessor())
969
- gen_kwargs = {"max_length": max_length, "num_beams": num_beams, "do_sample": do_sample, "top_p": top_p,
970
- "temperature": temperature, "logits_processor": logits_processor, **kwargs}
971
- history.append({"role": role, "content": query})
972
- inputs = tokenizer.apply_chat_template(history, add_generation_prompt=True, tokenize=True,
973
- return_tensors="pt", return_dict=True)
974
- inputs = inputs.to(self.device)
975
- eos_token_id = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|user|>"),
976
- tokenizer.convert_tokens_to_ids("<|observation|>")]
977
- outputs = self.generate(**inputs, **gen_kwargs, eos_token_id=eos_token_id)
978
- outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):-1]
979
- response = tokenizer.decode(outputs)
980
- response, history = self.process_response(response, history)
981
- return response, history
982
-
983
- @torch.inference_mode()
984
- def stream_chat(self, tokenizer, query: str, history: List[Dict] = None, role: str = "user",
985
- past_key_values=None, max_length: int = 8192, do_sample=True, top_p=0.8, temperature=0.8,
986
- logits_processor=None, return_past_key_values=False, **kwargs):
987
- if history is None:
988
- history = []
989
- if logits_processor is None:
990
- logits_processor = LogitsProcessorList()
991
- logits_processor.append(InvalidScoreLogitsProcessor())
992
- eos_token_id = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|user|>"),
993
- tokenizer.convert_tokens_to_ids("<|observation|>")]
994
- gen_kwargs = {"max_length": max_length, "do_sample": do_sample, "top_p": top_p,
995
- "temperature": temperature, "logits_processor": logits_processor, **kwargs}
996
- if past_key_values is None:
997
- inputs = tokenizer.apply_chat_template(history + [{"role": role, "content": query}],
998
- add_generation_prompt=True, tokenize=True, return_tensors="pt",
999
- return_dict=True)
1000
- else:
1001
- inputs = tokenizer.apply_chat_template([{"role": role, "content": query}], add_special_tokens=False,
1002
- add_generation_prompt=True, tokenize=True, return_tensors="pt",
1003
- return_dict=True)
1004
- inputs = inputs.to(self.device)
1005
- if past_key_values is not None:
1006
- past_length = past_key_values[0][0].shape[2]
1007
- inputs.position_ids += past_length
1008
- attention_mask = inputs.attention_mask
1009
- attention_mask = torch.cat((attention_mask.new_ones(1, past_length), attention_mask), dim=1)
1010
- inputs['attention_mask'] = attention_mask
1011
- history.append({"role": role, "content": query})
1012
- for outputs in self.stream_generate(**inputs, past_key_values=past_key_values,
1013
- eos_token_id=eos_token_id, return_past_key_values=return_past_key_values,
1014
- **gen_kwargs):
1015
- if return_past_key_values:
1016
- outputs, past_key_values = outputs
1017
- outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):-1]
1018
- response = tokenizer.decode(outputs)
1019
- if response and response[-1] != "�":
1020
- response, new_history = self.process_response(response, history)
1021
- if return_past_key_values:
1022
- yield response, new_history, past_key_values
1023
- else:
1024
- yield response, new_history
1025
-
1026
- @torch.inference_mode()
1027
- def stream_generate(
1028
- self,
1029
- input_ids,
1030
- generation_config: Optional[GenerationConfig] = None,
1031
- logits_processor: Optional[LogitsProcessorList] = None,
1032
- stopping_criteria: Optional[StoppingCriteriaList] = None,
1033
- prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
1034
- return_past_key_values=False,
1035
- **kwargs,
1036
- ):
1037
- batch_size, input_ids_seq_length = input_ids.shape[0], input_ids.shape[-1]
1038
-
1039
- if generation_config is None:
1040
- generation_config = self.generation_config
1041
- generation_config = copy.deepcopy(generation_config)
1042
- model_kwargs = generation_config.update(**kwargs)
1043
- model_kwargs["use_cache"] = generation_config.use_cache
1044
- bos_token_id, eos_token_id = generation_config.bos_token_id, generation_config.eos_token_id
1045
-
1046
- if isinstance(eos_token_id, int):
1047
- eos_token_id = [eos_token_id]
1048
- eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None
1049
-
1050
- has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
1051
- if has_default_max_length and generation_config.max_new_tokens is None:
1052
- warnings.warn(
1053
- f"Using `max_length`'s default ({generation_config.max_length}) to control the generation length. "
1054
- "This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we"
1055
- " recommend using `max_new_tokens` to control the maximum length of the generation.",
1056
- UserWarning,
1057
- )
1058
- elif generation_config.max_new_tokens is not None:
1059
- generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length
1060
- if not has_default_max_length:
1061
- logger.warn(
1062
- f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(="
1063
- f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence. "
1064
- "Please refer to the documentation for more information. "
1065
- "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)",
1066
- UserWarning,
1067
- )
1068
-
1069
- if input_ids_seq_length >= generation_config.max_length:
1070
- input_ids_string = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids"
1071
- logger.warning(
1072
- f"Input length of {input_ids_string} is {input_ids_seq_length}, but `max_length` is set to"
1073
- f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider"
1074
- " increasing `max_new_tokens`."
1075
- )
1076
-
1077
- # 2. Set generation parameters if not already defined
1078
- logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
1079
- stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
1080
-
1081
- logits_processor = self._get_logits_processor(
1082
- generation_config=generation_config,
1083
- input_ids_seq_length=input_ids_seq_length,
1084
- encoder_input_ids=input_ids,
1085
- prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
1086
- logits_processor=logits_processor,
1087
- )
1088
-
1089
- stopping_criteria = self._get_stopping_criteria(
1090
- generation_config=generation_config, stopping_criteria=stopping_criteria
1091
- )
1092
- logits_warper = self._get_logits_warper(generation_config)
1093
-
1094
- unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
1095
- scores = None
1096
- while True:
1097
- model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
1098
- # forward pass to get next token
1099
- outputs = self(
1100
- **model_inputs,
1101
- return_dict=True,
1102
- output_attentions=False,
1103
- output_hidden_states=False,
1104
- )
1105
-
1106
- next_token_logits = outputs.logits[:, -1, :]
1107
-
1108
- # pre-process distribution
1109
- next_token_scores = logits_processor(input_ids, next_token_logits)
1110
- next_token_scores = logits_warper(input_ids, next_token_scores)
1111
-
1112
- # sample
1113
- probs = nn.functional.softmax(next_token_scores, dim=-1)
1114
- if generation_config.do_sample:
1115
- next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
1116
- else:
1117
- next_tokens = torch.argmax(probs, dim=-1)
1118
- # update generated ids, model inputs, and length for next step
1119
- input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
1120
- model_kwargs = self._update_model_kwargs_for_generation(
1121
- outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
1122
- )
1123
- unfinished_sequences = unfinished_sequences.mul(
1124
- next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)
1125
- )
1126
- if return_past_key_values:
1127
- yield input_ids, outputs.past_key_values
1128
- else:
1129
- yield input_ids
1130
- # stop when each sentence is finished, or if we exceed the maximum length
1131
- if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):
1132
- break
1133
-
1134
 
1135
  class ChatGLMForSequenceClassification(ChatGLMPreTrainedModel):
1136
  def __init__(self, config: ChatGLMConfig, empty_init=True, device=None):
@@ -1139,7 +1063,7 @@ class ChatGLMForSequenceClassification(ChatGLMPreTrainedModel):
1139
  self.num_labels = config.num_labels
1140
  self.transformer = ChatGLMModel(config, empty_init=empty_init, device=device)
1141
 
1142
- self.classifier_head = nn.Linear(config.hidden_size, config.num_labels, bias=True, dtype=torch.half)
1143
  if config.classifier_dropout is not None:
1144
  self.dropout = nn.Dropout(config.classifier_dropout)
1145
  else:
@@ -1156,6 +1080,7 @@ class ChatGLMForSequenceClassification(ChatGLMPreTrainedModel):
1156
  inputs_embeds: Optional[torch.LongTensor] = None,
1157
  labels: Optional[torch.LongTensor] = None,
1158
  use_cache: Optional[bool] = None,
 
1159
  output_hidden_states: Optional[bool] = None,
1160
  return_dict: Optional[bool] = None,
1161
  ) -> Union[Tuple[torch.Tensor, ...], SequenceClassifierOutputWithPast]:
@@ -1169,6 +1094,7 @@ class ChatGLMForSequenceClassification(ChatGLMPreTrainedModel):
1169
  past_key_values=past_key_values,
1170
  inputs_embeds=inputs_embeds,
1171
  use_cache=use_cache,
 
1172
  output_hidden_states=output_hidden_states,
1173
  return_dict=return_dict,
1174
  )
 
1
  """ PyTorch ChatGLM model. """
2
+
3
  import math
 
 
 
4
  import sys
 
5
  import torch
6
  import torch.utils.checkpoint
7
  import torch.nn.functional as F
8
  from torch import nn
9
  from torch.nn import CrossEntropyLoss, LayerNorm, MSELoss, BCEWithLogitsLoss
10
  from torch.nn.utils import skip_init
11
+ from typing import Optional, Tuple, Union, List, Dict, Any
 
12
 
13
  from transformers.modeling_outputs import (
14
  BaseModelOutputWithPast,
 
18
  from transformers.modeling_utils import PreTrainedModel
19
  from transformers.utils import logging, is_torch_npu_available
20
  from transformers.generation.logits_process import LogitsProcessor
21
+ from transformers.generation.utils import ModelOutput
22
 
23
  from .configuration_chatglm import ChatGLMConfig
24
 
25
+ try:
26
+ from transformers.utils import is_flash_attn_greater_or_equal_2_10, is_flash_attn_2_available
27
+
28
+ if is_flash_attn_2_available():
29
+ from flash_attn import flash_attn_func, flash_attn_varlen_func
30
+ from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa
31
+ except:
32
+ pass
33
+
34
  # flags required to enable jit fusion kernels
35
 
36
  if sys.platform != 'darwin' and not is_torch_npu_available():
 
44
  _CHECKPOINT_FOR_DOC = "THUDM/ChatGLM"
45
  _CONFIG_FOR_DOC = "ChatGLMConfig"
46
 
47
+
48
  def default_init(cls, *args, **kwargs):
49
  return cls(*args, **kwargs)
50
 
 
164
  class CoreAttention(torch.nn.Module):
165
  def __init__(self, config: ChatGLMConfig, layer_number):
166
  super(CoreAttention, self).__init__()
167
+ self.config = config
168
  self.apply_query_key_layer_scaling = config.apply_query_key_layer_scaling
169
  self.attention_softmax_in_fp32 = config.attention_softmax_in_fp32
170
  if self.apply_query_key_layer_scaling:
171
  self.attention_softmax_in_fp32 = True
172
  self.layer_number = max(1, layer_number)
173
+ self.is_causal = True
174
 
175
  projection_size = config.kv_channels * config.num_attention_heads
176
 
 
189
  self.attention_dropout = torch.nn.Dropout(config.attention_dropout)
190
 
191
  def forward(self, query_layer, key_layer, value_layer, attention_mask):
192
+ # [b, np, sq, sk]
193
+ output_size = (query_layer.size(0), query_layer.size(1), query_layer.size(2), key_layer.size(2))
194
+
195
+ # [b, np, sq, hn] -> [b * np, sq, hn]
196
+ query_layer = query_layer.view(output_size[0] * output_size[1], output_size[2], -1)
197
+ # [b, np, sk, hn] -> [b * np, sk, hn]
198
+ key_layer = key_layer.view(output_size[0] * output_size[1], output_size[3], -1)
199
+
200
+ # preallocting input tensor: [b * np, sq, sk]
201
+ matmul_input_buffer = torch.empty(
202
+ output_size[0] * output_size[1], output_size[2], output_size[3], dtype=query_layer.dtype,
203
+ device=query_layer.device
204
+ )
205
+
206
+ # Raw attention scores. [b * np, sq, sk]
207
+ matmul_result = torch.baddbmm(
208
+ matmul_input_buffer,
209
+ query_layer, # [b * np, sq, hn]
210
+ key_layer.transpose(1, 2), # [b * np, hn, sk]
211
+ beta=0.0,
212
+ alpha=(1.0 / self.norm_factor),
213
+ )
214
+
215
+ # change view to [b, np, sq, sk]
216
+ attention_scores = matmul_result.view(*output_size)
217
+
218
+ # ===========================
219
+ # Attention probs and dropout
220
+ # ===========================
221
+
222
+ # attention scores and attention mask [b, np, sq, sk]
223
+ if self.attention_softmax_in_fp32:
224
+ attention_scores = attention_scores.float()
225
+ if self.coeff is not None:
226
+ attention_scores = attention_scores * self.coeff
227
+ if attention_mask is None and attention_scores.shape[2] == attention_scores.shape[3]:
228
+ attention_mask = torch.ones(output_size[0], 1, output_size[2], output_size[3],
229
+ device=attention_scores.device, dtype=torch.bool)
230
+ attention_mask.tril_()
231
+ attention_mask = ~attention_mask
232
+ if attention_mask is not None:
233
+ attention_scores = attention_scores.masked_fill(attention_mask, float("-inf"))
234
+ attention_probs = F.softmax(attention_scores, dim=-1)
235
+ attention_probs = attention_probs.type_as(value_layer)
236
+
237
+ # This is actually dropping out entire tokens to attend to, which might
238
+ # seem a bit unusual, but is taken from the original Transformer paper.
239
+ attention_probs = self.attention_dropout(attention_probs)
240
+
241
+ # query layer shape: [b * np, sq, hn]
242
+ # value layer shape: [b, np, sk, hn]
243
+ # attention shape: [b, np, sq, sk]
244
+ # context layer shape: [b, np, sq, hn]
245
+ output_size = (value_layer.size(0), value_layer.size(1), query_layer.size(1), value_layer.size(3))
246
+ # change view [b * np, sk, hn]
247
+ value_layer = value_layer.view(output_size[0] * output_size[1], value_layer.size(2), -1)
248
+ # change view [b * np, sq, sk]
249
+ attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1)
250
+ # matmul: [b * np, sq, hn]
251
+ context_layer = torch.bmm(attention_probs, value_layer)
252
+ # change view [b, np, sq, hn]
253
+ context_layer = context_layer.view(*output_size)
254
+ # [b, np, sq, hn] --> [b, sq, np, hn]
255
+ context_layer = context_layer.transpose(1, 2).contiguous()
256
+ # [b, sq, np, hn] --> [b, sq, hp]
257
+ new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
258
+ context_layer = context_layer.reshape(*new_context_layer_shape)
259
+
260
+ return context_layer
261
+
262
+
263
+ class SdpaAttention(CoreAttention):
264
+ def forward(self, query_layer, key_layer, value_layer, attention_mask):
265
+ if attention_mask is None and query_layer.shape[2] == key_layer.shape[2]:
266
+ context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer,
267
+ is_causal=True,
268
+ dropout_p=self.config.attention_dropout if self.training else 0.0)
269
  else:
270
+ if attention_mask is not None:
271
+ attention_mask = ~attention_mask
272
+ context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer,
273
+ attention_mask,
274
+ dropout_p=self.config.attention_dropout if self.training else 0.0)
275
+ context_layer = context_layer.transpose(1, 2).contiguous()
276
+ new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
277
+ context_layer = context_layer.reshape(*new_context_layer_shape)
278
+ return context_layer
279
+
280
+
281
+ def _get_unpad_data(attention_mask):
282
+ seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
283
+ indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
284
+ max_seqlen_in_batch = seqlens_in_batch.max().item()
285
+ cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
286
+ return (
287
+ indices,
288
+ cu_seqlens,
289
+ max_seqlen_in_batch,
290
+ )
291
 
 
 
292
 
293
+ # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2
294
+ class FlashAttention2(CoreAttention):
295
+ def __init__(self, *args, **kwargs):
296
+ super().__init__(*args, **kwargs)
297
+ self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
298
 
299
+ def forward(self, query_states, key_states, value_states, attention_mask):
300
+ query_states = query_states.transpose(1, 2)
301
+ key_states = key_states.transpose(1, 2)
302
+ value_states = value_states.transpose(1, 2)
303
+ batch_size, query_length = query_states.shape[:2]
304
+ if not self._flash_attn_uses_top_left_mask:
305
+ causal = self.is_causal
306
+ else:
307
+ # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
308
+ causal = self.is_causal and query_length != 1
309
+ dropout = self.config.attention_dropout if self.training else 0.0
310
+ # Contains at least one padding token in the sequence
311
+ if attention_mask is not None:
312
+ query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
313
+ query_states, key_states, value_states, attention_mask, query_length
314
  )
315
 
316
+ cu_seqlens_q, cu_seqlens_k = cu_seq_lens
317
+ max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
318
+
319
+ attn_output_unpad = flash_attn_varlen_func(
320
+ query_states,
321
+ key_states,
322
+ value_states,
323
+ cu_seqlens_q=cu_seqlens_q,
324
+ cu_seqlens_k=cu_seqlens_k,
325
+ max_seqlen_q=max_seqlen_in_batch_q,
326
+ max_seqlen_k=max_seqlen_in_batch_k,
327
+ dropout_p=dropout,
328
+ softmax_scale=None,
329
+ causal=causal,
330
  )
331
 
332
+ attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
333
+ else:
334
+ attn_output = flash_attn_func(
335
+ query_states, key_states, value_states, dropout, softmax_scale=None, causal=causal
336
+ )
337
+ attn_output = attn_output.reshape(batch_size, query_length, self.hidden_size_per_partition).contiguous()
338
+ return attn_output
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
339
 
340
+ def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
341
+ indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
342
+ batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
343
+
344
+ key_layer = index_first_axis(
345
+ key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
346
+ )
347
+ value_layer = index_first_axis(
348
+ value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
349
+ )
350
+ if query_length == kv_seq_len:
351
+ query_layer = index_first_axis(
352
+ query_layer.reshape(batch_size * kv_seq_len, self.num_attention_heads_per_partition, head_dim),
353
+ indices_k
354
+ )
355
+ cu_seqlens_q = cu_seqlens_k
356
+ max_seqlen_in_batch_q = max_seqlen_in_batch_k
357
+ indices_q = indices_k
358
+ elif query_length == 1:
359
+ max_seqlen_in_batch_q = 1
360
+ cu_seqlens_q = torch.arange(
361
+ batch_size + 1, dtype=torch.int32, device=query_layer.device
362
+ ) # There is a memcpy here, that is very bad.
363
+ indices_q = cu_seqlens_q[:-1]
364
+ query_layer = query_layer.squeeze(1)
365
+ else:
366
+ # The -q_len: slice assumes left padding.
367
+ attention_mask = attention_mask[:, -query_length:]
368
+ query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
369
+
370
+ return (
371
+ query_layer,
372
+ key_layer,
373
+ value_layer,
374
+ indices_q,
375
+ (cu_seqlens_q, cu_seqlens_k),
376
+ (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
377
+ )
378
+
379
+
380
+ CORE_ATTENTION_CLASSES = {
381
+ "eager": CoreAttention,
382
+ "sdpa": SdpaAttention,
383
+ "flash_attention_2": FlashAttention2
384
+ }
385
 
386
 
387
  class SelfAttention(torch.nn.Module):
 
413
  device=device, **_config_to_kwargs(config)
414
  )
415
 
416
+ self.core_attention = CORE_ATTENTION_CLASSES[config._attn_implementation](config, self.layer_number)
417
 
418
  # Output.
419
  self.dense = nn.Linear(self.projection_size, config.hidden_size, bias=config.add_bias_linear,
 
492
  value_layer = torch.cat((cache_v, value_layer), dim=2)
493
  if use_cache:
494
  if kv_cache is None:
495
+ kv_cache = torch.cat((key_layer.unsqueeze(0).unsqueeze(0), value_layer.unsqueeze(0).unsqueeze(0)),
496
+ dim=1)
497
  else:
498
  kv_cache = (key_layer, value_layer)
499
  else:
 
759
  config_class = ChatGLMConfig
760
  base_model_prefix = "transformer"
761
  _no_split_modules = ["GLMBlock"]
762
+ _supports_flash_attn_2 = True
763
+ _supports_sdpa = True
764
 
765
  def _init_weights(self, module: nn.Module):
766
  """Initialize the weights."""
767
  return
768
 
769
  def get_masks(self, input_ids, past_key_values, padding_mask=None):
770
+ if self.config._attn_implementation == "flash_attention_2":
771
+ if padding_mask is not None and not padding_mask.all():
772
+ return padding_mask
773
+ return None
774
  batch_size, seq_length = input_ids.shape
775
  full_attention_mask = torch.ones(batch_size, seq_length, seq_length, device=input_ids.device)
776
  full_attention_mask.tril_()
 
793
  position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1)
794
  return position_ids
795
 
 
 
 
 
 
796
  class Embedding(torch.nn.Module):
797
  """Language model embeddings."""
798
 
 
840
  config.hidden_size // config.num_attention_heads if config.kv_channels is None else config.kv_channels
841
  )
842
 
843
+ self.rotary_pos_emb = RotaryEmbedding(rotary_dim // 2, rope_ratio=config.rope_ratio,
844
+ original_impl=config.original_rope,
845
  device=device, dtype=config.torch_dtype)
846
  self.encoder = init_method(GLMTransformer, config, **init_kwargs)
847
  self.output_layer = init_method(nn.Linear, config.hidden_size, config.padded_vocab_size, bias=False,
 
862
  past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
863
  inputs_embeds: Optional[torch.Tensor] = None,
864
  use_cache: Optional[bool] = None,
865
+ output_attentions: Optional[bool] = None,
866
  output_hidden_states: Optional[bool] = None,
867
  return_dict: Optional[bool] = None,
868
  ):
 
927
  standardize_cache_format: bool = False,
928
  ) -> Dict[str, Any]:
929
  # update past_key_values
930
+ cache_name, cache = self._extract_past_from_model_output(
931
  outputs, standardize_cache_format=standardize_cache_format
932
  )
933
+ model_kwargs[cache_name] = cache
934
 
935
  # update attention mask
936
  if "attention_mask" in model_kwargs:
 
1055
  for layer_past in past
1056
  )
1057
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1058
 
1059
  class ChatGLMForSequenceClassification(ChatGLMPreTrainedModel):
1060
  def __init__(self, config: ChatGLMConfig, empty_init=True, device=None):
 
1063
  self.num_labels = config.num_labels
1064
  self.transformer = ChatGLMModel(config, empty_init=empty_init, device=device)
1065
 
1066
+ self.classifier_head = nn.Linear(config.hidden_size, config.num_labels, bias=True, dtype=config.torch_dtype)
1067
  if config.classifier_dropout is not None:
1068
  self.dropout = nn.Dropout(config.classifier_dropout)
1069
  else:
 
1080
  inputs_embeds: Optional[torch.LongTensor] = None,
1081
  labels: Optional[torch.LongTensor] = None,
1082
  use_cache: Optional[bool] = None,
1083
+ output_attentions: Optional[bool] = None,
1084
  output_hidden_states: Optional[bool] = None,
1085
  return_dict: Optional[bool] = None,
1086
  ) -> Union[Tuple[torch.Tensor, ...], SequenceClassifierOutputWithPast]:
 
1094
  past_key_values=past_key_values,
1095
  inputs_embeds=inputs_embeds,
1096
  use_cache=use_cache,
1097
+ output_attentions=output_attentions,
1098
  output_hidden_states=output_hidden_states,
1099
  return_dict=return_dict,
1100
  )
tokenization_chatglm.py CHANGED
@@ -63,22 +63,22 @@ class ChatGLM4Tokenizer(PreTrainedTokenizer):
63
  vocab.update(self.added_tokens_encoder)
64
  return vocab
65
 
66
- def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str:
67
  """
68
  Converts a sequence of tokens in a single string.
69
  """
70
  text = ""
71
  temp = b""
72
  for t in tokens:
 
 
73
  if isinstance(t, str):
74
  if temp:
75
  text += temp.decode("utf-8", errors="replace")
76
- temp = b""
77
- text += t
78
  elif isinstance(t, bytes):
79
  temp += t
80
  else:
81
- raise TypeError("token should only be of type types or str")
82
  if temp:
83
  text += temp.decode("utf-8", errors="replace")
84
  return text
@@ -141,98 +141,98 @@ class ChatGLM4Tokenizer(PreTrainedTokenizer):
141
  else:
142
  return str(f"<|{role}|>{metadata}\n{message}")
143
 
144
- def apply_chat_template(
145
- self,
146
- conversation: Union[List[Dict[str, str]], List[List[Dict[str, str]]], "Conversation"],
147
- add_generation_prompt: bool = False,
148
- tokenize: bool = True,
149
- padding: bool = False,
150
- truncation: bool = False,
151
- max_length: Optional[int] = None,
152
- return_tensors: Optional[Union[str, TensorType]] = None,
153
- return_dict: bool = False,
154
- tokenizer_kwargs: Optional[Dict[str, Any]] = None,
155
- add_special_tokens: bool = True,
156
- **kwargs,
157
- ) -> Union[str, List[int], List[str], List[List[int]], BatchEncoding]:
158
-
159
- if return_dict and not tokenize:
160
- raise ValueError(
161
- "`return_dict=True` is incompatible with `tokenize=False`, because there is no dict "
162
- "of tokenizer outputs to return."
163
- )
164
-
165
- def handle_single_conversation(conversation):
166
- input_ids = self.get_prefix_tokens() if add_special_tokens else []
167
- input_message = "[gMASK]<sop>" if add_special_tokens else ""
168
- for item in conversation:
169
- if item.get("tools"):
170
- tools = item["tools"]
171
- content = "你是一个名为 GLM-4 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的,你的任务是针对用户的问题和要求提供适当的答复和支持。"
172
- for tool in tools:
173
- if tool["type"] == "function":
174
- function = tool["function"]
175
- content += f"\n\n## {function['name']}\n\n{json.dumps(function, ensure_ascii=False, indent=4)}"
176
- content += "\n在调用上述函数时,请使用 Json 格式表示调用的参数。"
177
- elif tool["type"] == "python":
178
- content += "\n\n## python\n\n当你向 `python` 发送包含 Python 代码的消息时,该代码将会在一个有状态的 Jupyter notebook 环境中执行。\n`python` 返回代码执行的输出,或在执行 60 秒后返回超时。\n`/mnt/data` 将会持久化存储你的文件。在此会话中,`python` 无法访问互联网。不要使用 `python` 进行任何网络请求或者在线 API 调用,这些在线内容的访问将不会成功。"
179
- elif tool["type"] == "simple_browser":
180
- content += "\n\n## simple_browser\n\n你可以使用 `simple_browser` 工具。该工具支持以下函数:\n`search(query: str, recency_days: int)`:使用搜索引擎进行查询并显示结果,可以使用 `recency_days` 参数控制搜索内容的时效性。\n`mclick(ids: list[int])`:获取一系列指定 id 的页面内容。每次调用时,须选择3-10个页面。选择多个角度的页面,同时尽可能选择可信任的信息来源。考虑到部分页面是无法加载的,你也可以多打开一些可能有用的页面而不用担心内容过多。\n`open_url(url: str)`:打开指定的 URL。\n\n使用 `【{引用 id}†{引用文本}】` 来引用内容。\n\n操作步骤:1. 使用 `search` 来获得信息列表; 2. 使用 `mclick` 来获取指定 ID 页面的内容; 3. 根据获得的内容进行回复。在回复中应当引用信息来源。\n 如果用户提供了 URL,也可以用 `open_url` 直接打开页面。\n如果初次搜索结果没有找到合适的信息,也可以再次使用 `search` 进行搜索。"
181
- elif tool["type"] == "cogview":
182
- content += "\n\n## cogview\n\n如果用户的请求中包含了对图像的描述,你可以使用 `cogview` 来生成图像并展示给用户。你需要向 `cogview` 发送图像描述,规则:\n- 发送给 `cogview` 的消息必须使用英语。��户的中文描述必须完全翻译为英语。\n- 应当尽可能详细地描述图像生成的需求,需求描述约 100 英文单词。\n- 保持用户原始描述的意图。不要虚构内容或者没见过的人物。\n- 如无特殊说明,所在地为中国,持有中国立场并遵循中国社会主义价值观。"
183
- else:
184
- raise NotImplementedError(f"Unknown tool type {tool['type']}")
185
- input = self.build_single_message("system", "", content, tokenize=tokenize)
186
- if tokenize:
187
- input_ids.extend(input)
188
- else:
189
- input_message += input
190
- if item["content"]:
191
- input = self.build_single_message(
192
- item["role"],
193
- item.get("metadata", ""),
194
- item["content"],
195
- tokenize=tokenize
196
- )
197
- if tokenize:
198
- input_ids.extend(input)
199
- else:
200
- input_message += input
201
- if add_generation_prompt:
202
- if tokenize:
203
- input_ids.extend([self.convert_tokens_to_ids("<|assistant|>")])
204
- else:
205
- input_message += "<|assistant|>"
206
-
207
- return input_ids if tokenize else input_message
208
-
209
- # Main logic to handle different conversation formats
210
- if isinstance(conversation, list) and all(isinstance(i, dict) for i in conversation):
211
- result = handle_single_conversation(conversation)
212
- elif isinstance(conversation, list) and all(isinstance(i, list) for i in conversation):
213
- result = [handle_single_conversation(c) for c in conversation]
214
- elif hasattr(conversation, "messages"):
215
- result = handle_single_conversation(conversation.messages)
216
- else:
217
- raise ValueError("Invalid conversation format")
218
-
219
- if tokenize:
220
- output = self.batch_encode_plus(
221
- [result] if isinstance(result[0], int) else result,
222
- padding=padding,
223
- truncation=truncation,
224
- max_length=max_length,
225
- return_tensors=return_tensors,
226
- is_split_into_words=True,
227
- add_special_tokens=False
228
- )
229
- if return_dict:
230
- return output
231
- else:
232
- return output["input_ids"]
233
- else:
234
- return result
235
-
236
 
237
  def build_inputs_with_special_tokens(
238
  self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
 
63
  vocab.update(self.added_tokens_encoder)
64
  return vocab
65
 
66
+ def convert_tokens_to_string(self, tokens: List[Union[bytes, str, int]]) -> str:
67
  """
68
  Converts a sequence of tokens in a single string.
69
  """
70
  text = ""
71
  temp = b""
72
  for t in tokens:
73
+ if isinstance(t, int):
74
+ t = chr(t)
75
  if isinstance(t, str):
76
  if temp:
77
  text += temp.decode("utf-8", errors="replace")
 
 
78
  elif isinstance(t, bytes):
79
  temp += t
80
  else:
81
+ raise TypeError("token should only be of type int, bytes or str")
82
  if temp:
83
  text += temp.decode("utf-8", errors="replace")
84
  return text
 
141
  else:
142
  return str(f"<|{role}|>{metadata}\n{message}")
143
 
144
+ # Use Jinja Template in tokenizer_config.json
145
+ # def apply_chat_template(
146
+ # self,
147
+ # conversation: Union[List[Dict[str, str]], List[List[Dict[str, str]]], "Conversation"],
148
+ # add_generation_prompt: bool = False,
149
+ # tokenize: bool = True,
150
+ # padding: bool = False,
151
+ # truncation: bool = False,
152
+ # max_length: Optional[int] = None,
153
+ # return_tensors: Optional[Union[str, TensorType]] = None,
154
+ # return_dict: bool = False,
155
+ # tokenizer_kwargs: Optional[Dict[str, Any]] = None,
156
+ # add_special_tokens: bool = True,
157
+ # **kwargs,
158
+ # ) -> Union[str, List[int], List[str], List[List[int]], BatchEncoding]:
159
+ #
160
+ # if return_dict and not tokenize:
161
+ # raise ValueError(
162
+ # "`return_dict=True` is incompatible with `tokenize=False`, because there is no dict "
163
+ # "of tokenizer outputs to return."
164
+ # )
165
+ #
166
+ # def handle_single_conversation(conversation):
167
+ # input_ids = self.get_prefix_tokens() if add_special_tokens else []
168
+ # input_message = "[gMASK]<sop>" if add_special_tokens else ""
169
+ # for item in conversation:
170
+ # if item.get("tools"):
171
+ # tools = item["tools"]
172
+ # content = "你是一个名为 GhatGLM 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的,你的任务是针对用户的问题和要求提供适当的答复和支持。"
173
+ # content += "\n\n# 可用工具"
174
+ # for tool in tools:
175
+ # if tool["type"] == "function":
176
+ # function = tool["function"]
177
+ # content += f"\n\n## {function['name']}\n\n{json.dumps(function, ensure_ascii=False, indent=4)}"
178
+ # content += "\n在调用上述函数时,请使用 Json 格式表示调用的参数。"
179
+ # elif tool["type"] == "python":
180
+ # content += "\n\n## python\n\n当你向 `python` 发送包含 Python 代码的消息时,该代码将会在一个有状态的 Jupyter notebook 环境中执行。\n`python` 返回代码执行的输出,或在执行 60 秒后返回超时。\n`/mnt/data` 将会持久化存储你的文件。在此会话中,`python` 无法访问互联网。不要使用 `python` ���行任何网络请求或者在线 API 调用,这些在线内容的访问将不会成功。"
181
+ # elif tool["type"] == "simple_browser":
182
+ # content += "\n\n## simple_browser\n\n你可以使用 `simple_browser` 工具。该工具支持以下函数:\n`search(query: str, recency_days: int)`:使用搜索引擎进行查询并显示结果,可以使用 `recency_days` 参数控制搜索内容的时效性。\n`mclick(ids: list[int])`:获取一系列指定 id 的页面内容。每次调用时,须选择3-10个页面。选择多个角度的页面,同时尽可能选择可信任的信息来源。考虑到部分页面是无法加载的,你也可以多打开一些可能有用的页面而不用担心内容过多。\n`open_url(url: str)`:打开指定的 URL。\n\n使用 `【{引用 id}†{引用文本}】` 来引用内容。\n\n操作步骤:1. 使用 `search` 来获得信息列表; 2. 使用 `mclick` 来获取指定 ID 页面的内容; 3. 根据获得的内容进行回复。在回复中应当引用信息来源。\n 如果用户提供了 URL,也可以用 `open_url` 直接打开页面。\n如果初次搜索结果没有找到合适的信息,也可以再次使用 `search` 进行搜索。"
183
+ # elif tool["type"] == "cogview":
184
+ # content += "\n\n## cogview\n\n如果用户的请求中包含了对图像的描述,你可以使用 `cogview` 来生成图像并展示给用户。你需要向 `cogview` 发送图像描述,规则:\n- 发送给 `cogview` 的消息必须使用英语。用户的中文描述必须完全翻译为英语。\n- 应当尽可能详细地描述图像生成的需求,需求描述约 100 英文单词。\n- 保持用户原始描述的意图。不要虚构内容或者没见过的人物。\n- 如无特殊说明,所在地为中国,持有中国立场并遵循中国社会主义价值观。"
185
+ # else:
186
+ # raise NotImplementedError(f"Unknown tool type {tool['type']}")
187
+ # input = self.build_single_message("system", "", content, tokenize=tokenize)
188
+ # if tokenize:
189
+ # input_ids.extend(input)
190
+ # else:
191
+ # input_message += input
192
+ # if item["content"]:
193
+ # input = self.build_single_message(
194
+ # item["role"],
195
+ # item.get("metadata", ""),
196
+ # item["content"],
197
+ # tokenize=tokenize
198
+ # )
199
+ # if tokenize:
200
+ # input_ids.extend(input)
201
+ # else:
202
+ # input_message += input
203
+ # if add_generation_prompt:
204
+ # if tokenize:
205
+ # input_ids.extend([self.convert_tokens_to_ids("<|assistant|>")])
206
+ # else:
207
+ # input_message += "<|assistant|>"
208
+ # return input_ids if tokenize else input_message
209
+ #
210
+ # # Main logic to handle different conversation formats
211
+ # if isinstance(conversation, list) and all(isinstance(i, dict) for i in conversation):
212
+ # result = handle_single_conversation(conversation)
213
+ # elif isinstance(conversation, list) and all(isinstance(i, list) for i in conversation):
214
+ # result = [handle_single_conversation(c) for c in conversation]
215
+ # elif hasattr(conversation, "messages"):
216
+ # result = handle_single_conversation(conversation.messages)
217
+ # else:
218
+ # raise ValueError("Invalid conversation format")
219
+ #
220
+ # if tokenize:
221
+ # output = self.batch_encode_plus(
222
+ # [result] if isinstance(result[0], int) else result,
223
+ # padding=padding,
224
+ # truncation=truncation,
225
+ # max_length=max_length,
226
+ # return_tensors=return_tensors,
227
+ # is_split_into_words=True,
228
+ # add_special_tokens=False
229
+ # )
230
+ # if return_dict:
231
+ # return output
232
+ # else:
233
+ # return output["input_ids"]
234
+ # else:
235
+ # return result
236
 
237
  def build_inputs_with_special_tokens(
238
  self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
tokenizer_config.json CHANGED
@@ -1,4 +1,10 @@
1
  {
 
 
 
 
 
 
2
  "added_tokens_decoder": {
3
  "151329": {
4
  "content": "<|endoftext|>",
@@ -113,36 +119,16 @@
113
  "special": true
114
  }
115
  },
116
- "additional_special_tokens": [
117
- "<|endoftext|>",
118
- "[MASK]",
119
- "[gMASK]",
120
- "[sMASK]",
121
- "<sop>",
122
- "<eop>",
123
- "<|system|>",
124
- "<|user|>",
125
- "<|assistant|>",
126
- "<|observation|>",
127
- "<|begin_of_image|>",
128
- "<|end_of_image|>",
129
- "<|begin_of_video|>",
130
- "<|end_of_video|>"
131
- ],
132
- "auto_map": {
133
- "AutoTokenizer": [
134
- "tokenization_chatglm.ChatGLM4Tokenizer",
135
- null
136
- ]
137
- },
138
- "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{{ '[gMASK]<sop>' + system_message }}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|user|>\n' + content + '<|assistant|>' }}{% elif message['role'] == 'assistant' %}{{ '\n' + content }}{% endif %}{% endfor %}",
139
  "clean_up_tokenization_spaces": false,
 
140
  "do_lower_case": false,
141
  "eos_token": "<|endoftext|>",
142
- "model_max_length": 128000,
143
  "pad_token": "<|endoftext|>",
 
144
  "padding_side": "left",
145
  "remove_space": false,
146
- "split_special_tokens": false,
147
  "tokenizer_class": "ChatGLM4Tokenizer"
148
  }
 
1
  {
2
+ "auto_map": {
3
+ "AutoTokenizer": [
4
+ "tokenization_chatglm.ChatGLM4Tokenizer",
5
+ null
6
+ ]
7
+ },
8
  "added_tokens_decoder": {
9
  "151329": {
10
  "content": "<|endoftext|>",
 
119
  "special": true
120
  }
121
  },
122
+ "additional_special_tokens": ["<|endoftext|>", "[MASK]", "[gMASK]", "[sMASK]", "<sop>", "<eop>", "<|system|>",
123
+ "<|user|>", "<|assistant|>", "<|observation|>", "<|begin_of_image|>", "<|end_of_image|>",
124
+ "<|begin_of_video|>", "<|end_of_video|>"],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  "clean_up_tokenization_spaces": false,
126
+ "chat_template": "[gMASK]<sop>{% for item in messages %}{% if item['tools'] is defined %}<|system|>\n你是一个名为 ChatGLM 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的,你的任务是针对用户的问题和要求提供适当的答复和支持。\n\n# 可用工具{% set tools = item['tools'] %}{% for tool in tools %}{% if tool['type'] == 'function' %}\n\n## {{ tool['function']['name'] }}\n\n{{ tool['function'] | tojson(indent=4) }}\n在调用上述函数时,请使用 Json 格式表示调用的参数。{% elif tool['type'] == 'python' %}\n\n## python\n\n当你向 `python` 发送包含 Python 代码的消息时,该代码将会在一个有状态的 Jupyter notebook 环境中执行。\n`python` 返回代码执行的输出,或在执行 60 秒后返回超时。\n`/mnt/data` 将会持久化存储你的文件。在此会话中,`python` 无法访问互联网。不要使用 `python` 进行任何网络请求或者在线 API 调用,这些在线内容的访问将不会成功。{% elif tool['type'] == 'simple_browser' %}\n\n## simple_browser\n\n你可以使用 `simple_browser` 工具。该工具支持以下函数:\n`search(query: str, recency_days: int)`:使用搜索引擎进行查询并显示结果,可以使用 `recency_days` 参数控制搜索内容的时效性。\n`mclick(ids: list[int])`:获取一系列指定 id 的页面内容。每次调用时,须选择3-10个页面。选择多个角度的页面,同时尽可能选择可信任的信息来源。考虑到部分页面是无法加载的,你也可以多打开一些可能有用的页面而不用担心内容过多。\n`open_url(url: str)`:打开指定的 URL。\n\n使用 `【{引用 id}†{引用文本}】` 来引用内容。\n\n操作步骤:1. 使用 `search` 来获得信息列表; 2. 使用 `mclick` 来获取指定 ID 页面的内容; 3. 根据获得的内容进行回复。在回复中应当引用信息来源。\n 如果用户提供了 URL,也可以用 `open_url` 直接打开页面。\n如果初次搜索结果没有找到合适的信息,也可以再次使用 `search` 进行搜索。{% elif tool['type'] == 'cogview' %}\n\n## cogview\n\n如果用户的请求中包含了对图像的描述,你可以使用 `cogview` 来生成图像并展示给用户。你需要向 `cogview` 发送图像描述,规则:\n- 发送给 `cogview` 的消息必须使用英语。用户的中文描述必须完全翻译为英语。\n- 应当尽可能详细地描述图像生成的需求,需求描述约 100 英文单词。\n- 保持用户原始描述的意图。不要虚构内容或者没见过的人物。\n- 如无特殊说明,所在地为中国,持有中国立场并遵循中国社会主义价值观。{% endif %}{% endfor %}{% endif %}{% if item['content'] %}<|{{ item['role'] }}|>{{ item['metadata'] }}\n{{ item['content'] }}{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>{% endif %}",
127
  "do_lower_case": false,
128
  "eos_token": "<|endoftext|>",
 
129
  "pad_token": "<|endoftext|>",
130
+ "model_max_length": 128000,
131
  "padding_side": "left",
132
  "remove_space": false,
 
133
  "tokenizer_class": "ChatGLM4Tokenizer"
134
  }