duzx16 commited on
Commit
d493e51
1 Parent(s): dba7772

Update apply_chat_template

Browse files
Files changed (1) hide show
  1. tokenization_chatglm.py +6 -6
tokenization_chatglm.py CHANGED
@@ -63,22 +63,22 @@ class ChatGLM4Tokenizer(PreTrainedTokenizer):
63
  vocab.update(self.added_tokens_encoder)
64
  return vocab
65
 
66
- def convert_tokens_to_string(self, tokens: List[Union[bytes, str, int]]) -> str:
67
  """
68
  Converts a sequence of tokens in a single string.
69
  """
70
  text = ""
71
  temp = b""
72
  for t in tokens:
73
- if isinstance(t, int):
74
- t = chr(t)
75
  if isinstance(t, str):
76
  if temp:
77
  text += temp.decode("utf-8", errors="replace")
 
 
78
  elif isinstance(t, bytes):
79
  temp += t
80
  else:
81
- raise TypeError("token should only be of type int, bytes or str")
82
  if temp:
83
  text += temp.decode("utf-8", errors="replace")
84
  return text
@@ -168,7 +168,8 @@ class ChatGLM4Tokenizer(PreTrainedTokenizer):
168
  for item in conversation:
169
  if item.get("tools"):
170
  tools = item["tools"]
171
- content = "你是一个名为 GLM-4 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的,你的任务是针对用户的问题和要求提供适当的答复和支持。"
 
172
  for tool in tools:
173
  if tool["type"] == "function":
174
  function = tool["function"]
@@ -203,7 +204,6 @@ class ChatGLM4Tokenizer(PreTrainedTokenizer):
203
  input_ids.extend([self.convert_tokens_to_ids("<|assistant|>")])
204
  else:
205
  input_message += "<|assistant|>"
206
-
207
  return input_ids if tokenize else input_message
208
 
209
  # Main logic to handle different conversation formats
 
63
  vocab.update(self.added_tokens_encoder)
64
  return vocab
65
 
66
+ def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str:
67
  """
68
  Converts a sequence of tokens in a single string.
69
  """
70
  text = ""
71
  temp = b""
72
  for t in tokens:
 
 
73
  if isinstance(t, str):
74
  if temp:
75
  text += temp.decode("utf-8", errors="replace")
76
+ temp = b""
77
+ text += t
78
  elif isinstance(t, bytes):
79
  temp += t
80
  else:
81
+ raise TypeError("token should only be of type types or str")
82
  if temp:
83
  text += temp.decode("utf-8", errors="replace")
84
  return text
 
168
  for item in conversation:
169
  if item.get("tools"):
170
  tools = item["tools"]
171
+ content = "你是一个名为 GhatGLM 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的,你的任务是针对用户的问题和要求提供适当的答复和支持。"
172
+ content += "\n\n# 可用工具"
173
  for tool in tools:
174
  if tool["type"] == "function":
175
  function = tool["function"]
 
204
  input_ids.extend([self.convert_tokens_to_ids("<|assistant|>")])
205
  else:
206
  input_message += "<|assistant|>"
 
207
  return input_ids if tokenize else input_message
208
 
209
  # Main logic to handle different conversation formats