Files changed (3) hide show
  1. README.md +1 -1
  2. tokenization_chatglm.py +32 -7
  3. tokenizer_config.json +19 -0
README.md CHANGED
@@ -4,7 +4,7 @@
4
  ```python
5
  >>> from transformers import AutoTokenizer, AutoModel
6
  >>> tokenizer = AutoTokenizer.from_pretrained("thu-coai/CharacterGLM-6B", trust_remote_code=True)
7
- >>> model = AutoModel.from_pretrained("LingxinAI/CharacterGLM-6b", trust_remote_code=True, device='cuda')
8
  >>> model = model.eval()
9
  >>> session_meta = {'user_info': '我是陆星辰,是一个男性,是一位知名导演,也是苏梦远的合作导演。我擅长拍摄音乐题材的电影。苏梦远对我的态度是尊敬的,并视我为良师益友。', 'bot_info': '苏梦远,本名苏远心,是一位当红的国内女歌手及演员。在参加选秀节目后,凭借独特的嗓音及出众的舞台魅力迅速成名,进入娱乐圈。她外表美丽动人,但真正的魅力在于她的才华和勤奋。苏梦远是音乐学院毕业的优秀生,善于创作,拥有多首热门原创歌曲。除了音乐方面的成就,她还热衷于慈善事业,积极参加公益活动,用实际行动传递正能量。在工作中,她对待工作非常敬业,拍戏时总是全身心投入角色,赢得了业内人士的赞誉和粉丝的喜爱。虽然在娱乐圈,但她始终保持低调、谦逊的态度,深得同行尊重。在表达时,苏梦远喜欢使用“我们”和“一起”,强调团队精神。', 'bot_name': '苏梦远', 'user_name': '陆星辰'}
10
  >>> response, history = model.chat(tokenizer, session_meta, "你好", history=[])
 
4
  ```python
5
  >>> from transformers import AutoTokenizer, AutoModel
6
  >>> tokenizer = AutoTokenizer.from_pretrained("thu-coai/CharacterGLM-6B", trust_remote_code=True)
7
+ >>> model = AutoModel.from_pretrained("thu-coai/CharacterGLM-6b", trust_remote_code=True, device='cuda')
8
  >>> model = model.eval()
9
  >>> session_meta = {'user_info': '我是陆星辰,是一个男性,是一位知名导演,也是苏梦远的合作导演。我擅长拍摄音乐题材的电影。苏梦远对我的态度是尊敬的,并视我为良师益友。', 'bot_info': '苏梦远,本名苏远心,是一位当红的国内女歌手及演员。在参加选秀节目后,凭借独特的嗓音及出众的舞台魅力迅速成名,进入娱乐圈。她外表美丽动人,但真正的魅力在于她的才华和勤奋。苏梦远是音乐学院毕业的优秀生,善于创作,拥有多首热门原创歌曲。除了音乐方面的成就,她还热衷于慈善事业,积极参加公益活动,用实际行动传递正能量。在工作中,她对待工作非常敬业,拍戏时总是全身心投入角色,赢得了业内人士的赞誉和粉丝的喜爱。虽然在娱乐圈,但她始终保持低调、谦逊的态度,深得同行尊重。在表达时,苏梦远喜欢使用“我们”和“一起”,强调团队精神。', 'bot_name': '苏梦远', 'user_name': '陆星辰'}
10
  >>> response, history = model.chat(tokenizer, session_meta, "你好", history=[])
tokenization_chatglm.py CHANGED
@@ -1,5 +1,5 @@
1
  import os
2
- import torch
3
  from typing import List, Optional, Union, Dict
4
  from sentencepiece import SentencePieceProcessor
5
  from transformers import PreTrainedTokenizer
@@ -27,9 +27,22 @@ class SPTokenizer:
27
  self.special_tokens[token] = self.n_words
28
  self.index_special_tokens[self.n_words] = token
29
  self.n_words += 1
30
-
31
- def tokenize(self, s: str):
32
- return self.sp_model.EncodeAsPieces(s)
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
  def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]:
35
  assert type(s) is str
@@ -41,7 +54,18 @@ class SPTokenizer:
41
  return t
42
 
43
  def decode(self, t: List[int]) -> str:
44
- return self.sp_model.decode(t)
 
 
 
 
 
 
 
 
 
 
 
45
 
46
  def decode_tokens(self, tokens: List[str]) -> str:
47
  text = self.sp_model.DecodePieces(tokens)
@@ -65,7 +89,7 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
65
 
66
  model_input_names = ["input_ids", "attention_mask", "position_ids"]
67
 
68
- def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, **kwargs):
69
  self.name = "GLMTokenizer"
70
 
71
  self.vocab_file = vocab_file
@@ -75,6 +99,7 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
75
  "<eos>": self.tokenizer.eos_id,
76
  "<pad>": self.tokenizer.pad_id
77
  }
 
78
  super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs)
79
 
80
  def get_command(self, token):
@@ -110,7 +135,7 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
110
  return vocab
111
 
112
  def _tokenize(self, text, **kwargs):
113
- return self.tokenizer.tokenize(text)
114
 
115
  def _convert_token_to_id(self, token):
116
  """ Converts a token (str) in an id using the vocab. """
 
1
  import os
2
+ import re
3
  from typing import List, Optional, Union, Dict
4
  from sentencepiece import SentencePieceProcessor
5
  from transformers import PreTrainedTokenizer
 
27
  self.special_tokens[token] = self.n_words
28
  self.index_special_tokens[self.n_words] = token
29
  self.n_words += 1
30
+ self.role_special_token_expression = "|".join([re.escape(token) for token in special_tokens]) # for apply_chat_template
31
+
32
+ def tokenize(self, s: str, encode_special_tokens=False):
33
+ if encode_special_tokens:
34
+ last_index = 0
35
+ t = []
36
+ for match in re.finditer(self.role_special_token_expression, s):
37
+ if last_index < match.start():
38
+ t.extend(self.sp_model.EncodeAsPieces(s[last_index:match.start()]))
39
+ t.append(s[match.start():match.end()])
40
+ last_index = match.end()
41
+ if last_index < len(s):
42
+ t.extend(self.sp_model.EncodeAsPieces(s[last_index:]))
43
+ return t
44
+ else:
45
+ return self.sp_model.EncodeAsPieces(s)
46
 
47
  def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]:
48
  assert type(s) is str
 
54
  return t
55
 
56
  def decode(self, t: List[int]) -> str:
57
+ text, buffer = "", []
58
+ for token in t:
59
+ if token in self.index_special_tokens:
60
+ if buffer:
61
+ text += self.sp_model.decode(buffer)
62
+ buffer = []
63
+ text += self.index_special_tokens[token]
64
+ else:
65
+ buffer.append(token)
66
+ if buffer:
67
+ text += self.sp_model.decode(buffer)
68
+ return text
69
 
70
  def decode_tokens(self, tokens: List[str]) -> str:
71
  text = self.sp_model.DecodePieces(tokens)
 
89
 
90
  model_input_names = ["input_ids", "attention_mask", "position_ids"]
91
 
92
+ def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, encode_special_tokens=False, **kwargs):
93
  self.name = "GLMTokenizer"
94
 
95
  self.vocab_file = vocab_file
 
99
  "<eos>": self.tokenizer.eos_id,
100
  "<pad>": self.tokenizer.pad_id
101
  }
102
+ self.encode_special_tokens = encode_special_tokens
103
  super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs)
104
 
105
  def get_command(self, token):
 
135
  return vocab
136
 
137
  def _tokenize(self, text, **kwargs):
138
+ return self.tokenizer.tokenize(text, encode_special_tokens=self.encode_special_tokens)
139
 
140
  def _convert_token_to_id(self, token):
141
  """ Converts a token (str) in an id using the vocab. """
tokenizer_config.json CHANGED
@@ -1,10 +1,29 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "auto_map": {
3
  "AutoTokenizer": [
4
  "tokenization_chatglm.ChatGLMTokenizer",
5
  null
6
  ]
7
  },
 
8
  "clean_up_tokenization_spaces": true,
9
  "do_lower_case": false,
10
  "model_max_length": 1000000000000000019884624838656,
 
1
  {
2
+ "added_tokens_decoder": {
3
+ "64790": {
4
+ "content": "[gMASK]",
5
+ "lstrip": false,
6
+ "normalized": true,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": false
10
+ },
11
+ "64792": {
12
+ "content": "sop",
13
+ "lstrip": false,
14
+ "normalized": true,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": false
18
+ }
19
+ },
20
  "auto_map": {
21
  "AutoTokenizer": [
22
  "tokenization_chatglm.ChatGLMTokenizer",
23
  null
24
  ]
25
  },
26
+ "chat_template": "{% set ns = namespace() %}[gMASK]sop{% for message in messages %}{% if loop.first %}{% set ns.bot_name = message['bot_name'] %}{% set ns.user_name = message['user_name'] %}以下是一段{{ message['bot_name'] }}和{{ message['user_name'] }}之间的对话。{%+ if message['bot_profile'] is defined and message['bot_profile']|length +%}\n关于{{ message['bot_name'] }}的信息:{{ message['bot_profile']|replace('\n', ' ') }}{% endif %}{%+ if message['user_profile'] is defined and message['user_profile']|length +%}\n关于{{ message['user_name'] }}的信息:{{ message['user_profile']|replace('\n', ' ') }}{% endif %}{%+ else +%}\n[{% if message['role'] == 'user' %}{{ ns.user_name }}{% else %}{{ ns.bot_name }}{% endif %}]{{ message['content']|replace('\n', ' ') }}{% endif %}{% endfor %}{%+ if add_generation_prompt +%}\n[{{ ns.bot_name }}]{% endif %}",
27
  "clean_up_tokenization_spaces": true,
28
  "do_lower_case": false,
29
  "model_max_length": 1000000000000000019884624838656,