xu-song commited on
Commit
d27a756
·
1 Parent(s): a37f943

fix chatglm; new feature about add_special_tokens;

Browse files
app.py CHANGED
@@ -59,7 +59,7 @@ with gr.Blocks(css="css/style.css", title="Tokenizer Arena") as demo:
59
  gr.Markdown("## Input Text")
60
  dropdown_examples = gr.Dropdown(
61
  # ["空格测试", "标点测试", "符号测试", "数字测试"],
62
- ["spaces", "punctuations", "symbols", "digits"],
63
  value="Examples",
64
  type="index",
65
  show_label=False,
 
59
  gr.Markdown("## Input Text")
60
  dropdown_examples = gr.Dropdown(
61
  # ["空格测试", "标点测试", "符号测试", "数字测试"],
62
+ ["space", "punctuation", "symbol", "number"],
63
  value="Examples",
64
  type="index",
65
  show_label=False,
config.py CHANGED
@@ -1,3 +1,2 @@
1
-
2
-
3
- USE_REMOTE = False
 
1
+ USE_REMOTE = False
2
+ ADD_SPECIAL_TOKEN = False
 
examples.py CHANGED
@@ -2,9 +2,9 @@ examples = {
2
  "en": [
3
  ["spaces: 2spaces 8spaces\t1tab\t\t2tab\n1newline", "llama", "chatglm2_6b"], # chatglm 有blank_n,
4
  # !?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.
5
- ["punctuations: ,.:/?+=\",。!?;【】〔〕〖〗", "baichuan", "llama"],
6
- ["symbols: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan", "llama"],
7
- ["digits: (10086 + 98) = 100184", "baichuan", "llama"]
8
  ]
9
  ,
10
  "zh": [
 
2
  "en": [
3
  ["spaces: 2spaces 8spaces\t1tab\t\t2tab\n1newline", "llama", "chatglm2_6b"], # chatglm 有blank_n,
4
  # !?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.
5
+ ["punctuation: ,.:/?+=\",。!?;【】〔〕〖〗", "baichuan", "llama"],
6
+ ["symbol: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan", "llama"],
7
+ ["number: (10086 + 98) = 100184", "baichuan", "llama"]
8
  ]
9
  ,
10
  "zh": [
js/onload.js CHANGED
@@ -3,7 +3,7 @@ function() {
3
  //$("footer a")["href"] = "https://github.com/xu-song/tokenizer-arena/issues"
4
  //$("footer a").childNodes[0].textContent ="Send Feedback"
5
 
6
- document.querySelectorAll("footer a")[0].childNodes[0].textContent ="Send Feedback";
7
  document.querySelectorAll("footer a")[0].href = "https://github.com/xu-song/tokenizer-arena/issues";
8
 
9
  // download button
 
3
  //$("footer a")["href"] = "https://github.com/xu-song/tokenizer-arena/issues"
4
  //$("footer a").childNodes[0].textContent ="Send Feedback"
5
 
6
+ document.querySelectorAll("footer a")[0].childNodes[0].textContent ="Send Feedback"; // 🤔Reporting Issues
7
  document.querySelectorAll("footer a")[0].href = "https://github.com/xu-song/tokenizer-arena/issues";
8
 
9
  // download button
util.py CHANGED
@@ -2,6 +2,7 @@ import gradio as gr
2
  import json
3
  import socket
4
  import pandas as pd
 
5
  from vocab import load_tokener
6
  from utils.zh_util import iter_vocab
7
  from utils.log_util import logger
@@ -16,7 +17,10 @@ def tokenize(text, tokenizer_type, color_num=5):
16
  logger.info("param=" + json.dumps({"text": text, "tokenizer_type": tokenizer_type}, ensure_ascii=False))
17
  pos_tokens = []
18
  tokenizer = load_tokener(tokenizer_type)
19
- encoding = tokenizer.encode(text)
 
 
 
20
 
21
  table = []
22
 
 
2
  import json
3
  import socket
4
  import pandas as pd
5
+ import config
6
  from vocab import load_tokener
7
  from utils.zh_util import iter_vocab
8
  from utils.log_util import logger
 
17
  logger.info("param=" + json.dumps({"text": text, "tokenizer_type": tokenizer_type}, ensure_ascii=False))
18
  pos_tokens = []
19
  tokenizer = load_tokener(tokenizer_type)
20
+ if config.ADD_SPECIAL_TOKEN:
21
+ encoding = tokenizer.encode(text, add_special_tokens=True)
22
+ else:
23
+ encoding = tokenizer.encode(text, add_special_tokens=False)
24
 
25
  table = []
26
 
utils/compress_rate_util.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+
3
+
4
+ 中文数据
5
+ 英文数据:
6
+
7
+
8
+
9
+ """
utils/speed_util.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ """
2
+ 分词速度
3
+ """
vocab/chatglm_6b/chatglm_6b/tokenization_chatglm.py CHANGED
@@ -195,6 +195,7 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
195
  padding_side="left",
196
  **kwargs
197
  ) -> None:
 
198
  super().__init__(
199
  do_lower_case=do_lower_case,
200
  remove_space=remove_space,
@@ -212,7 +213,7 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
212
  self.mask_token = mask_token
213
  self.gMASK_token = gmask_token
214
 
215
- self.sp_tokenizer = SPTokenizer(vocab_file)
216
 
217
  """ Initialisation """
218
 
 
195
  padding_side="left",
196
  **kwargs
197
  ) -> None:
198
+ self.sp_tokenizer = SPTokenizer(vocab_file)
199
  super().__init__(
200
  do_lower_case=do_lower_case,
201
  remove_space=remove_space,
 
213
  self.mask_token = mask_token
214
  self.gMASK_token = gmask_token
215
 
216
+
217
 
218
  """ Initialisation """
219
 
vocab/chatglm_6b/test_chatglm.py CHANGED
@@ -33,7 +33,7 @@ from transformers import AutoTokenizer
33
 
34
  os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
35
  # tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
36
- tokenizer = AutoTokenizer.from_pretrained("tokenizer", trust_remote_code=True)
37
 
38
 
39
  def encode_text(text):
@@ -105,6 +105,7 @@ def test_tokens():
105
 
106
 
107
  test_tokens()
 
108
 
109
  # tokenizer.sp_tokenizer.text_tokenizer.convert_token_to_id(x) + tokenizer.sp_tokenizer.num_image_tokens
110
 
 
33
 
34
  os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
35
  # tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
36
+ tokenizer = AutoTokenizer.from_pretrained("chatglm_6b/", trust_remote_code=True)
37
 
38
 
39
  def encode_text(text):
 
105
 
106
 
107
  test_tokens()
108
+ encode_text("good job d的 算法")
109
 
110
  # tokenizer.sp_tokenizer.text_tokenizer.convert_token_to_id(x) + tokenizer.sp_tokenizer.num_image_tokens
111
 
vocab/gpt_35_turbo/README.md CHANGED
@@ -24,6 +24,14 @@ special_token
24
  {"id": 100276, "token": "<|endofprompt|>", "token_decode": "<|endofprompt|>", "token_len": 15, "zh_count": 0, "space_count": 0, "digit_count": 0, "zh_symbol_count": 0}
25
  ```
26
 
 
 
 
 
 
 
 
 
27
  ## 词典文件
28
 
29
 
 
24
  {"id": 100276, "token": "<|endofprompt|>", "token_decode": "<|endofprompt|>", "token_len": 15, "zh_count": 0, "space_count": 0, "digit_count": 0, "zh_symbol_count": 0}
25
  ```
26
 
27
+ 汉字+符号
28
+ ```
29
+ {"id": 39045, "token": ",请", "token_decode": ",请", "token_len": 2, "zh_count": 1, "space_count": 0, "digit_count": 0, "zh_symbol_count": 0}
30
+ ```
31
+
32
+
33
+
34
+
35
  ## 词典文件
36
 
37
 
vocab/gpt_35_turbo/__init__.py CHANGED
@@ -1,4 +1,6 @@
1
-
 
 
2
 
3
  import tiktoken
4
  from tiktoken import Encoding
@@ -22,17 +24,19 @@ def decode(self, tokens, errors="replace", skip_special_tokens=False):
22
  decode_str = "null"
23
  return decode_str
24
 
 
25
  def convert_ids_to_tokens(self, tokens, skip_special_tokens=False):
26
  """
27
  为什么没有这个方法?
28
  """
29
  try:
30
- return tokenizer.decode_tokens_bytes(tokens)
31
  except:
32
  # 什么要返回None?见zh_util.py
33
  # 16个空闲id, 100256 100261-100275
34
  return [None for token in tokens]
35
 
 
36
  def get_vocab(self, token_type="str"):
37
  """Returns vocab as a dict
38
  :param token_type: ["str", "byte"]
@@ -59,10 +63,17 @@ def get_vocab(self, token_type="str"):
59
  return vocab
60
 
61
 
 
 
 
 
 
 
 
62
 
63
  # tiktoken patch
 
 
64
  Encoding.decode = decode
65
  Encoding.convert_ids_to_tokens = convert_ids_to_tokens
66
  Encoding.get_vocab = get_vocab
67
-
68
-
 
1
+ """
2
+ ,请
3
+ """
4
 
5
  import tiktoken
6
  from tiktoken import Encoding
 
24
  decode_str = "null"
25
  return decode_str
26
 
27
+
28
  def convert_ids_to_tokens(self, tokens, skip_special_tokens=False):
29
  """
30
  为什么没有这个方法?
31
  """
32
  try:
33
+ return self.decode_tokens_bytes(tokens)
34
  except:
35
  # 什么要返回None?见zh_util.py
36
  # 16个空闲id, 100256 100261-100275
37
  return [None for token in tokens]
38
 
39
+
40
  def get_vocab(self, token_type="str"):
41
  """Returns vocab as a dict
42
  :param token_type: ["str", "byte"]
 
63
  return vocab
64
 
65
 
66
+ def encode(self, *args, **kwargs):
67
+ """
68
+ add_special_token 是为了兼容 hf_tokenizer
69
+ """
70
+ kwargs.pop("add_special_token", None)
71
+ return self._encode(*args, **kwargs)
72
+
73
 
74
  # tiktoken patch
75
+ Encoding._encode = Encoding.encode
76
+ Encoding.encode = encode
77
  Encoding.decode = decode
78
  Encoding.convert_ids_to_tokens = convert_ids_to_tokens
79
  Encoding.get_vocab = get_vocab
 
 
vocab/gpt_35_turbo/decode_test.py CHANGED
@@ -1,6 +1,13 @@
1
 
2
  from vocab.gpt_35_turbo import tokenizer
3
 
4
- print(tokenizer.decode([100256]))
5
 
 
 
 
 
 
 
 
 
6
  print(tokenizer.convert_ids_to_tokens([100256]))
 
1
 
2
  from vocab.gpt_35_turbo import tokenizer
3
 
 
4
 
5
+ text = "你好,请告诉我聚乙烯是什么"
6
+ encoding = tokenizer.encode(text)
7
+
8
+
9
+ print(tokenizer.decode([6744]))
10
+ print(tokenizer.convert_ids_to_tokens([6744]))
11
+
12
+ print(tokenizer.decode([100256]))
13
  print(tokenizer.convert_ids_to_tokens([100256]))
vocab/gpt_35_turbo/test_tiktoken.py CHANGED
@@ -12,7 +12,9 @@ import tiktoken
12
 
13
 
14
  tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo')
15
- encoding = tokenizer.encode("a bcjik今天天气颗粒剂范大将军发卡卡萨")
 
 
16
  decoding_bytes = tokenizer.decode_tokens_bytes(encoding)
17
  print(encoding)
18
  print(decoding_bytes)
 
12
 
13
 
14
  tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo')
15
+ text = "你好,请告诉我聚乙烯是什么"
16
+ # text = "a bcjik今天天气颗粒剂范大将军发卡卡萨"
17
+ encoding = tokenizer.encode(text)
18
  decoding_bytes = tokenizer.decode_tokens_bytes(encoding)
19
  print(encoding)
20
  print(decoding_bytes)