eson commited on
Commit
adcfb97
1 Parent(s): 44c3329

fix tiktoken special tokens

Browse files
tokenizer/tiktoken_patch.py CHANGED
@@ -6,6 +6,8 @@ def decode(self, tokens, errors="replace", skip_special_tokens=False):
6
  """
7
  默认的decode,可能会报错,详见 decode_test.py
8
  skip_special_tokens 是为了兼容 hf_tokenizer
 
 
9
  """
10
  try:
11
  decode_str = self._core_bpe.decode_bytes(tokens).decode("utf-8", errors=errors)
@@ -58,6 +60,7 @@ def encode(self, *args, **kwargs):
58
  add_special_token 是为了兼容 hf_tokenizer
59
  """
60
  kwargs.pop("add_special_tokens", None)
 
61
  return self._encode(*args, **kwargs)
62
 
63
 
 
6
  """
7
  默认的decode,可能会报错,详见 decode_test.py
8
  skip_special_tokens 是为了兼容 hf_tokenizer
9
+
10
+ errors=replace, ignore, strict 有什么区别?
11
  """
12
  try:
13
  decode_str = self._core_bpe.decode_bytes(tokens).decode("utf-8", errors=errors)
 
60
  add_special_token 是为了兼容 hf_tokenizer
61
  """
62
  kwargs.pop("add_special_tokens", None)
63
+ kwargs["allowed_special"] = "all"
64
  return self._encode(*args, **kwargs)
65
 
66
 
vocab/__init__.py CHANGED
@@ -15,11 +15,11 @@ tokenizer.type = TokenizerType.ByteBPE.name
15
  tokenizer.implementation = TokenizerImpl.SentencePiece.name # https://github.com/facebookresearch/llama/blob/main/llama/tokenizer.py
16
  "HFGPT2Tokenizer", "HFTokenizer", "GPT2BPETokenizer", "CharLevelTokenizer", "TiktokenTokenizer", "SPMTokenizer", https://github.com/EleutherAI/gpt-neox/blob/main/tools/preprocess_data.py
17
 
18
- - bert
19
  - 特征
20
  - 词典:有##开头的token,表示subword
21
  - 示例:
22
- - sentencepiece:
23
  - 特征:
24
  - 训练:
25
  - 文件: *.sp_model 或 *.model (可选文件 .vocab,)
@@ -28,10 +28,10 @@ tokenizer.implementation = TokenizerImpl.SentencePiece.name # https://github.c
28
  - 加载: `import sentencepiece as spm; spm.SentencePieceProcessor().Load(vocab_file)`
29
  - 方法: 是SentencePieceProcessor类型,sp_model.id_to_piece,有tokenizer.json tokenizer.model,
30
  - 词典: 词典字符有 ▁ (U+2581) ,表示空格或句首。
31
- - 示例:llama,baichuan, orion
32
  - icetk: sentencepiece的分支,支持image_tokenizer
33
  - glm, chatglm1, chatglm2
34
- - tiktoken
35
  - hf_tokenizer
36
  - 特征:
37
  - 文件: tokenizer.json(包含后两个文件的内容), merges.txt, vocab.json
@@ -65,102 +65,103 @@ uniq_tokenizers = [
65
  ""
66
  ]
67
 
 
68
  all_tokenizers = [
69
- "gpt2",
70
- "gpt2_chinese",
71
-
72
- # bert 系列
73
- "bert_base_cased",
74
- "bert_base_uncased",
75
- "bert_base_chinese",
76
- "roberta_chinese_clue",
77
- "kplug",
78
-
79
- # gpt2 系列
80
- "moss",
81
- #
82
- # ######
83
- "chatyuan_large_v2",
84
- "prompt_clue",
85
- #
86
- # #### bloom 系列
87
- "bloom",
88
- # "bloomz_6b4_zh",
89
- # "belle_7b_2m", # 模型和词典都基于bloom
90
  #
91
- "gpt_nexo_20b",
92
- "qwen1_5_14b_chat",
93
- # "gpt_neox_chinese_v1",
94
- #
95
- # ##### glm系列
96
- # "glm_chinese",
97
- "chatglm_6b",
98
- "chatglm2_6b",
99
- "chatglm3_6b",
100
- #
101
- # #### llama alpaca系列
102
- "llama", # '中文单字': 700, '中文多字': 0
103
- "llama2",
104
- "chinese_llama", #
105
- "chinese_llama2", #
106
- # "chinese_alpaca_lora_7b", # 中文Alpaca模型在上述中文LLaMA模型的基础上进一步使用了指令数据进行精调。
107
- # "belle_llama_ext_7b",
108
- # "alpaca_7b",
109
- "baichuan",
110
- "baichuan2",
111
- "internlm_chat_7b",
112
- "internlm2_chat_7b",
113
- "internlm2_math_7b",
114
- "internlm_xcomposer_7b",
115
- "falcon_7b",
116
- "falcon_180b",
 
117
  # "goat",
118
 
 
 
 
 
 
 
 
119
  # tiktoken 系列
120
- "qwen_1_8b_chat",
121
- "qwen_7b_chat",
122
- "qwen_72b_chat",
123
- "text_davinci_003",
124
- "code_davinci_002",
125
- "gpt_35_turbo",
126
- "gpt_4",
127
 
128
  # 未分类
129
- "skywork_13b_base",
130
- "skywork_13b_math",
131
- "mistral_7b",
132
- "mixtral_8_7b",
133
- "t5_small",
134
- "t5_base",
135
- "t5_large",
136
- "flan_t5_base",
137
- "fastchat_t5_3b",
138
- "pko_t5_large",
139
- "wizardcoder_15b_v1",
140
- "yi_6b",
141
- "yi_34b",
142
- "yi_vl34b",
143
- "orion_14b_chat",
144
- "phi_1",
145
- "phi_2",
146
- "solar_10_7b",
147
- "mobilebert_uncased",
148
- "mobilenet_v2",
149
- "switch_c_2048",
150
- "byt5_small",
151
- "mt5_large",
152
- "wizardcoder_python_7b_v1",
153
- "wizardlm_7b_v1",
154
- "wizardmath_70b_v1",
155
- "tigerbot_70b_chat_v4_4k",
156
- "tigerbot_13b_chat_v2",
157
- "deepseek_coder_33b_instruct",
158
- "deepseek_llm_7b_base",
159
- "gemma_7b",
160
- "olmo_7b",
161
- "aya_101",
162
  ]
163
 
 
164
  all_tokenizers = sorted(all_tokenizers)
165
 
166
 
 
15
  tokenizer.implementation = TokenizerImpl.SentencePiece.name # https://github.com/facebookresearch/llama/blob/main/llama/tokenizer.py
16
  "HFGPT2Tokenizer", "HFTokenizer", "GPT2BPETokenizer", "CharLevelTokenizer", "TiktokenTokenizer", "SPMTokenizer", https://github.com/EleutherAI/gpt-neox/blob/main/tools/preprocess_data.py
17
 
18
+ - google/bert
19
  - 特征
20
  - 词典:有##开头的token,表示subword
21
  - 示例:
22
+ - google/sentencepiece:
23
  - 特征:
24
  - 训练:
25
  - 文件: *.sp_model 或 *.model (可选文件 .vocab,)
 
28
  - 加载: `import sentencepiece as spm; spm.SentencePieceProcessor().Load(vocab_file)`
29
  - 方法: 是SentencePieceProcessor类型,sp_model.id_to_piece,有tokenizer.json tokenizer.model,
30
  - 词典: 词典字符有 ▁ (U+2581) ,表示空格或句首。
31
+ - 示例:google-t5, llama,baichuan, orion,
32
  - icetk: sentencepiece的分支,支持image_tokenizer
33
  - glm, chatglm1, chatglm2
34
+ - openai/tiktoken
35
  - hf_tokenizer
36
  - 特征:
37
  - 文件: tokenizer.json(包含后两个文件的内容), merges.txt, vocab.json
 
65
  ""
66
  ]
67
 
68
+ # TODO: alias/abbr, hf_path, tokenizer_class, comments,
69
  all_tokenizers = [
70
+ ##### bert 系列
71
+ ("bert_base_cased", "", ""),
72
+ ("bert_base_uncased","",),
73
+ ("bert_base_chinese",),
74
+ ("roberta_chinese_clue",),
75
+ ("kplug",),
76
+ ("gpt2_chinese",),
77
+
78
+ ##### GPT2Tokenizer
79
+ ("gpt2",), #
80
+ ("moss",),
81
+ ("bloom",),
82
+ # ("bloomz_6b4_zh",
83
+ # ("belle_7b_2m", # 模型和词典都基于bloom
 
 
 
 
 
 
 
84
  #
85
+ ("gpt_nexo_20b",), # 5万
86
+ ("qwen1_5_14b_chat",), # 15万,速度有点慢
87
+ ("starchat_alpha",),
88
+
89
+ ####### google/sentencepiece tokenizer:
90
+ # T5 llama internlm
91
+ ("t5_small",),
92
+ ("t5_base",),
93
+ ("t5_large",),
94
+ ("chatyuan_large_v2",),
95
+ ("prompt_clue",),
96
+
97
+ ("llama",), # '中文单字': 700, '中文多字': 0
98
+ ("llama2",),
99
+ ("chinese_llama",), #
100
+ ("chinese_llama2",), #
101
+ # ("chinese_alpaca_lora_7b", # 中文Alpaca模型在上述中文LLaMA模型的基础上进一步使用了指令数据进行精调。
102
+ # ("belle_llama_ext_7b",
103
+ # ("alpaca_7b",
104
+ ("baichuan",),
105
+ ("baichuan2",),
106
+ ("internlm_chat_7b",),
107
+ ("internlm2_chat_7b",),
108
+ ("internlm2_math_7b",),
109
+ ("internlm_xcomposer_7b",),
110
+ ("falcon_7b",),
111
+ ("falcon_180b",),
112
  # "goat",
113
 
114
+ # ##### glm系列
115
+ # "glm_chinese",),
116
+ ("chatglm_6b",),
117
+ ("chatglm2_6b",),
118
+ ("chatglm3_6b",),
119
+
120
+
121
  # tiktoken 系列
122
+ ("qwen_1_8b_chat",),
123
+ ("qwen_7b_chat",),
124
+ ("qwen_72b_chat",),
125
+ ("text_davinci_003",),
126
+ ("code_davinci_002",),
127
+ ("gpt_35_turbo",),
128
+ ("gpt_4",),
129
 
130
  # 未分类
131
+ ("skywork_13b_base",),
132
+ ("skywork_13b_math",),
133
+ ("mistral_7b",),
134
+ ("mixtral_8_7b",),
135
+
136
+ ("flan_t5_base",),
137
+ ("fastchat_t5_3b",),
138
+ ("pko_t5_large",),
139
+ ("wizardcoder_15b_v1",),
140
+ ("yi_6b",),
141
+ ("yi_34b",),
142
+ ("yi_vl34b",),
143
+ ("orion_14b_chat",),
144
+ ("phi_1",),
145
+ ("phi_2",),
146
+ ("solar_10_7b",),
147
+ ("mobilebert_uncased",),
148
+ ("mobilenet_v2",),
149
+ ("switch_c_2048",),
150
+ ("byt5_small",),
151
+ ("mt5_large",),
152
+ ("wizardcoder_python_7b_v1",),
153
+ ("wizardlm_7b_v1",),
154
+ ("wizardmath_70b_v1",),
155
+ ("tigerbot_70b_chat_v4_4k",),
156
+ ("tigerbot_13b_chat_v2",),
157
+ ("deepseek_coder_33b_instruct",),
158
+ ("deepseek_llm_7b_base",),
159
+ ("gemma_7b",),
160
+ ("olmo_7b",),
161
+ ("aya_101",),
 
 
162
  ]
163
 
164
+ all_tokenizers = [tokenizer[0] for tokenizer in all_tokenizers]
165
  all_tokenizers = sorted(all_tokenizers)
166
 
167
 
vocab/starchat_alpha/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+
2
+
3
+ from transformers import AutoTokenizer
4
+
5
+ tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/starchat-alpha")