add more tokenizer
Browse files- requirements.txt +2 -1
- util.py +1 -1
- vocab/__init__.py +10 -2
- vocab/gemma_7b/__init__.py +7 -0
- vocab/gpt2/__init__.py +1 -1
- vocab/mobilenet_v2/__init__.py +11 -0
- vocab/mt5_large/__init__.py +5 -0
- vocab/qwen1_5_14b_chat/README.md +18 -0
- vocab/qwen1_5_14b_chat/__init__.py +8 -0
requirements.txt
CHANGED
@@ -4,4 +4,5 @@ tiktoken
|
|
4 |
icetk
|
5 |
torch
|
6 |
zhon
|
7 |
-
nltk
|
|
|
|
4 |
icetk
|
5 |
torch
|
6 |
zhon
|
7 |
+
nltk
|
8 |
+
boto3
|
util.py
CHANGED
@@ -35,7 +35,7 @@ def tokenize(text, tokenizer_type, color_num=5):
|
|
35 |
token_str = token.decode("utf-8")
|
36 |
except:
|
37 |
token_str = token.decode("utf-8", errors="ignore")
|
38 |
-
logger.error("decode_error: " + json.dumps( # gpt_35_turbo 经常有token会decode error,这里用来记录一下
|
39 |
{"tokenizer_type": tokenizer_type, "token": str(token), "token_str": token_str},
|
40 |
ensure_ascii=False))
|
41 |
|
|
|
35 |
token_str = token.decode("utf-8")
|
36 |
except:
|
37 |
token_str = token.decode("utf-8", errors="ignore")
|
38 |
+
logger.error(f"{idx}: decode_error: " + json.dumps( # gpt_35_turbo 经常有token会decode error,这里用来记录一下
|
39 |
{"tokenizer_type": tokenizer_type, "token": str(token), "token_str": token_str},
|
40 |
ensure_ascii=False))
|
41 |
|
vocab/__init__.py
CHANGED
@@ -20,6 +20,8 @@ tokenizer.implementation = TokenizerImpl.SentencePiece.name # https://github.c
|
|
20 |
- 示例:
|
21 |
- gpt2
|
22 |
- 特征:
|
|
|
|
|
23 |
- sentencepiece:
|
24 |
- 特征:.sp_model 是SentencePieceProcessor类型,sp_model.id_to_piece,有tokenizer.json tokenizer.model,词典字符有 ▁,
|
25 |
- 示例:llama,baichuan
|
@@ -28,11 +30,14 @@ tokenizer.implementation = TokenizerImpl.SentencePiece.name # https://github.c
|
|
28 |
- tiktoken
|
29 |
- hf_tokenizer
|
30 |
- 特征:
|
|
|
|
|
31 |
- .model 是 tokenizer.models.BPE 类型
|
32 |
- 词典有 Ġ "\u0120" 开头
|
33 |
-
- 有1个tokenizer.json(包括 merge vocab),或者分开独立文件
|
34 |
- .model.from_file .model.save .model.token_to_id .model.tokenize
|
35 |
-
- 示例:gpt_neox_20b, moss, bloom
|
|
|
|
|
36 |
- tiktoken
|
37 |
- 特征:空格就是空格,
|
38 |
- 示例:gpt3.5 gpt4, qwen,
|
@@ -76,6 +81,7 @@ all_tokenizers = [
|
|
76 |
# "belle_7b_2m", # 模型和词典都基于bloom
|
77 |
#
|
78 |
"gpt_nexo_20b",
|
|
|
79 |
# "gpt_neox_chinese_v1",
|
80 |
#
|
81 |
# ##### glm系列
|
@@ -134,6 +140,7 @@ all_tokenizers = [
|
|
134 |
"mobilenet_v2",
|
135 |
"switch_c_2048",
|
136 |
"byt5_small",
|
|
|
137 |
"wizardcoder_python_7b_v1",
|
138 |
"wizardlm_7b_v1",
|
139 |
"wizardmath_70b_v1",
|
@@ -141,6 +148,7 @@ all_tokenizers = [
|
|
141 |
"tigerbot_13b_chat_v2",
|
142 |
"deepseek_coder_33b_instruct",
|
143 |
"deepseek_llm_7b_base",
|
|
|
144 |
|
145 |
|
146 |
]
|
|
|
20 |
- 示例:
|
21 |
- gpt2
|
22 |
- 特征:
|
23 |
+
- 词典:
|
24 |
+
|
25 |
- sentencepiece:
|
26 |
- 特征:.sp_model 是SentencePieceProcessor类型,sp_model.id_to_piece,有tokenizer.json tokenizer.model,词典字符有 ▁,
|
27 |
- 示例:llama,baichuan
|
|
|
30 |
- tiktoken
|
31 |
- hf_tokenizer
|
32 |
- 特征:
|
33 |
+
- 文件: tokenizer.json(包含后两个文件的内容), merges.txt, vocab.json
|
34 |
+
- added_tokens 在vocab中不一定存在。
|
35 |
- .model 是 tokenizer.models.BPE 类型
|
36 |
- 词典有 Ġ "\u0120" 开头
|
|
|
37 |
- .model.from_file .model.save .model.token_to_id .model.tokenize
|
38 |
+
- 示例:gpt2, gpt_neox_20b, moss, bloom, qwen2
|
39 |
+
- 构造词典:
|
40 |
+
- ss
|
41 |
- tiktoken
|
42 |
- 特征:空格就是空格,
|
43 |
- 示例:gpt3.5 gpt4, qwen,
|
|
|
81 |
# "belle_7b_2m", # 模型和词典都基于bloom
|
82 |
#
|
83 |
"gpt_nexo_20b",
|
84 |
+
"qwen1_5_14b_chat",
|
85 |
# "gpt_neox_chinese_v1",
|
86 |
#
|
87 |
# ##### glm系列
|
|
|
140 |
"mobilenet_v2",
|
141 |
"switch_c_2048",
|
142 |
"byt5_small",
|
143 |
+
"mt5_large",
|
144 |
"wizardcoder_python_7b_v1",
|
145 |
"wizardlm_7b_v1",
|
146 |
"wizardmath_70b_v1",
|
|
|
148 |
"tigerbot_13b_chat_v2",
|
149 |
"deepseek_coder_33b_instruct",
|
150 |
"deepseek_llm_7b_base",
|
151 |
+
"gemma_7b",
|
152 |
|
153 |
|
154 |
]
|
vocab/gemma_7b/__init__.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
|
3 |
+
from transformers import AutoTokenizer
|
4 |
+
|
5 |
+
tokenizer = AutoTokenizer.from_pretrained("google/gemma-7b", trust_remote_code=True)
|
6 |
+
|
7 |
+
tokenizer.comments = ""
|
vocab/gpt2/__init__.py
CHANGED
@@ -8,7 +8,7 @@ from vocab import TokenizerType, TokenizerImpl
|
|
8 |
# TOKENIZER_DIR = os.path.join(CURRENT_DIR, "tokenizer")
|
9 |
# tokenizer = GPT2Tokenizer.from_pretrained(TOKENIZER_DIR)
|
10 |
|
11 |
-
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
|
12 |
|
13 |
# tokenizer.type = TokenizerType.
|
14 |
|
|
|
8 |
# TOKENIZER_DIR = os.path.join(CURRENT_DIR, "tokenizer")
|
9 |
# tokenizer = GPT2Tokenizer.from_pretrained(TOKENIZER_DIR)
|
10 |
|
11 |
+
tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
|
12 |
|
13 |
# tokenizer.type = TokenizerType.
|
14 |
|
vocab/mobilenet_v2/__init__.py
CHANGED
@@ -1,2 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from transformers import AutoTokenizer
|
2 |
tokenizer = AutoTokenizer.from_pretrained("google/mobilenet_v2_1.0_224", trust_remote_code=True)
|
|
|
1 |
+
"""
|
2 |
+
## error
|
3 |
+
File "/home/user/app/vocab/mobilenet_v2/__init__.py", line 2, in <module>
|
4 |
+
tokenizer = AutoTokenizer.from_pretrained("google/mobilenet_v2_1.0_224", trust_remote_code=True)
|
5 |
+
File "/home/user/.local/lib/python3.10/site-packages/transformers/models/auto/tokenization_auto.py", line 830, in from_pretrained
|
6 |
+
tokenizer_class_py, tokenizer_class_fast = TOKENIZER_MAPPING[type(config)]
|
7 |
+
File "/home/user/.local/lib/python3.10/site-packages/transformers/models/auto/auto_factory.py", line 748, in __getitem__
|
8 |
+
raise KeyError(key)
|
9 |
+
KeyError: <class 'transformers.models.mobilenet_v2.configuration_mobilenet_v2.MobileNetV2Config'>
|
10 |
+
"""
|
11 |
+
|
12 |
from transformers import AutoTokenizer
|
13 |
tokenizer = AutoTokenizer.from_pretrained("google/mobilenet_v2_1.0_224", trust_remote_code=True)
|
vocab/mt5_large/__init__.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from transformers import AutoTokenizer
|
3 |
+
|
4 |
+
|
5 |
+
tokenizer = AutoTokenizer.from_pretrained("google/mt5-large")
|
vocab/qwen1_5_14b_chat/README.md
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
|
3 |
+
## eos token
|
4 |
+
|
5 |
+
|
6 |
+
```sh
|
7 |
+
151643 <|endoftext|>
|
8 |
+
151645 <|im_end|>
|
9 |
+
```
|
10 |
+
|
11 |
+
|
12 |
+
`eos_token_id=[151643, 151645]` [chat generation_config.json](https://huggingface.co/Qwen/Qwen1.5-7B-Chat/blob/main/generation_config.json),在`model.generate`时会加载这个文件。实际sft时的eos是 `<|im_end|>`,并未用到 `<|endoftext|>`。
|
13 |
+
|
14 |
+
|
15 |
+
`eos_token_id=151643` [base generation_config.json](https://huggingface.co/Qwen/Qwen1.5-14B/blob/main/generation_config.json),在预训练阶段用的是`<|endoftext|>`
|
16 |
+
|
17 |
+
`eos_token_id=151643` [tokenization_qwen2.py](https://github.com/huggingface/transformers/blob/2a9b1f80c45cab19b542bc7cc004937d39d6f6fb/src/transformers/models/qwen2/tokenization_qwen2.py#L150),`print(tokenizer.eos_token_id)` 会显示 `151643`,是为了兼容 base model和chat model。
|
18 |
+
|
vocab/qwen1_5_14b_chat/__init__.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
|
3 |
+
import os
|
4 |
+
from transformers import AutoTokenizer
|
5 |
+
|
6 |
+
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-14B-Chat", trust_remote_code=True)
|
7 |
+
|
8 |
+
tokenizer.comments = ""
|