xu-song commited on
Commit
5425d5d
1 Parent(s): e6543ac

add more tokenizer

Browse files
requirements.txt CHANGED
@@ -4,4 +4,5 @@ tiktoken
4
  icetk
5
  torch
6
  zhon
7
- nltk
 
 
4
  icetk
5
  torch
6
  zhon
7
+ nltk
8
+ boto3
util.py CHANGED
@@ -35,7 +35,7 @@ def tokenize(text, tokenizer_type, color_num=5):
35
  token_str = token.decode("utf-8")
36
  except:
37
  token_str = token.decode("utf-8", errors="ignore")
38
- logger.error("decode_error: " + json.dumps( # gpt_35_turbo 经常有token会decode error,这里用来记录一下
39
  {"tokenizer_type": tokenizer_type, "token": str(token), "token_str": token_str},
40
  ensure_ascii=False))
41
 
 
35
  token_str = token.decode("utf-8")
36
  except:
37
  token_str = token.decode("utf-8", errors="ignore")
38
+ logger.error(f"{idx}: decode_error: " + json.dumps( # gpt_35_turbo 经常有token会decode error,这里用来记录一下
39
  {"tokenizer_type": tokenizer_type, "token": str(token), "token_str": token_str},
40
  ensure_ascii=False))
41
 
vocab/__init__.py CHANGED
@@ -20,6 +20,8 @@ tokenizer.implementation = TokenizerImpl.SentencePiece.name # https://github.c
20
  - 示例:
21
  - gpt2
22
  - 特征:
 
 
23
  - sentencepiece:
24
  - 特征:.sp_model 是SentencePieceProcessor类型,sp_model.id_to_piece,有tokenizer.json tokenizer.model,词典字符有 ▁,
25
  - 示例:llama,baichuan
@@ -28,11 +30,14 @@ tokenizer.implementation = TokenizerImpl.SentencePiece.name # https://github.c
28
  - tiktoken
29
  - hf_tokenizer
30
  - 特征:
 
 
31
  - .model 是 tokenizer.models.BPE 类型
32
  - 词典有 Ġ "\u0120" 开头
33
- - 有1个tokenizer.json(包括 merge vocab),或者分开独立文件
34
  - .model.from_file .model.save .model.token_to_id .model.tokenize
35
- - 示例:gpt_neox_20b, moss, bloom
 
 
36
  - tiktoken
37
  - 特征:空格就是空格,
38
  - 示例:gpt3.5 gpt4, qwen,
@@ -76,6 +81,7 @@ all_tokenizers = [
76
  # "belle_7b_2m", # 模型和词典都基于bloom
77
  #
78
  "gpt_nexo_20b",
 
79
  # "gpt_neox_chinese_v1",
80
  #
81
  # ##### glm系列
@@ -134,6 +140,7 @@ all_tokenizers = [
134
  "mobilenet_v2",
135
  "switch_c_2048",
136
  "byt5_small",
 
137
  "wizardcoder_python_7b_v1",
138
  "wizardlm_7b_v1",
139
  "wizardmath_70b_v1",
@@ -141,6 +148,7 @@ all_tokenizers = [
141
  "tigerbot_13b_chat_v2",
142
  "deepseek_coder_33b_instruct",
143
  "deepseek_llm_7b_base",
 
144
 
145
 
146
  ]
 
20
  - 示例:
21
  - gpt2
22
  - 特征:
23
+ - 词典:
24
+
25
  - sentencepiece:
26
  - 特征:.sp_model 是SentencePieceProcessor类型,sp_model.id_to_piece,有tokenizer.json tokenizer.model,词典字符有 ▁,
27
  - 示例:llama,baichuan
 
30
  - tiktoken
31
  - hf_tokenizer
32
  - 特征:
33
+ - 文件: tokenizer.json(包含后两个文件的内容), merges.txt, vocab.json
34
+ - added_tokens 在vocab中不一定存在。
35
  - .model 是 tokenizer.models.BPE 类型
36
  - 词典有 Ġ "\u0120" 开头
 
37
  - .model.from_file .model.save .model.token_to_id .model.tokenize
38
+ - 示例:gpt2, gpt_neox_20b, moss, bloom, qwen2
39
+ - 构造词典:
40
+ - ss
41
  - tiktoken
42
  - 特征:空格就是空格,
43
  - 示例:gpt3.5 gpt4, qwen,
 
81
  # "belle_7b_2m", # 模型和词典都基于bloom
82
  #
83
  "gpt_nexo_20b",
84
+ "qwen1_5_14b_chat",
85
  # "gpt_neox_chinese_v1",
86
  #
87
  # ##### glm系列
 
140
  "mobilenet_v2",
141
  "switch_c_2048",
142
  "byt5_small",
143
+ "mt5_large",
144
  "wizardcoder_python_7b_v1",
145
  "wizardlm_7b_v1",
146
  "wizardmath_70b_v1",
 
148
  "tigerbot_13b_chat_v2",
149
  "deepseek_coder_33b_instruct",
150
  "deepseek_llm_7b_base",
151
+ "gemma_7b",
152
 
153
 
154
  ]
vocab/gemma_7b/__init__.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+
2
+
3
+ from transformers import AutoTokenizer
4
+
5
+ tokenizer = AutoTokenizer.from_pretrained("google/gemma-7b", trust_remote_code=True)
6
+
7
+ tokenizer.comments = ""
vocab/gpt2/__init__.py CHANGED
@@ -8,7 +8,7 @@ from vocab import TokenizerType, TokenizerImpl
8
  # TOKENIZER_DIR = os.path.join(CURRENT_DIR, "tokenizer")
9
  # tokenizer = GPT2Tokenizer.from_pretrained(TOKENIZER_DIR)
10
 
11
- tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
12
 
13
  # tokenizer.type = TokenizerType.
14
 
 
8
  # TOKENIZER_DIR = os.path.join(CURRENT_DIR, "tokenizer")
9
  # tokenizer = GPT2Tokenizer.from_pretrained(TOKENIZER_DIR)
10
 
11
+ tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
12
 
13
  # tokenizer.type = TokenizerType.
14
 
vocab/mobilenet_v2/__init__.py CHANGED
@@ -1,2 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
1
  from transformers import AutoTokenizer
2
  tokenizer = AutoTokenizer.from_pretrained("google/mobilenet_v2_1.0_224", trust_remote_code=True)
 
1
+ """
2
+ ## error
3
+ File "/home/user/app/vocab/mobilenet_v2/__init__.py", line 2, in <module>
4
+ tokenizer = AutoTokenizer.from_pretrained("google/mobilenet_v2_1.0_224", trust_remote_code=True)
5
+ File "/home/user/.local/lib/python3.10/site-packages/transformers/models/auto/tokenization_auto.py", line 830, in from_pretrained
6
+ tokenizer_class_py, tokenizer_class_fast = TOKENIZER_MAPPING[type(config)]
7
+ File "/home/user/.local/lib/python3.10/site-packages/transformers/models/auto/auto_factory.py", line 748, in __getitem__
8
+ raise KeyError(key)
9
+ KeyError: <class 'transformers.models.mobilenet_v2.configuration_mobilenet_v2.MobileNetV2Config'>
10
+ """
11
+
12
  from transformers import AutoTokenizer
13
  tokenizer = AutoTokenizer.from_pretrained("google/mobilenet_v2_1.0_224", trust_remote_code=True)
vocab/mt5_large/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+
2
+ from transformers import AutoTokenizer
3
+
4
+
5
+ tokenizer = AutoTokenizer.from_pretrained("google/mt5-large")
vocab/qwen1_5_14b_chat/README.md ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ ## eos token
4
+
5
+
6
+ ```sh
7
+ 151643 <|endoftext|>
8
+ 151645 <|im_end|>
9
+ ```
10
+
11
+
12
+ `eos_token_id=[151643, 151645]` [chat generation_config.json](https://huggingface.co/Qwen/Qwen1.5-7B-Chat/blob/main/generation_config.json),在`model.generate`时会加载这个文件。实际sft时的eos是 `<|im_end|>`,并未用到 `<|endoftext|>`。
13
+
14
+
15
+ `eos_token_id=151643` [base generation_config.json](https://huggingface.co/Qwen/Qwen1.5-14B/blob/main/generation_config.json),在预训练阶段用的是`<|endoftext|>`
16
+
17
+ `eos_token_id=151643` [tokenization_qwen2.py](https://github.com/huggingface/transformers/blob/2a9b1f80c45cab19b542bc7cc004937d39d6f6fb/src/transformers/models/qwen2/tokenization_qwen2.py#L150),`print(tokenizer.eos_token_id)` 会显示 `151643`,是为了兼容 base model和chat model。
18
+
vocab/qwen1_5_14b_chat/__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ import os
4
+ from transformers import AutoTokenizer
5
+
6
+ tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-14B-Chat", trust_remote_code=True)
7
+
8
+ tokenizer.comments = ""