eson commited on
Commit
bbefe94
1 Parent(s): 24b4aa5

add olmo tokenizer

Browse files
Files changed (3) hide show
  1. requirements.txt +2 -1
  2. vocab/__init__.py +24 -15
  3. vocab/olmo_7b/__init__.py +4 -0
requirements.txt CHANGED
@@ -5,4 +5,5 @@ icetk
5
  torch
6
  zhon
7
  nltk
8
- boto3
 
 
5
  torch
6
  zhon
7
  nltk
8
+ boto3
9
+ ai2-olmo
vocab/__init__.py CHANGED
@@ -17,14 +17,18 @@ tokenizer.implementation = TokenizerImpl.SentencePiece.name # https://github.c
17
 
18
  - bert
19
  - 特征
 
20
  - 示例:
21
- - gpt2
22
- - 特征:
23
- - 词典:
24
-
25
  - sentencepiece:
26
- - 特征:.sp_model 是SentencePieceProcessor类型,sp_model.id_to_piece,有tokenizer.json tokenizer.model,词典字符有 ▁,
27
- - 示例:llama,baichuan
 
 
 
 
 
 
 
28
  - icetk: sentencepiece的分支,支持image_tokenizer
29
  - glm, chatglm1, chatglm2
30
  - tiktoken
@@ -32,6 +36,10 @@ tokenizer.implementation = TokenizerImpl.SentencePiece.name # https://github.c
32
  - 特征:
33
  - 文件: tokenizer.json(包含后两个文件的内容), merges.txt, vocab.json
34
  - added_tokens 在vocab中不一定存在。
 
 
 
 
35
  - .model 是 tokenizer.models.BPE 类型
36
  - 词典有 Ġ "\u0120" 开头
37
  - .model.from_file .model.save .model.token_to_id .model.tokenize
@@ -116,7 +124,7 @@ all_tokenizers = [
116
  "code_davinci_002",
117
  "gpt_35_turbo",
118
  "gpt_4",
119
-
120
  # 未分类
121
  "skywork_13b_base",
122
  "skywork_13b_math",
@@ -141,20 +149,21 @@ all_tokenizers = [
141
  "switch_c_2048",
142
  "byt5_small",
143
  "mt5_large",
144
- "wizardcoder_python_7b_v1",
145
- "wizardlm_7b_v1",
146
- "wizardmath_70b_v1",
147
- "tigerbot_70b_chat_v4_4k",
148
- "tigerbot_13b_chat_v2",
149
- "deepseek_coder_33b_instruct",
150
- "deepseek_llm_7b_base",
151
  "gemma_7b",
152
-
153
 
154
  ]
155
 
156
  all_tokenizers = sorted(all_tokenizers)
157
 
 
158
  class TokenizerType(Enum):
159
  """
160
  - https://huggingface.co/docs/transformers/tokenizer_summary
 
17
 
18
  - bert
19
  - 特征
20
+ - 词典:有##开头的token,表示subword
21
  - 示例:
 
 
 
 
22
  - sentencepiece:
23
+ - 特征:
24
+ - 训练:
25
+ - 文件: *.sp_model 或 *.model (可选文件 .vocab,)
26
+ - 实现:
27
+ - 训练: `import sentencepiece as spm; spm.SentencePieceTrainer.train` 或 `spm_train`
28
+ - 加载: `import sentencepiece as spm; spm.SentencePieceProcessor().Load(vocab_file)`
29
+ - 方法: 是SentencePieceProcessor类型,sp_model.id_to_piece,有tokenizer.json tokenizer.model,
30
+ - 词典: 词典字符有 ▁ (U+2581) ,表示空格或句首。
31
+ - 示例:llama,baichuan, orion
32
  - icetk: sentencepiece的分支,支持image_tokenizer
33
  - glm, chatglm1, chatglm2
34
  - tiktoken
 
36
  - 特征:
37
  - 文件: tokenizer.json(包含后两个文件的内容), merges.txt, vocab.json
38
  - added_tokens 在vocab中不一定存在。
39
+ - 实现:
40
+ - 训练:
41
+ - 加载:
42
+ - 方法:
43
  - .model 是 tokenizer.models.BPE 类型
44
  - 词典有 Ġ "\u0120" 开头
45
  - .model.from_file .model.save .model.token_to_id .model.tokenize
 
124
  "code_davinci_002",
125
  "gpt_35_turbo",
126
  "gpt_4",
127
+
128
  # 未分类
129
  "skywork_13b_base",
130
  "skywork_13b_math",
 
149
  "switch_c_2048",
150
  "byt5_small",
151
  "mt5_large",
152
+ "wizardcoder_python_7b_v1",
153
+ "wizardlm_7b_v1",
154
+ "wizardmath_70b_v1",
155
+ "tigerbot_70b_chat_v4_4k",
156
+ "tigerbot_13b_chat_v2",
157
+ "deepseek_coder_33b_instruct",
158
+ "deepseek_llm_7b_base",
159
  "gemma_7b",
160
+ "olmo_7b",
161
 
162
  ]
163
 
164
  all_tokenizers = sorted(all_tokenizers)
165
 
166
+
167
  class TokenizerType(Enum):
168
  """
169
  - https://huggingface.co/docs/transformers/tokenizer_summary
vocab/olmo_7b/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+
2
+ from transformers import AutoTokenizer
3
+
4
+ tokenizer = AutoTokenizer.from_pretrained("allenai/OLMo-7B")