Spaces:

eson
/

tokenizer-arena

Running

App Files Files Community

eson commited on Feb 28

Commit

bbefe94

•

1 Parent(s): 24b4aa5

add olmo tokenizer

Browse files

Files changed (3) hide show

requirements.txt +2 -1
vocab/__init__.py +24 -15
vocab/olmo_7b/__init__.py +4 -0

requirements.txt CHANGED Viewed

@@ -5,4 +5,5 @@ icetk
 torch
 zhon
 nltk
-boto3

 torch
 zhon
 nltk
+boto3
+ai2-olmo

vocab/__init__.py CHANGED Viewed

@@ -17,14 +17,18 @@ tokenizer.implementation = TokenizerImpl.SentencePiece.name   # https://github.c
   - bert
     - 特征
     - 示例：
-  - gpt2
-    - 特征：
-      - 词典：
   - sentencepiece:
-    - 特征：.sp_model 是SentencePieceProcessor类型，sp_model.id_to_piece，有tokenizer.json tokenizer.model，词典字符有 ▁，
-    - 示例：llama，baichuan
   - icetk： sentencepiece的分支，支持image_tokenizer
     - glm, chatglm1, chatglm2
   - tiktoken
@@ -32,6 +36,10 @@ tokenizer.implementation = TokenizerImpl.SentencePiece.name   # https://github.c
     - 特征：
       - 文件: tokenizer.json(包含后两个文件的内容), merges.txt, vocab.json
         - added_tokens 在vocab中不一定存在。
       - .model 是 tokenizer.models.BPE 类型
       - 词典有 Ġ  "\u0120" 开头
       - .model.from_file  .model.save   .model.token_to_id  .model.tokenize
@@ -116,7 +124,7 @@ all_tokenizers = [
     "code_davinci_002",
     "gpt_35_turbo",
     "gpt_4",
     # 未分类
     "skywork_13b_base",
     "skywork_13b_math",
@@ -141,20 +149,21 @@ all_tokenizers = [
     "switch_c_2048",
     "byt5_small",
     "mt5_large",
-"wizardcoder_python_7b_v1",
-"wizardlm_7b_v1",
-"wizardmath_70b_v1",
-"tigerbot_70b_chat_v4_4k",
-"tigerbot_13b_chat_v2",
-"deepseek_coder_33b_instruct",
-"deepseek_llm_7b_base",
     "gemma_7b",
 ]
 all_tokenizers = sorted(all_tokenizers)
 class TokenizerType(Enum):
     """
     - https://huggingface.co/docs/transformers/tokenizer_summary

   - bert
     - 特征
+      - 词典：有##开头的token，表示subword
     - 示例：
   - sentencepiece:
+    - 特征：
+      - 训练:
+      - 文件: *.sp_model  或 *.model  (可选文件 .vocab，)
+      - 实现:
+        - 训练: `import sentencepiece as spm; spm.SentencePieceTrainer.train` 或 `spm_train`
+        - 加载: `import sentencepiece as spm; spm.SentencePieceProcessor().Load(vocab_file)`
+        - 方法: 是SentencePieceProcessor类型，sp_model.id_to_piece，有tokenizer.json tokenizer.model，
+      - 词典:  词典字符有 ▁  (U+2581) ，表示空格或句首。
+    - 示例：llama，baichuan, orion
   - icetk： sentencepiece的分支，支持image_tokenizer
     - glm, chatglm1, chatglm2
   - tiktoken
     - 特征：
       - 文件: tokenizer.json(包含后两个文件的内容), merges.txt, vocab.json
         - added_tokens 在vocab中不一定存在。
+      - 实现:
+        - 训练:
+        - 加载:
+        - 方法:
       - .model 是 tokenizer.models.BPE 类型
       - 词典有 Ġ  "\u0120" 开头
       - .model.from_file  .model.save   .model.token_to_id  .model.tokenize
     "code_davinci_002",
     "gpt_35_turbo",
     "gpt_4",
     # 未分类
     "skywork_13b_base",
     "skywork_13b_math",
     "switch_c_2048",
     "byt5_small",
     "mt5_large",
+    "wizardcoder_python_7b_v1",
+    "wizardlm_7b_v1",
+    "wizardmath_70b_v1",
+    "tigerbot_70b_chat_v4_4k",
+    "tigerbot_13b_chat_v2",
+    "deepseek_coder_33b_instruct",
+    "deepseek_llm_7b_base",
     "gemma_7b",
+    "olmo_7b",
 ]
 all_tokenizers = sorted(all_tokenizers)
 class TokenizerType(Enum):
     """
     - https://huggingface.co/docs/transformers/tokenizer_summary

vocab/olmo_7b/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@


1	+
2	+ from transformers import AutoTokenizer
3	+
4	+ tokenizer = AutoTokenizer.from_pretrained("allenai/OLMo-7B")