Spaces:

eson
/

tokenizer-arena

Running

App Files Files Community

eson commited on Mar 1

Commit

adcfb97

•

1 Parent(s): 44c3329

fix tiktoken special tokens

Browse files

Files changed (3) hide show

tokenizer/tiktoken_patch.py +3 -0
vocab/__init__.py +92 -91
vocab/starchat_alpha/__init__.py +5 -0

tokenizer/tiktoken_patch.py CHANGED Viewed

@@ -6,6 +6,8 @@ def decode(self, tokens, errors="replace", skip_special_tokens=False):
     """
     默认的decode，可能会报错，详见 decode_test.py
     skip_special_tokens 是为了兼容 hf_tokenizer
     """
     try:
         decode_str = self._core_bpe.decode_bytes(tokens).decode("utf-8", errors=errors)
@@ -58,6 +60,7 @@ def encode(self, *args, **kwargs):
     add_special_token 是为了兼容 hf_tokenizer
     """
     kwargs.pop("add_special_tokens", None)
     return self._encode(*args, **kwargs)

     """
     默认的decode，可能会报错，详见 decode_test.py
     skip_special_tokens 是为了兼容 hf_tokenizer
+    errors=replace, ignore, strict 有什么区别？
     """
     try:
         decode_str = self._core_bpe.decode_bytes(tokens).decode("utf-8", errors=errors)
     add_special_token 是为了兼容 hf_tokenizer
     """
     kwargs.pop("add_special_tokens", None)
+    kwargs["allowed_special"] = "all"
     return self._encode(*args, **kwargs)

vocab/__init__.py CHANGED Viewed

@@ -15,11 +15,11 @@ tokenizer.type = TokenizerType.ByteBPE.name
 tokenizer.implementation = TokenizerImpl.SentencePiece.name   # https://github.com/facebookresearch/llama/blob/main/llama/tokenizer.py
   "HFGPT2Tokenizer", "HFTokenizer", "GPT2BPETokenizer", "CharLevelTokenizer", "TiktokenTokenizer", "SPMTokenizer", https://github.com/EleutherAI/gpt-neox/blob/main/tools/preprocess_data.py
-  - bert
     - 特征
       - 词典：有##开头的token，表示subword
     - 示例：
-  - sentencepiece:
     - 特征：
       - 训练:
       - 文件: *.sp_model  或 *.model  (可选文件 .vocab，)
@@ -28,10 +28,10 @@ tokenizer.implementation = TokenizerImpl.SentencePiece.name   # https://github.c
         - 加载: `import sentencepiece as spm; spm.SentencePieceProcessor().Load(vocab_file)`
         - 方法: 是SentencePieceProcessor类型，sp_model.id_to_piece，有tokenizer.json tokenizer.model，
       - 词典:  词典字符有 ▁  (U+2581) ，表示空格或句首。
-    - 示例：llama，baichuan, orion
   - icetk： sentencepiece的分支，支持image_tokenizer
     - glm, chatglm1, chatglm2
-  - tiktoken
   - hf_tokenizer
     - 特征：
       - 文件: tokenizer.json(包含后两个文件的内容), merges.txt, vocab.json
@@ -65,102 +65,103 @@ uniq_tokenizers = [
     ""
 ]
 all_tokenizers = [
-    "gpt2",
-    "gpt2_chinese",
-    # bert 系列
-    "bert_base_cased",
-    "bert_base_uncased",
-    "bert_base_chinese",
-    "roberta_chinese_clue",
-    "kplug",
-    # gpt2 系列
-    "moss",
-    #
-    # ######
-    "chatyuan_large_v2",
-    "prompt_clue",
-    #
-    # #### bloom 系列
-    "bloom",
-    # "bloomz_6b4_zh",
-    # "belle_7b_2m",   # 模型和词典都基于bloom
     #
-    "gpt_nexo_20b",
-    "qwen1_5_14b_chat",
-    # "gpt_neox_chinese_v1",
-    #
-    # ##### glm系列
-    # "glm_chinese",
-    "chatglm_6b",
-    "chatglm2_6b",
-    "chatglm3_6b",
-    #
-    # #### llama alpaca系列
-    "llama",  # '中文单字': 700, '中文多字': 0
-    "llama2",
-    "chinese_llama",  #
-    "chinese_llama2",  #
-    # "chinese_alpaca_lora_7b",  # 中文Alpaca模型在上述中文LLaMA模型的基础上进一步使用了指令数据进行精调。
-    # "belle_llama_ext_7b",
-    # "alpaca_7b",
-    "baichuan",
-    "baichuan2",
-    "internlm_chat_7b",
-    "internlm2_chat_7b",
-    "internlm2_math_7b",
-    "internlm_xcomposer_7b",
-    "falcon_7b",
-    "falcon_180b",
     # "goat",
     # tiktoken 系列
-    "qwen_1_8b_chat",
-    "qwen_7b_chat",
-    "qwen_72b_chat",
-    "text_davinci_003",
-    "code_davinci_002",
-    "gpt_35_turbo",
-    "gpt_4",
     # 未分类
-    "skywork_13b_base",
-    "skywork_13b_math",
-    "mistral_7b",
-    "mixtral_8_7b",
-    "t5_small",
-    "t5_base",
-    "t5_large",
-    "flan_t5_base",
-    "fastchat_t5_3b",
-    "pko_t5_large",
-    "wizardcoder_15b_v1",
-    "yi_6b",
-    "yi_34b",
-    "yi_vl34b",
-    "orion_14b_chat",
-    "phi_1",
-    "phi_2",
-    "solar_10_7b",
-    "mobilebert_uncased",
-    "mobilenet_v2",
-    "switch_c_2048",
-    "byt5_small",
-    "mt5_large",
-    "wizardcoder_python_7b_v1",
-    "wizardlm_7b_v1",
-    "wizardmath_70b_v1",
-    "tigerbot_70b_chat_v4_4k",
-    "tigerbot_13b_chat_v2",
-    "deepseek_coder_33b_instruct",
-    "deepseek_llm_7b_base",
-    "gemma_7b",
-    "olmo_7b",
-    "aya_101",
 ]
 all_tokenizers = sorted(all_tokenizers)

 tokenizer.implementation = TokenizerImpl.SentencePiece.name   # https://github.com/facebookresearch/llama/blob/main/llama/tokenizer.py
   "HFGPT2Tokenizer", "HFTokenizer", "GPT2BPETokenizer", "CharLevelTokenizer", "TiktokenTokenizer", "SPMTokenizer", https://github.com/EleutherAI/gpt-neox/blob/main/tools/preprocess_data.py
+  - google/bert
     - 特征
       - 词典：有##开头的token，表示subword
     - 示例：
+  - google/sentencepiece:
     - 特征：
       - 训练:
       - 文件: *.sp_model  或 *.model  (可选文件 .vocab，)
         - 加载: `import sentencepiece as spm; spm.SentencePieceProcessor().Load(vocab_file)`
         - 方法: 是SentencePieceProcessor类型，sp_model.id_to_piece，有tokenizer.json tokenizer.model，
       - 词典:  词典字符有 ▁  (U+2581) ，表示空格或句首。
+    - 示例：google-t5, llama，baichuan, orion,
   - icetk： sentencepiece的分支，支持image_tokenizer
     - glm, chatglm1, chatglm2
+  - openai/tiktoken
   - hf_tokenizer
     - 特征：
       - 文件: tokenizer.json(包含后两个文件的内容), merges.txt, vocab.json
     ""
 ]
+# TODO: alias/abbr, hf_path, tokenizer_class, comments,
 all_tokenizers = [
+    ##### bert 系列
+    ("bert_base_cased", "", ""),
+    ("bert_base_uncased","",),
+    ("bert_base_chinese",),
+    ("roberta_chinese_clue",),
+    ("kplug",),
+    ("gpt2_chinese",),
+    ##### GPT2Tokenizer
+    ("gpt2",),   #
+    ("moss",),
+    ("bloom",),
+    # ("bloomz_6b4_zh",
+    # ("belle_7b_2m",   # 模型和词典都基于bloom
     #
+    ("gpt_nexo_20b",),      # 5万
+    ("qwen1_5_14b_chat",),  # 15万，速度有点慢
+    ("starchat_alpha",),
+    ####### google/sentencepiece tokenizer:
+    # T5 llama internlm
+    ("t5_small",),
+    ("t5_base",),
+    ("t5_large",),
+    ("chatyuan_large_v2",),
+    ("prompt_clue",),
+    ("llama",),  # '中文单字': 700, '中文多字': 0
+    ("llama2",),
+    ("chinese_llama",),  #
+    ("chinese_llama2",),  #
+    # ("chinese_alpaca_lora_7b",  # 中文Alpaca模型在上述中文LLaMA模型的基础上进一步使用了指令数据进行精调。
+    # ("belle_llama_ext_7b",
+    # ("alpaca_7b",
+    ("baichuan",),
+    ("baichuan2",),
+    ("internlm_chat_7b",),
+    ("internlm2_chat_7b",),
+    ("internlm2_math_7b",),
+    ("internlm_xcomposer_7b",),
+    ("falcon_7b",),
+    ("falcon_180b",),
     # "goat",
+    # ##### glm系列
+    # "glm_chinese",),
+    ("chatglm_6b",),
+    ("chatglm2_6b",),
+    ("chatglm3_6b",),
     # tiktoken 系列
+    ("qwen_1_8b_chat",),
+    ("qwen_7b_chat",),
+    ("qwen_72b_chat",),
+    ("text_davinci_003",),
+    ("code_davinci_002",),
+    ("gpt_35_turbo",),
+    ("gpt_4",),
     # 未分类
+    ("skywork_13b_base",),
+    ("skywork_13b_math",),
+    ("mistral_7b",),
+    ("mixtral_8_7b",),
+    ("flan_t5_base",),
+    ("fastchat_t5_3b",),
+    ("pko_t5_large",),
+    ("wizardcoder_15b_v1",),
+    ("yi_6b",),
+    ("yi_34b",),
+    ("yi_vl34b",),
+    ("orion_14b_chat",),
+    ("phi_1",),
+    ("phi_2",),
+    ("solar_10_7b",),
+    ("mobilebert_uncased",),
+    ("mobilenet_v2",),
+    ("switch_c_2048",),
+    ("byt5_small",),
+    ("mt5_large",),
+    ("wizardcoder_python_7b_v1",),
+    ("wizardlm_7b_v1",),
+    ("wizardmath_70b_v1",),
+    ("tigerbot_70b_chat_v4_4k",),
+    ("tigerbot_13b_chat_v2",),
+    ("deepseek_coder_33b_instruct",),
+    ("deepseek_llm_7b_base",),
+    ("gemma_7b",),
+    ("olmo_7b",),
+    ("aya_101",),
 ]
+all_tokenizers = [tokenizer[0] for tokenizer in all_tokenizers]
 all_tokenizers = sorted(all_tokenizers)

vocab/starchat_alpha/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@


1	+
2	+
3	+ from transformers import AutoTokenizer
4	+
5	+ tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/starchat-alpha")