Spaces:

yhavinga
/

dutch-tokenizer-arena

Running

xu-song commited on Mar 15, 2024

Commit

a6aee1d

1 Parent(s): 6b70021

add zephyr

Files changed (2) hide show

vocab/__init__.py CHANGED Viewed

@@ -70,7 +70,7 @@ uniq_tokenizers = [
     ""
 ]
-# TODO: alias/abbr, hf_path, tokenizer_class/type, comments,
 all_tokenizers = [
     ##### bert 系列
     ("bert_base_cased", "", "bert"),
@@ -99,7 +99,7 @@ all_tokenizers = [
     ("chatyuan_large_v2", "", "sentencepiece"),
     ("prompt_clue", "", "sentencepiece"),
-    ("llama", "", "sentencepiece"),  # '中文单字': 700, '中文多字': 0
     ("llama2", "", "sentencepiece"),
     ("chinese_llama", "", "sentencepiece"),  #
     ("chinese_llama2", "", "sentencepiece"),  #
@@ -168,6 +168,7 @@ all_tokenizers = [
     ("gemma_7b",),
     ("olmo_7b",),
     ("aya_101",),
 ]
 all_tokenizers = [tokenizer[0] for tokenizer in all_tokenizers]

     ""
 ]
+# TODO: alias/abbr, description, hf_path, tokenizer_class/type, comments, Organization
 all_tokenizers = [
     ##### bert 系列
     ("bert_base_cased", "", "bert"),
     ("chatyuan_large_v2", "", "sentencepiece"),
     ("prompt_clue", "", "sentencepiece"),
+    ("llama", "", "sentencepiece", "llama use single digits and thus uses 4 tokens to encode the number 1000"),  # '中文单字': 700, '中文多字': 0
     ("llama2", "", "sentencepiece"),
     ("chinese_llama", "", "sentencepiece"),  #
     ("chinese_llama2", "", "sentencepiece"),  #
     ("gemma_7b",),
     ("olmo_7b",),
     ("aya_101",),
+    ("zephyr_7b_beta",)
 ]
 all_tokenizers = [tokenizer[0] for tokenizer in all_tokenizers]

vocab/zephyr_7b_beta/__init__.py ADDED Viewed


1	+
2	+
3	+ from transformers import AutoTokenizer
4	+
5	+ tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")