Spaces:

eson
/

tokenizer-arena

Running

App Files Files Community

eson commited on May 21

Commit

2bd606a

•

1 Parent(s): f331792

remove vocabs; update compression_app; add character_app;

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +0 -10
.gitignore +5 -1
vocab/README.md → README.2.md +3 -1
README.md +16 -1
app.py +7 -5
character_app.py +80 -0
character_util.py +213 -0
app_compression.py → compression_app.py +42 -40
utils/compression_util.py → compression_util.py +151 -61
config.py +0 -20
patcher/README.md +15 -0
patcher/sptokenizer_patch_deprecated.py +0 -105
patcher/sptokenizer_wrapper.py +0 -61
patcher/tiktoken_patch.py +2 -2
app_playground.py → playground_app.py +34 -19
examples.py → playground_examples.py +9 -9
util.py → playground_util.py +39 -35
requirements.txt +3 -1
stats/character_stats.json +1712 -0
stats/compress_rate.json +0 -4286
stats/compression_rate.json +0 -0
utils/byte_util.py +0 -0
utils/character_util.py +0 -231
utils/convert_sp_to_json.py +0 -4
utils/fn_util.py +0 -0
utils/lang_util.py +26 -30
utils/lang_util_2.py +0 -115
utils/oov.md +202 -0
utils/oov_util.py +109 -3
utils/speed_util.py +0 -9
utils/symbol.py +0 -35
utils/text_util.py +12 -1
utils/vocab.jd.txt.v2 +0 -10268
vocab.py +453 -0
vocab/Intern_gpt/README.md +0 -0
vocab/__init__.py +0 -260
vocab/_alpaca_7b/README.md +0 -0
vocab/_goat/README.md +0 -0
vocab/_goat/__init__.py +0 -0
vocab/albert/__init__.py +0 -6
vocab/aya_101/__init__.py +0 -5
vocab/baichuan/Baichuan-7B/config.json +0 -26
vocab/baichuan/Baichuan-7B/configuration_baichuan.py +0 -66
vocab/baichuan/Baichuan-7B/special_tokens_map.json +0 -23
vocab/baichuan/Baichuan-7B/tokenization_baichuan.py +0 -250
vocab/baichuan/Baichuan-7B/tokenizer.model +0 -3
vocab/baichuan/Baichuan-7B/tokenizer_config.json +0 -35
vocab/baichuan/__init__.py +0 -19
vocab/baichuan/demo.py +0 -6
vocab/baichuan/error.md +0 -8

.gitattributes CHANGED Viewed

@@ -33,13 +33,3 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
-vocab/belle_7b_2m/belle-7b-2m/tokenizer.json filter=lfs diff=lfs merge=lfs -text
-vocab/bloom/tokenizer/tokenizer.json filter=lfs diff=lfs merge=lfs -text
-vocab/gemma_7b/gemma-7b/tokenizer.model filter=lfs diff=lfs merge=lfs -text
-vocab/gemma_7b/gemma-7b/tokenizer.json filter=lfs diff=lfs merge=lfs -text
-vocab/grok_1/tokenizer.model filter=lfs diff=lfs merge=lfs -text
-vocab/llama3/Meta-Llama-3-70B/tokenizer.json filter=lfs diff=lfs merge=lfs -text
-vocab/mistral_7b/Mistral-7B-v0.1/tokenizer.json filter=lfs diff=lfs merge=lfs -text
-vocab/mistral_7b/Mistral-7B-v0.1/tokenizer.model filter=lfs diff=lfs merge=lfs -text
-vocab/mixtral_8_7b/Mixtral-8x7B-v0.1/tokenizer.json filter=lfs diff=lfs merge=lfs -text
-vocab/mixtral_8_7b/Mixtral-8x7B-v0.1/tokenizer.model filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore CHANGED Viewed

@@ -14,4 +14,8 @@ downloads/
 eggs/
 .eggs/
 .idea/
-gradio_cached_examples

 eggs/
 .eggs/
 .idea/
+gradio_cached_examples
+stats/
+test/
+wip/
+tools/

vocab/README.md → README.2.md RENAMED Viewed

@@ -67,7 +67,7 @@ carol
 ```
-##
 https://github.com/pytorch/fairseq/blob/master/tests/test_noising.py#L37
@@ -77,6 +77,8 @@ https://github.com/pytorch/fairseq/blob/master/tests/test_noising.py#L37
 跟BERT类似，只不过BERT是词后缀，这里是词前缀。
 ## GPT2

 ```
+## @@
 https://github.com/pytorch/fairseq/blob/master/tests/test_noising.py#L37
 跟BERT类似，只不过BERT是词后缀，这里是词前缀。
+这种应该是 https://github.com/rsennrich/subword-nmt
 ## GPT2

README.md CHANGED Viewed

@@ -7,6 +7,8 @@ sdk: gradio
 sdk_version: 4.28.3
 app_file: app.py
 pinned: false
 ---
@@ -210,4 +212,17 @@ python utils/compress_rate_util.py
 - Getting the most out of your tokenizer for pre-training and domain adaptation
 - Efficient and Effective Text Encoding for Chinese LLaMA and Alpaca
-- https://huggingface.co/spaces/Xenova/the-tokenizer-playground

 sdk_version: 4.28.3
 app_file: app.py
 pinned: false
+datasets:
+  - cc100
 ---
 - Getting the most out of your tokenizer for pre-training and domain adaptation
 - Efficient and Effective Text Encoding for Chinese LLaMA and Alpaca
+- blog
+  - https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
+  - https://huggingface.co/docs/transformers/tokenizer_summary#sentencepiece
+  - https://www.huaxiaozhuan.com/%E5%B7%A5%E5%85%B7/huggingface_transformer/chapters/1_tokenizer.html
+  - https://zhuanlan.zhihu.com/p/652520262
+  - https://github.com/QwenLM/Qwen/blob/main/tokenization_note_zh.md
+- demo
+  - https://huggingface.co/spaces/Xenova/the-tokenizer-playground
+  - https://github.com/dqbd/tiktokenizer
+  - https://chat.lmsys.org/?leaderboard
+  - https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard
+- paper
+  - ss
+-

app.py CHANGED Viewed

@@ -1,16 +1,18 @@
-import gradio as gr
-from app_playground import demo as tab_playground
-from app_compression import demo as tab_compression
 from patcher.gr_interface import TabbedInterface
 demo = TabbedInterface(
-    [tab_playground, tab_compression],
-    [" ⚔️ Playground", "🏆 Compression Leaderboard",],  # 编码速度，解码速度，字符分类(zh、num等，支持正则)，支持的语言，机构，。
     title='<div align="center">Tokenizer Arena ⚔️</div>',
     css="css/style.css"
 )
 if __name__ == "__main__":
     demo.launch()

+from playground_app import demo as playground_tab
+from compression_app import demo as compression_tab
+from character_app import demo as character_tab
 from patcher.gr_interface import TabbedInterface
 demo = TabbedInterface(
+    [playground_tab, compression_tab, character_tab],
+    [" ⚔️ Playground", "🏆 Compression Leaderboard", "📊 Character Statistics"],  # 编码速度，解码速度，字符分类(zh、num等，支持正则)，支持的语言，机构，。
     title='<div align="center">Tokenizer Arena ⚔️</div>',
     css="css/style.css"
 )
+demo.load(js=open("js/onload.js", "r", encoding="utf-8").read())
 if __name__ == "__main__":
     demo.launch()

character_app.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import gradio as gr
+from character_util import get_character_table
+all_columns = [
+    ("digit", "digit"),
+    ("space", "space"),
+    ("lang-chinese", 'zh'),
+    ("lang-korea", 'ko'),
+    ("lang-japanese", 'ja'),
+    # ("byte", "byte"),
+    # ("oov", "oov")
+]
+default_columns = ["digit", "zh"]
+# columns = ["lang-zh", "lang-korea", "lang-japanese", "number", "space", "bytes", "oov"]
+abbr2name = {column[1]: column[0].split('-')[-1] for column in all_columns}
+def get_column_info(columns):
+    print(columns)
+    markdown = ""
+    for column in columns:
+        markdown += f"- `num({column})`: num of tokens containing {abbr2name[column]} characters\n" \
+                    f"- `len({column})`: `min,median,max` length of tokens containing {abbr2name[column]} characters\n"
+    return markdown
+with gr.Blocks() as demo:
+    gr.Markdown("## 🛠️ Setting")  # ⚙
+    with gr.Accordion("Please select the type of character you want to count.", open=True):
+        # file size 💽 🖴, tokens 🧮
+        with gr.Row():
+            with gr.Column():
+                columns = gr.Checkboxgroup(
+                    all_columns,
+                    value=default_columns,
+                    label="character type",
+                    # info=""
+                )
+                gr.Markdown(
+                    "To count other types of characters, you can modify [character_util.py]"
+                    "(https://huggingface.co/spaces/eson/tokenizer-arena/blob/main/character_util.py). "
+                )
+            column_info = gr.Markdown(
+                get_column_info(default_columns)
+            )
+        gr.Markdown("## 📊 Character Statistics")
+        search_bar = gr.Textbox(
+            placeholder="🔍 Search by tokenizer or organization (e.g., 'llama', 'openai') and press ENTER...",
+            show_label=False,
+            elem_id="search-bar",
+        )
+        compress_rate_table = gr.Dataframe(datatype="html", wrap=True)
+        search_bar.submit(
+            get_character_table,
+            inputs=[search_bar, columns],
+            outputs=compress_rate_table
+        )
+        columns.change(
+            get_character_table,
+            inputs=[search_bar, columns],
+            outputs=compress_rate_table
+        )
+        columns.change(
+            get_column_info,
+            inputs=[columns],
+            outputs=column_info
+        )
+        demo.load(
+            get_character_table,
+            inputs=[search_bar, columns],
+            outputs=compress_rate_table
+        )
+    if __name__ == "__main__":
+        demo.launch()

character_util.py ADDED Viewed

	@@ -0,0 +1,213 @@

+"""
+TODO:
+1. 繁体、简体、语种、
+2. 确认 bert的space token数目
+3. add token_impl
+4.
+"""
+import os
+import json
+import numpy as np
+import pandas as pd
+from collections import Counter, defaultdict
+from vocab import tokenizer_factory
+from typing import Optional, Union, Literal
+from utils.log_util import logger
+from utils.text_util import contains_digit, get_space_count
+from utils.lang_util import detect_language, language_ranges
+CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
+def _to_unicode(text):
+    return ''.join(r'\u{:04X}'.format(ord(chr)) for chr in text)
+def _get_coding_length(tokenizer, vocab, filter=None):
+    """
+    oov character may be tokenized into more than one token.
+    """
+    all_length = []
+    for word in vocab:
+        if len(word) > 1:
+            continue
+        if filter is not None and filter(word):
+            continue
+        try:
+            tokens = tokenizer.encode(word)
+        except Exception as e:
+            print(e)
+        all_length.append(len(tokens))
+        # if len(tokens.ids) > 1:
+        # if len(tokens) > 3:
+        #     print(word, tokens)
+    dist_length = Counter(all_length)
+    mean_length = round(sum(all_length) / len(all_length), 2)
+    return dist_length, mean_length
+cache = {}
+def _dist(token_lens):
+    """
+    :param token_lens:
+    :return: min,median,max of token_lens
+    """
+    if not token_lens:
+        return "-"
+    return f"{min(token_lens)},{round(np.median(token_lens))},{max(token_lens)}"
+def iter_vocab(
+        tokenizer_name: str,
+        from_cache: bool = True,
+        cache_dir: str = "stats",
+) -> Union[pd.DataFrame, dict]:
+    """
+    :param tokenizer_name:
+    :param from_cache:
+    :param cache_dir:
+    :return:
+    """
+    tokenizer_config = tokenizer_factory.get_tokenizer_config(tokenizer_name)
+    cache_dir = os.path.join(CURRENT_DIR, cache_dir)
+    os.makedirs(cache_dir, exist_ok=True)
+    # load from cache
+    cache_path = os.path.join(cache_dir, "character_stats.json")
+    if not cache and os.path.exists(cache_path):
+        with open(cache_path, "r", encoding="utf-8") as f_tmp:
+            cache.update(json.load(f_tmp))
+    if from_cache and tokenizer_name in cache:
+        logger.info(f"load {tokenizer_config.name_or_path} from cache")
+        return cache[tokenizer_name]
+    tokenizer = tokenizer_factory.get_tokenizer(tokenizer_name)
+    tokens_by_lang = {lang[1]: [] for lang in language_ranges.keys()}
+    digit_tokens = []
+    space_tokens = []
+    byte_tokens = []
+    buffer = []
+    for token_id in range(tokenizer.vocab_size):
+        # for token_id in tokenizer.get_vocab():
+        # for token_id in range(len(tokenizer)):
+        decode_str = tokenizer.decode([token_id], skip_special_tokens=False)
+        token = tokenizer.convert_ids_to_tokens([token_id], skip_special_tokens=False)[0]
+        tags = []
+        if token is None:  # 有些词典有空的id（不连续）
+            continue
+        if isinstance(token, bytes):
+            token = token.decode("utf-8", errors="ignore")
+        if hasattr(tokenizer, "sp_model"):  # 基于 sentencepiece 包
+            if tokenizer.sp_model.is_byte(token_id):
+                tags.append("is_byte")
+                byte_tokens.append(token)
+        language_tags = detect_language(decode_str)
+        for language in language_tags:
+            tokens_by_lang[language[1]].append(decode_str)
+        if contains_digit(decode_str):
+            tags.append("digit")
+            digit_tokens.append(decode_str)
+        space_count = get_space_count(decode_str)
+        if space_count > 0:
+            space_tokens.append(decode_str)
+        buffer.append(json.dumps(
+            {
+                "id": token_id,
+                "token": token,
+                "token_decode": decode_str,
+                "token_dumps": json.dumps(token),
+                "token_unicode": _to_unicode(token),
+                "token_len": len(decode_str),
+            },
+            ensure_ascii=False) + "\n")
+    result = {
+        "tokenizer": tokenizer_factory.get_name_with_hyperlink(tokenizer_name),
+        "organization": tokenizer_config.org,
+        # "impl": str(tokenizer.__class__),
+        # "vocab_size-": tokenizer.vocab_size,  # vocab_size_without_added_token
+        "vocab_size": len(tokenizer),
+        # "中文汉字编码长度均值": mean_length,   # 不用统计，因为字典包含中文字符多，一般就意味着 中文汉字编码长度短。
+        # "中文汉字编码长度分布": json.dumps(dist_length),
+        "num(digit)": len(digit_tokens),
+        "len(digit)": _dist([len(token) for token in digit_tokens]),
+        "num(space)": len(space_tokens),
+        "len(space)": _dist([len(token) for token in space_tokens]),
+        # "num(byte)": len(byte_tokens)
+    }
+    for lang, tokens in tokens_by_lang.items():
+        result[f"num({lang})"] = len(tokens)
+        result["len(" + lang + ")"] = _dist([len(token) for token in tokens])
+    out_path = os.path.join(cache_dir, f"iter_vocab/{tokenizer_name.replace('/', '_')}.vocab.jsonl")
+    with open(out_path, "w", encoding="utf-8") as f_out:
+        for line in buffer:
+            f_out.write(line)
+    len_before = len(cache)
+    cache[tokenizer_name] = result
+    len_after = len(cache)
+    logger.info(f"saving {tokenizer_name} to memory and file cache: {len_before}->{len_after}")
+    with open(cache_path, "w", encoding="utf-8") as f_out:
+        f_out.write(json.dumps(cache, ensure_ascii=False, indent=2))
+    return result
+def to_dataframe(stats, columns):
+    table = []
+    for stat in stats.values():
+        filtered_stat = {}
+        for k, v in stat.items():
+            if not k.startswith("num") and not k.startswith("len"):
+                filtered_stat[k] = v
+            if any(column in k for column in columns):
+                k = k.replace("ja-kana", "kana")
+                filtered_stat[k] = v
+        table.append(filtered_stat)
+    df = pd.DataFrame(table)
+    return df
+def get_character_table(
+        tokenizer_filter: Optional[str] = None,
+        columns: Optional[str] = None,
+        return_type: Optional[Literal["dict", "dataframe"]] = "dataframe"
+) -> Union[pd.DataFrame, dict]:
+    """
+    """
+    logger.info(f"columns: {columns}, tokenizer_filter: {tokenizer_filter}")
+    stats = {}
+    if tokenizer_filter is not None:
+        tokenizer_names = [tokenizer_config.name_or_path for tokenizer_config in tokenizer_factory.all_tokenizer_configs
+                           if tokenizer_filter.lower() in tokenizer_config.name_or_path.lower()]
+    else:
+        tokenizer_names = tokenizer_factory.all_tokenizer_names
+    for tokenizer_name in tokenizer_names:
+        stat = iter_vocab(tokenizer_name)
+        stats[tokenizer_name] = stat
+    if return_type == "dataframe":
+        stats = to_dataframe(stats, columns)
+    return stats
+if __name__ == "__main__":
+    # aa = get_character_table(tokenizer_filter="baichuan")
+    df = get_character_table()
+    logger.info(f"\n{df.to_markdown(index=False)}")

app_compression.py → compression_app.py RENAMED Viewed

@@ -1,6 +1,14 @@
 import gradio as gr
-from utils.compression_util import get_compression_leaderboard
-from utils.compression_util import common_corpuses
 with gr.Blocks() as demo:
     # gr.Markdown("## Convertor")
@@ -44,63 +52,56 @@ with gr.Blocks() as demo:
     #         )
     gr.Markdown("## 🛠️ Setting")  # ⚙
-    with gr.Accordion("Please select corpus and measure of compression rate ...", open=True):
         # file size 💽 🖴, tokens 🧮
-        # gr.Markdown(
-        #     "Please select corpus and measure of compression rate.\n"
-            #"`num_of_trillion_tokens`  `num_of_billion_tokens`\n"
-            # "- `b_tokens/g_bytes` measures how many billion tokens per gigabytes corpus. \n"
-            # "- `t_tokens/t_bytes` measures how many trillion tokens per terabytes corpus. \n"
-            # "- `n_chars/n_tokens` measures how many chars per token in the current corpus. \n\n"
-            # "All the above measures are depend on corpus. You can reproduce this "
-            # "procedure at [github](https://github.com/xu-song/tokenizer-arena/)."
-        # )
         with gr.Row():
-            compress_rate_corpus = gr.Dropdown(
-                common_corpuses,  # , "code"
-                value=["cc100-en", "cc100-zh-Hans"],
-                label="corpus",
-                multiselect=True
-                # info=""
-            )
-            # unit of file_size: gigabyte terabyte
-            # unit of token_num: million billion trillion
-            # The most common units of measurement include length (meter, inch, foot), weight (gram, kilogram, pound), volume (liter, gallon, milliliter), time (second, minute, hour)
-            compress_rate_unit = gr.Radio(
-                ["b_tokens/g_bytes", "t_tokens/t_bytes"],
-                value="b_tokens/g_bytes",
-                label="measure",
             )
-        gr.Markdown(
-            # "`num_of_trillion_tokens`  `num_of_billion_tokens`\n"
-            "- `b_tokens/g_bytes` measures how many billion tokens per gigabytes corpus. \n"
-            "- `t_tokens/t_bytes` measures how many trillion tokens per terabytes corpus. \n"
-            "- `n_chars/n_tokens` measures how many chars per token in the tokenized corpus. \n"
-            # "\nAll the above measures are depend on corpus. You can reproduce this "
-            # "procedure at [github](https://github.com/xu-song/tokenizer-arena/)."
-        )
     gr.Markdown("## 🏆 Compression Rate Leaderboard")
     search_bar = gr.Textbox(
-        placeholder="🔍 Search tokenizers(e.g., 'llama') and press ENTER...",
         show_label=False,
         elem_id="search-bar",
     )
-    compress_rate_table = gr.Dataframe()
     # func call
     compress_rate_corpus.change(
         get_compression_leaderboard,
-        inputs=[compress_rate_corpus, compress_rate_unit],
         outputs=compress_rate_table
     )
     compress_rate_unit.change(
         get_compression_leaderboard,
-        inputs=[compress_rate_corpus, compress_rate_unit],
         outputs=compress_rate_table
     )
     # file_size.change(
@@ -123,5 +124,6 @@ with gr.Blocks() as demo:
         inputs=[compress_rate_corpus, compress_rate_unit],
         outputs=compress_rate_table
     )
 if __name__ == "__main__":
     demo.launch()

+"""
+TODO:
+- 统计 tokenizer_impl
+- 统计 OOV
+- 统计 reversal
+- 增加 math，code
+"""
 import gradio as gr
+from compression_util import get_compression_leaderboard, common_corpuses
 with gr.Blocks() as demo:
     # gr.Markdown("## Convertor")
     #         )
     gr.Markdown("## 🛠️ Setting")  # ⚙
+    with gr.Accordion("Please select the corpus and measure of compression rate.", open=True):
         # file size 💽 🖴, tokens 🧮
+        # Total amount of disk used
         with gr.Row():
+            with gr.Column():
+                compress_rate_corpus = gr.Dropdown(
+                    common_corpuses,  # , "code"
+                    value=["cc100/en", "cc100/zh-Hans", "cc100/fr", "cc100/es"],
+                    label="corpus",
+                    multiselect=True
+                    # info=""
+                )
+                # unit of file_size: gigabyte terabyte
+                # unit of token_num: million billion trillion
+                # The most common units of measurement include length (meter, inch, foot), weight (gram, kilogram, pound), volume (liter, gallon, milliliter), time (second, minute, hour)
+                compress_rate_unit = gr.Radio(
+                    ["b_tokens/g_bytes", "t_tokens/t_bytes"],
+                    value="b_tokens/g_bytes",
+                    label="measure",  # evaluation metric
+                )
+            gr.Markdown(
+                "- `corpus`: tokenization is performed on the selected subsets of [cc100](https://huggingface.co/datasets/cc100) corpus.\n"
+                "- `b_tokens/g_bytes` measures how many billion tokens per gigabytes corpus.\n"
+                "- `t_tokens/t_bytes` measures how many trillion tokens per terabytes corpus.\n"
+                # "- `g_bytes/b_tokens` measures how many gigabytes corpus per billion tokens.\n"
+                # "- `t_bytes/t_tokens` measures how many terabytes corpus per trillion tokens.\n"
+                "- `char/token` measures how many chars per token on the tokenized corpus.\n"
+                "- `oov_ratio`: out-of-vocabulary ratio on the selected corpus. 👉 get [oov charset](https://huggingface.co/spaces/eson/tokenizer-arena/blob/main/stats/compression_rate.json)\n\n"
+                "You can reproduce this procedure with [compression_util.py](https://huggingface.co/spaces/eson/tokenizer-arena/blob/main/compression_util.py)."
             )
     gr.Markdown("## 🏆 Compression Rate Leaderboard")
     search_bar = gr.Textbox(
+        placeholder="🔍 Search by tokenizer or organization (e.g., 'llama', 'openai') and press ENTER...",
         show_label=False,
         elem_id="search-bar",
     )
+    compress_rate_table = gr.Dataframe(datatype="html")
     # func call
     compress_rate_corpus.change(
         get_compression_leaderboard,
+        inputs=[compress_rate_corpus, compress_rate_unit, search_bar],
         outputs=compress_rate_table
     )
     compress_rate_unit.change(
         get_compression_leaderboard,
+        inputs=[compress_rate_corpus, compress_rate_unit, search_bar],
         outputs=compress_rate_table
     )
     # file_size.change(
         inputs=[compress_rate_corpus, compress_rate_unit],
         outputs=compress_rate_table
     )
 if __name__ == "__main__":
     demo.launch()

utils/compression_util.py → compression_util.py RENAMED Viewed

@@ -2,8 +2,8 @@
 中文数据：clue superclue
 英文数据：glue cnn_dailymail gigaword
-代码数据:
-数字：
 """
@@ -13,15 +13,15 @@ import sys
 import pandas as pd
 from datasets import load_dataset
 from utils.log_util import logger
-from vocab import load_tokener
-from vocab import all_tokenizers
 from typing import List, Optional, Union, Literal
 CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
 common_units = ["g_bytes/b_tokens", "b_tokens/g_bytes", "t_bytes/t_tokens", "t_tokens/t_bytes", "n_chars/n_tokens", ]
-common_corpuses = sorted(["cc100-en", "cc100-zh-Hans", "cc100-es", "cc100-fr", "cc100-de", "cc100-ko",
-                          "cc100-fa", "cc100-ar", "cc100-ja"])
 VALID_CODES_CC100 = [
     "am", "ar", "as", "az", "be", "bg", "bn", "bn_rom", "br", "bs", "ca", "cs", "cy", "da", "de",
@@ -44,9 +44,12 @@ def get_n_bytes_of_string(string_text):
 def unit_convertor(stat, unit):
-    n_tokens = stat["n_tokens"]
-    n_chars = stat["n_chars"]
-    n_bytes = stat["n_bytes"]
     n_tokens_in_billion = n_tokens / (1000 * 1000 * 1000)
     n_tokens_in_trillion = n_tokens / (1000 * 1000 * 1000 * 1000)
@@ -57,11 +60,9 @@ def unit_convertor(stat, unit):
     if unit == "n_tokens/n_bytes":
         value = n_tokens / n_bytes
-    # the average number of characters per token
-    elif unit in ["n_chars/n_tokens", "chars_per_token"]:  # 重要：平均一个token包含多少个字符。
         value = n_chars / n_tokens
-    elif unit == "n_tokens/n_chars":  # 一个中文汉字需要几个token？
         value = n_tokens / n_chars
     elif unit == "g_bytes/b_tokens":
         value = n_bytes_in_gb / n_tokens_in_billion
@@ -76,14 +77,48 @@ def unit_convertor(stat, unit):
     return round(value, 3)
 def to_dataframe(stats, units=None):
     if units is None:
         units = common_units
     elif not isinstance(units, list):
         units = [units]
     table = []
-    for tokenizer_name, stat in stats.items():
-        columns = {"tokenizer": tokenizer_name, "vocab_size": stat["vocab_size"]}
         for unit in units:
             if unit not in stat:
                 columns[unit] = unit_convertor(stat, unit)
@@ -98,105 +133,159 @@ cache = {}
 def tokenize_corpus(
-        tokenizer_name: str,
         corpuses: List[str],
-        cache_path: str = "stats/compress_rate.json"
 ) -> dict:
     """
     这个要独立的cache，因为速度慢。
-    :param tokenizer_name:
     :param corpuses:
     :param cache_path:
     :return:
     """
-    def _tokenize(tokenizer, datasets):
         n_tokens = 0
         n_chars = 0
-        n_bytes = 0
         for dataset in datasets:
             for item in dataset:
                 text = item["text"]
                 n_bytes += get_n_bytes_of_string(text)
                 n_chars += len(text)
-                encodings = tokenizer.encode(text)
-                n_tokens += len(encodings)
         stat = {
-            # "vocab_size": len(tokenizer.vocab_size,
-            "vocab_size": len(tokenizer),
-            "n_bytes": n_bytes,
-            "n_tokens": n_tokens,
-            "n_chars": n_chars,
         }
         return stat
     # load from cache
-    cache_id = f"{tokenizer_name}.{'.'.join(corpuses)}"
     if not cache and os.path.exists(cache_path):
         with open(cache_path, "r", encoding="utf-8") as f_tmp:
             cache.update(json.load(f_tmp))
     if cache_id in cache:
-        logger.info(f"loading {cache_id} from in-memory cache")
         return cache[cache_id]
     # tokenize corpus
-    tokenizer = load_tokener(tokenizer_name)
-    datasets = [load_dataset("eson/cc100-samples", corpus.replace("cc100-", ""), split="train") for corpus in corpuses]
-    stat = _tokenize(tokenizer, datasets)
     # save to cache
     len_before = len(cache)
     cache[cache_id] = stat
     len_after = len(cache)
-    logger.info(f"saving {cache_id} to in-memory and file cache: {len_before}->{len_after}")
     with open(cache_path, "w", encoding="utf-8") as f_tmp:
-        json.dump(cache, f_tmp, indent=2)
     return stat
 def get_compression_leaderboard(
-        corpuses: List[str] = ['cc100-en'],
         unit: str = "b_tokens/g_bytes",
         tokenizer_filter: Optional[str] = None,
         return_type: Optional[Literal["dict", "dataframe"]] = "dataframe"
 ) -> Union[pd.DataFrame, dict]:
     """
-    ## TODO
-    - search by organization,
     """
     logger.info(f"corpuses: {corpuses}; unit: {unit}; tokenizer_filter: {tokenizer_filter}")
     stats = {}
     if tokenizer_filter is not None:
-        tokenizers = [tokenizer_name for tokenizer_name in all_tokenizers if tokenizer_filter in tokenizer_name]
     else:
-        tokenizers = all_tokenizers
-    for lang in corpuses:
-        for tokenizer_name in tokenizers:
-            stat = tokenize_corpus(tokenizer_name, [lang])
-            stats[tokenizer_name] = stat
     if return_type == "dataframe":
         token_number_unit, file_size_unit = unit.split("/")
         reverse_unit = f"{file_size_unit}/{token_number_unit}"
-        stats = to_dataframe(stats, [unit, reverse_unit, "n_chars/n_tokens"])
-        stats = stats.sort_values(unit)
-        stats = stats.rename(columns={unit: f' ⬆️{unit}'})
     return stats
-def update_compress_rate():
-    pass
-def test():
-    tokenizer_name = "gpt_4"
-    tokenizer = load_tokener(tokenizer_name)
-    stats = {tokenizer_name: tokenize_corpus(tokenizer, ["cc100-en", "cc100-zh-Hans"])}
-    df = to_dataframe(stats)
-    # print(df.to_markdown(index=False, tablefmt='fancy_grid'))
-    logger.info(f"\n{df.to_markdown(index=False)}")
 def main():
     if len(sys.argv) == 3:
         tokenizer_filter = [sys.argv[1]]
@@ -204,11 +293,12 @@ def main():
     else:
         tokenizer_filter = None
         corpuses = common_corpuses
-    df = get_compression_leaderboard(corpuses)
     # print(df.to_markdown(index=False, tablefmt='fancy_grid'))
     logger.info(f"\n{df.to_markdown(index=False)}")
 if __name__ == "__main__":
     main()
-    # test()

 中文数据：clue superclue
 英文数据：glue cnn_dailymail gigaword
+code:
+math：
 """
 import pandas as pd
 from datasets import load_dataset
 from utils.log_util import logger
+from vocab import tokenizer_factory, TokenizerConfig
 from typing import List, Optional, Union, Literal
 CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
 common_units = ["g_bytes/b_tokens", "b_tokens/g_bytes", "t_bytes/t_tokens", "t_tokens/t_bytes", "n_chars/n_tokens", ]
+common_corpuses = sorted(["cc100/en", "cc100/zh-Hans", "cc100/es", "cc100/fr", "cc100/de", "cc100/ko",
+                          "cc100/fa", "cc100/ar", "cc100/ja"])
 VALID_CODES_CC100 = [
     "am", "ar", "as", "az", "be", "bg", "bn", "bn_rom", "br", "bs", "ca", "cs", "cy", "da", "de",
 def unit_convertor(stat, unit):
+    n_tokens = stat["_n_tokens"]
+    n_chars = stat["_n_chars"]
+    n_bytes = stat["_n_bytes"]
+    if n_tokens is None:
+        return None
     n_tokens_in_billion = n_tokens / (1000 * 1000 * 1000)
     n_tokens_in_trillion = n_tokens / (1000 * 1000 * 1000 * 1000)
     if unit == "n_tokens/n_bytes":
         value = n_tokens / n_bytes
+    elif unit in ["char/token", "chars_per_token"]:  # 重要：平均一个token包含多少个字符。
         value = n_chars / n_tokens
+    elif unit in ["token/char", "tokens_per_char"]:  # 一个中文汉字需要几个token？
         value = n_tokens / n_chars
     elif unit == "g_bytes/b_tokens":
         value = n_bytes_in_gb / n_tokens_in_billion
     return round(value, 3)
+def _merge_stats_by_corpus(stats_by_corpus, oov_threshold=0.3):
+    """
+    """
+    all_stats = list(stats_by_corpus.values())
+    assert len(set([stats["tokenizer"] for stats in all_stats])) == 1
+    reversible = all(stat['reversible'] for stat in all_stats)
+    is_support = all(stat['oov_ratio'] < oov_threshold for stat in all_stats)
+    merged_stats = {
+        "tokenizer": all_stats[0]["tokenizer"],
+        "organization": all_stats[0]["organization"],
+        "vocab_size": all_stats[0]["vocab_size"],
+        "_n_bytes": 0,
+        "_n_tokens": 0 if is_support else None,
+        "_n_chars": 0,
+        "_n_oov_chars": 0,
+        "reversible": True,
+    }
+    for stats in all_stats:
+        merged_stats["_n_bytes"] += stats["_n_bytes"]
+        merged_stats["_n_chars"] += stats["_n_chars"]
+        if is_support:  # The number of tokens cannot be accurately counted, when there are too many UNKs.
+            merged_stats["_n_tokens"] += stats["_n_tokens"]
+        merged_stats["_n_oov_chars"] += stats["_n_oov_chars"]
+        merged_stats["reversible"] &= stats['reversible']
+    merged_stats.update({
+        "oov_ratio": float("%.4g" % (stats["_n_oov_chars"] / stats["_n_chars"])),
+        "reversible": reversible
+    })
+    return merged_stats
 def to_dataframe(stats, units=None):
     if units is None:
         units = common_units
     elif not isinstance(units, list):
         units = [units]
     table = []
+    for stat in stats.values():
+        columns = {k: v for k, v in stat.items() if not k.startswith("_")}
         for unit in units:
             if unit not in stat:
                 columns[unit] = unit_convertor(stat, unit)
 def tokenize_corpus(
+        tokenizer_name: str,  # 可以免加载tokenizer直接出结果
         corpuses: List[str],
+        cache_dir: str = "stats"
 ) -> dict:
     """
     这个要独立的cache，因为速度慢。
+    :param tokenizer_config: 可以不加载就
     :param corpuses:
     :param cache_path:
     :return:
     """
+    def _char_based_oov(src_text, decode_text):
+        oov_chars = []
+        for char in src_text:
+            if char not in decode_text:
+                oov_chars.append(char)
+        n_oov_chars = len(oov_chars)
+        oov_charset = list(dict.fromkeys(oov_chars))
+        return n_oov_chars, oov_charset
+    def _tokenize(tokenizer, datasets, detail_path=None):
+        """
+        export_diff: true | false
+        :param tokenizer:
+        :param datasets:
+        :param detail_path:
+        :return:
+        """
+        n_bytes = 0
         n_tokens = 0
         n_chars = 0
+        n_oov_chars = 0
+        diff_details = []
+        oov_charset = set()
+        unk_token_id = None
+        if hasattr(tokenizer, "unk_token"):
+            unk_token_id = tokenizer.unk_token_id
         for dataset in datasets:
             for item in dataset:
                 text = item["text"]
                 n_bytes += get_n_bytes_of_string(text)
                 n_chars += len(text)
+                ids = tokenizer.encode(text, add_special_tokens=False)
+                # detect oov
+                decode_text = tokenizer.decode(ids)
+                decode_text_without_unk = tokenizer.decode([token_id for token_id in ids if token_id != unk_token_id])
+                if decode_text != text:
+                    _n_oov_chars, _oov_charset = _char_based_oov(text, decode_text_without_unk)
+                    diff_details.append(
+                        {
+                            "text": text,
+                            "decode_text": decode_text,
+                            "decode_text_without_unk": decode_text_without_unk,
+                            "n_oov_chars": _n_oov_chars,
+                            'oov_ratio': _n_oov_chars / len(text),
+                            'oov_charset': json.dumps(_oov_charset, ensure_ascii=False),
+                        }
+                    )
+                    n_oov_chars += _n_oov_chars
+                    oov_charset.update(_oov_charset)
+                n_tokens += len(ids)
         stat = {
+            "_n_bytes": n_bytes,
+            "_n_tokens": n_tokens,
+            "_n_chars": n_chars,
+            "_n_oov_chars": n_oov_chars,
+            "oov_ratio": n_oov_chars / n_chars,
+            '_oov_charset': json.dumps(list(oov_charset), ensure_ascii=False),
+            "reversible": len(diff_details) == 0
         }
+        if detail_path and diff_details:
+            logger.info(f"saving tokenization detail to '{detail_path}'")
+            with open(detail_path, "w", encoding="utf-8") as f:
+                f.write(json.dumps(diff_details, ensure_ascii=False, indent=2))
+                # print(f"{tokenizer_config.name_or_path}, {infer_tokenizer_type(tokenizer_config)}\n"
+                #       f"reversible: false; unk_token: {get_unk(tokenizer_config)},"
+                #       f" unk_ratio: {unk_count / len(encoding):.4f}; oov: []")
+                # for diff_detail in diff_details:
+                #         # print(f"text[{i}]     = {str(bytes(text[i:], 'utf-8'))}\n"
+                #         #       f"decoding[{i}] = {str(bytes(decoding[i:], 'utf-8'))}")
+                #         f.write(f"text= {json.dumps(text[i:], ensure_ascii=False)}, \n"
+                #               f"decoding[{i}] = {json.dumps(decoding[i:], ensure_ascii=False)}")
         return stat
     # load from cache
+    cache_id = f"{tokenizer_name} @ {'.'.join(corpuses)}"
+    cache_path = os.path.join(cache_dir, "compression_rate.json")
     if not cache and os.path.exists(cache_path):
         with open(cache_path, "r", encoding="utf-8") as f_tmp:
             cache.update(json.load(f_tmp))
     if cache_id in cache:
+        # logger.info(f"loading {cache_id} from in-memory cache")
         return cache[cache_id]
     # tokenize corpus
+    tokenizer = tokenizer_factory.get_tokenizer(tokenizer_name)
+    datasets = [load_dataset("eson/cc100-samples", corpus.replace("cc100/", ""), split="train") for corpus in corpuses]
+    stat = {
+        "tokenizer": tokenizer_factory.get_name_with_hyperlink(tokenizer_name),
+        "organization": tokenizer_factory.get_tokenizer_config(tokenizer_name).org,
+        "vocab_size": len(tokenizer),
+    }
+    tokenize_detail_dir = os.path.join(cache_dir, "compression_rate")
+    os.makedirs(tokenize_detail_dir, exist_ok=True)
+    tokenize_detail_path = os.path.join(tokenize_detail_dir, cache_id.replace("/", ".") + ".diff.json")
+    stat.update(_tokenize(tokenizer, datasets, detail_path=tokenize_detail_path))
+    # add basic info
     # save to cache
     len_before = len(cache)
     cache[cache_id] = stat
     len_after = len(cache)
+    logger.info(f"saving '{cache_id}' to memory and file cache '{cache_path}': {len_before}->{len_after}")
     with open(cache_path, "w", encoding="utf-8") as f_tmp:
+        json.dump(cache, f_tmp, ensure_ascii=False, indent=2)
     return stat
 def get_compression_leaderboard(
+        corpuses: List[str] = ['cc100/en'],
         unit: str = "b_tokens/g_bytes",
         tokenizer_filter: Optional[str] = None,
         return_type: Optional[Literal["dict", "dataframe"]] = "dataframe"
 ) -> Union[pd.DataFrame, dict]:
     """
     """
     logger.info(f"corpuses: {corpuses}; unit: {unit}; tokenizer_filter: {tokenizer_filter}")
     stats = {}
     if tokenizer_filter is not None:
+        tokenizer_names = [tokenizer_name for tokenizer_name in tokenizer_factory.all_tokenizer_names
+                           if tokenizer_filter.lower() in tokenizer_name.lower()]
     else:
+        tokenizer_names = tokenizer_factory.all_tokenizer_names
+    for tokenizer_name in tokenizer_names:
+        stats_by_corpus = {}
+        for corpus in corpuses:
+            stats_by_corpus[corpus] = tokenize_corpus(tokenizer_name, [corpus])
+        stats[tokenizer_name] = _merge_stats_by_corpus(stats_by_corpus)
     if return_type == "dataframe":
         token_number_unit, file_size_unit = unit.split("/")
         reverse_unit = f"{file_size_unit}/{token_number_unit}"
+        stats = to_dataframe(stats, [unit, reverse_unit, "char/token"])
+        stats = stats.sort_values(["oov_ratio", unit], ascending=[True, True])
+        stats = stats.rename(columns={"oov_ratio": f' ⬆️oov_ratio'}).rename(columns={unit: f' ⬆️{unit}'})  # ⬇
     return stats
 def main():
     if len(sys.argv) == 3:
         tokenizer_filter = [sys.argv[1]]
     else:
         tokenizer_filter = None
         corpuses = common_corpuses
+        # tokenizer_filter = "openai"
+        # corpuses = ["cc100/en", "cc100/zh-Hans"]
+    df = get_compression_leaderboard(corpuses, tokenizer_filter=tokenizer_filter)
     # print(df.to_markdown(index=False, tablefmt='fancy_grid'))
     logger.info(f"\n{df.to_markdown(index=False)}")
 if __name__ == "__main__":
     main()

config.py DELETED Viewed

@@ -1,20 +0,0 @@
-USE_REMOTE = False  # use remote tokenizer or local tokenizer
-# load_vocab_with_SPECIAL_TOKEN = True # 如果不包含会导致计算词典大小错误、overlap_token计算不一致。
-# encoding config
-ADD_SPECIAL_TOKEN = False
-#
-LAZY_IMPORT = True
-# DEBUG: 设置环境变量 RUST_BACKTRACE=full
-#
-default_user_input = """\
-Replace this text in the input field to see how tokenization works.
-Buenos días!
-华为发布Mate60手机。
-ラグビーワールドカップ2023フランス"""
-default_tokenizer_type_1 = "llama3"
-default_tokenizer_type_2 = "gpt_4"

patcher/README.md ADDED Viewed

	@@ -0,0 +1,15 @@

+## vocabsize不一致问题
+- .vcab_size
+  - Size of the base vocabulary (without the added tokens)
+  - 来自 https://huggingface.co/transformers/v2.11.0/main_classes/tokenizer.html
+- len(tokenizer)
+  - Size of the full vocabulary with the added tokens.
+  - https://github.com/huggingface/transformers/issues/12632
+- max(tokenizer.get_vocab().values())
+  - 包括不连续的 token_id
+  - https://github.com/huggingface/transformers/issues/4875

patcher/sptokenizer_patch_deprecated.py DELETED Viewed

@@ -1,105 +0,0 @@
-"""
-## adapt to transformer tokenizer
-https://github.com/huggingface/transformers/blob/v4.40.1/src/transformers/tokenization_utils.py#L379
-## usage
-- grok
-## 风险评估
-- 可能会干扰 sentencepiece.SentencePieceProcessor的正常使用，比如 .vocab_size 原来是个方法，patch后是个property
-## TODO
-不用patch，改用wrapper。常见的 tokenizer通常是封装的 sentencepiece，
-"""
-import sentencepiece
-@property
-def vocab_size(self):
-    """Returns vocab size"""
-    return self.get_piece_size()
-def get_vocab(self):
-    """Returns vocab as a dict"""
-    vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
-    # vocab.update(self.added_tokens_encoder)
-    return vocab
-def _tokenize(self, text):
-    """Returns a tokenized string."""
-    return self.encode(text, out_type=str)
-def _convert_token_to_id(self, token):
-    """Converts a token (str) in an id using the vocab."""
-    return self.piece_to_id(token)
-def _convert_id_to_token(self, index):
-    """Converts an index (integer) in a token (str) using the vocab."""
-    token = self.IdToPiece(index)
-    return token
-def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
-    """ copy from transformers.PreTrainedTokenizer
-    Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
-    added tokens.
-    Args:
-        ids (`int` or `List[int]`):
-            The token id (or token ids) to convert to tokens.
-        skip_special_tokens (`bool`, *optional*, defaults to `False`):
-            Whether or not to remove special tokens in the decoding.
-    Returns:
-        `str` or `List[str]`: The decoded token(s).
-    """
-    self._added_tokens_decoder = {}  # add by xs
-    if isinstance(ids, int):
-        if ids in self._added_tokens_decoder:
-            return self._added_tokens_decoder[ids].content
-        else:
-            return self._convert_id_to_token(ids)
-    tokens = []
-    for index in ids:
-        index = int(index)
-        if skip_special_tokens and index in self.all_special_ids:
-            continue
-        if index in self._added_tokens_decoder:
-            tokens.append(self._added_tokens_decoder[index].content)
-        else:
-            tokens.append(self._convert_id_to_token(index))
-    return tokens
-def encode(self, *args, **kwargs):
-    """
-    add_special_token 是为了兼容 hf_tokenizer
-    """
-    kwargs.pop("add_special_tokens", None)
-    kwargs.pop("allowed_special", None)
-    return self.Encode(*args, **kwargs)
-def decode(self, *args, **kwargs):
-    kwargs.pop("skip_special_tokens", None)
-    return self.Decode(*args, **kwargs)
-sentencepiece.SentencePieceProcessor.vocab_size = vocab_size  #
-sentencepiece.SentencePieceProcessor.get_vocab = get_vocab
-sentencepiece.SentencePieceProcessor._convert_id_to_token = _convert_id_to_token
-sentencepiece.SentencePieceProcessor.convert_ids_to_tokens = convert_ids_to_tokens
-# sentencepiece.SentencePieceProcessor.tokenize = _tokenize
-sentencepiece.SentencePieceProcessor.encode = encode
-sentencepiece.SentencePieceProcessor.decode = decode

patcher/sptokenizer_wrapper.py DELETED Viewed

@@ -1,61 +0,0 @@
-""" 封装 sentencepiece.SentencePieceProcessor，以便符合transformers中的tokenizer标准
-## reference
-## usage
-- grok
-"""
-import sentencepiece as spm
-from transformers import PreTrainedTokenizer
-class SPTokenizerWrapper(PreTrainedTokenizer):
-    """
-    ## impl in PreTrainedTokenizer
-    - convert_ids_to_tokens
-    """
-    def __init__(self, vocab_file):
-        self.vocab_file = vocab_file
-        self.sp_model = spm.SentencePieceProcessor(self.vocab_file)
-        super().__init__()
-    @property
-    def vocab_size(self):
-        """Returns vocab size"""
-        return self.sp_model.get_piece_size()
-    def get_vocab(self):
-        """Returns vocab as a dict"""
-        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
-        return vocab
-    def _convert_token_to_id(self, token):
-        """Converts a token (str) in an id using the vocab."""
-        return self.sp_model.piece_to_id(token)
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        token = self.sp_model.IdToPiece(index)
-        return token
-    # def (self, ids, skip_special_tokens=False):  # impl in PreTrainedTokenizer
-    def encode(self, *args, **kwargs):
-        kwargs.pop("add_special_tokens", None)
-        kwargs.pop("allowed_special", None)
-        return self.sp_model.Encode(*args, **kwargs)
-    def decode(self, *args, **kwargs):
-        kwargs.pop("skip_special_tokens", None)
-        return self.sp_model.Decode(*args, **kwargs)
-# PreTrainedTokenizer.convert_ids_to_tokens

patcher/tiktoken_patch.py CHANGED Viewed

@@ -70,8 +70,8 @@ def get_vocab(self, token_type="str"):
 @property
 def vocab_size(self):
-    """Returns vocab size"""
-    return self.n_vocab
 def encode(self, *args, **kwargs):

 @property
 def vocab_size(self):
+    """Returns vocab size without special tokens"""
+    return len(self._mergeable_ranks)
 def encode(self, *args, **kwargs):

app_playground.py → playground_app.py RENAMED Viewed

@@ -36,9 +36,12 @@ table
 """
 import gradio as gr
-from vocab import all_tokenizers
-from util import *
-from examples import example_fn, example_types
 get_window_url_params = """
     function(url_params) {
@@ -48,6 +51,8 @@ get_window_url_params = """
         }
     """
 with gr.Blocks() as demo:
     # links: https://www.coderstool.com/utf8-encoding-decoding
     # 功能：输入文本，进行分词
@@ -60,6 +65,7 @@ with gr.Blocks() as demo:
             example_types,
             value="Examples",
             type="index",
             show_label=False,
             container=False,
             scale=0,
@@ -102,21 +108,26 @@ with gr.Blocks() as demo:
         with gr.Column(scale=6):
             with gr.Group():
                 tokenizer_name_1 = gr.Dropdown(
-                    all_tokenizers,
                     label="Tokenizer 1",
                 )
                 with gr.Group():
                     with gr.Row():
                         stats_vocab_size_1 = gr.TextArea(
                             label="Vocab Size",
                             lines=1,
                             elem_classes="statistics"
                         )
-                        stats_zh_token_size_1 = gr.TextArea(
-                            label="ZH char/word",
-                            lines=1,
-                            elem_classes="statistics",
-                        )
                         # stats_compress_rate_1 = gr.TextArea(
                         #     label="Compress Rate",
                         #     lines=1,
@@ -140,21 +151,26 @@ with gr.Blocks() as demo:
         with gr.Column(scale=6):
             with gr.Group():
                 tokenizer_name_2 = gr.Dropdown(
-                    all_tokenizers,
                     label="Tokenizer 2",
                 )
                 with gr.Group():
                     with gr.Row():
-                        stats_vocab_size_2 = gr.TextArea(
-                            label="VocabSize",
                             lines=1,
-                            elem_classes="statistics"
                         )
-                        stats_zh_token_size_2 = gr.TextArea(
-                            label="ZH char/word",  # 中文字/词
                             lines=1,
-                            elem_classes="statistics",
                         )
                         # stats_compress_rate_2 = gr.TextArea(
                         #     label="Compress Rate",
                         #     lines=1,
@@ -196,7 +212,7 @@ with gr.Blocks() as demo:
     tokenizer_name_1.change(tokenize, [user_input, tokenizer_name_1],
                             [output_text_1, output_table_1])
-    tokenizer_name_1.change(basic_count, [tokenizer_name_1], [stats_vocab_size_1, stats_zh_token_size_1])
     tokenizer_name_1.change(get_overlap_token_size, [tokenizer_name_1, tokenizer_name_2],
                             [stats_overlap_token_size_1, stats_overlap_token_size_2])
     # tokenizer_type_1.change(get_compress_rate, [tokenizer_type_1, compress_rate_corpus, compress_rate_unit],
@@ -209,7 +225,7 @@ with gr.Blocks() as demo:
     tokenizer_name_2.change(tokenize, [user_input, tokenizer_name_2],
                             [output_text_2, output_table_2])
-    tokenizer_name_2.change(basic_count, [tokenizer_name_2], [stats_vocab_size_2, stats_zh_token_size_2])
     tokenizer_name_2.change(get_overlap_token_size, [tokenizer_name_1, tokenizer_name_2],
                             [stats_overlap_token_size_1, stats_overlap_token_size_2])
     # tokenizer_type_2.change(get_compress_rate,
@@ -235,7 +251,6 @@ with gr.Blocks() as demo:
         [user_input, tokenizer_name_1, tokenizer_name_2]
     )
-    demo.load(js=open("js/onload.js", "r", encoding="utf-8").read())
     demo.load(
         fn=on_load,
         inputs=[user_input],  # 这里只需要传个空object即可。

 """
 import gradio as gr
+from vocab import tokenizer_factory
+from playground_examples import example_types, example_fn
+from playground_util import tokenize, tokenize_pair, basic_count, get_overlap_token_size, on_load
 get_window_url_params = """
     function(url_params) {
         }
     """
+all_tokenizer_name = [(config.name_display, config.name_or_path) for config in tokenizer_factory.all_tokenizer_configs]
 with gr.Blocks() as demo:
     # links: https://www.coderstool.com/utf8-encoding-decoding
     # 功能：输入文本，进行分词
             example_types,
             value="Examples",
             type="index",
+            allow_custom_value=True,
             show_label=False,
             container=False,
             scale=0,
         with gr.Column(scale=6):
             with gr.Group():
                 tokenizer_name_1 = gr.Dropdown(
+                    all_tokenizer_name,
                     label="Tokenizer 1",
                 )
                 with gr.Group():
                     with gr.Row():
+                        organization_1 = gr.TextArea(
+                            label="Organization",
+                            lines=1,
+                            elem_classes="statistics",
+                        )
                         stats_vocab_size_1 = gr.TextArea(
                             label="Vocab Size",
                             lines=1,
                             elem_classes="statistics"
                         )
+                        # stats_zh_token_size_1 = gr.TextArea(
+                        #     label="ZH char/word",
+                        #     lines=1,
+                        #     elem_classes="statistics",
+                        # )
                         # stats_compress_rate_1 = gr.TextArea(
                         #     label="Compress Rate",
                         #     lines=1,
         with gr.Column(scale=6):
             with gr.Group():
                 tokenizer_name_2 = gr.Dropdown(
+                    all_tokenizer_name,
                     label="Tokenizer 2",
                 )
                 with gr.Group():
                     with gr.Row():
+                        organization_2 = gr.TextArea(
+                            label="Organization",
                             lines=1,
+                            elem_classes="statistics",
                         )
+                        stats_vocab_size_2 = gr.TextArea(
+                            label="Vocab Size",
                             lines=1,
+                            elem_classes="statistics"
                         )
+                        # stats_zh_token_size_2 = gr.TextArea(
+                        #     label="ZH char/word",  # 中文字/词
+                        #     lines=1,
+                        #     elem_classes="statistics",
+                        # )
                         # stats_compress_rate_2 = gr.TextArea(
                         #     label="Compress Rate",
                         #     lines=1,
     tokenizer_name_1.change(tokenize, [user_input, tokenizer_name_1],
                             [output_text_1, output_table_1])
+    tokenizer_name_1.change(basic_count, [tokenizer_name_1], [stats_vocab_size_1, organization_1])
     tokenizer_name_1.change(get_overlap_token_size, [tokenizer_name_1, tokenizer_name_2],
                             [stats_overlap_token_size_1, stats_overlap_token_size_2])
     # tokenizer_type_1.change(get_compress_rate, [tokenizer_type_1, compress_rate_corpus, compress_rate_unit],
     tokenizer_name_2.change(tokenize, [user_input, tokenizer_name_2],
                             [output_text_2, output_table_2])
+    tokenizer_name_2.change(basic_count, [tokenizer_name_2], [stats_vocab_size_2, organization_2])
     tokenizer_name_2.change(get_overlap_token_size, [tokenizer_name_1, tokenizer_name_2],
                             [stats_overlap_token_size_1, stats_overlap_token_size_2])
     # tokenizer_type_2.change(get_compress_rate,
         [user_input, tokenizer_name_1, tokenizer_name_2]
     )
     demo.load(
         fn=on_load,
         inputs=[user_input],  # 这里只需要传个空object即可。

examples.py → playground_examples.py RENAMED Viewed

@@ -19,11 +19,11 @@ https://www.computerhope.com/jargon/s/specchar.htm
 examples = {
     "en": [
-        ["number: (10086 + 98) = 100184", "llama", "bloom"],  #
-        ["whitespace:  2spaces        8spaces\t1tab\t\t2tab\n1newline", "llama", "bert_base_cased"],  # chatglm 有blank_n, bert丢掉了空格，
         # ！？｡＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.
-        ["punctuation: ,.:/?+=\"，。！？；【】〔〕〖〗", "gemma_7b", "llama"],  # llama词典有点小
-        ["symbol: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan", "llama"],
         # ["special: [PAD] [UNK] [CLS] [SEP] [MASK] <|system|> <|user|> <|assistant|> <|endoftext|>", "", ""],
     ],
     "zh": [
@@ -37,16 +37,16 @@ examples = {
 more_examples = [
     # bert系列
-    ("bert_base_cased", "bert_base_uncased", "", ""),  # # clue VS kplug， bert VS clue
-    ("bert_base_cased", "clue", "", "增加了[]()"),
-    ("clue", "kplug", "", ""),
     # llama系列 (基于sentencepiece)
     ("baichuan", "baichuan2", "baichuan2支持多空格   ，多个换行\n\n\n，do not add dummy prefix as Baichuan1"),
     ("llama", "baichuan2", "baichuan2支持多空格   ，多个换行\n\n"),
-    ("llama", "chinese_llama2", ""),
     ("llama", "llama3", "扩充词典"),
-    ("chinese_llama", "chinese_llama2", ""),
     # glm系列 （基于sentencepiece）
     ("glm", "chatglm1", ""),

 examples = {
     "en": [
+        ["number: (10086 + 98) = 100184", "huggyllama/llama-7b", "bigscience/bloom"],  #
+        ["whitespace:  2spaces        8spaces\t1tab\t\t2tab\n1newline", "huggyllama/llama-7b", "google-bert/bert-base-cased"],  # chatglm 有blank_n, bert丢掉了空格，
         # ！？｡＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.
+        ["punctuation: ,.:/?+=\"，。！？；【】〔〕〖〗", "google/gemma-7b", "huggyllama/llama-7b"],  # llama词典有点小
+        ["symbol: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan-inc/Baichuan-7B", "huggyllama/llama-7b"],
         # ["special: [PAD] [UNK] [CLS] [SEP] [MASK] <|system|> <|user|> <|assistant|> <|endoftext|>", "", ""],
     ],
     "zh": [
 more_examples = [
     # bert系列
+    ("bert-base-cased", "bert-base-uncased", "", ""),  # # clue VS kplug， bert VS clue
+    ("bert-base-cased", "clue", "", "增加了[]()"),
+    ("roberta-chinese-clue", "kplug", "", ""),
     # llama系列 (基于sentencepiece)
     ("baichuan", "baichuan2", "baichuan2支持多空格   ，多个换行\n\n\n，do not add dummy prefix as Baichuan1"),
     ("llama", "baichuan2", "baichuan2支持多空格   ，多个换行\n\n"),
+    ("llama", "chinese-llama-2-7b", ""),
     ("llama", "llama3", "扩充词典"),
+    ("chinese-llama-lora-7b", "chinese-llama-2-7b", ""),
     # glm系列 （基于sentencepiece）
     ("glm", "chatglm1", ""),

util.py → playground_util.py RENAMED Viewed

@@ -1,22 +1,33 @@
 import gradio as gr
 import json
 import pandas as pd
-import config
-from vocab import load_tokener
-from utils.character_util import iter_vocab
 from utils.log_util import logger
-from utils.compression_util import tokenize_corpus, unit_convertor
 from functools import lru_cache
 @lru_cache
-def tokenize(text, tokenizer_name, color_num=5):
     """
     """
     logger.info("param=" + json.dumps({"text": text, "tokenizer_type": tokenizer_name}, ensure_ascii=False))
     pos_tokens = []
-    tokenizer = load_tokener(tokenizer_name)
-    if config.ADD_SPECIAL_TOKEN:
         encoding = tokenizer.encode(text, add_special_tokens=True)
     else:
         encoding = tokenizer.encode(text, add_special_tokens=False)
@@ -34,7 +45,7 @@ def tokenize(text, tokenizer_name, color_num=5):
                 token_str = token.decode("utf-8")
             except:
                 token_str = token.decode("utf-8", errors="ignore")
-                logger.error(f"{idx}: decode_error: " + json.dumps(    # gpt_35_turbo 经常有token会decode error，这里用来记录一下
                     {"tokenizer_type": tokenizer_name, "token": str(token), "token_str": token_str},
                     ensure_ascii=False))
@@ -45,7 +56,8 @@ def tokenize(text, tokenizer_name, color_num=5):
             token_bytes = bytes(token_str, "utf-8")
             # json_dumps = json.dumps(token_str)
         else:
-            logger.error(f"{idx}: wrong type for token {token_id} {type(token)} " + json.dumps({"text": text, "tokenizer_type": tokenizer_name}, ensure_ascii=False))
             token_str = token
             token_bytes = token
             # continue
@@ -82,30 +94,22 @@ def tokenize_pair(text, tokenizer_type_1, tokenizer_type_2):
 @lru_cache
 def basic_count(tokenizer_name):
     stats = iter_vocab(tokenizer_name)
-    return stats['vocab_size'], f'{stats["中文token数"]}'
     # return tokenizer.vocab_size, f'{stats["中文汉字数"]["中文单字"]}/{stats["中文汉字数"]["中文多字"]}'
-def get_compress_rate(tokenizer_type, all_corpus, unit):
-    tokenizer = load_tokener(tokenizer_type)
-    compress_rate_stats = tokenize_corpus(tokenizer, all_corpus)
-    compress_rate = unit_convertor(compress_rate_stats, unit)
-    return compress_rate
-# def get_all_compress_rate(corpuses, unit):
-#     stats = {}
-#     for lang in corpuses:
-#         print("###" * 10 + lang)
-#         for tokenizer_name in tokenizers:
-#             tokenizer = load_tokener(tokenizer_name)
-#             stat = tokenize_corpus(tokenizer, [lang])
-#             stats[tokenizer_name] = stat
-#         pprint(stats)
 @lru_cache
-def get_overlap_token_size(tokenizer_type_1, tokenizer_type_2):
-    tokenizer1 = load_tokener(tokenizer_type_1)
-    tokenizer2 = load_tokener(tokenizer_type_2)
     vocab_set_1 = tokenizer1.get_vocab().keys()
     vocab_set_2 = tokenizer2.get_vocab().keys()
@@ -121,11 +125,10 @@ def get_overlap_token_size(tokenizer_type_1, tokenizer_type_2):
     overlap_tokens = vocab_set_1 & vocab_set_2
     overlap_token_size = len(overlap_tokens)
     logger.info(
-        f"{overlap_token_size} OverlapTokens of {tokenizer_type_1} {tokenizer_type_2}: {list(overlap_tokens)[:10]}")
     return overlap_token_size, overlap_token_size
 def on_load(url_params, request: gr.Request):
     """
     onLoad
@@ -148,15 +151,16 @@ def on_load(url_params, request: gr.Request):
         # if "referer" in request.headers:   # not work for huggingface-space
         #     url_params = parse_qs(urlparse(request.headers["referer"]).query)
         #     url_params = {k: v[0] for k, v in url_params.items() if len(v) > 0}
-        tokenizer_type_1 = url_params.get("tokenizer1", config.default_tokenizer_type_1)
-        tokenizer_type_2 = url_params.get("tokenizer2", config.default_tokenizer_type_2)
-        text = url_params.get("text", config.default_user_input)
         logger.info(f"client_ip: {client_ip}; params: {url_params}")
     return text, tokenizer_type_1, tokenizer_type_2
-def compress_rate_unit_change(unit):
-    return gr.update(label=f"Compress Rate: {unit}"), gr.update(label=f"Compress Rate: {unit}"),
 def test_coding():
     bytes1 = b'\xe4\xb8\xad'
@@ -164,5 +168,5 @@ def test_coding():
 if __name__ == "__main__":
-    print(get_overlap_token_size("gpt_35_turbo", "gpt_4"))
     # print(basic_count("internlm_chat_7b"))

 import gradio as gr
 import json
 import pandas as pd
+from vocab import tokenizer_factory
+from character_util import iter_vocab
 from utils.log_util import logger
 from functools import lru_cache
+default_user_input = """\
+Replace this text in the input field to see how tokenization works.
+Buenos días!
+华为发布Mate60手机。
+ラグビーワールドカップ2023フランス"""
+# default_tokenizer_name_1 = "Meta/llama3"
+default_tokenizer_name_1 = "gradientai/Llama-3-8B-Instruct-Gradient-1048k"
+default_tokenizer_name_2 = "openai/gpt-4"
 @lru_cache
+def tokenize(
+        text: str,
+        tokenizer_name: str,
+        color_num: int = 5,
+        add_special_token: bool = False
+):
     """
     """
     logger.info("param=" + json.dumps({"text": text, "tokenizer_type": tokenizer_name}, ensure_ascii=False))
     pos_tokens = []
+    tokenizer = tokenizer_factory.get_tokenizer(tokenizer_name)
+    if add_special_token:
         encoding = tokenizer.encode(text, add_special_tokens=True)
     else:
         encoding = tokenizer.encode(text, add_special_tokens=False)
                 token_str = token.decode("utf-8")
             except:
                 token_str = token.decode("utf-8", errors="ignore")
+                logger.error(f"{idx}: decode_error: " + json.dumps(  # gpt_35_turbo 经常有token会decode error，这里用来记录一下
                     {"tokenizer_type": tokenizer_name, "token": str(token), "token_str": token_str},
                     ensure_ascii=False))
             token_bytes = bytes(token_str, "utf-8")
             # json_dumps = json.dumps(token_str)
         else:
+            logger.error(f"{idx}: wrong type for token {token_id} {type(token)} " + json.dumps(
+                {"text": text, "tokenizer_type": tokenizer_name}, ensure_ascii=False))
             token_str = token
             token_bytes = token
             # continue
 @lru_cache
 def basic_count(tokenizer_name):
     stats = iter_vocab(tokenizer_name)
+    return stats['vocab_size'], f'{stats["organization"]}'
     # return tokenizer.vocab_size, f'{stats["中文汉字数"]["中文单字"]}/{stats["中文汉字数"]["中文多字"]}'
+# def get_compress_rate(tokenizer_name, all_corpus, unit):
+#     tokenizer = tokenizer_factory.get_tokenizer(tokenizer_name)
+#     compress_rate_stats = tokenize_corpus(tokenizer, all_corpus)
+#     compress_rate = unit_convertor(compress_rate_stats, unit)
+#     return compress_rate
 @lru_cache
+def get_overlap_token_size(tokenizer_name_1, tokenizer_name_2):
+    tokenizer1 = tokenizer_factory.get_tokenizer(tokenizer_name_1)
+    tokenizer2 = tokenizer_factory.get_tokenizer(tokenizer_name_2)
     vocab_set_1 = tokenizer1.get_vocab().keys()
     vocab_set_2 = tokenizer2.get_vocab().keys()
     overlap_tokens = vocab_set_1 & vocab_set_2
     overlap_token_size = len(overlap_tokens)
     logger.info(
+        f"{overlap_token_size} OverlapTokens of {tokenizer_name_1} {tokenizer_name_2}: {list(overlap_tokens)[:10]}")
     return overlap_token_size, overlap_token_size
 def on_load(url_params, request: gr.Request):
     """
     onLoad
         # if "referer" in request.headers:   # not work for huggingface-space
         #     url_params = parse_qs(urlparse(request.headers["referer"]).query)
         #     url_params = {k: v[0] for k, v in url_params.items() if len(v) > 0}
+        tokenizer_type_1 = url_params.get("tokenizer1", default_tokenizer_name_1)
+        tokenizer_type_2 = url_params.get("tokenizer2", default_tokenizer_name_2)
+        text = url_params.get("text", default_user_input)
         logger.info(f"client_ip: {client_ip}; params: {url_params}")
     return text, tokenizer_type_1, tokenizer_type_2
+# def compress_rate_unit_change(unit):
+#     return gr.update(label=f"Compress Rate: {unit}"), gr.update(label=f"Compress Rate: {unit}"),
 def test_coding():
     bytes1 = b'\xe4\xb8\xad'
 if __name__ == "__main__":
+    print(get_overlap_token_size("gpt-35-turbo", "gpt-4"))
     # print(basic_count("internlm_chat_7b"))

requirements.txt CHANGED Viewed

@@ -6,4 +6,6 @@ torch
 zhon
 nltk
 boto3
-ai2-olmo==0.2.4

 zhon
 nltk
 boto3
+ai2-olmo
+ipadic
+fugashi

stats/character_stats.json ADDED Viewed

	@@ -0,0 +1,1712 @@

+{
+  "FacebookAI/xlm-roberta-base": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/FacebookAI/xlm-roberta-base\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">xlm-roberta-base</a>",
+    "organization": "Facebook",
+    "vocab_size": 250002,
+    "num(digit)": 2728,
+    "len(digit)": "1,3,9",
+    "num(space)": 1,
+    "len(space)": "1,1,1",
+    "num(ar)": 14644,
+    "len(ar)": "1,4,16",
+    "num(zh)": 18457,
+    "len(zh)": "1,2,16",
+    "num(ja)": 20572,
+    "len(ja)": "1,2,16",
+    "num(ja-kana)": 3434,
+    "len(ja-kana)": "1,3,12",
+    "num(ko)": 5373,
+    "len(ko)": "1,2,8"
+  },
+  "clue/roberta_chinese_clue_tiny": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/clue/roberta_chinese_clue_tiny\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">roberta-chinese-clue</a>",
+    "organization": "CLUE",
+    "vocab_size": 8021,
+    "num(digit)": 230,
+    "len(digit)": "1,4,10",
+    "num(space)": 0,
+    "len(space)": "-",
+    "num(ar)": 30,
+    "len(ar)": "1,2,3",
+    "num(zh)": 5689,
+    "len(zh)": "1,1,1",
+    "num(ja)": 5691,
+    "len(ja)": "1,1,3",
+    "num(ja-kana)": 0,
+    "len(ja-kana)": "-",
+    "num(ko)": 0,
+    "len(ko)": "-"
+  },
+  "dbmdz/bert-base-german-uncased": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/dbmdz/bert-base-german-uncased\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">bert-base-german-uncased</a>",
+    "organization": "dbmdz",
+    "vocab_size": 31102,
+    "num(digit)": 1733,
+    "len(digit)": "1,4,12",
+    "num(space)": 0,
+    "len(space)": "-",
+    "num(ar)": 0,
+    "len(ar)": "-",
+    "num(zh)": 0,
+    "len(zh)": "-",
+    "num(ja)": 0,
+    "len(ja)": "-",
+    "num(ja-kana)": 0,
+    "len(ja-kana)": "-",
+    "num(ko)": 0,
+    "len(ko)": "-"
+  },
+  "google-bert/bert-base-cased": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/google-bert/bert-base-cased\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">bert-base-cased</a>",
+    "organization": "Google",
+    "vocab_size": 28996,
+    "num(digit)": 926,
+    "len(digit)": "1,4,11",
+    "num(space)": 0,
+    "len(space)": "-",
+    "num(ar)": 94,
+    "len(ar)": "1,3,4",
+    "num(zh)": 226,
+    "len(zh)": "1,2,3",
+    "num(ja)": 390,
+    "len(ja)": "1,2,3",
+    "num(ja-kana)": 164,
+    "len(ja-kana)": "1,2,3",
+    "num(ko)": 10,
+    "len(ko)": "1,2,3"
+  },
+  "google-bert/bert-base-chinese": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/google-bert/bert-base-chinese\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">bert-base-chinese</a>",
+    "organization": "Google",
+    "vocab_size": 21128,
+    "num(digit)": 1451,
+    "len(digit)": "1,3,12",
+    "num(space)": 2,
+    "len(space)": "1,2,3",
+    "num(ar)": 30,
+    "len(ar)": "1,2,3",
+    "num(zh)": 14642,
+    "len(zh)": "1,2,3",
+    "num(ja)": 15197,
+    "len(ja)": "1,3,15",
+    "num(ja-kana)": 553,
+    "len(ja-kana)": "1,3,15",
+    "num(ko)": 0,
+    "len(ko)": "-"
+  },
+  "google-bert/bert-base-german-cased": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/google-bert/bert-base-german-cased\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">bert-base-german-cased</a>",
+    "organization": "Google",
+    "vocab_size": 30000,
+    "num(digit)": 4065,
+    "len(digit)": "1,11,22",
+    "num(space)": 0,
+    "len(space)": "-",
+    "num(ar)": 0,
+    "len(ar)": "-",
+    "num(zh)": 0,
+    "len(zh)": "-",
+    "num(ja)": 0,
+    "len(ja)": "-",
+    "num(ja-kana)": 0,
+    "len(ja-kana)": "-",
+    "num(ko)": 0,
+    "len(ko)": "-"
+  },
+  "google-bert/bert-base-multilingual-cased": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/google-bert/bert-base-multilingual-cased\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">bert-base-multilingual-cased</a>",
+    "organization": "Google",
+    "vocab_size": 119547,
+    "num(digit)": 2583,
+    "len(digit)": "1,3,13",
+    "num(space)": 0,
+    "len(space)": "-",
+    "num(ar)": 4873,
+    "len(ar)": "1,5,14",
+    "num(zh)": 13542,
+    "len(zh)": "1,2,3",
+    "num(ja)": 14880,
+    "len(ja)": "1,3,10",
+    "num(ja-kana)": 1336,
+    "len(ja-kana)": "1,4,10",
+    "num(ko)": 3271,
+    "len(ko)": "1,3,6"
+  },
+  "google-bert/bert-base-multilingual-uncased": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/google-bert/bert-base-multilingual-uncased\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">bert-base-multilingual-uncased</a>",
+    "organization": "Google",
+    "vocab_size": 105879,
+    "num(digit)": 2510,
+    "len(digit)": "1,3,13",
+    "num(space)": 2,
+    "len(space)": "1,2,3",
+    "num(ar)": 4530,
+    "len(ar)": "1,5,13",
+    "num(zh)": 16658,
+    "len(zh)": "1,2,3",
+    "num(ja)": 17858,
+    "len(ja)": "1,3,10",
+    "num(ja-kana)": 1188,
+    "len(ja-kana)": "1,4,10",
+    "num(ko)": 0,
+    "len(ko)": "-"
+  },
+  "google-bert/bert-base-uncased": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/google-bert/bert-base-uncased\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">bert-base-uncased</a>",
+    "organization": "Google",
+    "vocab_size": 30522,
+    "num(digit)": 2056,
+    "len(digit)": "1,4,11",
+    "num(space)": 0,
+    "len(space)": "-",
+    "num(ar)": 88,
+    "len(ar)": "1,3,5",
+    "num(zh)": 488,
+    "len(zh)": "1,2,3",
+    "num(ja)": 676,
+    "len(ja)": "1,2,3",
+    "num(ja-kana)": 188,
+    "len(ja-kana)": "1,2,3",
+    "num(ko)": 0,
+    "len(ko)": "-"
+  },
+  "google/mobilebert-uncased": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/google/mobilebert-uncased\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">mobilebert-uncased</a>",
+    "organization": "Google",
+    "vocab_size": 30522,
+    "num(digit)": 2056,
+    "len(digit)": "1,4,11",
+    "num(space)": 0,
+    "len(space)": "-",
+    "num(ar)": 88,
+    "len(ar)": "1,3,5",
+    "num(zh)": 488,
+    "len(zh)": "1,2,3",
+    "num(ja)": 676,
+    "len(ja)": "1,2,3",
+    "num(ja-kana)": 188,
+    "len(ja-kana)": "1,2,3",
+    "num(ko)": 0,
+    "len(ko)": "-"
+  },
+  "tohoku-nlp/bert-base-japanese": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/tohoku-nlp/bert-base-japanese\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">bert-base-japanese</a>",
+    "organization": "Tohoku",
+    "vocab_size": 32000,
+    "num(digit)": 669,
+    "len(digit)": "1,3,5",
+    "num(space)": 0,
+    "len(space)": "-",
+    "num(ar)": 10,
+    "len(ar)": "1,3,3",
+    "num(zh)": 18792,
+    "len(zh)": "1,2,11",
+    "num(ja)": 28367,
+    "len(ja)": "1,2,13",
+    "num(ja-kana)": 12359,
+    "len(ja-kana)": "1,4,13",
+    "num(ko)": 0,
+    "len(ko)": "-"
+  },
+  "gpt-4": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://github.com/openai/tiktoken\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">gpt-4</a>",
+    "organization": "OpenAI",
+    "vocab_size": 100277,
+    "num(digit)": 1110,
+    "len(digit)": "1,3,3",
+    "num(space)": 47472,
+    "len(space)": "1,7,128",
+    "num(ar)": 113,
+    "len(ar)": "1,2,10",
+    "num(zh)": 868,
+    "len(zh)": "1,1,7",
+    "num(ja)": 1035,
+    "len(ja)": "1,1,7",
+    "num(ja-kana)": 169,
+    "len(ja-kana)": "1,1,7",
+    "num(ko)": 299,
+    "len(ko)": "1,2,4"
+  },
+  "llama3": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/gradientai/Llama-3-8B-Instruct-Gradient-1048k\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">llama3</a>",
+    "organization": "Meta",
+    "vocab_size": 128256,
+    "num(digit)": 1110,
+    "len(digit)": "1,3,3",
+    "num(space)": 60860,
+    "len(space)": "1,6,128",
+    "num(ar)": 3810,
+    "len(ar)": "1,4,11",
+    "num(zh)": 4424,
+    "len(zh)": "1,1,7",
+    "num(ja)": 5387,
+    "len(ja)": "1,2,8",
+    "num(ja-kana)": 1086,
+    "len(ja-kana)": "1,2,8",
+    "num(ko)": 2281,
+    "len(ko)": "1,2,6"
+  },
+  "google-t5/t5-large": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/google-t5/t5-large\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">t5</a>",
+    "organization": "Google",
+    "vocab_size": 32100,
+    "num(digit)": 1133,
+    "len(digit)": "1,3,13",
+    "num(space)": 0,
+    "len(space)": "-",
+    "num(ar)": 0,
+    "len(ar)": "-",
+    "num(zh)": 0,
+    "len(zh)": "-",
+    "num(ja)": 0,
+    "len(ja)": "-",
+    "num(ja-kana)": 0,
+    "len(ja-kana)": "-",
+    "num(ko)": 0,
+    "len(ko)": "-"
+  },
+  "google/byt5-small": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/google/byt5-small\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">byt5-small</a>",
+    "organization": "Google",
+    "vocab_size": 384,
+    "num(digit)": 10,
+    "len(digit)": "1,1,1",
+    "num(space)": 10,
+    "len(space)": "1,1,1",
+    "num(ar)": 0,
+    "len(ar)": "-",
+    "num(zh)": 0,
+    "len(zh)": "-",
+    "num(ja)": 0,
+    "len(ja)": "-",
+    "num(ja-kana)": 0,
+    "len(ja-kana)": "-",
+    "num(ko)": 0,
+    "len(ko)": "-"
+  },
+  "google/mt5-large": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/google/mt5-large\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">mt5-large</a>",
+    "organization": "Google",
+    "vocab_size": 250100,
+    "num(digit)": 16829,
+    "len(digit)": "1,4,16",
+    "num(space)": 1,
+    "len(space)": "1,1,1",
+    "num(ar)": 7459,
+    "len(ar)": "1,3,16",
+    "num(zh)": 21489,
+    "len(zh)": "1,2,16",
+    "num(ja)": 27078,
+    "len(ja)": "1,2,16",
+    "num(ja-kana)": 9160,
+    "len(ja-kana)": "1,3,14",
+    "num(ko)": 4041,
+    "len(ko)": "1,1,10"
+  },
+  "lmsys/fastchat-t5-3b-v1.0": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/lmsys/fastchat-t5-3b-v1.0\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">fastchat-t5-3b-v1.0</a>",
+    "organization": "LMSYS",
+    "vocab_size": 32110,
+    "num(digit)": 1033,
+    "len(digit)": "1,3,8",
+    "num(space)": 0,
+    "len(space)": "-",
+    "num(ar)": 0,
+    "len(ar)": "-",
+    "num(zh)": 0,
+    "len(zh)": "-",
+    "num(ja)": 0,
+    "len(ja)": "-",
+    "num(ja-kana)": 0,
+    "len(ja-kana)": "-",
+    "num(ko)": 0,
+    "len(ko)": "-"
+  },
+  "paust/pko-t5-large": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/paust/pko-t5-large\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">pko-t5-large</a>",
+    "organization": "PAUST",
+    "vocab_size": 50358,
+    "num(digit)": 51,
+    "len(digit)": "1,2,3",
+    "num(space)": 10,
+    "len(space)": "1,1,1",
+    "num(ar)": 0,
+    "len(ar)": "-",
+    "num(zh)": 0,
+    "len(zh)": "-",
+    "num(ja)": 0,
+    "len(ja)": "-",
+    "num(ja-kana)": 0,
+    "len(ja-kana)": "-",
+    "num(ko)": 49050,
+    "len(ko)": "1,2,16"
+  },
+  "bloom": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/bigscience/bloom\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">bloom</a>",
+    "organization": "BigScience",
+    "vocab_size": 250680,
+    "num(digit)": 6629,
+    "len(digit)": "1,4,50",
+    "num(space)": 140180,
+    "len(space)": "1,6,600",
+    "num(ar)": 20854,
+    "len(ar)": "1,5,16",
+    "num(zh)": 30603,
+    "len(zh)": "1,2,23",
+    "num(ja)": 30816,
+    "len(ja)": "1,2,23",
+    "num(ja-kana)": 214,
+    "len(ja-kana)": "1,1,3",
+    "num(ko)": 338,
+    "len(ko)": "1,1,3"
+  },
+  "llama": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/huggyllama/llama-7b\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">llama</a>",
+    "organization": "Meta",
+    "vocab_size": 32000,
+    "num(digit)": 20,
+    "len(digit)": "1,1,1",
+    "num(space)": 61,
+    "len(space)": "1,2,15",
+    "num(ar)": 55,
+    "len(ar)": "1,1,2",
+    "num(zh)": 700,
+    "len(zh)": "1,1,1",
+    "num(ja)": 837,
+    "len(ja)": "1,1,1",
+    "num(ja-kana)": 137,
+    "len(ja-kana)": "1,1,1",
+    "num(ko)": 111,
+    "len(ko)": "1,1,1"
+  },
+  "ClueAI/ChatYuan-large-v2": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/ClueAI/ChatYuan-large-v2\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">ChatYuan-large-v2</a>",
+    "organization": "CLUE",
+    "vocab_size": 32128,
+    "num(digit)": 740,
+    "len(digit)": "1,3,9",
+    "num(space)": 0,
+    "len(space)": "-",
+    "num(ar)": 2,
+    "len(ar)": "1,1,1",
+    "num(zh)": 29591,
+    "len(zh)": "1,2,16",
+    "num(ja)": 29736,
+    "len(ja)": "1,2,16",
+    "num(ja-kana)": 145,
+    "len(ja-kana)": "1,1,2",
+    "num(ko)": 0,
+    "len(ko)": "-"
+  },
+  "Meta/llama3": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/gradientai/Llama-3-8B-Instruct-Gradient-1048k\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">llama3</a>",
+    "organization": "Meta",
+    "vocab_size": 128256,
+    "num(digit)": 1110,
+    "len(digit)": "1,3,3",
+    "num(space)": 60860,
+    "len(space)": "1,6,128",
+    "num(ar)": 3810,
+    "len(ar)": "1,4,11",
+    "num(zh)": 4424,
+    "len(zh)": "1,1,7",
+    "num(ja)": 5387,
+    "len(ja)": "1,2,8",
+    "num(ja-kana)": 1086,
+    "len(ja-kana)": "1,2,8",
+    "num(ko)": 2281,
+    "len(ko)": "1,2,6"
+  },
+  "openai/gpt-4": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://github.com/openai/tiktoken\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">gpt-4</a>",
+    "organization": "OpenAI",
+    "vocab_size": 100277,
+    "num(digit)": 1110,
+    "len(digit)": "1,3,3",
+    "num(space)": 47472,
+    "len(space)": "1,7,128",
+    "num(ar)": 113,
+    "len(ar)": "1,2,10",
+    "num(zh)": 868,
+    "len(zh)": "1,1,7",
+    "num(ja)": 1035,
+    "len(ja)": "1,1,7",
+    "num(ja-kana)": 169,
+    "len(ja-kana)": "1,1,7",
+    "num(ko)": 299,
+    "len(ko)": "1,2,4"
+  },
+  "gradientai/Llama-3-8B-Instruct-Gradient-1048k": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/gradientai/Llama-3-8B-Instruct-Gradient-1048k\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">llama3</a>",
+    "organization": "Meta",
+    "vocab_size": 128256,
+    "num(digit)": 1110,
+    "len(digit)": "1,3,3",
+    "num(space)": 60860,
+    "len(space)": "1,6,128",
+    "num(ar)": 3810,
+    "len(ar)": "1,4,11",
+    "num(zh)": 4424,
+    "len(zh)": "1,1,7",
+    "num(ja)": 5387,
+    "len(ja)": "1,2,8",
+    "num(ja-kana)": 1086,
+    "len(ja-kana)": "1,2,8",
+    "num(ko)": 2281,
+    "len(ko)": "1,2,6"
+  },
+  "bigscience/bloom": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/bigscience/bloom\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">bloom</a>",
+    "organization": "BigScience",
+    "vocab_size": 250680,
+    "num(digit)": 6629,
+    "len(digit)": "1,4,50",
+    "num(space)": 140180,
+    "len(space)": "1,6,600",
+    "num(ar)": 20854,
+    "len(ar)": "1,5,16",
+    "num(zh)": 30603,
+    "len(zh)": "1,2,23",
+    "num(ja)": 30816,
+    "len(ja)": "1,2,23",
+    "num(ja-kana)": 214,
+    "len(ja-kana)": "1,1,3",
+    "num(ko)": 338,
+    "len(ko)": "1,1,3"
+  },
+  "huggyllama/llama-7b": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/huggyllama/llama-7b\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">llama</a>",
+    "organization": "Meta",
+    "vocab_size": 32000,
+    "num(digit)": 20,
+    "len(digit)": "1,1,1",
+    "num(space)": 61,
+    "len(space)": "1,2,15",
+    "num(ar)": 55,
+    "len(ar)": "1,1,2",
+    "num(zh)": 700,
+    "len(zh)": "1,1,1",
+    "num(ja)": 837,
+    "len(ja)": "1,1,1",
+    "num(ja-kana)": 137,
+    "len(ja-kana)": "1,1,1",
+    "num(ko)": 111,
+    "len(ko)": "1,1,1"
+  },
+  "baichuan-inc/Baichuan-7B": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/baichuan-inc/Baichuan-7B\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">baichuan</a>",
+    "organization": "Baichuan",
+    "vocab_size": 64000,
+    "num(digit)": 335,
+    "len(digit)": "1,14,14",
+    "num(space)": 13,
+    "len(space)": "1,1,1",
+    "num(ar)": 299,
+    "len(ar)": "1,1,2",
+    "num(zh)": 27676,
+    "len(zh)": "1,1,9",
+    "num(ja)": 28522,
+    "len(ja)": "1,1,9",
+    "num(ja-kana)": 178,
+    "len(ja-kana)": "1,1,1",
+    "num(ko)": 1591,
+    "len(ko)": "1,1,1"
+  },
+  "01-ai/Yi-34B": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/01-ai/Yi-34B\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">Yi-34B</a>",
+    "organization": "Yi",
+    "vocab_size": 64000,
+    "num(digit)": 200,
+    "len(digit)": "1,13,15",
+    "num(space)": 24274,
+    "len(space)": "1,7,16",
+    "num(ar)": 18,
+    "len(ar)": "1,1,4",
+    "num(zh)": 21356,
+    "len(zh)": "1,2,12",
+    "num(ja)": 21407,
+    "len(ja)": "1,2,12",
+    "num(ja-kana)": 51,
+    "len(ja-kana)": "1,1,2",
+    "num(ko)": 28,
+    "len(ko)": "1,1,2"
+  },
+  "01-ai/Yi-6B": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/01-ai/Yi-6B\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">Yi-6B</a>",
+    "organization": "Yi",
+    "vocab_size": 64000,
+    "num(digit)": 200,
+    "len(digit)": "1,13,15",
+    "num(space)": 24274,
+    "len(space)": "1,7,16",
+    "num(ar)": 18,
+    "len(ar)": "1,1,4",
+    "num(zh)": 21356,
+    "len(zh)": "1,2,12",
+    "num(ja)": 21407,
+    "len(ja)": "1,2,12",
+    "num(ja-kana)": 51,
+    "len(ja-kana)": "1,1,2",
+    "num(ko)": 28,
+    "len(ko)": "1,1,2"
+  },
+  "01-ai/Yi-VL-34B": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/01-ai/Yi-VL-34B\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">Yi-VL-34B</a>",
+    "organization": "Yi",
+    "vocab_size": 64000,
+    "num(digit)": 200,
+    "len(digit)": "1,13,15",
+    "num(space)": 43,
+    "len(space)": "1,2,15",
+    "num(ar)": 18,
+    "len(ar)": "1,1,4",
+    "num(zh)": 21356,
+    "len(zh)": "1,2,12",
+    "num(ja)": 21407,
+    "len(ja)": "1,2,12",
+    "num(ja-kana)": 51,
+    "len(ja-kana)": "1,1,2",
+    "num(ko)": 28,
+    "len(ko)": "1,1,2"
+  },
+  "ClassCat/gpt2-base-french": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/ClassCat/gpt2-base-french\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">gpt2-base-french</a>",
+    "organization": "ClassCat",
+    "vocab_size": 50000,
+    "num(digit)": 1833,
+    "len(digit)": "1,4,5",
+    "num(space)": 31889,
+    "len(space)": "1,7,32",
+    "num(ar)": 41,
+    "len(ar)": "1,1,4",
+    "num(zh)": 27,
+    "len(zh)": "1,1,1",
+    "num(ja)": 46,
+    "len(ja)": "1,1,2",
+    "num(ja-kana)": 19,
+    "len(ja-kana)": "1,1,2",
+    "num(ko)": 0,
+    "len(ko)": "-"
+  },
+  "ClassCat/gpt2-base-spanish": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/ClassCat/gpt2-base-spanish\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">gpt2-base-spanish</a>",
+    "organization": "ClassCat",
+    "vocab_size": 50000,
+    "num(digit)": 1492,
+    "len(digit)": "1,4,9",
+    "num(space)": 34496,
+    "len(space)": "1,8,32",
+    "num(ar)": 36,
+    "len(ar)": "1,1,4",
+    "num(zh)": 13,
+    "len(zh)": "1,1,1",
+    "num(ja)": 36,
+    "len(ja)": "1,1,2",
+    "num(ja-kana)": 23,
+    "len(ja-kana)": "1,1,2",
+    "num(ko)": 0,
+    "len(ko)": "-"
+  },
+  "ClueAI/PromptCLUE-base": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/ClueAI/PromptCLUE-base\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">PromptCLUE-base</a>",
+    "organization": "CLUE",
+    "vocab_size": 32128,
+    "num(digit)": 740,
+    "len(digit)": "1,3,9",
+    "num(space)": 0,
+    "len(space)": "-",
+    "num(ar)": 2,
+    "len(ar)": "1,1,1",
+    "num(zh)": 29591,
+    "len(zh)": "1,2,16",
+    "num(ja)": 29736,
+    "len(ja)": "1,2,16",
+    "num(ja-kana)": 145,
+    "len(ja-kana)": "1,1,2",
+    "num(ko)": 0,
+    "len(ko)": "-"
+  },
+  "CohereForAI/aya-101": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/CohereForAI/aya-101\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">aya-101</a>",
+    "organization": "Cohere For AI",
+    "vocab_size": 250100,
+    "num(digit)": 16829,
+    "len(digit)": "1,4,16",
+    "num(space)": 1,
+    "len(space)": "1,1,1",
+    "num(ar)": 7459,
+    "len(ar)": "1,3,16",
+    "num(zh)": 21489,
+    "len(zh)": "1,2,16",
+    "num(ja)": 27078,
+    "len(ja)": "1,2,16",
+    "num(ja-kana)": 9160,
+    "len(ja-kana)": "1,3,14",
+    "num(ko)": 4041,
+    "len(ko)": "1,1,10"
+  },
+  "EleutherAI/gpt-neox-20b": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/EleutherAI/gpt-neox-20b\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">gpt-neox-20b</a>",
+    "organization": "EleutherAI",
+    "vocab_size": 50277,
+    "num(digit)": 2036,
+    "len(digit)": "1,3,35",
+    "num(space)": 28996,
+    "len(space)": "1,7,512",
+    "num(ar)": 94,
+    "len(ar)": "1,2,4",
+    "num(zh)": 313,
+    "len(zh)": "1,1,2",
+    "num(ja)": 480,
+    "len(ja)": "1,1,4",
+    "num(ja-kana)": 167,
+    "len(ja-kana)": "1,1,4",
+    "num(ko)": 25,
+    "len(ko)": "1,1,2"
+  },
+  "HuggingFaceH4/starchat-alpha": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/HuggingFaceH4/starchat-alpha\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">starchat-alpha</a>",
+    "organization": "-",
+    "vocab_size": 49156,
+    "num(digit)": 10,
+    "len(digit)": "1,1,1",
+    "num(space)": 16515,
+    "len(space)": "1,6,256",
+    "num(ar)": 84,
+    "len(ar)": "1,2,4",
+    "num(zh)": 2030,
+    "len(zh)": "1,1,7",
+    "num(ja)": 2368,
+    "len(ja)": "1,1,8",
+    "num(ja-kana)": 360,
+    "len(ja-kana)": "1,2,8",
+    "num(ko)": 491,
+    "len(ko)": "1,2,5"
+  },
+  "HuggingFaceH4/zephyr-7b-beta": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/HuggingFaceH4/zephyr-7b-beta\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">zephyr-7b-beta</a>",
+    "organization": "HuggingFace",
+    "vocab_size": 32000,
+    "num(digit)": 20,
+    "len(digit)": "1,1,1",
+    "num(space)": 85,
+    "len(space)": "1,3,15",
+    "num(ar)": 71,
+    "len(ar)": "1,1,2",
+    "num(zh)": 1459,
+    "len(zh)": "1,1,2",
+    "num(ja)": 1593,
+    "len(ja)": "1,1,2",
+    "num(ja-kana)": 134,
+    "len(ja-kana)": "1,1,1",
+    "num(ko)": 346,
+    "len(ko)": "1,1,1"
+  },
+  "LLM360/CrystalCoder": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/LLM360/CrystalCoder\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">CrystalCoder</a>",
+    "organization": "MBZUAI",
+    "vocab_size": 32022,
+    "num(digit)": 20,
+    "len(digit)": "1,1,1",
+    "num(space)": 61,
+    "len(space)": "1,2,15",
+    "num(ar)": 55,
+    "len(ar)": "1,1,2",
+    "num(zh)": 700,
+    "len(zh)": "1,1,1",
+    "num(ja)": 837,
+    "len(ja)": "1,1,1",
+    "num(ja-kana)": 137,
+    "len(ja-kana)": "1,1,1",
+    "num(ko)": 111,
+    "len(ko)": "1,1,1"
+  },
+  "NousResearch/Llama-2-7b-chat-hf": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/NousResearch/Llama-2-7b-chat-hf\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">llama2</a>",
+    "organization": "Meta",
+    "vocab_size": 32001,
+    "num(digit)": 20,
+    "len(digit)": "1,1,1",
+    "num(space)": 61,
+    "len(space)": "1,2,15",
+    "num(ar)": 55,
+    "len(ar)": "1,1,2",
+    "num(zh)": 700,
+    "len(zh)": "1,1,1",
+    "num(ja)": 837,
+    "len(ja)": "1,1,1",
+    "num(ja-kana)": 137,
+    "len(ja-kana)": "1,1,1",
+    "num(ko)": 111,
+    "len(ko)": "1,1,1"
+  },
+  "OrionStarAI/Orion-14B-Chat": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/OrionStarAI/Orion-14B-Chat\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">Orion-14B-Chat</a>",
+    "organization": "OrionStar",
+    "vocab_size": 84608,
+    "num(digit)": 1559,
+    "len(digit)": "1,4,14",
+    "num(space)": 18383,
+    "len(space)": "1,6,16",
+    "num(ar)": 102,
+    "len(ar)": "1,1,1",
+    "num(zh)": 46998,
+    "len(zh)": "1,2,16",
+    "num(ja)": 49644,
+    "len(ja)": "1,2,16",
+    "num(ja-kana)": 2987,
+    "len(ja-kana)": "1,3,11",
+    "num(ko)": 5110,
+    "len(ko)": "1,2,7"
+  },
+  "Qwen/Qwen-7B-Chat": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/Qwen/Qwen-7B-Chat\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">Qwen</a>",
+    "organization": "Alibaba",
+    "vocab_size": 151851,
+    "num(digit)": 10,
+    "len(digit)": "1,1,1",
+    "num(space)": 55883,
+    "len(space)": "1,6,128",
+    "num(ar)": 4018,
+    "len(ar)": "1,3,12",
+    "num(zh)": 25557,
+    "len(zh)": "1,2,7",
+    "num(ja)": 27206,
+    "len(ja)": "1,2,11",
+    "num(ja-kana)": 2089,
+    "len(ja-kana)": "1,3,11",
+    "num(ko)": 3495,
+    "len(ko)": "1,1,5"
+  },
+  "Qwen/Qwen1.5-14B-Chat": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/Qwen/Qwen1.5-14B-Chat\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">Qwen1.5</a>",
+    "organization": "Alibaba",
+    "vocab_size": 151646,
+    "num(digit)": 10,
+    "len(digit)": "1,1,1",
+    "num(space)": 55883,
+    "len(space)": "1,6,128",
+    "num(ar)": 4018,
+    "len(ar)": "1,3,12",
+    "num(zh)": 25557,
+    "len(zh)": "1,2,7",
+    "num(ja)": 27206,
+    "len(ja)": "1,2,11",
+    "num(ja-kana)": 2089,
+    "len(ja-kana)": "1,3,11",
+    "num(ko)": 3495,
+    "len(ko)": "1,1,5"
+  },
+  "Skywork/Skywork-13B-Math": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/Skywork/Skywork-13B-Math\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">Skywork-13B-Math</a>",
+    "organization": "Kunlun",
+    "vocab_size": 65519,
+    "num(digit)": 20,
+    "len(digit)": "1,1,1",
+    "num(space)": 62,
+    "len(space)": "1,2,15",
+    "num(ar)": 56,
+    "len(ar)": "1,1,2",
+    "num(zh)": 33913,
+    "len(zh)": "1,2,5",
+    "num(ja)": 34064,
+    "len(ja)": "1,2,5",
+    "num(ja-kana)": 150,
+    "len(ja-kana)": "1,1,1",
+    "num(ko)": 111,
+    "len(ko)": "1,1,1"
+  },
+  "Skywork/Skywork-13B-base": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/Skywork/Skywork-13B-base\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">Skywork-13B-base</a>",
+    "organization": "Kunlun",
+    "vocab_size": 65519,
+    "num(digit)": 20,
+    "len(digit)": "1,1,1",
+    "num(space)": 62,
+    "len(space)": "1,2,15",
+    "num(ar)": 56,
+    "len(ar)": "1,1,2",
+    "num(zh)": 33913,
+    "len(zh)": "1,2,5",
+    "num(ja)": 34064,
+    "len(ja)": "1,2,5",
+    "num(ja-kana)": 150,
+    "len(ja-kana)": "1,1,1",
+    "num(ko)": 111,
+    "len(ko)": "1,1,1"
+  },
+  "THUDM/chatglm-6b": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/THUDM/chatglm-6b\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">chatglm-6b</a>",
+    "organization": "Tsinghua",
+    "vocab_size": 130344,
+    "num(digit)": 20,
+    "len(digit)": "1,1,1",
+    "num(space)": 93,
+    "len(space)": "1,34,80",
+    "num(ar)": 137,
+    "len(ar)": "1,2,4",
+    "num(zh)": 61358,
+    "len(zh)": "1,2,16",
+    "num(ja)": 61784,
+    "len(ja)": "1,2,16",
+    "num(ja-kana)": 439,
+    "len(ja-kana)": "1,2,5",
+    "num(ko)": 114,
+    "len(ko)": "1,1,3"
+  },
+  "THUDM/chatglm2-6b": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/THUDM/chatglm2-6b\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">chatglm2-6b</a>",
+    "organization": "Tsinghua",
+    "vocab_size": 64787,
+    "num(digit)": 20,
+    "len(digit)": "1,1,1",
+    "num(space)": 67,
+    "len(space)": "1,2,15",
+    "num(ar)": 57,
+    "len(ar)": "1,1,2",
+    "num(zh)": 30922,
+    "len(zh)": "1,2,16",
+    "num(ja)": 31065,
+    "len(ja)": "1,2,16",
+    "num(ja-kana)": 143,
+    "len(ja-kana)": "1,1,1",
+    "num(ko)": 604,
+    "len(ko)": "1,1,1"
+  },
+  "THUDM/chatglm3-6b": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/THUDM/chatglm3-6b\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">chatglm3-6b</a>",
+    "organization": "Tsinghua",
+    "vocab_size": 64796,
+    "num(digit)": 20,
+    "len(digit)": "1,1,1",
+    "num(space)": 67,
+    "len(space)": "1,2,15",
+    "num(ar)": 57,
+    "len(ar)": "1,1,2",
+    "num(zh)": 30922,
+    "len(zh)": "1,2,16",
+    "num(ja)": 31065,
+    "len(ja)": "1,2,16",
+    "num(ja-kana)": 143,
+    "len(ja-kana)": "1,1,1",
+    "num(ko)": 604,
+    "len(ko)": "1,1,1"
+  },
+  "TigerResearch/tigerbot-13b-chat-v2": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/TigerResearch/tigerbot-13b-chat-v2\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">tigerbot-13b-chat-v2</a>",
+    "organization": "Tigerobo",
+    "vocab_size": 60515,
+    "num(digit)": 20,
+    "len(digit)": "1,1,1",
+    "num(space)": 61,
+    "len(space)": "1,2,15",
+    "num(ar)": 55,
+    "len(ar)": "1,1,2",
+    "num(zh)": 28603,
+    "len(zh)": "1,2,16",
+    "num(ja)": 28770,
+    "len(ja)": "1,2,16",
+    "num(ja-kana)": 167,
+    "len(ja-kana)": "1,1,2",
+    "num(ko)": 261,
+    "len(ko)": "1,1,1"
+  },
+  "TigerResearch/tigerbot-70b-chat-v4-4k": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/TigerResearch/tigerbot-70b-chat-v4-4k\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">tigerbot-70b-chat-v4-4k</a>",
+    "organization": "Tigerobo",
+    "vocab_size": 65110,
+    "num(digit)": 20,
+    "len(digit)": "1,1,1",
+    "num(space)": 61,
+    "len(space)": "1,2,15",
+    "num(ar)": 55,
+    "len(ar)": "1,1,2",
+    "num(zh)": 30509,
+    "len(zh)": "1,2,16",
+    "num(ja)": 32061,
+    "len(ja)": "1,2,16",
+    "num(ja-kana)": 2071,
+    "len(ja-kana)": "1,2,8",
+    "num(ko)": 1504,
+    "len(ko)": "1,1,5"
+  },
+  "Upstage/SOLAR-10.7B-v1.0": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/Upstage/SOLAR-10.7B-v1.0\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">SOLAR-10.7B-v1.0</a>",
+    "organization": "-",
+    "vocab_size": 32000,
+    "num(digit)": 20,
+    "len(digit)": "1,1,1",
+    "num(space)": 85,
+    "len(space)": "1,3,15",
+    "num(ar)": 71,
+    "len(ar)": "1,1,2",
+    "num(zh)": 1459,
+    "len(zh)": "1,1,2",
+    "num(ja)": 1593,
+    "len(ja)": "1,1,2",
+    "num(ja-kana)": 134,
+    "len(ja-kana)": "1,1,1",
+    "num(ko)": 346,
+    "len(ko)": "1,1,1"
+  },
+  "WizardLM/WizardCoder-15B-V1.0": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/WizardLM/WizardCoder-15B-V1.0\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">WizardCoder-15B-V1.0</a>",
+    "organization": "Microsoft",
+    "vocab_size": 49153,
+    "num(digit)": 10,
+    "len(digit)": "1,1,1",
+    "num(space)": 16515,
+    "len(space)": "1,6,256",
+    "num(ar)": 84,
+    "len(ar)": "1,2,4",
+    "num(zh)": 2030,
+    "len(zh)": "1,1,7",
+    "num(ja)": 2368,
+    "len(ja)": "1,1,8",
+    "num(ja-kana)": 360,
+    "len(ja-kana)": "1,2,8",
+    "num(ko)": 491,
+    "len(ko)": "1,2,5"
+  },
+  "WizardLM/WizardCoder-Python-7B-V1.0": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/WizardLM/WizardCoder-Python-7B-V1.0\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">WizardCoder-Python-7B-V1.0</a>",
+    "organization": "Microsoft",
+    "vocab_size": 32001,
+    "num(digit)": 20,
+    "len(digit)": "1,1,1",
+    "num(space)": 61,
+    "len(space)": "1,2,15",
+    "num(ar)": 55,
+    "len(ar)": "1,1,2",
+    "num(zh)": 700,
+    "len(zh)": "1,1,1",
+    "num(ja)": 837,
+    "len(ja)": "1,1,1",
+    "num(ja-kana)": 137,
+    "len(ja-kana)": "1,1,1",
+    "num(ko)": 111,
+    "len(ko)": "1,1,1"
+  },
+  "WizardLM/WizardLM-7B-V1.0": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/WizardLM/WizardLM-7B-V1.0\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">WizardLM-7B-V1.0</a>",
+    "organization": "Microsoft",
+    "vocab_size": 32001,
+    "num(digit)": 20,
+    "len(digit)": "1,1,1",
+    "num(space)": 61,
+    "len(space)": "1,2,15",
+    "num(ar)": 55,
+    "len(ar)": "1,1,2",
+    "num(zh)": 700,
+    "len(zh)": "1,1,1",
+    "num(ja)": 837,
+    "len(ja)": "1,1,1",
+    "num(ja-kana)": 137,
+    "len(ja-kana)": "1,1,1",
+    "num(ko)": 111,
+    "len(ko)": "1,1,1"
+  },
+  "WizardLM/WizardMath-70B-V1.0": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/WizardLM/WizardMath-70B-V1.0\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">WizardMath-70B-V1.0</a>",
+    "organization": "Microsoft",
+    "vocab_size": 32002,
+    "num(digit)": 20,
+    "len(digit)": "1,1,1",
+    "num(space)": 61,
+    "len(space)": "1,2,15",
+    "num(ar)": 55,
+    "len(ar)": "1,1,2",
+    "num(zh)": 700,
+    "len(zh)": "1,1,1",
+    "num(ja)": 837,
+    "len(ja)": "1,1,1",
+    "num(ja-kana)": 137,
+    "len(ja-kana)": "1,1,1",
+    "num(ko)": 111,
+    "len(ko)": "1,1,1"
+  },
+  "abeja/gpt-neox-japanese-2.7b": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/abeja/gpt-neox-japanese-2.7b\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">gpt-neox-japanese-2.7b</a>",
+    "organization": "ABEJA",
+    "vocab_size": 32000,
+    "num(digit)": 20,
+    "len(digit)": "1,1,1",
+    "num(space)": 0,
+    "len(space)": "-",
+    "num(ar)": 0,
+    "len(ar)": "-",
+    "num(zh)": 15176,
+    "len(zh)": "1,2,2",
+    "num(ja)": 31482,
+    "len(ja)": "1,2,3",
+    "num(ja-kana)": 16306,
+    "len(ja-kana)": "1,3,3",
+    "num(ko)": 0,
+    "len(ko)": "-"
+  },
+  "ai21labs/Jamba-v0.1": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/ai21labs/Jamba-v0.1\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">Jamba-v0.1</a>",
+    "organization": "AI21",
+    "vocab_size": 65536,
+    "num(digit)": 1556,
+    "len(digit)": "1,16,17",
+    "num(space)": 39501,
+    "len(space)": "1,7,32",
+    "num(ar)": 867,
+    "len(ar)": "1,3,8",
+    "num(zh)": 1157,
+    "len(zh)": "1,1,2",
+    "num(ja)": 1287,
+    "len(ja)": "1,1,2",
+    "num(ja-kana)": 130,
+    "len(ja-kana)": "1,1,2",
+    "num(ko)": 312,
+    "len(ko)": "1,1,2"
+  },
+  "allenai/OLMo-7B": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/allenai/OLMo-7B\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">OLMo-7B</a>",
+    "organization": "Allen AI",
+    "vocab_size": 50280,
+    "num(digit)": 2036,
+    "len(digit)": "1,3,35",
+    "num(space)": 29019,
+    "len(space)": "1,7,512",
+    "num(ar)": 94,
+    "len(ar)": "1,2,4",
+    "num(zh)": 313,
+    "len(zh)": "1,1,2",
+    "num(ja)": 480,
+    "len(ja)": "1,1,4",
+    "num(ja-kana)": 167,
+    "len(ja-kana)": "1,1,4",
+    "num(ko)": 25,
+    "len(ko)": "1,1,2"
+  },
+  "baichuan-inc/Baichuan2-7B-Chat": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">baichuan2</a>",
+    "organization": "Baichuan",
+    "vocab_size": 125696,
+    "num(digit)": 1023,
+    "len(digit)": "1,14,14",
+    "num(space)": 26013,
+    "len(space)": "1,7,32",
+    "num(ar)": 335,
+    "len(ar)": "1,1,27",
+    "num(zh)": 70398,
+    "len(zh)": "1,2,32",
+    "num(ja)": 71269,
+    "len(ja)": "1,2,32",
+    "num(ja-kana)": 206,
+    "len(ja-kana)": "1,1,9",
+    "num(ko)": 1595,
+    "len(ko)": "1,1,2"
+  },
+  "ckiplab/gpt2-base-chinese": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/ckiplab/gpt2-base-chinese\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">gpt2-base-chinese</a>",
+    "organization": "SINICA",
+    "vocab_size": 21128,
+    "num(digit)": 1451,
+    "len(digit)": "1,3,12",
+    "num(space)": 2,
+    "len(space)": "1,2,3",
+    "num(ar)": 30,
+    "len(ar)": "1,2,3",
+    "num(zh)": 14642,
+    "len(zh)": "1,2,3",
+    "num(ja)": 15197,
+    "len(ja)": "1,3,15",
+    "num(ja-kana)": 553,
+    "len(ja-kana)": "1,3,15",
+    "num(ko)": 0,
+    "len(ko)": "-"
+  },
+  "cyberagent/open-calm-7b": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/cyberagent/open-calm-7b\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">open-calm-7b</a>",
+    "organization": "CyberAgent",
+    "vocab_size": 52000,
+    "num(digit)": 690,
+    "len(digit)": "1,3,5",
+    "num(space)": 1698,
+    "len(space)": "1,4,33",
+    "num(ar)": 10,
+    "len(ar)": "1,1,4",
+    "num(zh)": 30775,
+    "len(zh)": "1,3,31",
+    "num(ja)": 45790,
+    "len(ja)": "1,3,31",
+    "num(ja-kana)": 32535,
+    "len(ja-kana)": "1,3,31",
+    "num(ko)": 0,
+    "len(ko)": "-"
+  },
+  "databricks/dbrx-instruct": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/databricks/dbrx-instruct\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">dbrx-instruct</a>",
+    "organization": "Databricks",
+    "vocab_size": 100280,
+    "num(digit)": 1126,
+    "len(digit)": "1,3,17",
+    "num(space)": 47400,
+    "len(space)": "1,7,128",
+    "num(ar)": 113,
+    "len(ar)": "1,2,10",
+    "num(zh)": 868,
+    "len(zh)": "1,1,7",
+    "num(ja)": 1035,
+    "len(ja)": "1,1,7",
+    "num(ja-kana)": 169,
+    "len(ja-kana)": "1,1,7",
+    "num(ko)": 299,
+    "len(ko)": "1,2,4"
+  },
+  "deepseek-ai/DeepSeek-V2": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/deepseek-ai/DeepSeek-V2\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">DeepSeek-V2</a>",
+    "organization": "DeepSeek",
+    "vocab_size": 100002,
+    "num(digit)": 10,
+    "len(digit)": "1,1,1",
+    "num(space)": 48073,
+    "len(space)": "1,7,128",
+    "num(ar)": 48,
+    "len(ar)": "1,1,4",
+    "num(zh)": 18052,
+    "len(zh)": "1,2,16",
+    "num(ja)": 18090,
+    "len(ja)": "1,2,16",
+    "num(ja-kana)": 38,
+    "len(ja-kana)": "1,1,2",
+    "num(ko)": 16,
+    "len(ko)": "1,1,2"
+  },
+  "deepseek-ai/deepseek-coder-33b-instruct": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/deepseek-ai/deepseek-coder-33b-instruct\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">deepseek-coder-33b-instruct</a>",
+    "organization": "DeepSeek",
+    "vocab_size": 32022,
+    "num(digit)": 10,
+    "len(digit)": "1,1,1",
+    "num(space)": 15254,
+    "len(space)": "1,6,65",
+    "num(ar)": 12,
+    "len(ar)": "1,1,2",
+    "num(zh)": 4803,
+    "len(zh)": "1,2,4",
+    "num(ja)": 4804,
+    "len(ja)": "1,2,4",
+    "num(ja-kana)": 1,
+    "len(ja-kana)": "1,1,1",
+    "num(ko)": 0,
+    "len(ko)": "-"
+  },
+  "deepseek-ai/deepseek-llm-7b-base": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/deepseek-ai/deepseek-llm-7b-base\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">deepseek-llm-7b-base</a>",
+    "organization": "DeepSeek",
+    "vocab_size": 100015,
+    "num(digit)": 10,
+    "len(digit)": "1,1,1",
+    "num(space)": 48073,
+    "len(space)": "1,7,128",
+    "num(ar)": 48,
+    "len(ar)": "1,1,4",
+    "num(zh)": 18052,
+    "len(zh)": "1,2,16",
+    "num(ja)": 18090,
+    "len(ja)": "1,2,16",
+    "num(ja-kana)": 38,
+    "len(ja-kana)": "1,1,2",
+    "num(ko)": 16,
+    "len(ko)": "1,1,2"
+  },
+  "eson/kplug-base-encoder": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/eson/kplug-base-encoder\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">kplug</a>",
+    "organization": "JD",
+    "vocab_size": 10261,
+    "num(digit)": 420,
+    "len(digit)": "1,3,12",
+    "num(space)": 0,
+    "len(space)": "-",
+    "num(ar)": 0,
+    "len(ar)": "-",
+    "num(zh)": 5764,
+    "len(zh)": "1,1,1",
+    "num(ja)": 5766,
+    "len(ja)": "1,1,3",
+    "num(ja-kana)": 0,
+    "len(ja-kana)": "-",
+    "num(ko)": 0,
+    "len(ko)": "-"
+  },
+  "fnlp/moss-moon-003-sft": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/fnlp/moss-moon-003-sft\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">moss-moon-003-sft</a>",
+    "organization": "Fudan",
+    "vocab_size": 106072,
+    "num(digit)": 1848,
+    "len(digit)": "1,3,16",
+    "num(space)": 33566,
+    "len(space)": "1,7,102",
+    "num(ar)": 25,
+    "len(ar)": "1,1,4",
+    "num(zh)": 54230,
+    "len(zh)": "1,2,15",
+    "num(ja)": 54381,
+    "len(ja)": "1,2,15",
+    "num(ja-kana)": 152,
+    "len(ja-kana)": "1,1,7",
+    "num(ko)": 0,
+    "len(ko)": "-"
+  },
+  "google/gemma-7b": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/google/gemma-7b\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">gemma-7b</a>",
+    "organization": "Google",
+    "vocab_size": 256000,
+    "num(digit)": 134,
+    "len(digit)": "1,10,12",
+    "num(space)": 125662,
+    "len(space)": "1,7,31",
+    "num(ar)": 6274,
+    "len(ar)": "1,4,15",
+    "num(zh)": 23767,
+    "len(zh)": "1,2,12",
+    "num(ja)": 28852,
+    "len(ja)": "1,2,12",
+    "num(ja-kana)": 7061,
+    "len(ja-kana)": "1,3,12",
+    "num(ko)": 2295,
+    "len(ko)": "1,1,5"
+  },
+  "google/switch-c-2048": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/google/switch-c-2048\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">switch-c-2048</a>",
+    "organization": "Google",
+    "vocab_size": 32100,
+    "num(digit)": 1133,
+    "len(digit)": "1,3,13",
+    "num(space)": 0,
+    "len(space)": "-",
+    "num(ar)": 0,
+    "len(ar)": "-",
+    "num(zh)": 0,
+    "len(zh)": "-",
+    "num(ja)": 0,
+    "len(ja)": "-",
+    "num(ja-kana)": 0,
+    "len(ja-kana)": "-",
+    "num(ko)": 0,
+    "len(ko)": "-"
+  },
+  "hfl/chinese-alpaca-lora-7b": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/hfl/chinese-alpaca-lora-7b\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">chinese-alpaca-lora-7b</a>",
+    "organization": "-",
+    "vocab_size": 49954,
+    "num(digit)": 614,
+    "len(digit)": "1,3,5",
+    "num(space)": 61,
+    "len(space)": "1,2,15",
+    "num(ar)": 55,
+    "len(ar)": "1,1,2",
+    "num(zh)": 17839,
+    "len(zh)": "1,2,13",
+    "num(ja)": 17993,
+    "len(ja)": "1,2,13",
+    "num(ja-kana)": 154,
+    "len(ja-kana)": "1,1,1",
+    "num(ko)": 135,
+    "len(ko)": "1,1,1"
+  },
+  "hfl/chinese-llama-2-7b": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/hfl/chinese-llama-2-7b\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">chinese-llama-2-7b</a>",
+    "organization": "-",
+    "vocab_size": 55296,
+    "num(digit)": 20,
+    "len(digit)": "1,1,1",
+    "num(space)": 61,
+    "len(space)": "1,2,15",
+    "num(ar)": 55,
+    "len(ar)": "1,1,2",
+    "num(zh)": 23974,
+    "len(zh)": "1,2,16",
+    "num(ja)": 24111,
+    "len(ja)": "1,2,16",
+    "num(ja-kana)": 137,
+    "len(ja-kana)": "1,1,1",
+    "num(ko)": 111,
+    "len(ko)": "1,1,1"
+  },
+  "hfl/chinese-llama-lora-7b": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/hfl/chinese-llama-lora-7b\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">chinese-llama-lora-7b</a>",
+    "organization": "-",
+    "vocab_size": 49953,
+    "num(digit)": 614,
+    "len(digit)": "1,3,5",
+    "num(space)": 61,
+    "len(space)": "1,2,15",
+    "num(ar)": 55,
+    "len(ar)": "1,1,2",
+    "num(zh)": 17839,
+    "len(zh)": "1,2,13",
+    "num(ja)": 17993,
+    "len(ja)": "1,2,13",
+    "num(ja-kana)": 154,
+    "len(ja-kana)": "1,1,1",
+    "num(ko)": 135,
+    "len(ko)": "1,1,1"
+  },
+  "hfl/llama-3-chinese-8b": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/hfl/llama-3-chinese-8b\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">llama-3-chinese-8b</a>",
+    "organization": "-",
+    "vocab_size": 128256,
+    "num(digit)": 1110,
+    "len(digit)": "1,3,3",
+    "num(space)": 60860,
+    "len(space)": "1,6,128",
+    "num(ar)": 3810,
+    "len(ar)": "1,4,11",
+    "num(zh)": 4424,
+    "len(zh)": "1,1,7",
+    "num(ja)": 5387,
+    "len(ja)": "1,2,8",
+    "num(ja-kana)": 1086,
+    "len(ja-kana)": "1,2,8",
+    "num(ko)": 2281,
+    "len(ko)": "1,2,6"
+  },
+  "hpcai-tech/grok-1": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/hpcai-tech/grok-1\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">grok-1</a>",
+    "organization": "xAI",
+    "vocab_size": 131072,
+    "num(digit)": 40,
+    "len(digit)": "1,6,13",
+    "num(space)": 399,
+    "len(space)": "1,3,16",
+    "num(ar)": 69,
+    "len(ar)": "1,2,4",
+    "num(zh)": 1626,
+    "len(zh)": "1,2,7",
+    "num(ja)": 3118,
+    "len(ja)": "1,2,8",
+    "num(ja-kana)": 1908,
+    "len(ja-kana)": "1,2,8",
+    "num(ko)": 67,
+    "len(ko)": "1,1,2"
+  },
+  "internlm/internlm-chat-7b": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/internlm/internlm-chat-7b\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">internlm-chat-7b</a>",
+    "organization": "Shanghai AI Lab",
+    "vocab_size": 103168,
+    "num(digit)": 1259,
+    "len(digit)": "1,3,19",
+    "num(space)": 33008,
+    "len(space)": "1,6,128",
+    "num(ar)": 6702,
+    "len(ar)": "1,4,16",
+    "num(zh)": 32000,
+    "len(zh)": "1,2,15",
+    "num(ja)": 32866,
+    "len(ja)": "1,2,15",
+    "num(ja-kana)": 864,
+    "len(ja-kana)": "1,2,9",
+    "num(ko)": 298,
+    "len(ko)": "1,1,1"
+  },
+  "internlm/internlm-xcomposer-7b": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/internlm/internlm-xcomposer-7b\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">internlm-xcomposer-7b</a>",
+    "organization": "Shanghai AI Lab",
+    "vocab_size": 103168,
+    "num(digit)": 1261,
+    "len(digit)": "1,3,19",
+    "num(space)": 33008,
+    "len(space)": "1,6,128",
+    "num(ar)": 6702,
+    "len(ar)": "1,4,16",
+    "num(zh)": 32000,
+    "len(zh)": "1,2,15",
+    "num(ja)": 32866,
+    "len(ja)": "1,2,15",
+    "num(ja-kana)": 864,
+    "len(ja-kana)": "1,2,9",
+    "num(ko)": 298,
+    "len(ko)": "1,1,1"
+  },
+  "internlm/internlm2-chat-7b": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/internlm/internlm2-chat-7b\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">internlm2-chat-7b</a>",
+    "organization": "Shanghai AI Lab",
+    "vocab_size": 92544,
+    "num(digit)": 1261,
+    "len(digit)": "1,3,18",
+    "num(space)": 28681,
+    "len(space)": "1,7,128",
+    "num(ar)": 30,
+    "len(ar)": "1,1,1",
+    "num(zh)": 31148,
+    "len(zh)": "1,2,15",
+    "num(ja)": 31296,
+    "len(ja)": "1,2,15",
+    "num(ja-kana)": 148,
+    "len(ja-kana)": "1,1,1",
+    "num(ko)": 83,
+    "len(ko)": "1,1,1"
+  },
+  "internlm/internlm2-math-7b": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/internlm/internlm2-math-7b\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">internlm2-math-7b</a>",
+    "organization": "Shanghai AI Lab",
+    "vocab_size": 92544,
+    "num(digit)": 1261,
+    "len(digit)": "1,3,18",
+    "num(space)": 28681,
+    "len(space)": "1,7,128",
+    "num(ar)": 30,
+    "len(ar)": "1,1,1",
+    "num(zh)": 31148,
+    "len(zh)": "1,2,15",
+    "num(ja)": 31296,
+    "len(ja)": "1,2,15",
+    "num(ja-kana)": 148,
+    "len(ja-kana)": "1,1,1",
+    "num(ko)": 83,
+    "len(ko)": "1,1,1"
+  },
+  "microsoft/Phi-3-mini-4k-instruct": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/microsoft/Phi-3-mini-4k-instruct\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">Phi-3-mini-4k-instruct</a>",
+    "organization": "Microsoft",
+    "vocab_size": 32011,
+    "num(digit)": 20,
+    "len(digit)": "1,1,1",
+    "num(space)": 61,
+    "len(space)": "1,2,15",
+    "num(ar)": 55,
+    "len(ar)": "1,1,2",
+    "num(zh)": 700,
+    "len(zh)": "1,1,1",
+    "num(ja)": 837,
+    "len(ja)": "1,1,1",
+    "num(ja-kana)": 137,
+    "len(ja-kana)": "1,1,1",
+    "num(ko)": 111,
+    "len(ko)": "1,1,1"
+  },
+  "microsoft/phi-1": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/microsoft/phi-1\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">phi-1</a>",
+    "organization": "Microsoft",
+    "vocab_size": 50295,
+    "num(digit)": 1691,
+    "len(digit)": "1,3,16",
+    "num(space)": 33129,
+    "len(space)": "1,7,66",
+    "num(ar)": 22,
+    "len(ar)": "1,1,3",
+    "num(zh)": 51,
+    "len(zh)": "1,1,4",
+    "num(ja)": 183,
+    "len(ja)": "1,1,7",
+    "num(ja-kana)": 133,
+    "len(ja-kana)": "1,1,7",
+    "num(ko)": 0,
+    "len(ko)": "-"
+  },
+  "microsoft/phi-2": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/microsoft/phi-2\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">phi-2</a>",
+    "organization": "Microsoft",
+    "vocab_size": 50295,
+    "num(digit)": 1691,
+    "len(digit)": "1,3,16",
+    "num(space)": 33129,
+    "len(space)": "1,7,66",
+    "num(ar)": 22,
+    "len(ar)": "1,1,3",
+    "num(zh)": 51,
+    "len(zh)": "1,1,4",
+    "num(ja)": 183,
+    "len(ja)": "1,1,7",
+    "num(ja-kana)": 133,
+    "len(ja-kana)": "1,1,7",
+    "num(ko)": 0,
+    "len(ko)": "-"
+  },
+  "mistralai/Mistral-7B-v0.1": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/mistralai/Mistral-7B-v0.1\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">Mistral-7B-v0.1</a>",
+    "organization": "Mistral",
+    "vocab_size": 32000,
+    "num(digit)": 20,
+    "len(digit)": "1,1,1",
+    "num(space)": 85,
+    "len(space)": "1,3,15",
+    "num(ar)": 71,
+    "len(ar)": "1,1,2",
+    "num(zh)": 1459,
+    "len(zh)": "1,1,2",
+    "num(ja)": 1593,
+    "len(ja)": "1,1,2",
+    "num(ja-kana)": 134,
+    "len(ja-kana)": "1,1,1",
+    "num(ko)": 346,
+    "len(ko)": "1,1,1"
+  },
+  "mistralai/Mixtral-8x7B-v0.1": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/mistralai/Mixtral-8x7B-v0.1\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">Mixtral-8x7B-v0.1</a>",
+    "organization": "Mistral",
+    "vocab_size": 32000,
+    "num(digit)": 20,
+    "len(digit)": "1,1,1",
+    "num(space)": 85,
+    "len(space)": "1,3,15",
+    "num(ar)": 71,
+    "len(ar)": "1,1,2",
+    "num(zh)": 1459,
+    "len(zh)": "1,1,2",
+    "num(ja)": 1593,
+    "len(ja)": "1,1,2",
+    "num(ja-kana)": 134,
+    "len(ja-kana)": "1,1,1",
+    "num(ko)": 346,
+    "len(ko)": "1,1,1"
+  },
+  "openai-community/gpt2": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/openai-community/gpt2\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">gpt2</a>",
+    "organization": "OpenAI",
+    "vocab_size": 50257,
+    "num(digit)": 1691,
+    "len(digit)": "1,3,16",
+    "num(space)": 33129,
+    "len(space)": "1,7,66",
+    "num(ar)": 22,
+    "len(ar)": "1,1,3",
+    "num(zh)": 51,
+    "len(zh)": "1,1,4",
+    "num(ja)": 183,
+    "len(ja)": "1,1,7",
+    "num(ja-kana)": 133,
+    "len(ja-kana)": "1,1,7",
+    "num(ko)": 0,
+    "len(ko)": "-"
+  },
+  "openai/code-davinci-002": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://github.com/openai/tiktoken\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">code-davinci-002</a>",
+    "organization": "OpenAI",
+    "vocab_size": 50281,
+    "num(digit)": 1691,
+    "len(digit)": "1,3,16",
+    "num(space)": 33175,
+    "len(space)": "1,7,66",
+    "num(ar)": 22,
+    "len(ar)": "1,1,3",
+    "num(zh)": 51,
+    "len(zh)": "1,1,4",
+    "num(ja)": 183,
+    "len(ja)": "1,1,7",
+    "num(ja-kana)": 133,
+    "len(ja-kana)": "1,1,7",
+    "num(ko)": 0,
+    "len(ko)": "-"
+  },
+  "openai/gpt-3.5-turbo": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://github.com/openai/tiktoken\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">gpt-3.5-turbo</a>",
+    "organization": "OpenAI",
+    "vocab_size": 100277,
+    "num(digit)": 1110,
+    "len(digit)": "1,3,3",
+    "num(space)": 47472,
+    "len(space)": "1,7,128",
+    "num(ar)": 113,
+    "len(ar)": "1,2,10",
+    "num(zh)": 868,
+    "len(zh)": "1,1,7",
+    "num(ja)": 1035,
+    "len(ja)": "1,1,7",
+    "num(ja-kana)": 169,
+    "len(ja-kana)": "1,1,7",
+    "num(ko)": 299,
+    "len(ko)": "1,2,4"
+  },
+  "openai/gpt-4o": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://github.com/openai/tiktoken\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">gpt-4o</a>",
+    "organization": "OpenAI",
+    "vocab_size": 200019,
+    "num(digit)": 1110,
+    "len(digit)": "1,3,3",
+    "num(space)": 109316,
+    "len(space)": "1,6,128",
+    "num(ar)": 8055,
+    "len(ar)": "1,4,12",
+    "num(zh)": 7563,
+    "len(zh)": "1,2,11",
+    "num(ja)": 8292,
+    "len(ja)": "1,2,11",
+    "num(ja-kana)": 809,
+    "len(ja-kana)": "1,2,11",
+    "num(ko)": 2365,
+    "len(ko)": "1,2,8"
+  },
+  "openai/text-davinci-003": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://github.com/openai/tiktoken\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">text-davinci-003</a>",
+    "organization": "OpenAI",
+    "vocab_size": 50281,
+    "num(digit)": 1691,
+    "len(digit)": "1,3,16",
+    "num(space)": 33175,
+    "len(space)": "1,7,66",
+    "num(ar)": 22,
+    "len(ar)": "1,1,3",
+    "num(zh)": 51,
+    "len(zh)": "1,1,4",
+    "num(ja)": 183,
+    "len(ja)": "1,1,7",
+    "num(ja-kana)": 133,
+    "len(ja-kana)": "1,1,7",
+    "num(ko)": 0,
+    "len(ko)": "-"
+  },
+  "thu-coai/CharacterGLM-6B": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/thu-coai/CharacterGLM-6B\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">CharacterGLM-6B</a>",
+    "organization": "Tsinghua",
+    "vocab_size": 64789,
+    "num(digit)": 20,
+    "len(digit)": "1,1,1",
+    "num(space)": 67,
+    "len(space)": "1,2,15",
+    "num(ar)": 57,
+    "len(ar)": "1,1,2",
+    "num(zh)": 30922,
+    "len(zh)": "1,2,16",
+    "num(ja)": 31065,
+    "len(ja)": "1,2,16",
+    "num(ja-kana)": 143,
+    "len(ja-kana)": "1,1,1",
+    "num(ko)": 604,
+    "len(ko)": "1,1,1"
+  },
+  "tiiuae/falcon-180b": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/tiiuae/falcon-180b\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">falcon-180b</a>",
+    "organization": "TII",
+    "vocab_size": 65024,
+    "num(digit)": 1108,
+    "len(digit)": "1,3,3",
+    "num(space)": 40202,
+    "len(space)": "1,7,65",
+    "num(ar)": 21,
+    "len(ar)": "1,1,4",
+    "num(zh)": 1627,
+    "len(zh)": "1,1,3",
+    "num(ja)": 1652,
+    "len(ja)": "1,1,3",
+    "num(ja-kana)": 25,
+    "len(ja-kana)": "1,1,1",
+    "num(ko)": 1,
+    "len(ko)": "1,1,1"
+  },
+  "tiiuae/falcon-7b": {
+    "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/tiiuae/falcon-7b\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">falcon-7b</a>",
+    "organization": "TII",
+    "vocab_size": 65024,
+    "num(digit)": 1108,
+    "len(digit)": "1,3,3",
+    "num(space)": 40202,
+    "len(space)": "1,7,65",
+    "num(ar)": 21,
+    "len(ar)": "1,1,4",
+    "num(zh)": 1627,
+    "len(zh)": "1,1,3",
+    "num(ja)": 1652,
+    "len(ja)": "1,1,3",
+    "num(ja-kana)": 25,
+    "len(ja-kana)": "1,1,1",
+    "num(ko)": 1,
+    "len(ko)": "1,1,1"
+  }
+}

stats/compress_rate.json DELETED Viewed

@@ -1,4286 +0,0 @@
-{
-  "amber.cc100-en": {
-    "vocab_size": 32000,
-    "n_bytes": 1124813,
-    "n_tokens": 294627,
-    "n_chars": 1121360
-  },
-  "aya_101.cc100-en": {
-    "vocab_size": 250100,
-    "n_bytes": 1124813,
-    "n_tokens": 317881,
-    "n_chars": 1121360
-  },
-  "baichuan.cc100-en": {
-    "vocab_size": 64000,
-    "n_bytes": 1124813,
-    "n_tokens": 280108,
-    "n_chars": 1121360
-  },
-  "baichuan2.cc100-en": {
-    "vocab_size": 125696,
-    "n_bytes": 1124813,
-    "n_tokens": 269011,
-    "n_chars": 1121360
-  },
-  "bert_base_cased.cc100-en": {
-    "vocab_size": 28996,
-    "n_bytes": 1124813,
-    "n_tokens": 288022,
-    "n_chars": 1121360
-  },
-  "bert_base_chinese.cc100-en": {
-    "vocab_size": 21128,
-    "n_bytes": 1124813,
-    "n_tokens": 377068,
-    "n_chars": 1121360
-  },
-  "bert_base_uncased.cc100-en": {
-    "vocab_size": 30522,
-    "n_bytes": 1124813,
-    "n_tokens": 280575,
-    "n_chars": 1121360
-  },
-  "bloom.cc100-en": {
-    "vocab_size": 250680,
-    "n_bytes": 1124813,
-    "n_tokens": 257405,
-    "n_chars": 1121360
-  },
-  "byt5_small.cc100-en": {
-    "vocab_size": 384,
-    "n_bytes": 1124813,
-    "n_tokens": 1134813,
-    "n_chars": 1121360
-  },
-  "character_glm_6b.cc100-en": {
-    "vocab_size": 64789,
-    "n_bytes": 1124813,
-    "n_tokens": 289347,
-    "n_chars": 1121360
-  },
-  "chatglm2_6b.cc100-en": {
-    "vocab_size": 64787,
-    "n_bytes": 1124813,
-    "n_tokens": 289329,
-    "n_chars": 1121360
-  },
-  "chatglm3_6b.cc100-en": {
-    "vocab_size": 64796,
-    "n_bytes": 1124813,
-    "n_tokens": 289347,
-    "n_chars": 1121360
-  },
-  "chatglm_6b.cc100-en": {
-    "vocab_size": 150344,
-    "n_bytes": 1124813,
-    "n_tokens": 284761,
-    "n_chars": 1121360
-  },
-  "chatyuan_large_v2.cc100-en": {
-    "vocab_size": 32128,
-    "n_bytes": 1124813,
-    "n_tokens": 536033,
-    "n_chars": 1121360
-  },
-  "chinese_llama.cc100-en": {
-    "vocab_size": 49953,
-    "n_bytes": 1124813,
-    "n_tokens": 291514,
-    "n_chars": 1121360
-  },
-  "chinese_llama2.cc100-en": {
-    "vocab_size": 55296,
-    "n_bytes": 1124813,
-    "n_tokens": 294627,
-    "n_chars": 1121360
-  },
-  "code_davinci_002.cc100-en": {
-    "vocab_size": 50281,
-    "n_bytes": 1124813,
-    "n_tokens": 258403,
-    "n_chars": 1121360
-  },
-  "crystal_coder.cc100-en": {
-    "vocab_size": 32022,
-    "n_bytes": 1124813,
-    "n_tokens": 284627,
-    "n_chars": 1121360
-  },
-  "dbrx_instruct.cc100-en": {
-    "vocab_size": 100280,
-    "n_bytes": 1124813,
-    "n_tokens": 254985,
-    "n_chars": 1121360
-  },
-  "deepseek_coder_33b_instruct.cc100-en": {
-    "vocab_size": 32022,
-    "n_bytes": 1124813,
-    "n_tokens": 287408,
-    "n_chars": 1121360
-  },
-  "deepseek_llm_7b_base.cc100-en": {
-    "vocab_size": 100015,
-    "n_bytes": 1124813,
-    "n_tokens": 272324,
-    "n_chars": 1121360
-  },
-  "falcon_180b.cc100-en": {
-    "vocab_size": 65024,
-    "n_bytes": 1124813,
-    "n_tokens": 262509,
-    "n_chars": 1121360
-  },
-  "falcon_7b.cc100-en": {
-    "vocab_size": 65024,
-    "n_bytes": 1124813,
-    "n_tokens": 262509,
-    "n_chars": 1121360
-  },
-  "fastchat_t5_3b.cc100-en": {
-    "vocab_size": 32110,
-    "n_bytes": 1124813,
-    "n_tokens": 484941,
-    "n_chars": 1121360
-  },
-  "flan_t5_base.cc100-en": {
-    "vocab_size": 32100,
-    "n_bytes": 1124813,
-    "n_tokens": 290104,
-    "n_chars": 1121360
-  },
-  "gemma_7b.cc100-en": {
-    "vocab_size": 256000,
-    "n_bytes": 1124813,
-    "n_tokens": 268010,
-    "n_chars": 1121360
-  },
-  "gpt2.cc100-en": {
-    "vocab_size": 50257,
-    "n_bytes": 1124813,
-    "n_tokens": 258428,
-    "n_chars": 1121360
-  },
-  "gpt2_chinese.cc100-en": {
-    "vocab_size": 21128,
-    "n_bytes": 1124813,
-    "n_tokens": 392641,
-    "n_chars": 1121360
-  },
-  "gpt_35_turbo.cc100-en": {
-    "vocab_size": 100277,
-    "n_bytes": 1124813,
-    "n_tokens": 254985,
-    "n_chars": 1121360
-  },
-  "gpt_4.cc100-en": {
-    "vocab_size": 100277,
-    "n_bytes": 1124813,
-    "n_tokens": 254985,
-    "n_chars": 1121360
-  },
-  "gpt_nexo_20b.cc100-en": {
-    "vocab_size": 50277,
-    "n_bytes": 1124813,
-    "n_tokens": 259357,
-    "n_chars": 1121360
-  },
-  "grok_1.cc100-en": {
-    "vocab_size": 131072,
-    "n_bytes": 1124813,
-    "n_tokens": 258048,
-    "n_chars": 1121360
-  },
-  "internlm2_chat_7b.cc100-en": {
-    "vocab_size": 92544,
-    "n_bytes": 1124813,
-    "n_tokens": 271583,
-    "n_chars": 1121360
-  },
-  "internlm2_math_7b.cc100-en": {
-    "vocab_size": 92544,
-    "n_bytes": 1124813,
-    "n_tokens": 271583,
-    "n_chars": 1121360
-  },
-  "internlm_chat_7b.cc100-en": {
-    "vocab_size": 103168,
-    "n_bytes": 1124813,
-    "n_tokens": 271293,
-    "n_chars": 1121360
-  },
-  "internlm_xcomposer_7b.cc100-en": {
-    "vocab_size": 103168,
-    "n_bytes": 1124813,
-    "n_tokens": 271293,
-    "n_chars": 1121360
-  },
-  "jamba_v0_1.cc100-en": {
-    "vocab_size": 65536,
-    "n_bytes": 1124813,
-    "n_tokens": 274242,
-    "n_chars": 1121360
-  },
-  "kplug.cc100-en": {
-    "vocab_size": 10261,
-    "n_bytes": 1124813,
-    "n_tokens": 393564,
-    "n_chars": 1121360
-  },
-  "llama.cc100-en": {
-    "vocab_size": 32000,
-    "n_bytes": 1124813,
-    "n_tokens": 294627,
-    "n_chars": 1121360
-  },
-  "llama2.cc100-en": {
-    "vocab_size": 32001,
-    "n_bytes": 1124813,
-    "n_tokens": 294627,
-    "n_chars": 1121360
-  },
-  "llama3.cc100-en": {
-    "vocab_size": 128256,
-    "n_bytes": 1124813,
-    "n_tokens": 254944,
-    "n_chars": 1121360
-  },
-  "mistral_7b.cc100-en": {
-    "vocab_size": 32000,
-    "n_bytes": 1124813,
-    "n_tokens": 285801,
-    "n_chars": 1121360
-  },
-  "mixtral_8_7b.cc100-en": {
-    "vocab_size": 32000,
-    "n_bytes": 1124813,
-    "n_tokens": 285801,
-    "n_chars": 1121360
-  },
-  "mobilebert_uncased.cc100-en": {
-    "vocab_size": 30522,
-    "n_bytes": 1124813,
-    "n_tokens": 280575,
-    "n_chars": 1121360
-  },
-  "moss.cc100-en": {
-    "vocab_size": 106072,
-    "n_bytes": 1124813,
-    "n_tokens": 257070,
-    "n_chars": 1121360
-  },
-  "mt5_large.cc100-en": {
-    "vocab_size": 250100,
-    "n_bytes": 1124813,
-    "n_tokens": 317881,
-    "n_chars": 1121360
-  },
-  "olmo_7b.cc100-en": {
-    "vocab_size": 50280,
-    "n_bytes": 1124813,
-    "n_tokens": 259357,
-    "n_chars": 1121360
-  },
-  "orion_14b_chat.cc100-en": {
-    "vocab_size": 84608,
-    "n_bytes": 1124813,
-    "n_tokens": 265948,
-    "n_chars": 1121360
-  },
-  "phi_1.cc100-en": {
-    "vocab_size": 50295,
-    "n_bytes": 1124813,
-    "n_tokens": 258409,
-    "n_chars": 1121360
-  },
-  "phi_2.cc100-en": {
-    "vocab_size": 50295,
-    "n_bytes": 1124813,
-    "n_tokens": 258409,
-    "n_chars": 1121360
-  },
-  "phi_3_mini.cc100-en": {
-    "vocab_size": 32011,
-    "n_bytes": 1124813,
-    "n_tokens": 294627,
-    "n_chars": 1121360
-  },
-  "pko_t5_large.cc100-en": {
-    "vocab_size": 50358,
-    "n_bytes": 1124813,
-    "n_tokens": 658985,
-    "n_chars": 1121360
-  },
-  "prompt_clue.cc100-en": {
-    "vocab_size": 32128,
-    "n_bytes": 1124813,
-    "n_tokens": 536033,
-    "n_chars": 1121360
-  },
-  "qwen1_5_14b_chat.cc100-en": {
-    "vocab_size": 151646,
-    "n_bytes": 1124813,
-    "n_tokens": 257983,
-    "n_chars": 1121360
-  },
-  "qwen_1_8b_chat.cc100-en": {
-    "vocab_size": 151851,
-    "n_bytes": 1124813,
-    "n_tokens": 257983,
-    "n_chars": 1121360
-  },
-  "qwen_72b_chat.cc100-en": {
-    "vocab_size": 151851,
-    "n_bytes": 1124813,
-    "n_tokens": 257983,
-    "n_chars": 1121360
-  },
-  "qwen_7b_chat.cc100-en": {
-    "vocab_size": 151851,
-    "n_bytes": 1124813,
-    "n_tokens": 257983,
-    "n_chars": 1121360
-  },
-  "roberta_chinese_clue.cc100-en": {
-    "vocab_size": 8021,
-    "n_bytes": 1124813,
-    "n_tokens": 583058,
-    "n_chars": 1121360
-  },
-  "skywork_13b_base.cc100-en": {
-    "vocab_size": 65519,
-    "n_bytes": 1124813,
-    "n_tokens": 294617,
-    "n_chars": 1121360
-  },
-  "skywork_13b_math.cc100-en": {
-    "vocab_size": 65519,
-    "n_bytes": 1124813,
-    "n_tokens": 294617,
-    "n_chars": 1121360
-  },
-  "solar_10_7b.cc100-en": {
-    "vocab_size": 32000,
-    "n_bytes": 1124813,
-    "n_tokens": 285801,
-    "n_chars": 1121360
-  },
-  "starchat_alpha.cc100-en": {
-    "vocab_size": 49156,
-    "n_bytes": 1124813,
-    "n_tokens": 288965,
-    "n_chars": 1121360
-  },
-  "switch_c_2048.cc100-en": {
-    "vocab_size": 32100,
-    "n_bytes": 1124813,
-    "n_tokens": 290104,
-    "n_chars": 1121360
-  },
-  "t5_base.cc100-en": {
-    "vocab_size": 32100,
-    "n_bytes": 1124813,
-    "n_tokens": 290104,
-    "n_chars": 1121360
-  },
-  "t5_large.cc100-en": {
-    "vocab_size": 32100,
-    "n_bytes": 1124813,
-    "n_tokens": 290104,
-    "n_chars": 1121360
-  },
-  "t5_small.cc100-en": {
-    "vocab_size": 32100,
-    "n_bytes": 1124813,
-    "n_tokens": 290104,
-    "n_chars": 1121360
-  },
-  "text_davinci_003.cc100-en": {
-    "vocab_size": 50281,
-    "n_bytes": 1124813,
-    "n_tokens": 258403,
-    "n_chars": 1121360
-  },
-  "tigerbot_13b_chat_v2.cc100-en": {
-    "vocab_size": 60515,
-    "n_bytes": 1124813,
-    "n_tokens": 285652,
-    "n_chars": 1121360
-  },
-  "tigerbot_70b_chat_v4_4k.cc100-en": {
-    "vocab_size": 65110,
-    "n_bytes": 1124813,
-    "n_tokens": 286946,
-    "n_chars": 1121360
-  },
-  "wizardcoder_15b_v1.cc100-en": {
-    "vocab_size": 49153,
-    "n_bytes": 1124813,
-    "n_tokens": 288965,
-    "n_chars": 1121360
-  },
-  "wizardcoder_python_7b_v1.cc100-en": {
-    "vocab_size": 32001,
-    "n_bytes": 1124813,
-    "n_tokens": 294627,
-    "n_chars": 1121360
-  },
-  "wizardlm_7b_v1.cc100-en": {
-    "vocab_size": 32001,
-    "n_bytes": 1124813,
-    "n_tokens": 294627,
-    "n_chars": 1121360
-  },
-  "wizardmath_70b_v1.cc100-en": {
-    "vocab_size": 32002,
-    "n_bytes": 1124813,
-    "n_tokens": 294627,
-    "n_chars": 1121360
-  },
-  "xlm_roberta.cc100-en": {
-    "vocab_size": 250002,
-    "n_bytes": 1124813,
-    "n_tokens": 300026,
-    "n_chars": 1121360
-  },
-  "yi_34b.cc100-en": {
-    "vocab_size": 64000,
-    "n_bytes": 1124813,
-    "n_tokens": 270400,
-    "n_chars": 1121360
-  },
-  "yi_6b.cc100-en": {
-    "vocab_size": 64000,
-    "n_bytes": 1124813,
-    "n_tokens": 270400,
-    "n_chars": 1121360
-  },
-  "yi_vl34b.cc100-en": {
-    "vocab_size": 64000,
-    "n_bytes": 1124813,
-    "n_tokens": 269738,
-    "n_chars": 1121360
-  },
-  "zephyr_7b_beta.cc100-en": {
-    "vocab_size": 32000,
-    "n_bytes": 1124813,
-    "n_tokens": 285801,
-    "n_chars": 1121360
-  },
-  "amber.cc100-zh-Hans": {
-    "vocab_size": 32000,
-    "n_bytes": 2633047,
-    "n_tokens": 1330093,
-    "n_chars": 927311
-  },
-  "aya_101.cc100-zh-Hans": {
-    "vocab_size": 250100,
-    "n_bytes": 2633047,
-    "n_tokens": 631182,
-    "n_chars": 927311
-  },
-  "baichuan.cc100-zh-Hans": {
-    "vocab_size": 64000,
-    "n_bytes": 2633047,
-    "n_tokens": 626117,
-    "n_chars": 927311
-  },
-  "baichuan2.cc100-zh-Hans": {
-    "vocab_size": 125696,
-    "n_bytes": 2633047,
-    "n_tokens": 541464,
-    "n_chars": 927311
-  },
-  "bert_base_cased.cc100-zh-Hans": {
-    "vocab_size": 28996,
-    "n_bytes": 2633047,
-    "n_tokens": 899709,
-    "n_chars": 927311
-  },
-  "bert_base_chinese.cc100-zh-Hans": {
-    "vocab_size": 21128,
-    "n_bytes": 2633047,
-    "n_tokens": 896599,
-    "n_chars": 927311
-  },
-  "bert_base_uncased.cc100-zh-Hans": {
-    "vocab_size": 30522,
-    "n_bytes": 2633047,
-    "n_tokens": 898554,
-    "n_chars": 927311
-  },
-  "bloom.cc100-zh-Hans": {
-    "vocab_size": 250680,
-    "n_bytes": 2633047,
-    "n_tokens": 573008,
-    "n_chars": 927311
-  },
-  "byt5_small.cc100-zh-Hans": {
-    "vocab_size": 384,
-    "n_bytes": 2633047,
-    "n_tokens": 2643047,
-    "n_chars": 927311
-  },
-  "character_glm_6b.cc100-zh-Hans": {
-    "vocab_size": 64789,
-    "n_bytes": 2633047,
-    "n_tokens": 583646,
-    "n_chars": 927311
-  },
-  "chatglm2_6b.cc100-zh-Hans": {
-    "vocab_size": 64787,
-    "n_bytes": 2633047,
-    "n_tokens": 583646,
-    "n_chars": 927311
-  },
-  "chatglm3_6b.cc100-zh-Hans": {
-    "vocab_size": 64796,
-    "n_bytes": 2633047,
-    "n_tokens": 583646,
-    "n_chars": 927311
-  },
-  "chatglm_6b.cc100-zh-Hans": {
-    "vocab_size": 150344,
-    "n_bytes": 2633047,
-    "n_tokens": 527384,
-    "n_chars": 927311
-  },
-  "chatyuan_large_v2.cc100-zh-Hans": {
-    "vocab_size": 32128,
-    "n_bytes": 2633047,
-    "n_tokens": 564905,
-    "n_chars": 927311
-  },
-  "chinese_llama.cc100-zh-Hans": {
-    "vocab_size": 49953,
-    "n_bytes": 2633047,
-    "n_tokens": 623219,
-    "n_chars": 927311
-  },
-  "chinese_llama2.cc100-zh-Hans": {
-    "vocab_size": 55296,
-    "n_bytes": 2633047,
-    "n_tokens": 625766,
-    "n_chars": 927311
-  },
-  "code_davinci_002.cc100-zh-Hans": {
-    "vocab_size": 50281,
-    "n_bytes": 2633047,
-    "n_tokens": 1876809,
-    "n_chars": 927311
-  },
-  "crystal_coder.cc100-zh-Hans": {
-    "vocab_size": 32022,
-    "n_bytes": 2633047,
-    "n_tokens": 1320093,
-    "n_chars": 927311
-  },
-  "dbrx_instruct.cc100-zh-Hans": {
-    "vocab_size": 100280,
-    "n_bytes": 2633047,
-    "n_tokens": 1084939,
-    "n_chars": 927311
-  },
-  "deepseek_coder_33b_instruct.cc100-zh-Hans": {
-    "vocab_size": 32022,
-    "n_bytes": 2633047,
-    "n_tokens": 720577,
-    "n_chars": 927311
-  },
-  "deepseek_llm_7b_base.cc100-zh-Hans": {
-    "vocab_size": 100015,
-    "n_bytes": 2633047,
-    "n_tokens": 605081,
-    "n_chars": 927311
-  },
-  "falcon_180b.cc100-zh-Hans": {
-    "vocab_size": 65024,
-    "n_bytes": 2633047,
-    "n_tokens": 1124681,
-    "n_chars": 927311
-  },
-  "falcon_7b.cc100-zh-Hans": {
-    "vocab_size": 65024,
-    "n_bytes": 2633047,
-    "n_tokens": 1124681,
-    "n_chars": 927311
-  },
-  "fastchat_t5_3b.cc100-zh-Hans": {
-    "vocab_size": 32110,
-    "n_bytes": 2633047,
-    "n_tokens": 178974,
-    "n_chars": 927311
-  },
-  "flan_t5_base.cc100-zh-Hans": {
-    "vocab_size": 32100,
-    "n_bytes": 2633047,
-    "n_tokens": 173520,
-    "n_chars": 927311
-  },
-  "gemma_7b.cc100-zh-Hans": {
-    "vocab_size": 256000,
-    "n_bytes": 2633047,
-    "n_tokens": 641795,
-    "n_chars": 927311
-  },
-  "gpt2.cc100-zh-Hans": {
-    "vocab_size": 50257,
-    "n_bytes": 2633047,
-    "n_tokens": 1876809,
-    "n_chars": 927311
-  },
-  "gpt2_chinese.cc100-zh-Hans": {
-    "vocab_size": 21128,
-    "n_bytes": 2633047,
-    "n_tokens": 899506,
-    "n_chars": 927311
-  },
-  "gpt_35_turbo.cc100-zh-Hans": {
-    "vocab_size": 100277,
-    "n_bytes": 2633047,
-    "n_tokens": 1084939,
-    "n_chars": 927311
-  },
-  "gpt_4.cc100-zh-Hans": {
-    "vocab_size": 100277,
-    "n_bytes": 2633047,
-    "n_tokens": 1084939,
-    "n_chars": 927311
-  },
-  "gpt_nexo_20b.cc100-zh-Hans": {
-    "vocab_size": 50277,
-    "n_bytes": 2633047,
-    "n_tokens": 1220529,
-    "n_chars": 927311
-  },
-  "grok_1.cc100-zh-Hans": {
-    "vocab_size": 131072,
-    "n_bytes": 2633047,
-    "n_tokens": 1414508,
-    "n_chars": 927311
-  },
-  "internlm2_chat_7b.cc100-zh-Hans": {
-    "vocab_size": 92544,
-    "n_bytes": 2633047,
-    "n_tokens": 579976,
-    "n_chars": 927311
-  },
-  "internlm2_math_7b.cc100-zh-Hans": {
-    "vocab_size": 92544,
-    "n_bytes": 2633047,
-    "n_tokens": 579976,
-    "n_chars": 927311
-  },
-  "internlm_chat_7b.cc100-zh-Hans": {
-    "vocab_size": 103168,
-    "n_bytes": 2633047,
-    "n_tokens": 579109,
-    "n_chars": 927311
-  },
-  "internlm_xcomposer_7b.cc100-zh-Hans": {
-    "vocab_size": 103168,
-    "n_bytes": 2633047,
-    "n_tokens": 579109,
-    "n_chars": 927311
-  },
-  "jamba_v0_1.cc100-zh-Hans": {
-    "vocab_size": 65536,
-    "n_bytes": 2633047,
-    "n_tokens": 1067054,
-    "n_chars": 927311
-  },
-  "kplug.cc100-zh-Hans": {
-    "vocab_size": 10261,
-    "n_bytes": 2633047,
-    "n_tokens": 902451,
-    "n_chars": 927311
-  },
-  "llama.cc100-zh-Hans": {
-    "vocab_size": 32000,
-    "n_bytes": 2633047,
-    "n_tokens": 1330093,
-    "n_chars": 927311
-  },
-  "llama2.cc100-zh-Hans": {
-    "vocab_size": 32001,
-    "n_bytes": 2633047,
-    "n_tokens": 1330093,
-    "n_chars": 927311
-  },
-  "llama3.cc100-zh-Hans": {
-    "vocab_size": 128256,
-    "n_bytes": 2633047,
-    "n_tokens": 747405,
-    "n_chars": 927311
-  },
-  "mistral_7b.cc100-zh-Hans": {
-    "vocab_size": 32000,
-    "n_bytes": 2633047,
-    "n_tokens": 1041023,
-    "n_chars": 927311
-  },
-  "mixtral_8_7b.cc100-zh-Hans": {
-    "vocab_size": 32000,
-    "n_bytes": 2633047,
-    "n_tokens": 1041023,
-    "n_chars": 927311
-  },
-  "mobilebert_uncased.cc100-zh-Hans": {
-    "vocab_size": 30522,
-    "n_bytes": 2633047,
-    "n_tokens": 898554,
-    "n_chars": 927311
-  },
-  "moss.cc100-zh-Hans": {
-    "vocab_size": 106072,
-    "n_bytes": 2633047,
-    "n_tokens": 557455,
-    "n_chars": 927311
-  },
-  "mt5_large.cc100-zh-Hans": {
-    "vocab_size": 250100,
-    "n_bytes": 2633047,
-    "n_tokens": 631182,
-    "n_chars": 927311
-  },
-  "olmo_7b.cc100-zh-Hans": {
-    "vocab_size": 50280,
-    "n_bytes": 2633047,
-    "n_tokens": 1220529,
-    "n_chars": 927311
-  },
-  "orion_14b_chat.cc100-zh-Hans": {
-    "vocab_size": 84608,
-    "n_bytes": 2633047,
-    "n_tokens": 529926,
-    "n_chars": 927311
-  },
-  "phi_1.cc100-zh-Hans": {
-    "vocab_size": 50295,
-    "n_bytes": 2633047,
-    "n_tokens": 1876809,
-    "n_chars": 927311
-  },
-  "phi_2.cc100-zh-Hans": {
-    "vocab_size": 50295,
-    "n_bytes": 2633047,
-    "n_tokens": 1876809,
-    "n_chars": 927311
-  },
-  "phi_3_mini.cc100-zh-Hans": {
-    "vocab_size": 32011,
-    "n_bytes": 2633047,
-    "n_tokens": 1330093,
-    "n_chars": 927311
-  },
-  "pko_t5_large.cc100-zh-Hans": {
-    "vocab_size": 50358,
-    "n_bytes": 2633047,
-    "n_tokens": 2533519,
-    "n_chars": 927311
-  },
-  "prompt_clue.cc100-zh-Hans": {
-    "vocab_size": 32128,
-    "n_bytes": 2633047,
-    "n_tokens": 564905,
-    "n_chars": 927311
-  },
-  "qwen1_5_14b_chat.cc100-zh-Hans": {
-    "vocab_size": 151646,
-    "n_bytes": 2633047,
-    "n_tokens": 589211,
-    "n_chars": 927311
-  },
-  "qwen_1_8b_chat.cc100-zh-Hans": {
-    "vocab_size": 151851,
-    "n_bytes": 2633047,
-    "n_tokens": 589211,
-    "n_chars": 927311
-  },
-  "qwen_72b_chat.cc100-zh-Hans": {
-    "vocab_size": 151851,
-    "n_bytes": 2633047,
-    "n_tokens": 589211,
-    "n_chars": 927311
-  },
-  "qwen_7b_chat.cc100-zh-Hans": {
-    "vocab_size": 151851,
-    "n_bytes": 2633047,
-    "n_tokens": 589211,
-    "n_chars": 927311
-  },
-  "roberta_chinese_clue.cc100-zh-Hans": {
-    "vocab_size": 8021,
-    "n_bytes": 2633047,
-    "n_tokens": 907144,
-    "n_chars": 927311
-  },
-  "skywork_13b_base.cc100-zh-Hans": {
-    "vocab_size": 65519,
-    "n_bytes": 2633047,
-    "n_tokens": 663923,
-    "n_chars": 927311
-  },
-  "skywork_13b_math.cc100-zh-Hans": {
-    "vocab_size": 65519,
-    "n_bytes": 2633047,
-    "n_tokens": 663923,
-    "n_chars": 927311
-  },
-  "solar_10_7b.cc100-zh-Hans": {
-    "vocab_size": 32000,
-    "n_bytes": 2633047,
-    "n_tokens": 1041023,
-    "n_chars": 927311
-  },
-  "starchat_alpha.cc100-zh-Hans": {
-    "vocab_size": 49156,
-    "n_bytes": 2633047,
-    "n_tokens": 882018,
-    "n_chars": 927311
-  },
-  "switch_c_2048.cc100-zh-Hans": {
-    "vocab_size": 32100,
-    "n_bytes": 2633047,
-    "n_tokens": 173519,
-    "n_chars": 927311
-  },
-  "t5_base.cc100-zh-Hans": {
-    "vocab_size": 32100,
-    "n_bytes": 2633047,
-    "n_tokens": 173519,
-    "n_chars": 927311
-  },
-  "t5_large.cc100-zh-Hans": {
-    "vocab_size": 32100,
-    "n_bytes": 2633047,
-    "n_tokens": 173519,
-    "n_chars": 927311
-  },
-  "t5_small.cc100-zh-Hans": {
-    "vocab_size": 32100,
-    "n_bytes": 2633047,
-    "n_tokens": 173519,
-    "n_chars": 927311
-  },
-  "text_davinci_003.cc100-zh-Hans": {
-    "vocab_size": 50281,
-    "n_bytes": 2633047,
-    "n_tokens": 1876809,
-    "n_chars": 927311
-  },
-  "tigerbot_13b_chat_v2.cc100-zh-Hans": {
-    "vocab_size": 60515,
-    "n_bytes": 2633047,
-    "n_tokens": 577385,
-    "n_chars": 927311
-  },
-  "tigerbot_70b_chat_v4_4k.cc100-zh-Hans": {
-    "vocab_size": 65110,
-    "n_bytes": 2633047,
-    "n_tokens": 577211,
-    "n_chars": 927311
-  },
-  "wizardcoder_15b_v1.cc100-zh-Hans": {
-    "vocab_size": 49153,
-    "n_bytes": 2633047,
-    "n_tokens": 882018,
-    "n_chars": 927311
-  },
-  "wizardcoder_python_7b_v1.cc100-zh-Hans": {
-    "vocab_size": 32001,
-    "n_bytes": 2633047,
-    "n_tokens": 1330093,
-    "n_chars": 927311
-  },
-  "wizardlm_7b_v1.cc100-zh-Hans": {
-    "vocab_size": 32001,
-    "n_bytes": 2633047,
-    "n_tokens": 1330093,
-    "n_chars": 927311
-  },
-  "wizardmath_70b_v1.cc100-zh-Hans": {
-    "vocab_size": 32002,
-    "n_bytes": 2633047,
-    "n_tokens": 1330093,
-    "n_chars": 927311
-  },
-  "xlm_roberta.cc100-zh-Hans": {
-    "vocab_size": 250002,
-    "n_bytes": 2633047,
-    "n_tokens": 619844,
-    "n_chars": 927311
-  },
-  "yi_34b.cc100-zh-Hans": {
-    "vocab_size": 64000,
-    "n_bytes": 2633047,
-    "n_tokens": 588729,
-    "n_chars": 927311
-  },
-  "yi_6b.cc100-zh-Hans": {
-    "vocab_size": 64000,
-    "n_bytes": 2633047,
-    "n_tokens": 588729,
-    "n_chars": 927311
-  },
-  "yi_vl34b.cc100-zh-Hans": {
-    "vocab_size": 64000,
-    "n_bytes": 2633047,
-    "n_tokens": 596166,
-    "n_chars": 927311
-  },
-  "zephyr_7b_beta.cc100-zh-Hans": {
-    "vocab_size": 32000,
-    "n_bytes": 2633047,
-    "n_tokens": 1041023,
-    "n_chars": 927311
-  },
-  "amber.cc100-es": {
-    "vocab_size": 32000,
-    "n_bytes": 1664455,
-    "n_tokens": 492235,
-    "n_chars": 1630297
-  },
-  "aya_101.cc100-es": {
-    "vocab_size": 250100,
-    "n_bytes": 1664455,
-    "n_tokens": 472231,
-    "n_chars": 1630297
-  },
-  "baichuan.cc100-es": {
-    "vocab_size": 64000,
-    "n_bytes": 1664455,
-    "n_tokens": 585804,
-    "n_chars": 1630297
-  },
-  "baichuan2.cc100-es": {
-    "vocab_size": 125696,
-    "n_bytes": 1664455,
-    "n_tokens": 551326,
-    "n_chars": 1630297
-  },
-  "bert_base_cased.cc100-es": {
-    "vocab_size": 28996,
-    "n_bytes": 1664455,
-    "n_tokens": 630231,
-    "n_chars": 1630297
-  },
-  "bert_base_chinese.cc100-es": {
-    "vocab_size": 21128,
-    "n_bytes": 1664455,
-    "n_tokens": 609419,
-    "n_chars": 1630297
-  },
-  "bert_base_uncased.cc100-es": {
-    "vocab_size": 30522,
-    "n_bytes": 1664455,
-    "n_tokens": 558042,
-    "n_chars": 1630297
-  },
-  "bloom.cc100-es": {
-    "vocab_size": 250680,
-    "n_bytes": 1664455,
-    "n_tokens": 350793,
-    "n_chars": 1630297
-  },
-  "byt5_small.cc100-es": {
-    "vocab_size": 384,
-    "n_bytes": 1664455,
-    "n_tokens": 1674455,
-    "n_chars": 1630297
-  },
-  "character_glm_6b.cc100-es": {
-    "vocab_size": 64789,
-    "n_bytes": 1664455,
-    "n_tokens": 566501,
-    "n_chars": 1630297
-  },
-  "chatglm2_6b.cc100-es": {
-    "vocab_size": 64787,
-    "n_bytes": 1664455,
-    "n_tokens": 566476,
-    "n_chars": 1630297
-  },
-  "chatglm3_6b.cc100-es": {
-    "vocab_size": 64796,
-    "n_bytes": 1664455,
-    "n_tokens": 566501,
-    "n_chars": 1630297
-  },
-  "chatglm_6b.cc100-es": {
-    "vocab_size": 150344,
-    "n_bytes": 1664455,
-    "n_tokens": 514848,
-    "n_chars": 1630297
-  },
-  "chatyuan_large_v2.cc100-es": {
-    "vocab_size": 32128,
-    "n_bytes": 1664455,
-    "n_tokens": 889530,
-    "n_chars": 1630297
-  },
-  "chinese_llama.cc100-es": {
-    "vocab_size": 49953,
-    "n_bytes": 1664455,
-    "n_tokens": 486672,
-    "n_chars": 1630297
-  },
-  "chinese_llama2.cc100-es": {
-    "vocab_size": 55296,
-    "n_bytes": 1664455,
-    "n_tokens": 492235,
-    "n_chars": 1630297
-  },
-  "code_davinci_002.cc100-es": {
-    "vocab_size": 50281,
-    "n_bytes": 1664455,
-    "n_tokens": 569853,
-    "n_chars": 1630297
-  },
-  "crystal_coder.cc100-es": {
-    "vocab_size": 32022,
-    "n_bytes": 1664455,
-    "n_tokens": 482235,
-    "n_chars": 1630297
-  },
-  "dbrx_instruct.cc100-es": {
-    "vocab_size": 100280,
-    "n_bytes": 1664455,
-    "n_tokens": 433875,
-    "n_chars": 1630297
-  },
-  "deepseek_coder_33b_instruct.cc100-es": {
-    "vocab_size": 32022,
-    "n_bytes": 1664455,
-    "n_tokens": 523884,
-    "n_chars": 1630297
-  },
-  "deepseek_llm_7b_base.cc100-es": {
-    "vocab_size": 100015,
-    "n_bytes": 1664455,
-    "n_tokens": 480877,
-    "n_chars": 1630297
-  },
-  "falcon_180b.cc100-es": {
-    "vocab_size": 65024,
-    "n_bytes": 1664455,
-    "n_tokens": 442138,
-    "n_chars": 1630297
-  },
-  "falcon_7b.cc100-es": {
-    "vocab_size": 65024,
-    "n_bytes": 1664455,
-    "n_tokens": 442138,
-    "n_chars": 1630297
-  },
-  "fastchat_t5_3b.cc100-es": {
-    "vocab_size": 32110,
-    "n_bytes": 1664455,
-    "n_tokens": 970105,
-    "n_chars": 1630297
-  },
-  "flan_t5_base.cc100-es": {
-    "vocab_size": 32100,
-    "n_bytes": 1664455,
-    "n_tokens": 706405,
-    "n_chars": 1630297
-  },
-  "gemma_7b.cc100-es": {
-    "vocab_size": 256000,
-    "n_bytes": 1664455,
-    "n_tokens": 371321,
-    "n_chars": 1630297
-  },
-  "gpt2.cc100-es": {
-    "vocab_size": 50257,
-    "n_bytes": 1664455,
-    "n_tokens": 569853,
-    "n_chars": 1630297
-  },
-  "gpt2_chinese.cc100-es": {
-    "vocab_size": 21128,
-    "n_bytes": 1664455,
-    "n_tokens": 703390,
-    "n_chars": 1630297
-  },
-  "gpt_35_turbo.cc100-es": {
-    "vocab_size": 100277,
-    "n_bytes": 1664455,
-    "n_tokens": 433875,
-    "n_chars": 1630297
-  },
-  "gpt_4.cc100-es": {
-    "vocab_size": 100277,
-    "n_bytes": 1664455,
-    "n_tokens": 433875,
-    "n_chars": 1630297
-  },
-  "gpt_nexo_20b.cc100-es": {
-    "vocab_size": 50277,
-    "n_bytes": 1664455,
-    "n_tokens": 494577,
-    "n_chars": 1630297
-  },
-  "grok_1.cc100-es": {
-    "vocab_size": 131072,
-    "n_bytes": 1664455,
-    "n_tokens": 449392,
-    "n_chars": 1630297
-  },
-  "internlm2_chat_7b.cc100-es": {
-    "vocab_size": 92544,
-    "n_bytes": 1664455,
-    "n_tokens": 518871,
-    "n_chars": 1630297
-  },
-  "internlm2_math_7b.cc100-es": {
-    "vocab_size": 92544,
-    "n_bytes": 1664455,
-    "n_tokens": 518871,
-    "n_chars": 1630297
-  },
-  "internlm_chat_7b.cc100-es": {
-    "vocab_size": 103168,
-    "n_bytes": 1664455,
-    "n_tokens": 516572,
-    "n_chars": 1630297
-  },
-  "internlm_xcomposer_7b.cc100-es": {
-    "vocab_size": 103168,
-    "n_bytes": 1664455,
-    "n_tokens": 516572,
-    "n_chars": 1630297
-  },
-  "jamba_v0_1.cc100-es": {
-    "vocab_size": 65536,
-    "n_bytes": 1664455,
-    "n_tokens": 420883,
-    "n_chars": 1630297
-  },
-  "kplug.cc100-es": {
-    "vocab_size": 10261,
-    "n_bytes": 1664455,
-    "n_tokens": 704804,
-    "n_chars": 1630297
-  },
-  "llama.cc100-es": {
-    "vocab_size": 32000,
-    "n_bytes": 1664455,
-    "n_tokens": 492235,
-    "n_chars": 1630297
-  },
-  "llama2.cc100-es": {
-    "vocab_size": 32001,
-    "n_bytes": 1664455,
-    "n_tokens": 492235,
-    "n_chars": 1630297
-  },
-  "llama3.cc100-es": {
-    "vocab_size": 128256,
-    "n_bytes": 1664455,
-    "n_tokens": 433289,
-    "n_chars": 1630297
-  },
-  "mistral_7b.cc100-es": {
-    "vocab_size": 32000,
-    "n_bytes": 1664455,
-    "n_tokens": 513915,
-    "n_chars": 1630297
-  },
-  "mixtral_8_7b.cc100-es": {
-    "vocab_size": 32000,
-    "n_bytes": 1664455,
-    "n_tokens": 513915,
-    "n_chars": 1630297
-  },
-  "mobilebert_uncased.cc100-es": {
-    "vocab_size": 30522,
-    "n_bytes": 1664455,
-    "n_tokens": 558042,
-    "n_chars": 1630297
-  },
-  "moss.cc100-es": {
-    "vocab_size": 106072,
-    "n_bytes": 1664455,
-    "n_tokens": 568539,
-    "n_chars": 1630297
-  },
-  "mt5_large.cc100-es": {
-    "vocab_size": 250100,
-    "n_bytes": 1664455,
-    "n_tokens": 472231,
-    "n_chars": 1630297
-  },
-  "olmo_7b.cc100-es": {
-    "vocab_size": 50280,
-    "n_bytes": 1664455,
-    "n_tokens": 494577,
-    "n_chars": 1630297
-  },
-  "orion_14b_chat.cc100-es": {
-    "vocab_size": 84608,
-    "n_bytes": 1664455,
-    "n_tokens": 628571,
-    "n_chars": 1630297
-  },
-  "phi_1.cc100-es": {
-    "vocab_size": 50295,
-    "n_bytes": 1664455,
-    "n_tokens": 569853,
-    "n_chars": 1630297
-  },
-  "phi_2.cc100-es": {
-    "vocab_size": 50295,
-    "n_bytes": 1664455,
-    "n_tokens": 569853,
-    "n_chars": 1630297
-  },
-  "phi_3_mini.cc100-es": {
-    "vocab_size": 32011,
-    "n_bytes": 1664455,
-    "n_tokens": 492235,
-    "n_chars": 1630297
-  },
-  "pko_t5_large.cc100-es": {
-    "vocab_size": 50358,
-    "n_bytes": 1664455,
-    "n_tokens": 1134056,
-    "n_chars": 1630297
-  },
-  "prompt_clue.cc100-es": {
-    "vocab_size": 32128,
-    "n_bytes": 1664455,
-    "n_tokens": 889530,
-    "n_chars": 1630297
-  },
-  "qwen1_5_14b_chat.cc100-es": {
-    "vocab_size": 151646,
-    "n_bytes": 1664455,
-    "n_tokens": 434264,
-    "n_chars": 1630297
-  },
-  "qwen_1_8b_chat.cc100-es": {
-    "vocab_size": 151851,
-    "n_bytes": 1664455,
-    "n_tokens": 434264,
-    "n_chars": 1630297
-  },
-  "qwen_72b_chat.cc100-es": {
-    "vocab_size": 151851,
-    "n_bytes": 1664455,
-    "n_tokens": 434264,
-    "n_chars": 1630297
-  },
-  "qwen_7b_chat.cc100-es": {
-    "vocab_size": 151851,
-    "n_bytes": 1664455,
-    "n_tokens": 434264,
-    "n_chars": 1630297
-  },
-  "roberta_chinese_clue.cc100-es": {
-    "vocab_size": 8021,
-    "n_bytes": 1664455,
-    "n_tokens": 866564,
-    "n_chars": 1630297
-  },
-  "skywork_13b_base.cc100-es": {
-    "vocab_size": 65519,
-    "n_bytes": 1664455,
-    "n_tokens": 492211,
-    "n_chars": 1630297
-  },
-  "skywork_13b_math.cc100-es": {
-    "vocab_size": 65519,
-    "n_bytes": 1664455,
-    "n_tokens": 492211,
-    "n_chars": 1630297
-  },
-  "solar_10_7b.cc100-es": {
-    "vocab_size": 32000,
-    "n_bytes": 1664455,
-    "n_tokens": 513915,
-    "n_chars": 1630297
-  },
-  "starchat_alpha.cc100-es": {
-    "vocab_size": 49156,
-    "n_bytes": 1664455,
-    "n_tokens": 530592,
-    "n_chars": 1630297
-  },
-  "switch_c_2048.cc100-es": {
-    "vocab_size": 32100,
-    "n_bytes": 1664455,
-    "n_tokens": 706400,
-    "n_chars": 1630297
-  },
-  "t5_base.cc100-es": {
-    "vocab_size": 32100,
-    "n_bytes": 1664455,
-    "n_tokens": 706400,
-    "n_chars": 1630297
-  },
-  "t5_large.cc100-es": {
-    "vocab_size": 32100,
-    "n_bytes": 1664455,
-    "n_tokens": 706400,
-    "n_chars": 1630297
-  },
-  "t5_small.cc100-es": {
-    "vocab_size": 32100,
-    "n_bytes": 1664455,
-    "n_tokens": 706400,
-    "n_chars": 1630297
-  },
-  "text_davinci_003.cc100-es": {
-    "vocab_size": 50281,
-    "n_bytes": 1664455,
-    "n_tokens": 569853,
-    "n_chars": 1630297
-  },
-  "tigerbot_13b_chat_v2.cc100-es": {
-    "vocab_size": 60515,
-    "n_bytes": 1664455,
-    "n_tokens": 482553,
-    "n_chars": 1630297
-  },
-  "tigerbot_70b_chat_v4_4k.cc100-es": {
-    "vocab_size": 65110,
-    "n_bytes": 1664455,
-    "n_tokens": 484099,
-    "n_chars": 1630297
-  },
-  "wizardcoder_15b_v1.cc100-es": {
-    "vocab_size": 49153,
-    "n_bytes": 1664455,
-    "n_tokens": 530592,
-    "n_chars": 1630297
-  },
-  "wizardcoder_python_7b_v1.cc100-es": {
-    "vocab_size": 32001,
-    "n_bytes": 1664455,
-    "n_tokens": 492235,
-    "n_chars": 1630297
-  },
-  "wizardlm_7b_v1.cc100-es": {
-    "vocab_size": 32001,
-    "n_bytes": 1664455,
-    "n_tokens": 492235,
-    "n_chars": 1630297
-  },
-  "wizardmath_70b_v1.cc100-es": {
-    "vocab_size": 32002,
-    "n_bytes": 1664455,
-    "n_tokens": 492235,
-    "n_chars": 1630297
-  },
-  "xlm_roberta.cc100-es": {
-    "vocab_size": 250002,
-    "n_bytes": 1664455,
-    "n_tokens": 399850,
-    "n_chars": 1630297
-  },
-  "yi_34b.cc100-es": {
-    "vocab_size": 64000,
-    "n_bytes": 1664455,
-    "n_tokens": 577018,
-    "n_chars": 1630297
-  },
-  "yi_6b.cc100-es": {
-    "vocab_size": 64000,
-    "n_bytes": 1664455,
-    "n_tokens": 577018,
-    "n_chars": 1630297
-  },
-  "yi_vl34b.cc100-es": {
-    "vocab_size": 64000,
-    "n_bytes": 1664455,
-    "n_tokens": 576794,
-    "n_chars": 1630297
-  },
-  "zephyr_7b_beta.cc100-es": {
-    "vocab_size": 32000,
-    "n_bytes": 1664455,
-    "n_tokens": 513915,
-    "n_chars": 1630297
-  },
-  "aya_101.cc100-fr": {
-    "vocab_size": 250100,
-    "n_bytes": 1540504,
-    "n_tokens": 470944,
-    "n_chars": 1484970
-  },
-  "baichuan.cc100-fr": {
-    "vocab_size": 64000,
-    "n_bytes": 1540504,
-    "n_tokens": 540430,
-    "n_chars": 1484970
-  },
-  "baichuan2.cc100-fr": {
-    "vocab_size": 125696,
-    "n_bytes": 1540504,
-    "n_tokens": 512313,
-    "n_chars": 1484970
-  },
-  "bert_base_cased.cc100-fr": {
-    "vocab_size": 28996,
-    "n_bytes": 1540504,
-    "n_tokens": 583210,
-    "n_chars": 1484970
-  },
-  "bert_base_chinese.cc100-fr": {
-    "vocab_size": 21128,
-    "n_bytes": 1540504,
-    "n_tokens": 553134,
-    "n_chars": 1484970
-  },
-  "bert_base_uncased.cc100-fr": {
-    "vocab_size": 30522,
-    "n_bytes": 1540504,
-    "n_tokens": 504075,
-    "n_chars": 1484970
-  },
-  "bloom.cc100-fr": {
-    "vocab_size": 250680,
-    "n_bytes": 1540504,
-    "n_tokens": 321639,
-    "n_chars": 1484970
-  },
-  "byt5_small.cc100-fr": {
-    "vocab_size": 384,
-    "n_bytes": 1540504,
-    "n_tokens": 1550504,
-    "n_chars": 1484970
-  },
-  "character_glm_6b.cc100-fr": {
-    "vocab_size": 64789,
-    "n_bytes": 1540504,
-    "n_tokens": 515052,
-    "n_chars": 1484970
-  },
-  "chatglm2_6b.cc100-fr": {
-    "vocab_size": 64787,
-    "n_bytes": 1540504,
-    "n_tokens": 515028,
-    "n_chars": 1484970
-  },
-  "chatglm3_6b.cc100-fr": {
-    "vocab_size": 64796,
-    "n_bytes": 1540504,
-    "n_tokens": 515052,
-    "n_chars": 1484970
-  },
-  "chatglm_6b.cc100-fr": {
-    "vocab_size": 150344,
-    "n_bytes": 1540504,
-    "n_tokens": 499261,
-    "n_chars": 1484970
-  },
-  "chatyuan_large_v2.cc100-fr": {
-    "vocab_size": 32128,
-    "n_bytes": 1540504,
-    "n_tokens": 822012,
-    "n_chars": 1484970
-  },
-  "chinese_llama.cc100-fr": {
-    "vocab_size": 49953,
-    "n_bytes": 1540504,
-    "n_tokens": 450352,
-    "n_chars": 1484970
-  },
-  "chinese_llama2.cc100-fr": {
-    "vocab_size": 55296,
-    "n_bytes": 1540504,
-    "n_tokens": 457243,
-    "n_chars": 1484970
-  },
-  "code_davinci_002.cc100-fr": {
-    "vocab_size": 50281,
-    "n_bytes": 1540504,
-    "n_tokens": 521776,
-    "n_chars": 1484970
-  },
-  "crystal_coder.cc100-fr": {
-    "vocab_size": 32022,
-    "n_bytes": 1540504,
-    "n_tokens": 447243,
-    "n_chars": 1484970
-  },
-  "dbrx_instruct.cc100-fr": {
-    "vocab_size": 100280,
-    "n_bytes": 1540504,
-    "n_tokens": 412685,
-    "n_chars": 1484970
-  },
-  "deepseek_coder_33b_instruct.cc100-fr": {
-    "vocab_size": 32022,
-    "n_bytes": 1540504,
-    "n_tokens": 537538,
-    "n_chars": 1484970
-  },
-  "deepseek_llm_7b_base.cc100-fr": {
-    "vocab_size": 100015,
-    "n_bytes": 1540504,
-    "n_tokens": 507693,
-    "n_chars": 1484970
-  },
-  "falcon_180b.cc100-fr": {
-    "vocab_size": 65024,
-    "n_bytes": 1540504,
-    "n_tokens": 407853,
-    "n_chars": 1484970
-  },
-  "falcon_7b.cc100-fr": {
-    "vocab_size": 65024,
-    "n_bytes": 1540504,
-    "n_tokens": 407853,
-    "n_chars": 1484970
-  },
-  "fastchat_t5_3b.cc100-fr": {
-    "vocab_size": 32110,
-    "n_bytes": 1540504,
-    "n_tokens": 717675,
-    "n_chars": 1484970
-  },
-  "flan_t5_base.cc100-fr": {
-    "vocab_size": 32100,
-    "n_bytes": 1540504,
-    "n_tokens": 476135,
-    "n_chars": 1484970
-  },
-  "gemma_7b.cc100-fr": {
-    "vocab_size": 256000,
-    "n_bytes": 1540504,
-    "n_tokens": 374551,
-    "n_chars": 1484970
-  },
-  "gpt2.cc100-fr": {
-    "vocab_size": 50257,
-    "n_bytes": 1540504,
-    "n_tokens": 521776,
-    "n_chars": 1484970
-  },
-  "gpt2_chinese.cc100-fr": {
-    "vocab_size": 21128,
-    "n_bytes": 1540504,
-    "n_tokens": 636442,
-    "n_chars": 1484970
-  },
-  "gpt_35_turbo.cc100-fr": {
-    "vocab_size": 100277,
-    "n_bytes": 1540504,
-    "n_tokens": 412685,
-    "n_chars": 1484970
-  },
-  "gpt_4.cc100-fr": {
-    "vocab_size": 100277,
-    "n_bytes": 1540504,
-    "n_tokens": 412685,
-    "n_chars": 1484970
-  },
-  "gpt_nexo_20b.cc100-fr": {
-    "vocab_size": 50277,
-    "n_bytes": 1540504,
-    "n_tokens": 458961,
-    "n_chars": 1484970
-  },
-  "grok_1.cc100-fr": {
-    "vocab_size": 131072,
-    "n_bytes": 1540504,
-    "n_tokens": 428298,
-    "n_chars": 1484970
-  },
-  "internlm2_chat_7b.cc100-fr": {
-    "vocab_size": 92544,
-    "n_bytes": 1540504,
-    "n_tokens": 496629,
-    "n_chars": 1484970
-  },
-  "internlm2_math_7b.cc100-fr": {
-    "vocab_size": 92544,
-    "n_bytes": 1540504,
-    "n_tokens": 496629,
-    "n_chars": 1484970
-  },
-  "internlm_chat_7b.cc100-fr": {
-    "vocab_size": 103168,
-    "n_bytes": 1540504,
-    "n_tokens": 495045,
-    "n_chars": 1484970
-  },
-  "internlm_xcomposer_7b.cc100-fr": {
-    "vocab_size": 103168,
-    "n_bytes": 1540504,
-    "n_tokens": 495045,
-    "n_chars": 1484970
-  },
-  "jamba_v0_1.cc100-fr": {
-    "vocab_size": 65536,
-    "n_bytes": 1540504,
-    "n_tokens": 412899,
-    "n_chars": 1484970
-  },
-  "kplug.cc100-fr": {
-    "vocab_size": 10261,
-    "n_bytes": 1540504,
-    "n_tokens": 638107,
-    "n_chars": 1484970
-  },
-  "llama.cc100-fr": {
-    "vocab_size": 32000,
-    "n_bytes": 1540504,
-    "n_tokens": 457243,
-    "n_chars": 1484970
-  },
-  "llama2.cc100-fr": {
-    "vocab_size": 32001,
-    "n_bytes": 1540504,
-    "n_tokens": 457243,
-    "n_chars": 1484970
-  },
-  "llama3.cc100-fr": {
-    "vocab_size": 128256,
-    "n_bytes": 1540504,
-    "n_tokens": 412146,
-    "n_chars": 1484970
-  },
-  "mistral_7b.cc100-fr": {
-    "vocab_size": 32000,
-    "n_bytes": 1540504,
-    "n_tokens": 476666,
-    "n_chars": 1484970
-  },
-  "mixtral_8_7b.cc100-fr": {
-    "vocab_size": 32000,
-    "n_bytes": 1540504,
-    "n_tokens": 476666,
-    "n_chars": 1484970
-  },
-  "mobilebert_uncased.cc100-fr": {
-    "vocab_size": 30522,
-    "n_bytes": 1540504,
-    "n_tokens": 504075,
-    "n_chars": 1484970
-  },
-  "moss.cc100-fr": {
-    "vocab_size": 106072,
-    "n_bytes": 1540504,
-    "n_tokens": 515669,
-    "n_chars": 1484970
-  },
-  "mt5_large.cc100-fr": {
-    "vocab_size": 250100,
-    "n_bytes": 1540504,
-    "n_tokens": 470944,
-    "n_chars": 1484970
-  },
-  "olmo_7b.cc100-fr": {
-    "vocab_size": 50280,
-    "n_bytes": 1540504,
-    "n_tokens": 458961,
-    "n_chars": 1484970
-  },
-  "orion_14b_chat.cc100-fr": {
-    "vocab_size": 84608,
-    "n_bytes": 1540504,
-    "n_tokens": 564107,
-    "n_chars": 1484970
-  },
-  "phi_1.cc100-fr": {
-    "vocab_size": 50295,
-    "n_bytes": 1540504,
-    "n_tokens": 521776,
-    "n_chars": 1484970
-  },
-  "phi_2.cc100-fr": {
-    "vocab_size": 50295,
-    "n_bytes": 1540504,
-    "n_tokens": 521776,
-    "n_chars": 1484970
-  },
-  "phi_3_mini.cc100-fr": {
-    "vocab_size": 32011,
-    "n_bytes": 1540504,
-    "n_tokens": 457243,
-    "n_chars": 1484970
-  },
-  "pko_t5_large.cc100-fr": {
-    "vocab_size": 50358,
-    "n_bytes": 1540504,
-    "n_tokens": 1044665,
-    "n_chars": 1484970
-  },
-  "prompt_clue.cc100-fr": {
-    "vocab_size": 32128,
-    "n_bytes": 1540504,
-    "n_tokens": 822012,
-    "n_chars": 1484970
-  },
-  "qwen1_5_14b_chat.cc100-fr": {
-    "vocab_size": 151646,
-    "n_bytes": 1540504,
-    "n_tokens": 413637,
-    "n_chars": 1484970
-  },
-  "qwen_1_8b_chat.cc100-fr": {
-    "vocab_size": 151851,
-    "n_bytes": 1540504,
-    "n_tokens": 413637,
-    "n_chars": 1484970
-  },
-  "qwen_72b_chat.cc100-fr": {
-    "vocab_size": 151851,
-    "n_bytes": 1540504,
-    "n_tokens": 413637,
-    "n_chars": 1484970
-  },
-  "qwen_7b_chat.cc100-fr": {
-    "vocab_size": 151851,
-    "n_bytes": 1540504,
-    "n_tokens": 413637,
-    "n_chars": 1484970
-  },
-  "roberta_chinese_clue.cc100-fr": {
-    "vocab_size": 8021,
-    "n_bytes": 1540504,
-    "n_tokens": 787363,
-    "n_chars": 1484970
-  },
-  "skywork_13b_base.cc100-fr": {
-    "vocab_size": 65519,
-    "n_bytes": 1540504,
-    "n_tokens": 457233,
-    "n_chars": 1484970
-  },
-  "skywork_13b_math.cc100-fr": {
-    "vocab_size": 65519,
-    "n_bytes": 1540504,
-    "n_tokens": 457233,
-    "n_chars": 1484970
-  },
-  "solar_10_7b.cc100-fr": {
-    "vocab_size": 32000,
-    "n_bytes": 1540504,
-    "n_tokens": 476666,
-    "n_chars": 1484970
-  },
-  "starchat_alpha.cc100-fr": {
-    "vocab_size": 49156,
-    "n_bytes": 1540504,
-    "n_tokens": 509958,
-    "n_chars": 1484970
-  },
-  "switch_c_2048.cc100-fr": {
-    "vocab_size": 32100,
-    "n_bytes": 1540504,
-    "n_tokens": 476133,
-    "n_chars": 1484970
-  },
-  "t5_base.cc100-fr": {
-    "vocab_size": 32100,
-    "n_bytes": 1540504,
-    "n_tokens": 476133,
-    "n_chars": 1484970
-  },
-  "t5_large.cc100-fr": {
-    "vocab_size": 32100,
-    "n_bytes": 1540504,
-    "n_tokens": 476133,
-    "n_chars": 1484970
-  },
-  "t5_small.cc100-fr": {
-    "vocab_size": 32100,
-    "n_bytes": 1540504,
-    "n_tokens": 476133,
-    "n_chars": 1484970
-  },
-  "text_davinci_003.cc100-fr": {
-    "vocab_size": 50281,
-    "n_bytes": 1540504,
-    "n_tokens": 521776,
-    "n_chars": 1484970
-  },
-  "tigerbot_13b_chat_v2.cc100-fr": {
-    "vocab_size": 60515,
-    "n_bytes": 1540504,
-    "n_tokens": 447372,
-    "n_chars": 1484970
-  },
-  "tigerbot_70b_chat_v4_4k.cc100-fr": {
-    "vocab_size": 65110,
-    "n_bytes": 1540504,
-    "n_tokens": 448567,
-    "n_chars": 1484970
-  },
-  "wizardcoder_15b_v1.cc100-fr": {
-    "vocab_size": 49153,
-    "n_bytes": 1540504,
-    "n_tokens": 509958,
-    "n_chars": 1484970
-  },
-  "wizardcoder_python_7b_v1.cc100-fr": {
-    "vocab_size": 32001,
-    "n_bytes": 1540504,
-    "n_tokens": 457243,
-    "n_chars": 1484970
-  },
-  "wizardlm_7b_v1.cc100-fr": {
-    "vocab_size": 32001,
-    "n_bytes": 1540504,
-    "n_tokens": 457243,
-    "n_chars": 1484970
-  },
-  "wizardmath_70b_v1.cc100-fr": {
-    "vocab_size": 32002,
-    "n_bytes": 1540504,
-    "n_tokens": 457243,
-    "n_chars": 1484970
-  },
-  "xlm_roberta.cc100-fr": {
-    "vocab_size": 250002,
-    "n_bytes": 1540504,
-    "n_tokens": 405041,
-    "n_chars": 1484970
-  },
-  "yi_34b.cc100-fr": {
-    "vocab_size": 64000,
-    "n_bytes": 1540504,
-    "n_tokens": 533106,
-    "n_chars": 1484970
-  },
-  "yi_6b.cc100-fr": {
-    "vocab_size": 64000,
-    "n_bytes": 1540504,
-    "n_tokens": 533106,
-    "n_chars": 1484970
-  },
-  "yi_vl34b.cc100-fr": {
-    "vocab_size": 64000,
-    "n_bytes": 1540504,
-    "n_tokens": 532288,
-    "n_chars": 1484970
-  },
-  "zephyr_7b_beta.cc100-fr": {
-    "vocab_size": 32000,
-    "n_bytes": 1540504,
-    "n_tokens": 476666,
-    "n_chars": 1484970
-  },
-  "gpt_neox_japanese_2_7b.cc100-en": {
-    "vocab_size": 32000,
-    "n_bytes": 1124813,
-    "n_tokens": 1121413,
-    "n_chars": 1121360
-  },
-  "gpt_neox_japanese_2_7b.cc100-zh-Hans": {
-    "vocab_size": 32000,
-    "n_bytes": 2633047,
-    "n_tokens": 1049033,
-    "n_chars": 927311
-  },
-  "aya_101.cc100-ja": {
-    "vocab_size": 250100,
-    "n_bytes": 1774770,
-    "n_tokens": 300542,
-    "n_chars": 603065
-  },
-  "baichuan.cc100-ja": {
-    "vocab_size": 64000,
-    "n_bytes": 1774770,
-    "n_tokens": 591656,
-    "n_chars": 603065
-  },
-  "baichuan2.cc100-ja": {
-    "vocab_size": 125696,
-    "n_bytes": 1774770,
-    "n_tokens": 554936,
-    "n_chars": 603065
-  },
-  "bert_base_cased.cc100-ja": {
-    "vocab_size": 28996,
-    "n_bytes": 1774770,
-    "n_tokens": 410492,
-    "n_chars": 603065
-  },
-  "bert_base_chinese.cc100-ja": {
-    "vocab_size": 21128,
-    "n_bytes": 1774770,
-    "n_tokens": 396831,
-    "n_chars": 603065
-  },
-  "bert_base_uncased.cc100-ja": {
-    "vocab_size": 30522,
-    "n_bytes": 1774770,
-    "n_tokens": 580634,
-    "n_chars": 603065
-  },
-  "bloom.cc100-ja": {
-    "vocab_size": 250680,
-    "n_bytes": 1774770,
-    "n_tokens": 523592,
-    "n_chars": 603065
-  },
-  "byt5_small.cc100-ja": {
-    "vocab_size": 384,
-    "n_bytes": 1774770,
-    "n_tokens": 1784770,
-    "n_chars": 603065
-  },
-  "aya_101.cc100-ar": {
-    "vocab_size": 250100,
-    "n_bytes": 2813283,
-    "n_tokens": 631736,
-    "n_chars": 1560987
-  },
-  "baichuan.cc100-ar": {
-    "vocab_size": 64000,
-    "n_bytes": 2813283,
-    "n_tokens": 1422976,
-    "n_chars": 1560987
-  },
-  "baichuan2.cc100-ar": {
-    "vocab_size": 125696,
-    "n_bytes": 2813283,
-    "n_tokens": 1337285,
-    "n_chars": 1560987
-  },
-  "bert_base_cased.cc100-ar": {
-    "vocab_size": 28996,
-    "n_bytes": 2813283,
-    "n_tokens": 1232449,
-    "n_chars": 1560987
-  },
-  "bert_base_chinese.cc100-ar": {
-    "vocab_size": 21128,
-    "n_bytes": 2813283,
-    "n_tokens": 536389,
-    "n_chars": 1560987
-  },
-  "bert_base_uncased.cc100-ar": {
-    "vocab_size": 30522,
-    "n_bytes": 2813283,
-    "n_tokens": 1269370,
-    "n_chars": 1560987
-  },
-  "bloom.cc100-ar": {
-    "vocab_size": 250680,
-    "n_bytes": 2813283,
-    "n_tokens": 427489,
-    "n_chars": 1560987
-  },
-  "byt5_small.cc100-ar": {
-    "vocab_size": 384,
-    "n_bytes": 2813283,
-    "n_tokens": 2823283,
-    "n_chars": 1560987
-  },
-  "character_glm_6b.cc100-ar": {
-    "vocab_size": 64789,
-    "n_bytes": 2813283,
-    "n_tokens": 1441847,
-    "n_chars": 1560987
-  },
-  "chatglm2_6b.cc100-ar": {
-    "vocab_size": 64787,
-    "n_bytes": 2813283,
-    "n_tokens": 1441847,
-    "n_chars": 1560987
-  },
-  "chatglm3_6b.cc100-ar": {
-    "vocab_size": 64796,
-    "n_bytes": 2813283,
-    "n_tokens": 1441847,
-    "n_chars": 1560987
-  },
-  "chatglm_6b.cc100-ar": {
-    "vocab_size": 150344,
-    "n_bytes": 2813283,
-    "n_tokens": 1097200,
-    "n_chars": 1560987
-  },
-  "chatyuan_large_v2.cc100-ar": {
-    "vocab_size": 32128,
-    "n_bytes": 2813283,
-    "n_tokens": 1006313,
-    "n_chars": 1560987
-  },
-  "chinese_llama.cc100-ar": {
-    "vocab_size": 49953,
-    "n_bytes": 2813283,
-    "n_tokens": 1421625,
-    "n_chars": 1560987
-  },
-  "chinese_llama2.cc100-ar": {
-    "vocab_size": 55296,
-    "n_bytes": 2813283,
-    "n_tokens": 1432081,
-    "n_chars": 1560987
-  },
-  "code_davinci_002.cc100-ar": {
-    "vocab_size": 50281,
-    "n_bytes": 2813283,
-    "n_tokens": 1558111,
-    "n_chars": 1560987
-  },
-  "crystal_coder.cc100-ar": {
-    "vocab_size": 32022,
-    "n_bytes": 2813283,
-    "n_tokens": 1422081,
-    "n_chars": 1560987
-  },
-  "dbrx_instruct.cc100-ar": {
-    "vocab_size": 100280,
-    "n_bytes": 2813283,
-    "n_tokens": 1105640,
-    "n_chars": 1560987
-  },
-  "deepseek_coder_33b_instruct.cc100-ar": {
-    "vocab_size": 32022,
-    "n_bytes": 2813283,
-    "n_tokens": 1958863,
-    "n_chars": 1560987
-  },
-  "deepseek_llm_7b_base.cc100-ar": {
-    "vocab_size": 100015,
-    "n_bytes": 2813283,
-    "n_tokens": 1426103,
-    "n_chars": 1560987
-  },
-  "falcon_180b.cc100-ar": {
-    "vocab_size": 65024,
-    "n_bytes": 2813283,
-    "n_tokens": 1597443,
-    "n_chars": 1560987
-  },
-  "falcon_7b.cc100-ar": {
-    "vocab_size": 65024,
-    "n_bytes": 2813283,
-    "n_tokens": 1597443,
-    "n_chars": 1560987
-  },
-  "fastchat_t5_3b.cc100-ar": {
-    "vocab_size": 32110,
-    "n_bytes": 2813283,
-    "n_tokens": 832267,
-    "n_chars": 1560987
-  },
-  "flan_t5_base.cc100-ar": {
-    "vocab_size": 32100,
-    "n_bytes": 2813283,
-    "n_tokens": 568957,
-    "n_chars": 1560987
-  },
-  "gemma_7b.cc100-ar": {
-    "vocab_size": 256000,
-    "n_bytes": 2813283,
-    "n_tokens": 573788,
-    "n_chars": 1560987
-  },
-  "gpt2.cc100-ar": {
-    "vocab_size": 50257,
-    "n_bytes": 2813283,
-    "n_tokens": 1558111,
-    "n_chars": 1560987
-  },
-  "gpt2_chinese.cc100-ar": {
-    "vocab_size": 21128,
-    "n_bytes": 2813283,
-    "n_tokens": 617677,
-    "n_chars": 1560987
-  },
-  "gpt_35_turbo.cc100-ar": {
-    "vocab_size": 100277,
-    "n_bytes": 2813283,
-    "n_tokens": 1105640,
-    "n_chars": 1560987
-  },
-  "gpt_4.cc100-ar": {
-    "vocab_size": 100277,
-    "n_bytes": 2813283,
-    "n_tokens": 1105640,
-    "n_chars": 1560987
-  },
-  "gpt_neox_japanese_2_7b.cc100-ar": {
-    "vocab_size": 32000,
-    "n_bytes": 2813283,
-    "n_tokens": 2809195,
-    "n_chars": 1560987
-  },
-  "gpt_nexo_20b.cc100-ar": {
-    "vocab_size": 50277,
-    "n_bytes": 2813283,
-    "n_tokens": 1106277,
-    "n_chars": 1560987
-  },
-  "grok_1.cc100-ar": {
-    "vocab_size": 131072,
-    "n_bytes": 2813283,
-    "n_tokens": 1392088,
-    "n_chars": 1560987
-  },
-  "internlm2_chat_7b.cc100-ar": {
-    "vocab_size": 92544,
-    "n_bytes": 2813283,
-    "n_tokens": 1635378,
-    "n_chars": 1560987
-  },
-  "internlm2_math_7b.cc100-ar": {
-    "vocab_size": 92544,
-    "n_bytes": 2813283,
-    "n_tokens": 1635378,
-    "n_chars": 1560987
-  },
-  "internlm_chat_7b.cc100-ar": {
-    "vocab_size": 103168,
-    "n_bytes": 2813283,
-    "n_tokens": 532046,
-    "n_chars": 1560987
-  },
-  "internlm_xcomposer_7b.cc100-ar": {
-    "vocab_size": 103168,
-    "n_bytes": 2813283,
-    "n_tokens": 532046,
-    "n_chars": 1560987
-  },
-  "jamba_v0_1.cc100-ar": {
-    "vocab_size": 65536,
-    "n_bytes": 2813283,
-    "n_tokens": 727886,
-    "n_chars": 1560987
-  },
-  "kplug.cc100-ar": {
-    "vocab_size": 10261,
-    "n_bytes": 2813283,
-    "n_tokens": 331987,
-    "n_chars": 1560987
-  },
-  "llama.cc100-ar": {
-    "vocab_size": 32000,
-    "n_bytes": 2813283,
-    "n_tokens": 1432081,
-    "n_chars": 1560987
-  },
-  "llama2.cc100-ar": {
-    "vocab_size": 32001,
-    "n_bytes": 2813283,
-    "n_tokens": 1432081,
-    "n_chars": 1560987
-  },
-  "llama3.cc100-ar": {
-    "vocab_size": 128256,
-    "n_bytes": 2813283,
-    "n_tokens": 615514,
-    "n_chars": 1560987
-  },
-  "mistral_7b.cc100-ar": {
-    "vocab_size": 32000,
-    "n_bytes": 2813283,
-    "n_tokens": 1406319,
-    "n_chars": 1560987
-  },
-  "mixtral_8_7b.cc100-ar": {
-    "vocab_size": 32000,
-    "n_bytes": 2813283,
-    "n_tokens": 1406319,
-    "n_chars": 1560987
-  },
-  "mobilebert_uncased.cc100-ar": {
-    "vocab_size": 30522,
-    "n_bytes": 2813283,
-    "n_tokens": 1269370,
-    "n_chars": 1560987
-  },
-  "moss.cc100-ar": {
-    "vocab_size": 106072,
-    "n_bytes": 2813283,
-    "n_tokens": 1557671,
-    "n_chars": 1560987
-  },
-  "mt5_large.cc100-ar": {
-    "vocab_size": 250100,
-    "n_bytes": 2813283,
-    "n_tokens": 631736,
-    "n_chars": 1560987
-  },
-  "olmo_7b.cc100-ar": {
-    "vocab_size": 50280,
-    "n_bytes": 2813283,
-    "n_tokens": 1106277,
-    "n_chars": 1560987
-  },
-  "orion_14b_chat.cc100-ar": {
-    "vocab_size": 84608,
-    "n_bytes": 2813283,
-    "n_tokens": 1531053,
-    "n_chars": 1560987
-  },
-  "phi_1.cc100-ar": {
-    "vocab_size": 50295,
-    "n_bytes": 2813283,
-    "n_tokens": 1558111,
-    "n_chars": 1560987
-  },
-  "phi_2.cc100-ar": {
-    "vocab_size": 50295,
-    "n_bytes": 2813283,
-    "n_tokens": 1558111,
-    "n_chars": 1560987
-  },
-  "phi_3_mini.cc100-ar": {
-    "vocab_size": 32011,
-    "n_bytes": 2813283,
-    "n_tokens": 1432081,
-    "n_chars": 1560987
-  },
-  "pko_t5_large.cc100-ar": {
-    "vocab_size": 50358,
-    "n_bytes": 2813283,
-    "n_tokens": 2815586,
-    "n_chars": 1560987
-  },
-  "prompt_clue.cc100-ar": {
-    "vocab_size": 32128,
-    "n_bytes": 2813283,
-    "n_tokens": 1006313,
-    "n_chars": 1560987
-  },
-  "qwen1_5_14b_chat.cc100-ar": {
-    "vocab_size": 151646,
-    "n_bytes": 2813283,
-    "n_tokens": 614959,
-    "n_chars": 1560987
-  },
-  "qwen_1_8b_chat.cc100-ar": {
-    "vocab_size": 151851,
-    "n_bytes": 2813283,
-    "n_tokens": 614959,
-    "n_chars": 1560987
-  },
-  "qwen_72b_chat.cc100-ar": {
-    "vocab_size": 151851,
-    "n_bytes": 2813283,
-    "n_tokens": 614959,
-    "n_chars": 1560987
-  },
-  "qwen_7b_chat.cc100-ar": {
-    "vocab_size": 151851,
-    "n_bytes": 2813283,
-    "n_tokens": 614959,
-    "n_chars": 1560987
-  },
-  "roberta_chinese_clue.cc100-ar": {
-    "vocab_size": 8021,
-    "n_bytes": 2813283,
-    "n_tokens": 621762,
-    "n_chars": 1560987
-  },
-  "skywork_13b_base.cc100-ar": {
-    "vocab_size": 65519,
-    "n_bytes": 2813283,
-    "n_tokens": 1432065,
-    "n_chars": 1560987
-  },
-  "skywork_13b_math.cc100-ar": {
-    "vocab_size": 65519,
-    "n_bytes": 2813283,
-    "n_tokens": 1432065,
-    "n_chars": 1560987
-  },
-  "solar_10_7b.cc100-ar": {
-    "vocab_size": 32000,
-    "n_bytes": 2813283,
-    "n_tokens": 1406319,
-    "n_chars": 1560987
-  },
-  "starchat_alpha.cc100-ar": {
-    "vocab_size": 49156,
-    "n_bytes": 2813283,
-    "n_tokens": 1195640,
-    "n_chars": 1560987
-  },
-  "switch_c_2048.cc100-ar": {
-    "vocab_size": 32100,
-    "n_bytes": 2813283,
-    "n_tokens": 568855,
-    "n_chars": 1560987
-  },
-  "t5_base.cc100-ar": {
-    "vocab_size": 32100,
-    "n_bytes": 2813283,
-    "n_tokens": 568855,
-    "n_chars": 1560987
-  },
-  "t5_large.cc100-ar": {
-    "vocab_size": 32100,
-    "n_bytes": 2813283,
-    "n_tokens": 568855,
-    "n_chars": 1560987
-  },
-  "t5_small.cc100-ar": {
-    "vocab_size": 32100,
-    "n_bytes": 2813283,
-    "n_tokens": 568855,
-    "n_chars": 1560987
-  },
-  "text_davinci_003.cc100-ar": {
-    "vocab_size": 50281,
-    "n_bytes": 2813283,
-    "n_tokens": 1558111,
-    "n_chars": 1560987
-  },
-  "tigerbot_13b_chat_v2.cc100-ar": {
-    "vocab_size": 60515,
-    "n_bytes": 2813283,
-    "n_tokens": 1422070,
-    "n_chars": 1560987
-  },
-  "tigerbot_70b_chat_v4_4k.cc100-ar": {
-    "vocab_size": 65110,
-    "n_bytes": 2813283,
-    "n_tokens": 1422073,
-    "n_chars": 1560987
-  },
-  "wizardcoder_15b_v1.cc100-ar": {
-    "vocab_size": 49153,
-    "n_bytes": 2813283,
-    "n_tokens": 1195640,
-    "n_chars": 1560987
-  },
-  "wizardcoder_python_7b_v1.cc100-ar": {
-    "vocab_size": 32001,
-    "n_bytes": 2813283,
-    "n_tokens": 1432081,
-    "n_chars": 1560987
-  },
-  "wizardlm_7b_v1.cc100-ar": {
-    "vocab_size": 32001,
-    "n_bytes": 2813283,
-    "n_tokens": 1432081,
-    "n_chars": 1560987
-  },
-  "wizardmath_70b_v1.cc100-ar": {
-    "vocab_size": 32002,
-    "n_bytes": 2813283,
-    "n_tokens": 1432081,
-    "n_chars": 1560987
-  },
-  "xlm_roberta.cc100-ar": {
-    "vocab_size": 250002,
-    "n_bytes": 2813283,
-    "n_tokens": 518287,
-    "n_chars": 1560987
-  },
-  "yi_34b.cc100-ar": {
-    "vocab_size": 64000,
-    "n_bytes": 2813283,
-    "n_tokens": 1795801,
-    "n_chars": 1560987
-  },
-  "yi_6b.cc100-ar": {
-    "vocab_size": 64000,
-    "n_bytes": 2813283,
-    "n_tokens": 1795801,
-    "n_chars": 1560987
-  },
-  "yi_vl34b.cc100-ar": {
-    "vocab_size": 64000,
-    "n_bytes": 2813283,
-    "n_tokens": 1803957,
-    "n_chars": 1560987
-  },
-  "zephyr_7b_beta.cc100-ar": {
-    "vocab_size": 32000,
-    "n_bytes": 2813283,
-    "n_tokens": 1406319,
-    "n_chars": 1560987
-  },
-  "aya_101.cc100-de": {
-    "vocab_size": 250100,
-    "n_bytes": 1814876,
-    "n_tokens": 480418,
-    "n_chars": 1784021
-  },
-  "baichuan.cc100-de": {
-    "vocab_size": 64000,
-    "n_bytes": 1814876,
-    "n_tokens": 680512,
-    "n_chars": 1784021
-  },
-  "baichuan2.cc100-de": {
-    "vocab_size": 125696,
-    "n_bytes": 1814876,
-    "n_tokens": 628063,
-    "n_chars": 1784021
-  },
-  "bert_base_cased.cc100-de": {
-    "vocab_size": 28996,
-    "n_bytes": 1814876,
-    "n_tokens": 731093,
-    "n_chars": 1784021
-  },
-  "bert_base_chinese.cc100-de": {
-    "vocab_size": 21128,
-    "n_bytes": 1814876,
-    "n_tokens": 561246,
-    "n_chars": 1784021
-  },
-  "bert_base_uncased.cc100-de": {
-    "vocab_size": 30522,
-    "n_bytes": 1814876,
-    "n_tokens": 646485,
-    "n_chars": 1784021
-  },
-  "bloom.cc100-de": {
-    "vocab_size": 250680,
-    "n_bytes": 1814876,
-    "n_tokens": 541170,
-    "n_chars": 1784021
-  },
-  "byt5_small.cc100-de": {
-    "vocab_size": 384,
-    "n_bytes": 1814876,
-    "n_tokens": 1824876,
-    "n_chars": 1784021
-  },
-  "character_glm_6b.cc100-de": {
-    "vocab_size": 64789,
-    "n_bytes": 1814876,
-    "n_tokens": 639822,
-    "n_chars": 1784021
-  },
-  "chatglm2_6b.cc100-de": {
-    "vocab_size": 64787,
-    "n_bytes": 1814876,
-    "n_tokens": 639757,
-    "n_chars": 1784021
-  },
-  "chatglm3_6b.cc100-de": {
-    "vocab_size": 64796,
-    "n_bytes": 1814876,
-    "n_tokens": 639822,
-    "n_chars": 1784021
-  },
-  "chatglm_6b.cc100-de": {
-    "vocab_size": 150344,
-    "n_bytes": 1814876,
-    "n_tokens": 589464,
-    "n_chars": 1784021
-  },
-  "chatyuan_large_v2.cc100-de": {
-    "vocab_size": 32128,
-    "n_bytes": 1814876,
-    "n_tokens": 970463,
-    "n_chars": 1784021
-  },
-  "chinese_llama.cc100-de": {
-    "vocab_size": 49953,
-    "n_bytes": 1814876,
-    "n_tokens": 523859,
-    "n_chars": 1784021
-  },
-  "chinese_llama2.cc100-de": {
-    "vocab_size": 55296,
-    "n_bytes": 1814876,
-    "n_tokens": 537318,
-    "n_chars": 1784021
-  },
-  "code_davinci_002.cc100-de": {
-    "vocab_size": 50281,
-    "n_bytes": 1814876,
-    "n_tokens": 684666,
-    "n_chars": 1784021
-  },
-  "crystal_coder.cc100-de": {
-    "vocab_size": 32022,
-    "n_bytes": 1814876,
-    "n_tokens": 527320,
-    "n_chars": 1784021
-  },
-  "dbrx_instruct.cc100-de": {
-    "vocab_size": 100280,
-    "n_bytes": 1814876,
-    "n_tokens": 500870,
-    "n_chars": 1784021
-  },
-  "deepseek_coder_33b_instruct.cc100-de": {
-    "vocab_size": 32022,
-    "n_bytes": 1814876,
-    "n_tokens": 745618,
-    "n_chars": 1784021
-  },
-  "deepseek_llm_7b_base.cc100-de": {
-    "vocab_size": 100015,
-    "n_bytes": 1814876,
-    "n_tokens": 642573,
-    "n_chars": 1784021
-  },
-  "falcon_180b.cc100-de": {
-    "vocab_size": 65024,
-    "n_bytes": 1814876,
-    "n_tokens": 497054,
-    "n_chars": 1784021
-  },
-  "falcon_7b.cc100-de": {
-    "vocab_size": 65024,
-    "n_bytes": 1814876,
-    "n_tokens": 497054,
-    "n_chars": 1784021
-  },
-  "fastchat_t5_3b.cc100-de": {
-    "vocab_size": 32110,
-    "n_bytes": 1814876,
-    "n_tokens": 736989,
-    "n_chars": 1784021
-  },
-  "flan_t5_base.cc100-de": {
-    "vocab_size": 32100,
-    "n_bytes": 1814876,
-    "n_tokens": 480254,
-    "n_chars": 1784021
-  },
-  "gemma_7b.cc100-de": {
-    "vocab_size": 256000,
-    "n_bytes": 1814876,
-    "n_tokens": 416876,
-    "n_chars": 1784021
-  },
-  "gpt2.cc100-de": {
-    "vocab_size": 50257,
-    "n_bytes": 1814876,
-    "n_tokens": 684669,
-    "n_chars": 1784021
-  },
-  "gpt2_chinese.cc100-de": {
-    "vocab_size": 21128,
-    "n_bytes": 1814876,
-    "n_tokens": 786497,
-    "n_chars": 1784021
-  },
-  "gpt_35_turbo.cc100-de": {
-    "vocab_size": 100277,
-    "n_bytes": 1814876,
-    "n_tokens": 500870,
-    "n_chars": 1784021
-  },
-  "gpt_4.cc100-de": {
-    "vocab_size": 100277,
-    "n_bytes": 1814876,
-    "n_tokens": 500870,
-    "n_chars": 1784021
-  },
-  "gpt_neox_japanese_2_7b.cc100-de": {
-    "vocab_size": 32000,
-    "n_bytes": 1814876,
-    "n_tokens": 1807780,
-    "n_chars": 1784021
-  },
-  "gpt_nexo_20b.cc100-de": {
-    "vocab_size": 50277,
-    "n_bytes": 1814876,
-    "n_tokens": 583628,
-    "n_chars": 1784021
-  },
-  "grok_1.cc100-de": {
-    "vocab_size": 131072,
-    "n_bytes": 1814876,
-    "n_tokens": 505220,
-    "n_chars": 1784021
-  },
-  "internlm2_chat_7b.cc100-de": {
-    "vocab_size": 92544,
-    "n_bytes": 1814876,
-    "n_tokens": 583917,
-    "n_chars": 1784021
-  },
-  "internlm2_math_7b.cc100-de": {
-    "vocab_size": 92544,
-    "n_bytes": 1814876,
-    "n_tokens": 583917,
-    "n_chars": 1784021
-  },
-  "internlm_chat_7b.cc100-de": {
-    "vocab_size": 103168,
-    "n_bytes": 1814876,
-    "n_tokens": 580489,
-    "n_chars": 1784021
-  },
-  "internlm_xcomposer_7b.cc100-de": {
-    "vocab_size": 103168,
-    "n_bytes": 1814876,
-    "n_tokens": 580489,
-    "n_chars": 1784021
-  },
-  "jamba_v0_1.cc100-de": {
-    "vocab_size": 65536,
-    "n_bytes": 1814876,
-    "n_tokens": 535856,
-    "n_chars": 1784021
-  },
-  "kplug.cc100-de": {
-    "vocab_size": 10261,
-    "n_bytes": 1814876,
-    "n_tokens": 789053,
-    "n_chars": 1784021
-  },
-  "llama.cc100-de": {
-    "vocab_size": 32000,
-    "n_bytes": 1814876,
-    "n_tokens": 537320,
-    "n_chars": 1784021
-  },
-  "llama2.cc100-de": {
-    "vocab_size": 32001,
-    "n_bytes": 1814876,
-    "n_tokens": 537320,
-    "n_chars": 1784021
-  },
-  "llama3.cc100-de": {
-    "vocab_size": 128256,
-    "n_bytes": 1814876,
-    "n_tokens": 499766,
-    "n_chars": 1784021
-  },
-  "mistral_7b.cc100-de": {
-    "vocab_size": 32000,
-    "n_bytes": 1814876,
-    "n_tokens": 577526,
-    "n_chars": 1784021
-  },
-  "mixtral_8_7b.cc100-de": {
-    "vocab_size": 32000,
-    "n_bytes": 1814876,
-    "n_tokens": 577526,
-    "n_chars": 1784021
-  },
-  "mobilebert_uncased.cc100-de": {
-    "vocab_size": 30522,
-    "n_bytes": 1814876,
-    "n_tokens": 646485,
-    "n_chars": 1784021
-  },
-  "moss.cc100-de": {
-    "vocab_size": 106072,
-    "n_bytes": 1814876,
-    "n_tokens": 683401,
-    "n_chars": 1784021
-  },
-  "mt5_large.cc100-de": {
-    "vocab_size": 250100,
-    "n_bytes": 1814876,
-    "n_tokens": 480418,
-    "n_chars": 1784021
-  },
-  "olmo_7b.cc100-de": {
-    "vocab_size": 50280,
-    "n_bytes": 1814876,
-    "n_tokens": 583628,
-    "n_chars": 1784021
-  },
-  "orion_14b_chat.cc100-de": {
-    "vocab_size": 84608,
-    "n_bytes": 1814876,
-    "n_tokens": 744404,
-    "n_chars": 1784021
-  },
-  "phi_1.cc100-de": {
-    "vocab_size": 50295,
-    "n_bytes": 1814876,
-    "n_tokens": 684665,
-    "n_chars": 1784021
-  },
-  "phi_2.cc100-de": {
-    "vocab_size": 50295,
-    "n_bytes": 1814876,
-    "n_tokens": 684665,
-    "n_chars": 1784021
-  },
-  "phi_3_mini.cc100-de": {
-    "vocab_size": 32011,
-    "n_bytes": 1814876,
-    "n_tokens": 537320,
-    "n_chars": 1784021
-  },
-  "pko_t5_large.cc100-de": {
-    "vocab_size": 50358,
-    "n_bytes": 1814876,
-    "n_tokens": 1254350,
-    "n_chars": 1784021
-  },
-  "prompt_clue.cc100-de": {
-    "vocab_size": 32128,
-    "n_bytes": 1814876,
-    "n_tokens": 970463,
-    "n_chars": 1784021
-  },
-  "qwen1_5_14b_chat.cc100-de": {
-    "vocab_size": 151646,
-    "n_bytes": 1814876,
-    "n_tokens": 503561,
-    "n_chars": 1784021
-  },
-  "qwen_1_8b_chat.cc100-de": {
-    "vocab_size": 151851,
-    "n_bytes": 1814876,
-    "n_tokens": 503561,
-    "n_chars": 1784021
-  },
-  "qwen_72b_chat.cc100-de": {
-    "vocab_size": 151851,
-    "n_bytes": 1814876,
-    "n_tokens": 503561,
-    "n_chars": 1784021
-  },
-  "qwen_7b_chat.cc100-de": {
-    "vocab_size": 151851,
-    "n_bytes": 1814876,
-    "n_tokens": 503561,
-    "n_chars": 1784021
-  },
-  "roberta_chinese_clue.cc100-de": {
-    "vocab_size": 8021,
-    "n_bytes": 1814876,
-    "n_tokens": 915612,
-    "n_chars": 1784021
-  },
-  "skywork_13b_base.cc100-de": {
-    "vocab_size": 65519,
-    "n_bytes": 1814876,
-    "n_tokens": 537308,
-    "n_chars": 1784021
-  },
-  "skywork_13b_math.cc100-de": {
-    "vocab_size": 65519,
-    "n_bytes": 1814876,
-    "n_tokens": 537308,
-    "n_chars": 1784021
-  },
-  "solar_10_7b.cc100-de": {
-    "vocab_size": 32000,
-    "n_bytes": 1814876,
-    "n_tokens": 577526,
-    "n_chars": 1784021
-  },
-  "starchat_alpha.cc100-de": {
-    "vocab_size": 49156,
-    "n_bytes": 1814876,
-    "n_tokens": 620541,
-    "n_chars": 1784021
-  },
-  "switch_c_2048.cc100-de": {
-    "vocab_size": 32100,
-    "n_bytes": 1814876,
-    "n_tokens": 480254,
-    "n_chars": 1784021
-  },
-  "t5_base.cc100-de": {
-    "vocab_size": 32100,
-    "n_bytes": 1814876,
-    "n_tokens": 480254,
-    "n_chars": 1784021
-  },
-  "t5_large.cc100-de": {
-    "vocab_size": 32100,
-    "n_bytes": 1814876,
-    "n_tokens": 480254,
-    "n_chars": 1784021
-  },
-  "t5_small.cc100-de": {
-    "vocab_size": 32100,
-    "n_bytes": 1814876,
-    "n_tokens": 480254,
-    "n_chars": 1784021
-  },
-  "text_davinci_003.cc100-de": {
-    "vocab_size": 50281,
-    "n_bytes": 1814876,
-    "n_tokens": 684666,
-    "n_chars": 1784021
-  },
-  "tigerbot_13b_chat_v2.cc100-de": {
-    "vocab_size": 60515,
-    "n_bytes": 1814876,
-    "n_tokens": 528918,
-    "n_chars": 1784021
-  },
-  "tigerbot_70b_chat_v4_4k.cc100-de": {
-    "vocab_size": 65110,
-    "n_bytes": 1814876,
-    "n_tokens": 529170,
-    "n_chars": 1784021
-  },
-  "wizardcoder_15b_v1.cc100-de": {
-    "vocab_size": 49153,
-    "n_bytes": 1814876,
-    "n_tokens": 620541,
-    "n_chars": 1784021
-  },
-  "wizardcoder_python_7b_v1.cc100-de": {
-    "vocab_size": 32001,
-    "n_bytes": 1814876,
-    "n_tokens": 537320,
-    "n_chars": 1784021
-  },
-  "wizardlm_7b_v1.cc100-de": {
-    "vocab_size": 32001,
-    "n_bytes": 1814876,
-    "n_tokens": 537320,
-    "n_chars": 1784021
-  },
-  "wizardmath_70b_v1.cc100-de": {
-    "vocab_size": 32002,
-    "n_bytes": 1814876,
-    "n_tokens": 537320,
-    "n_chars": 1784021
-  },
-  "xlm_roberta.cc100-de": {
-    "vocab_size": 250002,
-    "n_bytes": 1814876,
-    "n_tokens": 432571,
-    "n_chars": 1784021
-  },
-  "yi_34b.cc100-de": {
-    "vocab_size": 64000,
-    "n_bytes": 1814876,
-    "n_tokens": 698366,
-    "n_chars": 1784021
-  },
-  "yi_6b.cc100-de": {
-    "vocab_size": 64000,
-    "n_bytes": 1814876,
-    "n_tokens": 698366,
-    "n_chars": 1784021
-  },
-  "yi_vl34b.cc100-de": {
-    "vocab_size": 64000,
-    "n_bytes": 1814876,
-    "n_tokens": 697065,
-    "n_chars": 1784021
-  },
-  "zephyr_7b_beta.cc100-de": {
-    "vocab_size": 32000,
-    "n_bytes": 1814876,
-    "n_tokens": 577526,
-    "n_chars": 1784021
-  },
-  "gpt_neox_japanese_2_7b.cc100-es": {
-    "vocab_size": 32000,
-    "n_bytes": 1664455,
-    "n_tokens": 1658946,
-    "n_chars": 1630297
-  },
-  "gpt_neox_japanese_2_7b.cc100-fr": {
-    "vocab_size": 32000,
-    "n_bytes": 1540504,
-    "n_tokens": 1524129,
-    "n_chars": 1484970
-  },
-  "character_glm_6b.cc100-ja": {
-    "vocab_size": 64789,
-    "n_bytes": 1774770,
-    "n_tokens": 601380,
-    "n_chars": 603065
-  },
-  "chatglm2_6b.cc100-ja": {
-    "vocab_size": 64787,
-    "n_bytes": 1774770,
-    "n_tokens": 601380,
-    "n_chars": 603065
-  },
-  "chatglm3_6b.cc100-ja": {
-    "vocab_size": 64796,
-    "n_bytes": 1774770,
-    "n_tokens": 601380,
-    "n_chars": 603065
-  },
-  "chatglm_6b.cc100-ja": {
-    "vocab_size": 150344,
-    "n_bytes": 1774770,
-    "n_tokens": 489930,
-    "n_chars": 603065
-  },
-  "chatyuan_large_v2.cc100-ja": {
-    "vocab_size": 32128,
-    "n_bytes": 1774770,
-    "n_tokens": 575118,
-    "n_chars": 603065
-  },
-  "chinese_llama.cc100-ja": {
-    "vocab_size": 49953,
-    "n_bytes": 1774770,
-    "n_tokens": 614177,
-    "n_chars": 603065
-  },
-  "chinese_llama2.cc100-ja": {
-    "vocab_size": 55296,
-    "n_bytes": 1774770,
-    "n_tokens": 624362,
-    "n_chars": 603065
-  },
-  "code_davinci_002.cc100-ja": {
-    "vocab_size": 50281,
-    "n_bytes": 1774770,
-    "n_tokens": 844362,
-    "n_chars": 603065
-  },
-  "crystal_coder.cc100-ja": {
-    "vocab_size": 32022,
-    "n_bytes": 1774770,
-    "n_tokens": 718461,
-    "n_chars": 603065
-  },
-  "dbrx_instruct.cc100-ja": {
-    "vocab_size": 100280,
-    "n_bytes": 1774770,
-    "n_tokens": 630348,
-    "n_chars": 603065
-  },
-  "deepseek_coder_33b_instruct.cc100-ja": {
-    "vocab_size": 32022,
-    "n_bytes": 1774770,
-    "n_tokens": 1018060,
-    "n_chars": 603065
-  },
-  "deepseek_llm_7b_base.cc100-ja": {
-    "vocab_size": 100015,
-    "n_bytes": 1774770,
-    "n_tokens": 761467,
-    "n_chars": 603065
-  },
-  "falcon_180b.cc100-ja": {
-    "vocab_size": 65024,
-    "n_bytes": 1774770,
-    "n_tokens": 842458,
-    "n_chars": 603065
-  },
-  "falcon_7b.cc100-ja": {
-    "vocab_size": 65024,
-    "n_bytes": 1774770,
-    "n_tokens": 842458,
-    "n_chars": 603065
-  },
-  "fastchat_t5_3b.cc100-ja": {
-    "vocab_size": 32110,
-    "n_bytes": 1774770,
-    "n_tokens": 53915,
-    "n_chars": 603065
-  },
-  "flan_t5_base.cc100-ja": {
-    "vocab_size": 32100,
-    "n_bytes": 1774770,
-    "n_tokens": 51999,
-    "n_chars": 603065
-  },
-  "gemma_7b.cc100-ja": {
-    "vocab_size": 256000,
-    "n_bytes": 1774770,
-    "n_tokens": 317873,
-    "n_chars": 603065
-  },
-  "gpt2.cc100-ja": {
-    "vocab_size": 50257,
-    "n_bytes": 1774770,
-    "n_tokens": 844362,
-    "n_chars": 603065
-  },
-  "gpt2_chinese.cc100-ja": {
-    "vocab_size": 21128,
-    "n_bytes": 1774770,
-    "n_tokens": 503085,
-    "n_chars": 603065
-  },
-  "gpt_35_turbo.cc100-ja": {
-    "vocab_size": 100277,
-    "n_bytes": 1774770,
-    "n_tokens": 630348,
-    "n_chars": 603065
-  },
-  "gpt_4.cc100-ja": {
-    "vocab_size": 100277,
-    "n_bytes": 1774770,
-    "n_tokens": 630348,
-    "n_chars": 603065
-  },
-  "gpt_neox_japanese_2_7b.cc100-ja": {
-    "vocab_size": 32000,
-    "n_bytes": 1774770,
-    "n_tokens": 410803,
-    "n_chars": 603065
-  },
-  "gpt_nexo_20b.cc100-ja": {
-    "vocab_size": 50277,
-    "n_bytes": 1774770,
-    "n_tokens": 605168,
-    "n_chars": 603065
-  },
-  "grok_1.cc100-ja": {
-    "vocab_size": 131072,
-    "n_bytes": 1774770,
-    "n_tokens": 497590,
-    "n_chars": 603065
-  },
-  "internlm2_chat_7b.cc100-ja": {
-    "vocab_size": 92544,
-    "n_bytes": 1774770,
-    "n_tokens": 595803,
-    "n_chars": 603065
-  },
-  "internlm2_math_7b.cc100-ja": {
-    "vocab_size": 92544,
-    "n_bytes": 1774770,
-    "n_tokens": 595803,
-    "n_chars": 603065
-  },
-  "internlm_chat_7b.cc100-ja": {
-    "vocab_size": 103168,
-    "n_bytes": 1774770,
-    "n_tokens": 448212,
-    "n_chars": 603065
-  },
-  "internlm_xcomposer_7b.cc100-ja": {
-    "vocab_size": 103168,
-    "n_bytes": 1774770,
-    "n_tokens": 448212,
-    "n_chars": 603065
-  },
-  "jamba_v0_1.cc100-ja": {
-    "vocab_size": 65536,
-    "n_bytes": 1774770,
-    "n_tokens": 683256,
-    "n_chars": 603065
-  },
-  "kplug.cc100-ja": {
-    "vocab_size": 10261,
-    "n_bytes": 1774770,
-    "n_tokens": 338023,
-    "n_chars": 603065
-  },
-  "llama.cc100-ja": {
-    "vocab_size": 32000,
-    "n_bytes": 1774770,
-    "n_tokens": 728461,
-    "n_chars": 603065
-  },
-  "llama2.cc100-ja": {
-    "vocab_size": 32001,
-    "n_bytes": 1774770,
-    "n_tokens": 728461,
-    "n_chars": 603065
-  },
-  "llama3.cc100-ja": {
-    "vocab_size": 128256,
-    "n_bytes": 1774770,
-    "n_tokens": 414715,
-    "n_chars": 603065
-  },
-  "mistral_7b.cc100-ja": {
-    "vocab_size": 32000,
-    "n_bytes": 1774770,
-    "n_tokens": 685134,
-    "n_chars": 603065
-  },
-  "mixtral_8_7b.cc100-ja": {
-    "vocab_size": 32000,
-    "n_bytes": 1774770,
-    "n_tokens": 685134,
-    "n_chars": 603065
-  },
-  "mobilebert_uncased.cc100-ja": {
-    "vocab_size": 30522,
-    "n_bytes": 1774770,
-    "n_tokens": 580634,
-    "n_chars": 603065
-  },
-  "moss.cc100-ja": {
-    "vocab_size": 106072,
-    "n_bytes": 1774770,
-    "n_tokens": 600011,
-    "n_chars": 603065
-  },
-  "mt5_large.cc100-ja": {
-    "vocab_size": 250100,
-    "n_bytes": 1774770,
-    "n_tokens": 300542,
-    "n_chars": 603065
-  },
-  "olmo_7b.cc100-ja": {
-    "vocab_size": 50280,
-    "n_bytes": 1774770,
-    "n_tokens": 605168,
-    "n_chars": 603065
-  },
-  "orion_14b_chat.cc100-ja": {
-    "vocab_size": 84608,
-    "n_bytes": 1774770,
-    "n_tokens": 324956,
-    "n_chars": 603065
-  },
-  "phi_1.cc100-ja": {
-    "vocab_size": 50295,
-    "n_bytes": 1774770,
-    "n_tokens": 844362,
-    "n_chars": 603065
-  },
-  "phi_2.cc100-ja": {
-    "vocab_size": 50295,
-    "n_bytes": 1774770,
-    "n_tokens": 844362,
-    "n_chars": 603065
-  },
-  "phi_3_mini.cc100-ja": {
-    "vocab_size": 32011,
-    "n_bytes": 1774770,
-    "n_tokens": 728461,
-    "n_chars": 603065
-  },
-  "pko_t5_large.cc100-ja": {
-    "vocab_size": 50358,
-    "n_bytes": 1774770,
-    "n_tokens": 1766950,
-    "n_chars": 603065
-  },
-  "prompt_clue.cc100-ja": {
-    "vocab_size": 32128,
-    "n_bytes": 1774770,
-    "n_tokens": 575118,
-    "n_chars": 603065
-  },
-  "qwen1_5_14b_chat.cc100-ja": {
-    "vocab_size": 151646,
-    "n_bytes": 1774770,
-    "n_tokens": 377144,
-    "n_chars": 603065
-  },
-  "qwen_1_8b_chat.cc100-ja": {
-    "vocab_size": 151851,
-    "n_bytes": 1774770,
-    "n_tokens": 377144,
-    "n_chars": 603065
-  },
-  "qwen_72b_chat.cc100-ja": {
-    "vocab_size": 151851,
-    "n_bytes": 1774770,
-    "n_tokens": 377144,
-    "n_chars": 603065
-  },
-  "qwen_7b_chat.cc100-ja": {
-    "vocab_size": 151851,
-    "n_bytes": 1774770,
-    "n_tokens": 377144,
-    "n_chars": 603065
-  },
-  "roberta_chinese_clue.cc100-ja": {
-    "vocab_size": 8021,
-    "n_bytes": 1774770,
-    "n_tokens": 339411,
-    "n_chars": 603065
-  },
-  "skywork_13b_base.cc100-ja": {
-    "vocab_size": 65519,
-    "n_bytes": 1774770,
-    "n_tokens": 603613,
-    "n_chars": 603065
-  },
-  "skywork_13b_math.cc100-ja": {
-    "vocab_size": 65519,
-    "n_bytes": 1774770,
-    "n_tokens": 603613,
-    "n_chars": 603065
-  },
-  "solar_10_7b.cc100-ja": {
-    "vocab_size": 32000,
-    "n_bytes": 1774770,
-    "n_tokens": 685134,
-    "n_chars": 603065
-  },
-  "starchat_alpha.cc100-ja": {
-    "vocab_size": 49156,
-    "n_bytes": 1774770,
-    "n_tokens": 546876,
-    "n_chars": 603065
-  },
-  "switch_c_2048.cc100-ja": {
-    "vocab_size": 32100,
-    "n_bytes": 1774770,
-    "n_tokens": 51947,
-    "n_chars": 603065
-  },
-  "t5_base.cc100-ja": {
-    "vocab_size": 32100,
-    "n_bytes": 1774770,
-    "n_tokens": 51947,
-    "n_chars": 603065
-  },
-  "t5_large.cc100-ja": {
-    "vocab_size": 32100,
-    "n_bytes": 1774770,
-    "n_tokens": 51947,
-    "n_chars": 603065
-  },
-  "t5_small.cc100-ja": {
-    "vocab_size": 32100,
-    "n_bytes": 1774770,
-    "n_tokens": 51947,
-    "n_chars": 603065
-  },
-  "text_davinci_003.cc100-ja": {
-    "vocab_size": 50281,
-    "n_bytes": 1774770,
-    "n_tokens": 844362,
-    "n_chars": 603065
-  },
-  "tigerbot_13b_chat_v2.cc100-ja": {
-    "vocab_size": 60515,
-    "n_bytes": 1774770,
-    "n_tokens": 567792,
-    "n_chars": 603065
-  },
-  "tigerbot_70b_chat_v4_4k.cc100-ja": {
-    "vocab_size": 65110,
-    "n_bytes": 1774770,
-    "n_tokens": 406571,
-    "n_chars": 603065
-  },
-  "wizardcoder_15b_v1.cc100-ja": {
-    "vocab_size": 49153,
-    "n_bytes": 1774770,
-    "n_tokens": 546876,
-    "n_chars": 603065
-  },
-  "wizardcoder_python_7b_v1.cc100-ja": {
-    "vocab_size": 32001,
-    "n_bytes": 1774770,
-    "n_tokens": 728461,
-    "n_chars": 603065
-  },
-  "wizardlm_7b_v1.cc100-ja": {
-    "vocab_size": 32001,
-    "n_bytes": 1774770,
-    "n_tokens": 728461,
-    "n_chars": 603065
-  },
-  "wizardmath_70b_v1.cc100-ja": {
-    "vocab_size": 32002,
-    "n_bytes": 1774770,
-    "n_tokens": 728461,
-    "n_chars": 603065
-  },
-  "xlm_roberta.cc100-ja": {
-    "vocab_size": 250002,
-    "n_bytes": 1774770,
-    "n_tokens": 344820,
-    "n_chars": 603065
-  },
-  "yi_34b.cc100-ja": {
-    "vocab_size": 64000,
-    "n_bytes": 1774770,
-    "n_tokens": 740791,
-    "n_chars": 603065
-  },
-  "yi_6b.cc100-ja": {
-    "vocab_size": 64000,
-    "n_bytes": 1774770,
-    "n_tokens": 740791,
-    "n_chars": 603065
-  },
-  "yi_vl34b.cc100-ja": {
-    "vocab_size": 64000,
-    "n_bytes": 1774770,
-    "n_tokens": 749927,
-    "n_chars": 603065
-  },
-  "zephyr_7b_beta.cc100-ja": {
-    "vocab_size": 32000,
-    "n_bytes": 1774770,
-    "n_tokens": 685134,
-    "n_chars": 603065
-  },
-  "llama_3_chinese_8b.cc100-ar": {
-    "vocab_size": 128256,
-    "n_bytes": 2813283,
-    "n_tokens": 625514,
-    "n_chars": 1560987
-  },
-  "llama_3_chinese_8b.cc100-de": {
-    "vocab_size": 128256,
-    "n_bytes": 1814876,
-    "n_tokens": 509766,
-    "n_chars": 1784021
-  },
-  "llama_3_chinese_8b.cc100-en": {
-    "vocab_size": 128256,
-    "n_bytes": 1124813,
-    "n_tokens": 264944,
-    "n_chars": 1121360
-  },
-  "llama_3_chinese_8b.cc100-es": {
-    "vocab_size": 128256,
-    "n_bytes": 1664455,
-    "n_tokens": 443289,
-    "n_chars": 1630297
-  },
-  "aya_101.cc100-fa": {
-    "vocab_size": 250100,
-    "n_bytes": 2054052,
-    "n_tokens": 429922,
-    "n_chars": 1145876
-  },
-  "baichuan.cc100-fa": {
-    "vocab_size": 64000,
-    "n_bytes": 2054052,
-    "n_tokens": 1142057,
-    "n_chars": 1145876
-  },
-  "baichuan2.cc100-fa": {
-    "vocab_size": 125696,
-    "n_bytes": 2054052,
-    "n_tokens": 1052077,
-    "n_chars": 1145876
-  },
-  "bert_base_cased.cc100-fa": {
-    "vocab_size": 28996,
-    "n_bytes": 2054052,
-    "n_tokens": 903078,
-    "n_chars": 1145876
-  },
-  "bert_base_chinese.cc100-fa": {
-    "vocab_size": 21128,
-    "n_bytes": 2054052,
-    "n_tokens": 396414,
-    "n_chars": 1145876
-  },
-  "bert_base_uncased.cc100-fa": {
-    "vocab_size": 30522,
-    "n_bytes": 2054052,
-    "n_tokens": 910783,
-    "n_chars": 1145876
-  },
-  "bloom.cc100-fa": {
-    "vocab_size": 250680,
-    "n_bytes": 2054052,
-    "n_tokens": 434406,
-    "n_chars": 1145876
-  },
-  "byt5_small.cc100-fa": {
-    "vocab_size": 384,
-    "n_bytes": 2054052,
-    "n_tokens": 2064052,
-    "n_chars": 1145876
-  },
-  "character_glm_6b.cc100-fa": {
-    "vocab_size": 64789,
-    "n_bytes": 2054052,
-    "n_tokens": 1165051,
-    "n_chars": 1145876
-  },
-  "chatglm2_6b.cc100-fa": {
-    "vocab_size": 64787,
-    "n_bytes": 2054052,
-    "n_tokens": 1165051,
-    "n_chars": 1145876
-  },
-  "chatglm3_6b.cc100-fa": {
-    "vocab_size": 64796,
-    "n_bytes": 2054052,
-    "n_tokens": 1165051,
-    "n_chars": 1145876
-  },
-  "chatglm_6b.cc100-fa": {
-    "vocab_size": 150344,
-    "n_bytes": 2054052,
-    "n_tokens": 910808,
-    "n_chars": 1145876
-  },
-  "chatyuan_large_v2.cc100-fa": {
-    "vocab_size": 32128,
-    "n_bytes": 2054052,
-    "n_tokens": 740377,
-    "n_chars": 1145876
-  },
-  "chinese_llama.cc100-fa": {
-    "vocab_size": 49953,
-    "n_bytes": 2054052,
-    "n_tokens": 1150750,
-    "n_chars": 1145876
-  },
-  "chinese_llama2.cc100-fa": {
-    "vocab_size": 55296,
-    "n_bytes": 2054052,
-    "n_tokens": 1155078,
-    "n_chars": 1145876
-  },
-  "code_davinci_002.cc100-fa": {
-    "vocab_size": 50281,
-    "n_bytes": 2054052,
-    "n_tokens": 1292300,
-    "n_chars": 1145876
-  },
-  "crystal_coder.cc100-fa": {
-    "vocab_size": 32022,
-    "n_bytes": 2054052,
-    "n_tokens": 1145076,
-    "n_chars": 1145876
-  },
-  "dbrx_instruct.cc100-fa": {
-    "vocab_size": 100280,
-    "n_bytes": 2054052,
-    "n_tokens": 818067,
-    "n_chars": 1145876
-  },
-  "deepseek_coder_33b_instruct.cc100-fa": {
-    "vocab_size": 32022,
-    "n_bytes": 2054052,
-    "n_tokens": 1326109,
-    "n_chars": 1145876
-  },
-  "deepseek_llm_7b_base.cc100-fa": {
-    "vocab_size": 100015,
-    "n_bytes": 2054052,
-    "n_tokens": 973451,
-    "n_chars": 1145876
-  },
-  "falcon_180b.cc100-fa": {
-    "vocab_size": 65024,
-    "n_bytes": 2054052,
-    "n_tokens": 1246580,
-    "n_chars": 1145876
-  },
-  "falcon_7b.cc100-fa": {
-    "vocab_size": 65024,
-    "n_bytes": 2054052,
-    "n_tokens": 1246580,
-    "n_chars": 1145876
-  },
-  "fastchat_t5_3b.cc100-fa": {
-    "vocab_size": 32110,
-    "n_bytes": 2054052,
-    "n_tokens": 712443,
-    "n_chars": 1145876
-  },
-  "flan_t5_base.cc100-fa": {
-    "vocab_size": 32100,
-    "n_bytes": 2054052,
-    "n_tokens": 493779,
-    "n_chars": 1145876
-  },
-  "gemma_7b.cc100-fa": {
-    "vocab_size": 256000,
-    "n_bytes": 2054052,
-    "n_tokens": 373762,
-    "n_chars": 1145876
-  },
-  "gpt2.cc100-fa": {
-    "vocab_size": 50257,
-    "n_bytes": 2054052,
-    "n_tokens": 1292300,
-    "n_chars": 1145876
-  },
-  "gpt2_chinese.cc100-fa": {
-    "vocab_size": 21128,
-    "n_bytes": 2054052,
-    "n_tokens": 406174,
-    "n_chars": 1145876
-  },
-  "gpt_35_turbo.cc100-fa": {
-    "vocab_size": 100277,
-    "n_bytes": 2054052,
-    "n_tokens": 818067,
-    "n_chars": 1145876
-  },
-  "gpt_4.cc100-fa": {
-    "vocab_size": 100277,
-    "n_bytes": 2054052,
-    "n_tokens": 818067,
-    "n_chars": 1145876
-  },
-  "gpt_neox_japanese_2_7b.cc100-fa": {
-    "vocab_size": 32000,
-    "n_bytes": 2054052,
-    "n_tokens": 2036715,
-    "n_chars": 1145876
-  },
-  "gpt_nexo_20b.cc100-fa": {
-    "vocab_size": 50277,
-    "n_bytes": 2054052,
-    "n_tokens": 866434,
-    "n_chars": 1145876
-  },
-  "grok_1.cc100-fa": {
-    "vocab_size": 131072,
-    "n_bytes": 2054052,
-    "n_tokens": 1073281,
-    "n_chars": 1145876
-  },
-  "internlm2_chat_7b.cc100-fa": {
-    "vocab_size": 92544,
-    "n_bytes": 2054052,
-    "n_tokens": 1195032,
-    "n_chars": 1145876
-  },
-  "internlm2_math_7b.cc100-fa": {
-    "vocab_size": 92544,
-    "n_bytes": 2054052,
-    "n_tokens": 1195032,
-    "n_chars": 1145876
-  },
-  "internlm_chat_7b.cc100-fa": {
-    "vocab_size": 103168,
-    "n_bytes": 2054052,
-    "n_tokens": 640945,
-    "n_chars": 1145876
-  },
-  "internlm_xcomposer_7b.cc100-fa": {
-    "vocab_size": 103168,
-    "n_bytes": 2054052,
-    "n_tokens": 640945,
-    "n_chars": 1145876
-  },
-  "jamba_v0_1.cc100-fa": {
-    "vocab_size": 65536,
-    "n_bytes": 2054052,
-    "n_tokens": 732550,
-    "n_chars": 1145876
-  },
-  "kplug.cc100-fa": {
-    "vocab_size": 10261,
-    "n_bytes": 2054052,
-    "n_tokens": 274671,
-    "n_chars": 1145876
-  },
-  "llama.cc100-fa": {
-    "vocab_size": 32000,
-    "n_bytes": 2054052,
-    "n_tokens": 1155076,
-    "n_chars": 1145876
-  },
-  "llama2.cc100-fa": {
-    "vocab_size": 32001,
-    "n_bytes": 2054052,
-    "n_tokens": 1155076,
-    "n_chars": 1145876
-  },
-  "llama3.cc100-fa": {
-    "vocab_size": 128256,
-    "n_bytes": 2054052,
-    "n_tokens": 387448,
-    "n_chars": 1145876
-  },
-  "llama_3_chinese_8b.cc100-fa": {
-    "vocab_size": 128256,
-    "n_bytes": 2054052,
-    "n_tokens": 397448,
-    "n_chars": 1145876
-  },
-  "mistral_7b.cc100-fa": {
-    "vocab_size": 32000,
-    "n_bytes": 2054052,
-    "n_tokens": 1133278,
-    "n_chars": 1145876
-  },
-  "mixtral_8_7b.cc100-fa": {
-    "vocab_size": 32000,
-    "n_bytes": 2054052,
-    "n_tokens": 1133278,
-    "n_chars": 1145876
-  },
-  "mobilebert_uncased.cc100-fa": {
-    "vocab_size": 30522,
-    "n_bytes": 2054052,
-    "n_tokens": 910783,
-    "n_chars": 1145876
-  },
-  "moss.cc100-fa": {
-    "vocab_size": 106072,
-    "n_bytes": 2054052,
-    "n_tokens": 1285426,
-    "n_chars": 1145876
-  },
-  "mt5_large.cc100-fa": {
-    "vocab_size": 250100,
-    "n_bytes": 2054052,
-    "n_tokens": 429922,
-    "n_chars": 1145876
-  },
-  "olmo_7b.cc100-fa": {
-    "vocab_size": 50280,
-    "n_bytes": 2054052,
-    "n_tokens": 866434,
-    "n_chars": 1145876
-  },
-  "orion_14b_chat.cc100-fa": {
-    "vocab_size": 84608,
-    "n_bytes": 2054052,
-    "n_tokens": 1131108,
-    "n_chars": 1145876
-  },
-  "phi_1.cc100-fa": {
-    "vocab_size": 50295,
-    "n_bytes": 2054052,
-    "n_tokens": 1292300,
-    "n_chars": 1145876
-  },
-  "phi_2.cc100-fa": {
-    "vocab_size": 50295,
-    "n_bytes": 2054052,
-    "n_tokens": 1292300,
-    "n_chars": 1145876
-  },
-  "phi_3_mini.cc100-fa": {
-    "vocab_size": 32011,
-    "n_bytes": 2054052,
-    "n_tokens": 1155076,
-    "n_chars": 1145876
-  },
-  "pko_t5_large.cc100-fa": {
-    "vocab_size": 50358,
-    "n_bytes": 2054052,
-    "n_tokens": 2061040,
-    "n_chars": 1145876
-  },
-  "prompt_clue.cc100-fa": {
-    "vocab_size": 32128,
-    "n_bytes": 2054052,
-    "n_tokens": 740377,
-    "n_chars": 1145876
-  },
-  "qwen1_5_14b_chat.cc100-fa": {
-    "vocab_size": 151646,
-    "n_bytes": 2054052,
-    "n_tokens": 643421,
-    "n_chars": 1145876
-  },
-  "qwen_1_8b_chat.cc100-fa": {
-    "vocab_size": 151851,
-    "n_bytes": 2054052,
-    "n_tokens": 643421,
-    "n_chars": 1145876
-  },
-  "qwen_72b_chat.cc100-fa": {
-    "vocab_size": 151851,
-    "n_bytes": 2054052,
-    "n_tokens": 643421,
-    "n_chars": 1145876
-  },
-  "qwen_7b_chat.cc100-fa": {
-    "vocab_size": 151851,
-    "n_bytes": 2054052,
-    "n_tokens": 643421,
-    "n_chars": 1145876
-  },
-  "roberta_chinese_clue.cc100-fa": {
-    "vocab_size": 8021,
-    "n_bytes": 2054052,
-    "n_tokens": 407763,
-    "n_chars": 1145876
-  },
-  "skywork_13b_base.cc100-fa": {
-    "vocab_size": 65519,
-    "n_bytes": 2054052,
-    "n_tokens": 1155072,
-    "n_chars": 1145876
-  },
-  "skywork_13b_math.cc100-fa": {
-    "vocab_size": 65519,
-    "n_bytes": 2054052,
-    "n_tokens": 1155072,
-    "n_chars": 1145876
-  },
-  "solar_10_7b.cc100-fa": {
-    "vocab_size": 32000,
-    "n_bytes": 2054052,
-    "n_tokens": 1133278,
-    "n_chars": 1145876
-  },
-  "starchat_alpha.cc100-fa": {
-    "vocab_size": 49156,
-    "n_bytes": 2054052,
-    "n_tokens": 851630,
-    "n_chars": 1145876
-  },
-  "switch_c_2048.cc100-fa": {
-    "vocab_size": 32100,
-    "n_bytes": 2054052,
-    "n_tokens": 493767,
-    "n_chars": 1145876
-  },
-  "t5_base.cc100-fa": {
-    "vocab_size": 32100,
-    "n_bytes": 2054052,
-    "n_tokens": 493767,
-    "n_chars": 1145876
-  },
-  "t5_large.cc100-fa": {
-    "vocab_size": 32100,
-    "n_bytes": 2054052,
-    "n_tokens": 493767,
-    "n_chars": 1145876
-  },
-  "t5_small.cc100-fa": {
-    "vocab_size": 32100,
-    "n_bytes": 2054052,
-    "n_tokens": 493767,
-    "n_chars": 1145876
-  },
-  "text_davinci_003.cc100-fa": {
-    "vocab_size": 50281,
-    "n_bytes": 2054052,
-    "n_tokens": 1292300,
-    "n_chars": 1145876
-  },
-  "tigerbot_13b_chat_v2.cc100-fa": {
-    "vocab_size": 60515,
-    "n_bytes": 2054052,
-    "n_tokens": 1145046,
-    "n_chars": 1145876
-  },
-  "tigerbot_70b_chat_v4_4k.cc100-fa": {
-    "vocab_size": 65110,
-    "n_bytes": 2054052,
-    "n_tokens": 1145048,
-    "n_chars": 1145876
-  },
-  "wizardcoder_15b_v1.cc100-fa": {
-    "vocab_size": 49153,
-    "n_bytes": 2054052,
-    "n_tokens": 851630,
-    "n_chars": 1145876
-  },
-  "wizardcoder_python_7b_v1.cc100-fa": {
-    "vocab_size": 32001,
-    "n_bytes": 2054052,
-    "n_tokens": 1155076,
-    "n_chars": 1145876
-  },
-  "wizardlm_7b_v1.cc100-fa": {
-    "vocab_size": 32001,
-    "n_bytes": 2054052,
-    "n_tokens": 1155076,
-    "n_chars": 1145876
-  },
-  "wizardmath_70b_v1.cc100-fa": {
-    "vocab_size": 32002,
-    "n_bytes": 2054052,
-    "n_tokens": 1155076,
-    "n_chars": 1145876
-  },
-  "xlm_roberta.cc100-fa": {
-    "vocab_size": 250002,
-    "n_bytes": 2054052,
-    "n_tokens": 330926,
-    "n_chars": 1145876
-  },
-  "yi_34b.cc100-fa": {
-    "vocab_size": 64000,
-    "n_bytes": 2054052,
-    "n_tokens": 1337264,
-    "n_chars": 1145876
-  },
-  "yi_6b.cc100-fa": {
-    "vocab_size": 64000,
-    "n_bytes": 2054052,
-    "n_tokens": 1337264,
-    "n_chars": 1145876
-  },
-  "yi_vl34b.cc100-fa": {
-    "vocab_size": 64000,
-    "n_bytes": 2054052,
-    "n_tokens": 1346819,
-    "n_chars": 1145876
-  },
-  "zephyr_7b_beta.cc100-fa": {
-    "vocab_size": 32000,
-    "n_bytes": 2054052,
-    "n_tokens": 1133278,
-    "n_chars": 1145876
-  },
-  "llama_3_chinese_8b.cc100-fr": {
-    "vocab_size": 128256,
-    "n_bytes": 1540504,
-    "n_tokens": 422146,
-    "n_chars": 1484970
-  },
-  "llama_3_chinese_8b.cc100-ja": {
-    "vocab_size": 128256,
-    "n_bytes": 1774770,
-    "n_tokens": 424715,
-    "n_chars": 603065
-  },
-  "aya_101.cc100-ko": {
-    "vocab_size": 250100,
-    "n_bytes": 1524839,
-    "n_tokens": 434586,
-    "n_chars": 655190
-  },
-  "baichuan.cc100-ko": {
-    "vocab_size": 64000,
-    "n_bytes": 1524839,
-    "n_tokens": 639258,
-    "n_chars": 655190
-  },
-  "baichuan2.cc100-ko": {
-    "vocab_size": 125696,
-    "n_bytes": 1524839,
-    "n_tokens": 623358,
-    "n_chars": 655190
-  },
-  "bert_base_cased.cc100-ko": {
-    "vocab_size": 28996,
-    "n_bytes": 1524839,
-    "n_tokens": 222828,
-    "n_chars": 655190
-  },
-  "bert_base_chinese.cc100-ko": {
-    "vocab_size": 21128,
-    "n_bytes": 1524839,
-    "n_tokens": 219752,
-    "n_chars": 655190
-  },
-  "bert_base_uncased.cc100-ko": {
-    "vocab_size": 30522,
-    "n_bytes": 1524839,
-    "n_tokens": 904756,
-    "n_chars": 655190
-  },
-  "bloom.cc100-ko": {
-    "vocab_size": 250680,
-    "n_bytes": 1524839,
-    "n_tokens": 742111,
-    "n_chars": 655190
-  },
-  "byt5_small.cc100-ko": {
-    "vocab_size": 384,
-    "n_bytes": 1524839,
-    "n_tokens": 1534839,
-    "n_chars": 655190
-  },
-  "character_glm_6b.cc100-ko": {
-    "vocab_size": 64789,
-    "n_bytes": 1524839,
-    "n_tokens": 672160,
-    "n_chars": 655190
-  },
-  "chatglm2_6b.cc100-ko": {
-    "vocab_size": 64787,
-    "n_bytes": 1524839,
-    "n_tokens": 672156,
-    "n_chars": 655190
-  },
-  "chatglm3_6b.cc100-ko": {
-    "vocab_size": 64796,
-    "n_bytes": 1524839,
-    "n_tokens": 672160,
-    "n_chars": 655190
-  },
-  "chatglm_6b.cc100-ko": {
-    "vocab_size": 150344,
-    "n_bytes": 1524839,
-    "n_tokens": 939630,
-    "n_chars": 655190
-  },
-  "chatyuan_large_v2.cc100-ko": {
-    "vocab_size": 32128,
-    "n_bytes": 1524839,
-    "n_tokens": 354411,
-    "n_chars": 655190
-  },
-  "chinese_llama.cc100-ko": {
-    "vocab_size": 49953,
-    "n_bytes": 1524839,
-    "n_tokens": 913553,
-    "n_chars": 655190
-  },
-  "chinese_llama2.cc100-ko": {
-    "vocab_size": 55296,
-    "n_bytes": 1524839,
-    "n_tokens": 963427,
-    "n_chars": 655190
-  },
-  "code_davinci_002.cc100-ko": {
-    "vocab_size": 50281,
-    "n_bytes": 1524839,
-    "n_tokens": 1308993,
-    "n_chars": 655190
-  },
-  "crystal_coder.cc100-ko": {
-    "vocab_size": 32022,
-    "n_bytes": 1524839,
-    "n_tokens": 954428,
-    "n_chars": 655190
-  },
-  "dbrx_instruct.cc100-ko": {
-    "vocab_size": 100280,
-    "n_bytes": 1524839,
-    "n_tokens": 652277,
-    "n_chars": 655190
-  },
-  "deepseek_coder_33b_instruct.cc100-ko": {
-    "vocab_size": 32022,
-    "n_bytes": 1524839,
-    "n_tokens": 1454805,
-    "n_chars": 655190
-  },
-  "deepseek_llm_7b_base.cc100-ko": {
-    "vocab_size": 100015,
-    "n_bytes": 1524839,
-    "n_tokens": 1081983,
-    "n_chars": 655190
-  },
-  "falcon_180b.cc100-ko": {
-    "vocab_size": 65024,
-    "n_bytes": 1524839,
-    "n_tokens": 1330568,
-    "n_chars": 655190
-  },
-  "falcon_7b.cc100-ko": {
-    "vocab_size": 65024,
-    "n_bytes": 1524839,
-    "n_tokens": 1330568,
-    "n_chars": 655190
-  },
-  "fastchat_t5_3b.cc100-ko": {
-    "vocab_size": 32110,
-    "n_bytes": 1524839,
-    "n_tokens": 484953,
-    "n_chars": 655190
-  },
-  "flan_t5_base.cc100-ko": {
-    "vocab_size": 32100,
-    "n_bytes": 1524839,
-    "n_tokens": 344457,
-    "n_chars": 655190
-  },
-  "gemma_7b.cc100-ko": {
-    "vocab_size": 256000,
-    "n_bytes": 1524839,
-    "n_tokens": 464410,
-    "n_chars": 655190
-  },
-  "gpt2.cc100-ko": {
-    "vocab_size": 50257,
-    "n_bytes": 1524839,
-    "n_tokens": 1309029,
-    "n_chars": 655190
-  },
-  "gpt2_chinese.cc100-ko": {
-    "vocab_size": 21128,
-    "n_bytes": 1524839,
-    "n_tokens": 1055974,
-    "n_chars": 655190
-  },
-  "gpt_35_turbo.cc100-ko": {
-    "vocab_size": 100277,
-    "n_bytes": 1524839,
-    "n_tokens": 652277,
-    "n_chars": 655190
-  },
-  "gpt_4.cc100-ko": {
-    "vocab_size": 100277,
-    "n_bytes": 1524839,
-    "n_tokens": 652277,
-    "n_chars": 655190
-  },
-  "gpt_neox_japanese_2_7b.cc100-ko": {
-    "vocab_size": 32000,
-    "n_bytes": 1524839,
-    "n_tokens": 1512832,
-    "n_chars": 655190
-  },
-  "gpt_nexo_20b.cc100-ko": {
-    "vocab_size": 50277,
-    "n_bytes": 1524839,
-    "n_tokens": 973288,
-    "n_chars": 655190
-  },
-  "grok_1.cc100-ko": {
-    "vocab_size": 131072,
-    "n_bytes": 1524839,
-    "n_tokens": 1152005,
-    "n_chars": 655190
-  },
-  "internlm2_chat_7b.cc100-ko": {
-    "vocab_size": 92544,
-    "n_bytes": 1524839,
-    "n_tokens": 1008524,
-    "n_chars": 655190
-  },
-  "internlm2_math_7b.cc100-ko": {
-    "vocab_size": 92544,
-    "n_bytes": 1524839,
-    "n_tokens": 1008524,
-    "n_chars": 655190
-  },
-  "internlm_chat_7b.cc100-ko": {
-    "vocab_size": 103168,
-    "n_bytes": 1524839,
-    "n_tokens": 839609,
-    "n_chars": 655190
-  },
-  "internlm_xcomposer_7b.cc100-ko": {
-    "vocab_size": 103168,
-    "n_bytes": 1524839,
-    "n_tokens": 839609,
-    "n_chars": 655190
-  },
-  "jamba_v0_1.cc100-ko": {
-    "vocab_size": 65536,
-    "n_bytes": 1524839,
-    "n_tokens": 715688,
-    "n_chars": 655190
-  },
-  "kplug.cc100-ko": {
-    "vocab_size": 10261,
-    "n_bytes": 1524839,
-    "n_tokens": 222771,
-    "n_chars": 655190
-  },
-  "llama.cc100-ko": {
-    "vocab_size": 32000,
-    "n_bytes": 1524839,
-    "n_tokens": 964428,
-    "n_chars": 655190
-  },
-  "llama2.cc100-ko": {
-    "vocab_size": 32001,
-    "n_bytes": 1524839,
-    "n_tokens": 964428,
-    "n_chars": 655190
-  },
-  "llama3.cc100-ko": {
-    "vocab_size": 128256,
-    "n_bytes": 1524839,
-    "n_tokens": 412595,
-    "n_chars": 655190
-  },
-  "llama_3_chinese_8b.cc100-ko": {
-    "vocab_size": 128256,
-    "n_bytes": 1524839,
-    "n_tokens": 422595,
-    "n_chars": 655190
-  },
-  "mistral_7b.cc100-ko": {
-    "vocab_size": 32000,
-    "n_bytes": 1524839,
-    "n_tokens": 728766,
-    "n_chars": 655190
-  },
-  "mixtral_8_7b.cc100-ko": {
-    "vocab_size": 32000,
-    "n_bytes": 1524839,
-    "n_tokens": 728766,
-    "n_chars": 655190
-  },
-  "mobilebert_uncased.cc100-ko": {
-    "vocab_size": 30522,
-    "n_bytes": 1524839,
-    "n_tokens": 904756,
-    "n_chars": 655190
-  },
-  "moss.cc100-ko": {
-    "vocab_size": 106072,
-    "n_bytes": 1524839,
-    "n_tokens": 1305249,
-    "n_chars": 655190
-  },
-  "mt5_large.cc100-ko": {
-    "vocab_size": 250100,
-    "n_bytes": 1524839,
-    "n_tokens": 434586,
-    "n_chars": 655190
-  },
-  "olmo_7b.cc100-ko": {
-    "vocab_size": 50280,
-    "n_bytes": 1524839,
-    "n_tokens": 973288,
-    "n_chars": 655190
-  },
-  "orion_14b_chat.cc100-ko": {
-    "vocab_size": 84608,
-    "n_bytes": 1524839,
-    "n_tokens": 351149,
-    "n_chars": 655190
-  },
-  "phi_1.cc100-ko": {
-    "vocab_size": 50295,
-    "n_bytes": 1524839,
-    "n_tokens": 1308988,
-    "n_chars": 655190
-  },
-  "phi_2.cc100-ko": {
-    "vocab_size": 50295,
-    "n_bytes": 1524839,
-    "n_tokens": 1308988,
-    "n_chars": 655190
-  },
-  "phi_3_mini.cc100-ko": {
-    "vocab_size": 32011,
-    "n_bytes": 1524839,
-    "n_tokens": 964428,
-    "n_chars": 655190
-  },
-  "pko_t5_large.cc100-ko": {
-    "vocab_size": 50358,
-    "n_bytes": 1524839,
-    "n_tokens": 471643,
-    "n_chars": 655190
-  },
-  "prompt_clue.cc100-ko": {
-    "vocab_size": 32128,
-    "n_bytes": 1524839,
-    "n_tokens": 354411,
-    "n_chars": 655190
-  },
-  "qwen1_5_14b_chat.cc100-ko": {
-    "vocab_size": 151646,
-    "n_bytes": 1524839,
-    "n_tokens": 457492,
-    "n_chars": 655190
-  },
-  "qwen_1_8b_chat.cc100-ko": {
-    "vocab_size": 151851,
-    "n_bytes": 1524839,
-    "n_tokens": 457492,
-    "n_chars": 655190
-  },
-  "qwen_72b_chat.cc100-ko": {
-    "vocab_size": 151851,
-    "n_bytes": 1524839,
-    "n_tokens": 457492,
-    "n_chars": 655190
-  },
-  "qwen_7b_chat.cc100-ko": {
-    "vocab_size": 151851,
-    "n_bytes": 1524839,
-    "n_tokens": 457492,
-    "n_chars": 655190
-  },
-  "roberta_chinese_clue.cc100-ko": {
-    "vocab_size": 8021,
-    "n_bytes": 1524839,
-    "n_tokens": 226812,
-    "n_chars": 655190
-  },
-  "skywork_13b_base.cc100-ko": {
-    "vocab_size": 65519,
-    "n_bytes": 1524839,
-    "n_tokens": 962744,
-    "n_chars": 655190
-  },
-  "skywork_13b_math.cc100-ko": {
-    "vocab_size": 65519,
-    "n_bytes": 1524839,
-    "n_tokens": 962744,
-    "n_chars": 655190
-  },
-  "solar_10_7b.cc100-ko": {
-    "vocab_size": 32000,
-    "n_bytes": 1524839,
-    "n_tokens": 728766,
-    "n_chars": 655190
-  },
-  "starchat_alpha.cc100-ko": {
-    "vocab_size": 49156,
-    "n_bytes": 1524839,
-    "n_tokens": 580873,
-    "n_chars": 655190
-  },
-  "switch_c_2048.cc100-ko": {
-    "vocab_size": 32100,
-    "n_bytes": 1524839,
-    "n_tokens": 344457,
-    "n_chars": 655190
-  },
-  "t5_base.cc100-ko": {
-    "vocab_size": 32100,
-    "n_bytes": 1524839,
-    "n_tokens": 344457,
-    "n_chars": 655190
-  },
-  "t5_large.cc100-ko": {
-    "vocab_size": 32100,
-    "n_bytes": 1524839,
-    "n_tokens": 344457,
-    "n_chars": 655190
-  },
-  "t5_small.cc100-ko": {
-    "vocab_size": 32100,
-    "n_bytes": 1524839,
-    "n_tokens": 344457,
-    "n_chars": 655190
-  },
-  "text_davinci_003.cc100-ko": {
-    "vocab_size": 50281,
-    "n_bytes": 1524839,
-    "n_tokens": 1308993,
-    "n_chars": 655190
-  },
-  "tigerbot_13b_chat_v2.cc100-ko": {
-    "vocab_size": 60515,
-    "n_bytes": 1524839,
-    "n_tokens": 793053,
-    "n_chars": 655190
-  },
-  "tigerbot_70b_chat_v4_4k.cc100-ko": {
-    "vocab_size": 65110,
-    "n_bytes": 1524839,
-    "n_tokens": 484082,
-    "n_chars": 655190
-  },
-  "wizardcoder_15b_v1.cc100-ko": {
-    "vocab_size": 49153,
-    "n_bytes": 1524839,
-    "n_tokens": 580873,
-    "n_chars": 655190
-  },
-  "wizardcoder_python_7b_v1.cc100-ko": {
-    "vocab_size": 32001,
-    "n_bytes": 1524839,
-    "n_tokens": 964428,
-    "n_chars": 655190
-  },
-  "wizardlm_7b_v1.cc100-ko": {
-    "vocab_size": 32001,
-    "n_bytes": 1524839,
-    "n_tokens": 964428,
-    "n_chars": 655190
-  },
-  "wizardmath_70b_v1.cc100-ko": {
-    "vocab_size": 32002,
-    "n_bytes": 1524839,
-    "n_tokens": 964428,
-    "n_chars": 655190
-  },
-  "xlm_roberta.cc100-ko": {
-    "vocab_size": 250002,
-    "n_bytes": 1524839,
-    "n_tokens": 374571,
-    "n_chars": 655190
-  },
-  "yi_34b.cc100-ko": {
-    "vocab_size": 64000,
-    "n_bytes": 1524839,
-    "n_tokens": 1203134,
-    "n_chars": 655190
-  },
-  "yi_6b.cc100-ko": {
-    "vocab_size": 64000,
-    "n_bytes": 1524839,
-    "n_tokens": 1203134,
-    "n_chars": 655190
-  },
-  "yi_vl34b.cc100-ko": {
-    "vocab_size": 64000,
-    "n_bytes": 1524839,
-    "n_tokens": 1210021,
-    "n_chars": 655190
-  },
-  "zephyr_7b_beta.cc100-ko": {
-    "vocab_size": 32000,
-    "n_bytes": 1524839,
-    "n_tokens": 728766,
-    "n_chars": 655190
-  },
-  "llama_3_chinese_8b.cc100-zh-Hans": {
-    "vocab_size": 128256,
-    "n_bytes": 2633047,
-    "n_tokens": 757405,
-    "n_chars": 927311
-  }
-}

stats/compression_rate.json ADDED Viewed

The diff for this file is too large to render. See raw diff

utils/byte_util.py DELETED Viewed

File without changes

utils/character_util.py DELETED Viewed

@@ -1,231 +0,0 @@
-"""
-TODO: 繁体、简体、语种、
-"""
-import os
-import json
-from collections import Counter
-from vocab import load_tokener
-from utils.log_util import logger
-from utils.text_util import is_all_digit, has_digit, get_digit_count, get_space_count
-from utils.lang_util import detect_language
-from utils.lang_util_2 import is_zh_char, is_all_zh, get_zh_count
-CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
-zh_tokens = [line.strip() for line in open(os.path.join(CURRENT_DIR, "vocab.jd.txt.v2"), "r", encoding="utf-8") if
-             is_zh_char(line.strip())]
-def digit_():
-    """
-    qwen segments numbers by single digits.
-    """
-    pass
-def to_unicode(text):
-    return ''.join(r'\u{:04X}'.format(ord(chr)) for chr in text)
-def zh_iterator():
-    for idx in range(ord(u'\u4e00'), ord(u'\u9fa5')):
-        yield (chr(idx))
-def get_coding_length(tokenizer, vocab, filter=None):
-    """
-    计算编码长度。（有些中文汉字被解码成多个token）
-    """
-    all_length = []
-    for word in vocab:
-        if len(word) > 1:
-            continue
-        if filter is not None and filter(word):
-            continue
-        try:
-            tokens = tokenizer.encode(word)
-        except Exception as e:
-            print(e)
-        all_length.append(len(tokens))
-        # if len(tokens.ids) > 1:
-        # if len(tokens) > 3:
-        #     print(word, tokens)
-    dist_length = Counter(all_length)
-    mean_length = round(sum(all_length) / len(all_length), 2)
-    return dist_length, mean_length
-def remove_special_char():
-    """
-    :return:
-    """
-    # bert词典有 ##开头的
-    # byteBPE词典有带空格的
-    # decode_str = decode_str.strip().replace("#", "")  # TODO, 按类型
-    pass
-cache = {}
-def _mean(datas):
-    return sum(datas) / len(datas)
-def iter_vocab(tokenizer_name, from_cache=True, cache_dir="stats/iter_vocab"):
-    """
-    由于速度较快，建议不采用文件缓存。
-    :param tokenizer:
-    :param from_cache:
-    :return:
-    """
-    cache_dir = os.path.join(CURRENT_DIR, f"../{cache_dir}")
-    os.makedirs(cache_dir, exist_ok=True)
-    tokenizer = load_tokener(tokenizer_name)
-    # load from cache
-    if from_cache and tokenizer_name in cache:
-        logger.info(f"load {tokenizer_name} from cache")
-        return cache[tokenizer_name]
-    has_zh_tokens = []
-    all_zh_tokens = []
-    has_digit_tokens = []
-    all_digit_tokens = []
-    has_space_tokens = []
-    all_space_tokens = []
-    # zh_tags = ["all_zh", "has_zh"]
-    # digit_tags = ["all_digit", "has_digit"]
-    # zh_token_count = {"total": 0, "包含1个中文单字": 0, "中文多字": 0}
-    # symbol_count = 0
-    all_single_zh_tokens = set()
-    zh_symbol_count = 0
-    buffer = []
-    for token_id in range(tokenizer.vocab_size):
-        decode_str = tokenizer.decode([token_id], skip_special_tokens=False)
-        token = tokenizer.convert_ids_to_tokens([token_id], skip_special_tokens=False)[0]
-        # tokenizer.convert_tokens_to_string(tokens)
-        tags = []
-        if token is None:  # 有些词典有空的id（不连续）
-            continue
-        if isinstance(token, bytes):
-            token = token.decode("utf-8", errors="ignore")
-        digit_count = get_digit_count(decode_str)
-        language_tags = detect_language(decode_str)
-        if "Chinese" in language_tags:
-            has_zh_tokens.append(decode_str)
-        if is_all_zh(decode_str):
-            tags.append("all_zh")
-            all_zh_tokens.append(decode_str)
-        if is_all_digit(decode_str):
-            tags.append("all_digit")
-            all_digit_tokens.append(decode_str)
-        if has_digit(decode_str):
-            tags.append("has_digit")
-            has_digit_tokens.append(decode_str)
-        space_count = get_space_count(decode_str)
-        if space_count > 0:
-            has_space_tokens.append(decode_str)
-            if space_count == len(decode_str):
-                all_space_tokens.append(decode_str)
-        zh_count = get_zh_count(decode_str)
-        buffer.append(json.dumps(
-            {"id": token_id,
-             "token": token,
-             "token_decode": decode_str,
-             "token_dumps": json.dumps(token),
-             "token_unicode": to_unicode(token),
-             "token_len": len(decode_str),
-             "zh_count": zh_count,  # 包含汉字的数目
-             # "zh-smpli": zh_hans_count,  # 简体中文  zh-Hans
-             "tags": tags,
-             "zh_symbol_count": zh_symbol_count,
-             },
-            ensure_ascii=False) + "\n")
-    #     if zh_count >= 1:
-    #         zh_token_count["total"] += 1
-    #         if zh_count > 1:
-    #             zh_token_count["中文多字"] += 1
-    #         else:
-    #             zh_token_count["中文单字"] += 1
-    #             all_single_zh_tokens.add(decode_str.strip().replace("#", ""))
-    #
-    # zh_token_count["中文单字-去重���"] = len(all_single_zh_tokens)
-    dist_length, mean_length = get_coding_length(tokenizer, zh_tokens, filter=lambda k: not is_zh_char(k))
-    # TODO: 繁体字，简体字
-    result = {
-        "name": tokenizer_name,
-        "impl": str(tokenizer.__class__),
-        "vocab_size": len(tokenizer),
-        "中文token数": len(has_zh_tokens),
-        "中文token的平均长度": None,
-        "纯中文token的平均长度": None,
-        "中文标点数": zh_symbol_count,
-        "中文汉字编码长度均值": mean_length,
-        "中文汉字编码长度分布": json.dumps(dist_length),
-        "纯数字token数": len(all_digit_tokens),
-        "包含数字token数": len(has_digit_tokens),
-        "纯数字token的平均长度": round(_mean([len(item) for item in all_digit_tokens]), 2),
-        "纯中文token数": None,  # all_zh
-        "纯space的token数": len(all_space_tokens),
-        "纯space的token数": len(all_space_tokens),  # "#"
-        "纯space的token的平均长度": None,  # avg_len( tokens_contains_space)
-        "contains_korea": None,
-    }
-    out_path = os.path.join(cache_dir, f"{tokenizer_name}.vocab.jsonl")
-    logger.info(f"saving vocab to {out_path}")
-    with open(out_path, "w", encoding="utf-8") as f_out:
-        f_out.write(json.dumps(result, ensure_ascii=False) + "\n")
-        for line in buffer:
-            f_out.write(line)
-    cache[tokenizer_name] = result
-    return result
-if __name__ == "__main__":
-    # test_coding_length(jd_vocab_tokens, filter=lambda k: not is_chinese(k))
-    # test_coding_length(zh_punc)
-    # test_coding_length(zh_iterator())
-    # from vocab.chatglm2_6b import tokenizer; name = "chatglm2_6b"
-    # from vocab.chatglm_6b import tokenizer; name="chatglm_6b"
-    # from vocab.baichuan2 import tokenizer;  name="baichuan2"
-    name="gpt_4"
-    # name="gpt2"
-    # name="qwen1_5_14b_chat"
-    # name="gpt_nexo_20b"
-    # name="fastchat_t5_3b"
-    print(iter_vocab(name))

utils/convert_sp_to_json.py DELETED Viewed

@@ -1,4 +0,0 @@
-from vocab.baichuan_7b import tokenizer
-tokenizer.sp

utils/fn_util.py DELETED Viewed

File without changes

utils/lang_util.py CHANGED Viewed

@@ -18,43 +18,39 @@ import re
 # 由于大部分是'latin'，所以就不统计了。
 common = ['Chinese', 'Japanese-Kana', 'Korean', 'Arabic',  'number']
 def detect_language(s):
     # 定义各语言字符的Unicode范围
-    language_ranges = {
-        'Arabic': r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]',
-        # 'CJK'  https://en.wikipedia.org/wiki/CJK_Unified_Ideographs
-        'Chinese': r'[\u4e00-\u9fff]',
-        'Japanese': r'[\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\u3400-\u4DBF]',  # https://stackoverflow.com/questions/19899554/unicode-range-for-japanese
-        'Japanese-Kana': r'[\u3040-\u309F\u30A0-\u30FF]',  # Hiragana  & Katakana
-        # 'Korean': r'[\uac00-\ud7a3]',
-        'Hangul': r'[\uac00-\ud7a3]',
-        # 拉丁字母系列
-        'Latin': r'[\u0000-\u007F\u0080-\u00FF]',
-        'English': r'[A-Za-z]',  # 这可能会与其他使用基本拉丁字母的语言重叠
-        'French': r'[\u00C0-\u00FF]',
-        'German': r'[\u00C4\u00D6\u00DC\u00E4\u00F6\u00FC\u00DF]',
-        'Spanish-': r'[\u00C1\u00E1\u00C9\u00E9\u00CD\u00ED\u00D3\u00F3\u00DA\u00FA\u00D1\u00F1\u00FC]',  # 西班牙语特有字符集合
-        # 斯拉夫语族
-        'Cyrillic': r'[\u0400-\u04FF\u0500-\u052F\u2DE0-\u2DFF\uA640-\uA69F]',
-        #
-        'Greek': r'[\u0370-\u03FF\u1F00-\u1FFF]',   # 希腊字母
-        'Hebrew': r'[\u0590-\u05FF\uFB1D-\uFB4F]',  # 希伯来语
-    }
     detected_languages = []
     for language, pattern in language_ranges.items():
         if re.search(pattern, s):
             detected_languages.append(language)
-    return detected_languages if detected_languages else ['Unknown']
 if __name__ == "__main__":

 # 由于大部分是'latin'，所以就不统计了。
 common = ['Chinese', 'Japanese-Kana', 'Korean', 'Arabic',  'number']
+language_ranges = {
+    ('Arabic', 'ar'): r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]',
+    # 'CJK'  https://en.wikipedia.org/wiki/CJK_Unified_Ideographs
+    ('Chinese', 'zh'): r'[\u4e00-\u9fff]',
+    ('Japanese', 'ja'): r'[\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\u3400-\u4DBF]',
+    # https://stackoverflow.com/questions/19899554/unicode-range-for-japanese
+    # Kana type refers to Japanese hiragana and katakana characters that represent phonetic sounds in the Japanese language.
+    ('Japanese-Kana', 'ja-kana'): r'[\u3040-\u309F\u30A0-\u30FF]',  # Hiragana  & Katakana
+    ('Korean', 'ko'): r'[\uac00-\ud7a3]',
+    # 拉丁字母系列
+    # ('Latin', 'la'): r'[\u0000-\u007F\u0080-\u00FF]',
+    # ('English', 'en'): r'[A-Za-z]',  # 这可能会与其他使用基本拉丁字母的语言重叠
+    # ('French', 'fr'): r'[\u00C0-\u00FF]',
+    # ('German', 'de'): r'[\u00C4\u00D6\u00DC\u00E4\u00F6\u00FC\u00DF]',
+    # ('Spanish-特有'): r'[\u00C1\u00E1\u00C9\u00E9\u00CD\u00ED\u00D3\u00F3\u00DA\u00FA\u00D1\u00F1\u00FC]',  # 西班牙语特有字符集合
+    # 斯拉夫语系列
+    # ('Cyrillic', ''): r'[\u0400-\u04FF\u0500-\u052F\u2DE0-\u2DFF\uA640-\uA69F]',
+    #
+    # 'Greek': r'[\u0370-\u03FF\u1F00-\u1FFF]',   # 希腊字母
+    # 'Hebrew': r'[\u0590-\u05FF\uFB1D-\uFB4F]',  # 希伯来语
+}
 def detect_language(s):
     # 定义各语言字符的Unicode范围
     detected_languages = []
     for language, pattern in language_ranges.items():
         if re.search(pattern, s):
             detected_languages.append(language)
+    return detected_languages
 if __name__ == "__main__":

utils/lang_util_2.py DELETED Viewed

@@ -1,115 +0,0 @@
-"""
-日语、韩语 等
-https://www.cnblogs.com/luoganttcc/p/16605150.html
-https://zhuanlan.zhihu.com/p/618684374
-- https://zhuanlan.zhihu.com/p/84625185 赞
-## 相关包
-import opencc
-import langid
-imort langdetect
-https://github.com/pemistahl/lingua-py
-  - 原理：
-"""
-from zhon.hanzi import punctuation as zh_punc
-def is_zh_char(uchar):
-    """
-    https://github.com/fxsjy/jieba/blob/master/jieba/__init__.py#L48
-    re.compile("([\u4E00-\u9FD5]+)", re.U)
-    """
-    return u'\u4e00' <= uchar <= u'\u9fa5'
-def has_zh_punc(text):
-    """
-    是否包含中文标点
-    """
-    return any(ch in zh_punc for ch in text)
-def has_zh(text):
-    """ contains Chinese characters """
-    return any(is_zh_char(ch) for ch in text)
-def get_zh_count(text):
-    return sum([is_zh_char(uchar) for uchar in text])
-def is_all_zh(text):
-    return all(is_zh_char(char) for char in text)
-def is_all_en(text):
-    return text.encode('utf-8').isalpha()
-ranges = [
-    {"from": ord(u"\u3300"), "to": ord(u"\u33ff")},  # compatibility ideographs
-    {"from": ord(u"\ufe30"), "to": ord(u"\ufe4f")},  # compatibility ideographs
-    {"from": ord(u"\uf900"), "to": ord(u"\ufaff")},  # compatibility ideographs
-    {"from": ord(u"\U0002F800"), "to": ord(u"\U0002fa1f")},  # compatibility ideographs
-    {'from': ord(u'\u3040'), 'to': ord(u'\u309f')},  # Japanese Hiragana  日本平假名 96个
-    {"from": ord(u"\u30a0"), "to": ord(u"\u30ff")},  # Japanese Katakana  日语片假名 96个
-    {"from": ord(u"\u2e80"), "to": ord(u"\u2eff")},  # cjk radicals supplement
-    {"from": ord(u"\u4e00"), "to": ord(u"\u9fff")},  # 中文  u"\u4e00"-'\u9fa5'，
-    {"from": ord(u"\u3400"), "to": ord(u"\u4dbf")},  #
-    {"from": ord(u"\U00020000"), "to": ord(u"\U0002a6df")},
-    {"from": ord(u"\U0002a700"), "to": ord(u"\U0002b73f")},
-    {"from": ord(u"\U0002b740"), "to": ord(u"\U0002b81f")},
-    {"from": ord(u"\U0002b820"), "to": ord(u"\U0002ceaf")}  # included as of Unicode 8.0
-]
-# 韩语 [\uac00-\ud7ff]
-def is_cjk(char):
-    """
-    CJK（Chinese、Japanese、Korean）
-    日语中有很多汉字，日本汉字超过2万。
-    韩语有谚文，超过50个，有朝鲜汉字超过2万。
-    """
-    return any([range["from"] <= ord(char) <= range["to"] for range in ranges])
-def cjk_substrings(string):
-    i = 0
-    while i < len(string):
-        if is_cjk(string[i]):
-            start = i
-            while is_cjk(string[i]): i += 1
-            yield string[start:i]
-        i += 1
-def aa():
-    # string = "sdf344asfasf天地方益3権sdfsdf".decode("utf-8")
-    for idx, item in enumerate(ranges):
-        print(idx, end=": ")
-        for j in range(10):
-            print(chr(item["from"] + j), end=", ")
-        print("")
-    # for sub in cjk_substrings(string):
-    #   string = string.replace(sub, "(" + sub + ")")
-    # print(string)
-def is_traditional_chinese(text):
-    cc = opencc.OpenCC('t2s')
-    converted_text = cc.convert(text)
-    if converted_text != text:
-        return True
-    return False
-    # aa()

utils/oov.md ADDED Viewed

	@@ -0,0 +1,202 @@

+```sh
+###################################
+ClueAI/ChatYuan-large-v2, <class 'tokenizers.models.Unigram'>
+reversible: false; unk_token: <unk>, 2, unk_ratio: 0.2000; oov: []
+text[7]     = "Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること;  a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
+decoding[7] = "<unk>амглав<unk> у<unk>равления развития; <unk> <unk> 15~17<unk> <unk> 3<unk>; 確実に春が近づいてること; a közoktatással? _ Belföld; pum<unk>, i vjet<unk>r, vjeç; <unk>ا<unk> <unk> <unk>ا<unk> ; <unk> <unk> <unk> <unk> <unk> <unk>; <unk> <unk> ; <unk>зейн<unk>я асо<unk>:; <unk> <unk> <unk> <unk>; <unk>;<unk>"
+###################################
+ClueAI/PromptCLUE-base, <class 'tokenizers.models.Unigram'>
+reversible: false; unk_token: <unk>, 2, unk_ratio: 0.2000; oov: []
+text[7]     = "Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること;  a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
+decoding[7] = "<unk>амглав<unk> у<unk>равления развития; <unk> <unk> 15~17<unk> <unk> 3<unk>; 確実に春が近づいてること; a közoktatással? _ Belföld; pum<unk>, i vjet<unk>r, vjeç; <unk>ا<unk> <unk> <unk>ا<unk> ; <unk> <unk> <unk> <unk> <unk> <unk>; <unk> <unk> ; <unk>зейн<unk>я асо<unk>:; <unk> <unk> <unk> <unk>; <unk>;<unk>"
+###################################
+CohereForAI/aya-101, <class 'tokenizers.models.Unigram'>
+reversible: false; unk_token: <unk>, 2, unk_ratio: 0.0079; oov: []
+text[73]     = " a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
+decoding[73] = "a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; <unk>❤❥웃유♋☮✊;װיקיװערטערבוך"
+###################################
+FacebookAI/xlm-roberta-base, <class 'tokenizers.models.Unigram'>
+reversible: false; unk_token: <unk>, 3, unk_ratio: 0.0096; oov: []
+text[73]     = " a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
+decoding[73] = "a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; <unk>❤❥웃유♋☮✊;װיקיװערטערבוך"
+###################################
+OrionStarAI/Orion-14B-Chat, sp_model, byte_num: 0
+reversible: false; unk_token: <unk>, 0, unk_ratio: 0.0495; oov: []
+text[71]     = ";  a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
+decoding[71] = "；  a közoktatással? _ Belföld; pumë, i vjetër, vjeç;  ئ<unk>  ⁇ ردوغان <unk> ⁇  قىرغىزىستان ; निम्न में से कौन सा हारडवेयर;  ተለ<unk>  ⁇ ጭ የግድግ<unk> ⁇  ; Дзейныя асобы:;  « <unk>  ⁇  <unk>  ⁇  <unk> ⁇ ;  \t\n <unk> ⁇ ❤❥웃유♋☮✊; <unk>  ⁇ יקי<unk> ⁇ ערטערבוך "
+###################################
+THUDM/chatglm-6b, byte_num: 256
+reversible: false; unk_token: <unk>, 0, unk_ratio: 0.0000; oov: []
+text[237]     = "\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
+decoding[237] = " 🦙❤❥웃유♋☮✊;װיקיװערטערבוך"
+###################################
+abeja/gpt-neox-japanese-2.7b, japanese-bpe: https://github.com/tanreinama/Japanese-BPEEncoder_V2
+reversible: false; unk_token: <|endoftext|>, 31999, unk_ratio: 0.0000; oov: []
+text[7]     = "Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること;  a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
+decoding[7] = "���������������� �������������������� ����������������; ������ ������ 15~17��� ��������� 3������; 確実に春が近づいてること;  a k��zoktat��ssal? _ Belf��ld; pum��, i vjet��r, vje��; ���������������� ���� ���������������������� ; ��������������� ��������� ������ ��������� ������ ������������������������; ��������������� ��������������� ; �������������� ����������:; ǀ ��������������������������� ��������������� ���������������; \t\n\n🐯❤‖������🟥🟥🤚;��������������������������"
+###################################
+baichuan-inc/Baichuan-7B, sp_model, byte_num: 256
+reversible: false; unk_token: <unk>, 0, unk_ratio: 0.0000; oov: []
+text[237]     = "\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
+decoding[237] = " 🦙❤❥웃유♋☮✊;װיקיװערטערבוך "
+###################################
+ckiplab/gpt2-base-chinese, <class 'tokenizers.models.WordPiece'>
+reversible: false; unk_token: [UNK], 100, unk_ratio: 0.1185; oov: []
+text[5]     = "; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること;  a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
+decoding[5] = " ; замглавы управления развития ; 특히 주소 15 ~ 17번 홀에선 3연속 ; 確 実 に 春 か 近 ついてること ; a kozoktatassal? _ belfold ; pume, i vjeter, vjec ; [UNK] [UNK] [UNK] ; [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] ; [UNK] [UNK] ; дзеиныя асобы : ; « [UNK] [UNK] [UNK] ; [UNK] ; [UNK]"
+###################################
+cl-tohoku/bert-base-japanese, wordpiece.MecabTokenizer, 支持byte-level https://github.com/polm/fugashi
+reversible: false; unk_token: [UNK], 1, unk_ratio: 0.3951; oov: []
+text[5]     = "; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること;  a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
+decoding[5] = " ; [UNK] [UNK] [UNK] ; [UNK] [UNK] 15 ~ 17 [UNK] [UNK] 3 [UNK] ; 確実 に 春 が 近づい てる こと ; a közoktatással? _ Belföld ; [UNK], i [UNK], vjeç ; [UNK] [UNK] [UNK] ; [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] ; [UNK] [UNK] ; [UNK] [UNK] :; [UNK] [UNK] [UNK] [UNK] ; [UNK] [UNK] [UNK] [UNK] [UNK]"
+###################################
+clue/roberta_chinese_clue_tiny, <class 'tokenizers.models.WordPiece'>
+reversible: false; unk_token: [UNK], 100, unk_ratio: 0.3580; oov: []
+text[5]     = "; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること;  a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
+decoding[5] = " ; [UNK] [UNK] [UNK] ; [UNK] [UNK] 15 ~ [UNK] [UNK] [UNK] ; [UNK] 実 [UNK] 春 [UNK] 近 [UNK] ; a kozoktatassal? _ belfold ; pume, i vjeter, vjec ; [UNK] [UNK] [UNK] ; [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] ; [UNK] [UNK] ; [UNK] асобы : ; « [UNK] [UNK] [UNK] ; [UNK] ; [UNK]"
+###################################
+dbmdz/bert-base-german-uncased, <class 'tokenizers.models.WordPiece'>
+reversible: false; unk_token: [UNK], 101, unk_ratio: 0.4459; oov: []
+text[5]     = "; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること;  a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
+decoding[5] = " ; [UNK] [UNK] [UNK] ; [UNK] [UNK] 15 ~ [UNK] [UNK] [UNK] ; [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] ; a kozoktatassal? _ belfold ; pume, i vjeter, vjec ; [UNK] [UNK] [UNK] ; [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] ; [UNK] [UNK] ; [UNK] [UNK] : ; « [UNK] [UNK] [UNK] ; [UNK] ; [UNK]"
+###################################
+deepseek-ai/deepseek-coder-33b-instruct, <class 'tokenizers.models.BPE'>
+reversible: false; unk_token: None, None, unk_ratio: 0.0000; oov: []
+text[77]     = "özoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
+decoding[77] = "�zoktatással? _ Belf�ld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך "
+Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
+[2024-05-12 00:30:36] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer eson/kplug-base-encoder
+###################################
+deepseek-ai/deepseek-llm-7b-base, <class 'tokenizers.models.BPE'>
+reversible: false; unk_token: None, None, unk_ratio: 0.0000; oov: []
+text[77]     = "özoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
+decoding[77] = "�zoktatással? _ Belf�ld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך "
+[2024-05-12 00:30:56] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer fnlp/moss-moon-003-sft
+###################################
+eson/kplug-base-encoder, <class 'tokenizers.models.WordPiece'>
+reversible: false; unk_token: [UNK], 100, unk_ratio: 0.3625; oov: []
+text[5]     = "; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること;  a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
+decoding[5] = " ; [UNK] [UNK] [UNK] ; [UNK] [UNK] 15 ~ [UNK] [UNK] [UNK] ; [UNK] 実 [UNK] 春 [UNK] 近 [UNK] ; a kozoktatassal? _ belfold ; pume, i vjeter, vjec ; [UNK] [UNK] [UNK] ; [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] ; [UNK] [UNK] ; [UNK] асобы : ; « [UNK] [UNK] [UNK] ; [UNK] ; [UNK]"
+Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
+[2024-05-12 00:31:36] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer google-bert/bert-base-cased
+###################################
+fnlp/moss-moon-003-sft, 应该是 sentencepiece.byte_bpe,待确认
+reversible: false; unk_token: <|endoftext|>, 106028, unk_ratio: 0.0000; oov: []
+text[74]     = "a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
+decoding[74] = "  a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך "
+###################################
+google-bert/bert-base-cased, <class 'tokenizers.models.WordPiece'>
+reversible: false; unk_token: [UNK], 100, unk_ratio: 0.1732; oov: []
+text[5]     = "; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること;  a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
+decoding[5] = " ; Замглавы управления развития ; [UNK] [UNK] 15 ~ [UNK] [UNK] [UNK] ; [UNK] [UNK] に [UNK] [UNK] [UNK] [UNK] ; a közoktatással? _ Belföld ; pumë, i vjetër, vjeç ; [UNK] [UNK] قىرغىزىستان ; निम्न में से [UNK] सा [UNK] ; [UNK] [UNK] ; Дзейныя асобы : ; « [UNK] [UNK] [UNK] ; [UNK] ; [UNK]"
+[2024-05-12 00:31:56] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer google-bert/bert-base-chinese
+[2024-05-12 00:32:16] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer google-bert/bert-base-german-cased
+###################################
+google-bert/bert-base-chinese, <class 'tokenizers.models.WordPiece'>
+reversible: false; unk_token: [UNK], 100, unk_ratio: 0.3704; oov: []
+text[5]     = "; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること;  a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
+decoding[5] = " ; [UNK] управления развития ; [UNK] [UNK] 15 ~ [UNK] [UNK] [UNK] ; 確 実 に 春 [UNK] 近 [UNK] ; a [UNK]? _ [UNK] ; [UNK], i [UNK], [UNK] ; [UNK] [UNK] [UNK] ; [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] ; [UNK] [UNK] ; [UNK] асобы : ; « [UNK] [UNK] [UNK] ; [UNK] ; [UNK]"
+###################################
+google-bert/bert-base-german-cased, <class 'tokenizers.models.WordPiece'>
+reversible: false; unk_token: [UNK], 2, unk_ratio: 0.5938; oov: []
+text[5]     = "; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること;  a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
+decoding[5] = " ; [UNK] [UNK] [UNK] ; [UNK] [UNK] 15 ~ [UNK] [UNK] [UNK] ; [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] ; a [UNK]? _ Belföld ; [UNK], i [UNK], [UNK] ; [UNK] [UNK] [UNK] ; [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] ; [UNK] [UNK] ; [UNK] [UNK] : ; [UNK] [UNK] [UNK] [UNK] ; [UNK] ; [UNK]"
+[2024-05-12 00:32:36] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer google-bert/bert-base-multilingual-cased
+[2024-05-12 00:32:57] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer google-bert/bert-base-multilingual-uncased
+###################################
+google-bert/bert-base-multilingual-cased, <class 'tokenizers.models.WordPiece'>
+reversible: false; unk_token: [UNK], 100, unk_ratio: 0.0531; oov: []
+text[5]     = "; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること;  a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
+decoding[5] = " ; Замглавы управления развития ; 특히 주소 15 ~ 17번 홀에선 3연속 ; 確 実 に 春 が 近 づいてること ; a közoktatással? _ Belföld ; pumë, i vjetër, vjeç ; [UNK] [UNK] قىرغىزىستان ; निम्न में से कौन सा हारडवेयर ; [UNK] [UNK] ; Дзейныя асобы : ; « અમરેલીનાં મહિલા વિકાસ ; [UNK] ; [UNK]"
+[2024-05-12 00:33:17] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer google-bert/bert-base-uncased
+###################################
+google-bert/bert-base-multilingual-uncased, <class 'tokenizers.models.WordPiece'>
+reversible: false; unk_token: [UNK], 100, unk_ratio: 0.0360; oov: []
+text[5]     = "; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること;  a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
+decoding[5] = " ; замглавы управления развития ; 특히 주소 15 ~ 17번 홀에선 3연속 ; 確 実 に 春 か 近 ついてること ; a kozoktatassal? _ belfold ; pume, i vjeter, vjec ; يەردوغان ۋە قىرغىزىستان ; निमन म स कौन सा हारडवयर ; [UNK] [UNK] ; дзеиныя асобы : ; « અમરલીના મહિલા વિકાસ ; [UNK] ; [UNK]"
+[2024-05-12 00:33:37] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer google-t5/t5-large
+###################################
+google-bert/bert-base-uncased, <class 'tokenizers.models.WordPiece'>
+reversible: false; unk_token: [UNK], 100, unk_ratio: 0.0867; oov: []
+text[5]     = "; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること;  a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
+decoding[5] = " ; замглавы управления развития ; 특히 주소 15 ~ 17번 홀에선 3연속 ; [UNK] [UNK] に 春 か [UNK] ついてること ; a kozoktatassal? _ belfold ; pume, i vjeter, vjec ; [UNK] [UNK] قىرغىزىستان ; निमन म स [UNK] सा हारडवयर ; [UNK] [UNK] ; дзеиныя асобы : ; « [UNK] [UNK] [UNK] ; [UNK] ; [UNK]"
+###################################
+google-t5/t5-large, <class 'tokenizers.models.Unigram'>
+reversible: false; unk_token: <unk>, 2, unk_ratio: 0.2769; oov: []
+text[7]     = "Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること;  a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
+decoding[7] = "<unk>ам<unk>лав<unk> у<unk>равлени<unk> ра<unk>вити<unk>; <unk> <unk> 15<unk>17<unk> <unk> 3<unk>; <unk>; a közoktatással? _ Belföld; pum<unk>, i vjet<unk>r, vjeç; <unk> <unk> <unk> ; <unk> <unk> <unk> <unk> <unk> <unk>; <unk> <unk> ; <unk>е<unk>н<unk> асо<unk>:; « <unk> <unk> <unk>; <unk>;<unk>"
+[2024-05-12 00:34:57] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer google/byt5-small
+[2024-05-12 00:35:18] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer google/gemma-7b
+[2024-05-12 00:35:39] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer google/mobilebert-uncased
+[2024-05-12 00:36:59] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer google/mt5-large
+###################################
+google/mobilebert-uncased, <class 'tokenizers.models.WordPiece'>
+reversible: false; unk_token: [UNK], 100, unk_ratio: 0.0867; oov: []
+text[5]     = "; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること;  a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
+decoding[5] = " ; замглавы управления развития ; 특히 주소 15 ~ 17번 홀에선 3연속 ; [UNK] [UNK] に 春 か [UNK] ついてること ; a kozoktatassal? _ belfold ; pume, i vjeter, vjec ; [UNK] [UNK] قىرغىزىستان ; निमन म स [UNK] सा हारडवयर ; [UNK] [UNK] ; дзеиныя асобы : ; « [UNK] [UNK] [UNK] ; [UNK] ; [UNK]"
+C:\Users\xusong28\Miniconda3\envs\py3.10-torch1.13-hf.latest\lib\site-packages\transformers\convert_slow_tokenizer.py:560: UserWarning: The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.
+  warnings.warn(
+[2024-05-12 00:37:23] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer google/switch-c-2048
+###################################
+google/mt5-large, <class 'tokenizers.models.Unigram'>
+reversible: false; unk_token: <unk>, 2, unk_ratio: 0.0079; oov: []
+text[73]     = " a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
+decoding[73] = "a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; <unk>❤❥웃유♋☮✊;װיקיװערטערבוך"
+[2024-05-12 00:37:43] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer hfl/chinese-alpaca-lora-7b
+###################################
+google/switch-c-2048, <class 'tokenizers.models.Unigram'>
+reversible: false; unk_token: <unk>, 2, unk_ratio: 0.2769; oov: []
+text[7]     = "Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること;  a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
+decoding[7] = "<unk>ам<unk>лав<unk> у<unk>равлени<unk> ра<unk>вити<unk>; <unk> <unk> 15<unk>17<unk> <unk> 3<unk>; <unk>; a közoktatással? _ Belföld; pum<unk>, i vjet<unk>r, vjeç; <unk> <unk> <unk> ; <unk> <unk> <unk> <unk> <unk> <unk>; <unk> <unk> ; <unk>е<unk>н<unk> асо<unk>:; « <unk> <unk> <unk>; <unk>;<unk>"
+You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
+[2024-05-12 00:38:04] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer hfl/chinese-llama-2-7b
+[2024-05-12 00:38:25] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer hfl/chinese-llama-lora-7b
+[2024-05-12 00:38:46] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer hfl/llama-3-chinese-8b
+Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
+[2024-05-12 00:39:07] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer hpcai-tech/grok-1
+[2024-05-12 00:39:28] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer internlm/internlm-chat-7b
+[2024-05-12 00:40:09] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer internlm/internlm-xcomposer-7b
+[2024-05-12 00:40:31] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer internlm/internlm2-chat-7b
+[2024-05-12 00:41:13] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer internlm/internlm2-math-7b
+[2024-05-12 00:41:35] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer lmsys/fastchat-t5-3b-v1.0
+Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
+###################################
+[2024-05-12 00:41:55] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer meta-llama/Llama-2-7b-chat
+lmsys/fastchat-t5-3b-v1.0, sp_model, byte_num: 0
+reversible: false; unk_token: <unk>, 2, unk_ratio: 0.2105; oov: []
+text[7]     = "Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること;  a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
+decoding[7] = "  <unk> ам<unk> лав<unk>   у<unk> равлени<unk>   ра<unk> вити<unk>;   <unk>   <unk>   15<unk> 17<unk>   <unk>   3<unk>;   <unk>;     a   közoktatással?   _   Belföld;   pum<unk>,   i   vjet<unk>r,   vjeç;   <unk>   <unk>   <unk>   ;   <unk>   <unk>   <unk>   <unk>   <unk>   <unk>;   <unk>   <unk>   ;   <unk> е<unk> н<unk>   асо<unk>:;   «   <unk>   <unk>   <unk>;   \t \n <unk> ;<unk>  "
+[2024-05-12 00:41:55] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer meta-llama/Meta-Llama-3-8B
+[2024-05-12 00:41:55] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer microsoft/Phi-3-mini-4k-instruct
+Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
+[2024-05-12 00:42:16] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer microsoft/phi-1
+[2024-05-12 00:42:36] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer microsoft/phi-2
+Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
+[2024-05-12 00:42:56] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer mistralai/Mistral-7B-v0.1
+[2024-05-12 00:43:16] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer mistralai/Mixtral-8x7B-v0.1
+[2024-05-12 00:43:37] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer openai-community/gpt2
+[2024-05-12 00:43:57] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer openai/code-davinci-002
+[2024-05-12 00:43:57] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer openai/gpt-3.5-turbo
+[2024-05-12 00:43:57] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer openai/gpt-4
+[2024-05-12 00:43:57] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer openai/text-davinci-003
+[2024-05-12 00:43:57] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer paust/pko-t5-large
+Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
+[2024-05-12 00:44:18] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer thu-coai/CharacterGLM-6B
+[2024-05-12 00:44:58] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer tiiuae/falcon-180b
+[2024-05-12 00:45:19] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer tiiuae/falcon-7b
+Process finished with exit code 0
+```

utils/oov_util.py CHANGED Viewed

@@ -2,11 +2,117 @@
 import os
-CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
-space_tokens = ["空格 ，两个空格  ，三个空格    ，制表符\t,换行符\n"]
-docs = [line.strip() for line in open(os.path.join(CURRENT_DIR, "test.txt"), "r", encoding="utf-8")]

 import os
+import json
+from vocab import all_tokenizer_config, load_tokenizer, TokenizerImpl
+text = "hello; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속;" \
+       " 確実に春が近づいてること;  a közoktatással? _ Belföld;" \
+       " pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ;" \
+       " निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:;" \
+       " « અમરેલીનાં મહિલા વિકાસ; 🦙❤❥웃유♋☮✊;" \
+       "װיקיװערטערבוך "
+whitespace = "\t   \n\n\r  "
+bytes = b"\x00\x01\x02\x03\x04".decode('utf-8')
+text += whitespace
+def get_unk(tokenizer_config):
+    tokenizer = load_tokenizer(tokenizer_config)
+    if hasattr(tokenizer, "unk_token"):
+        return f"{tokenizer.unk_token}, {tokenizer.unk_token_id}"
+    else:
+        return "unk_token not found"
+# def infer_tokenizer_impl(tokenizer_config):
+def infer_tokenizer_type(tokenizer_config):
+    tokenizer = load_tokenizer(tokenizer_config)
+    if tokenizer_config.impl == TokenizerImpl.TikToken:
+        return "tiktoken"
+    if hasattr(tokenizer, "backend_tokenizer"):
+        return str(type(tokenizer.backend_tokenizer.model))  # type(tokenizer._tokenizer.model))
+    # orion: sp_model.Load(vocab_file)，继承 PreTrainedTokenizer
+    elif hasattr(tokenizer, "sp_model"):  # 基于 sentencepiece 包
+        # for i in range(tokenizer.sp_model.piece_size()):
+        #     if tokenizer.sp_model.is_byte(i):
+        #         print("")
+        return f"sp_model, byte_num: {sum([tokenizer.sp_model.is_byte(i) for i in range(tokenizer.sp_model.piece_size())])}"
+    # sp.Load(model_path)  ，并且包括image_tokenizer
+    elif "glm-" in tokenizer_config.name_or_path:
+        return f"byte_num: {sum([tokenizer.sp_tokenizer.text_tokenizer.sp.is_byte(i) for i in range(tokenizer.sp_tokenizer.text_tokenizer.sp.piece_size())])}"
+    # sp.Load(model_path)  ，没有image_tokenizer
+    elif "glm2-" in tokenizer_config.name_or_path \
+            or "glm3-" in tokenizer_config.name_or_path \
+            or "CharacterGLM-6B" in tokenizer_config.name_or_path:
+        return f"byte_num: {sum([tokenizer.tokenizer.sp_model.is_byte(i) for i in range(tokenizer.tokenizer.sp_model.piece_size())])}"
+    elif "abeja/gpt-neox-japanese-2.7b" == tokenizer_config.name_or_path:  # 支持 byte-level，解决oov问题
+        return f"japanese-bpe: https://github.com/tanreinama/Japanese-BPEEncoder_V2"
+    # bert-base-japanese： 特殊的地方在于 "word_tokenizer_type": "mecab"，见 https://huggingface.co/tohoku-nlp/bert-base-japanese/blob/main/tokenizer_config.json
+    elif "bert-base-japanese" in tokenizer_config.name_or_path:
+        return "wordpiece.MecabTokenizer, 支持byte-level https://taku910.github.io/mecab/"
+    elif "moss" in tokenizer_config.name_or_path:
+        return "应该是 sentencepiece.byte_bpe,待确认"
+    elif "byt5" in tokenizer_config.name_or_path:
+        return "未知，待定"
+    else:
+        print("catch", tokenizer_config.name_or_path)
+        raise "error"
+def test_reversible(tokenizer_config):
+    """
+    xlm-roberta-base 为什么oov这么少？是因为有 byte吗？
+    :param tokenizer_config:
+    :return:
+    """
+    tokenizer = load_tokenizer(tokenizer_config)
+    encoding = tokenizer.encode(text, add_special_tokens=False)
+    decoding = tokenizer.decode(encoding)
+    if text in decoding:
+        # print(tokenizer_config.name, tokenizer_config.impl, "reversible: true")
+        pass
+    else:
+        unk_count = sum([1 for token_id in encoding if token_id == tokenizer.unk_token_id])
+        oov_tokens = []
+        # if tokenizer_config.impl == TokenizerImpl.SentencePiece:
+        #     print(sum([tokenizer.is_byte(i) for i in range(tokenizer.piece_size())]))
+        print("#######"*5)
+        print(f"{tokenizer_config.name_or_path}, {infer_tokenizer_type(tokenizer_config)}\n"
+              f"reversible: false; unk_token: {get_unk(tokenizer_config)},"
+              f" unk_ratio: {unk_count/len(encoding):.4f}; oov: []")
+        for i in range(len(text)):
+            if text[i] != decoding[i]:
+                # print(f"text[{i}]     = {str(bytes(text[i:], 'utf-8'))}\n"
+                #       f"decoding[{i}] = {str(bytes(decoding[i:], 'utf-8'))}")
+                print(f"text[{i}]     = {json.dumps(text[i:], ensure_ascii=False)}, \n"
+                      f"decoding[{i}] = {json.dumps(decoding[i:], ensure_ascii=False)}")
+                break
+for config in all_tokenizer_config:
+    # if "xlm-roberta-base" in config.name:
+    # if "xlm-roberta-base" in config.name:
+    # if "chatglm3-6b" in config.name:
+    # if "bert-base-japanese" in config.name:
+    # if "moss" in config.name:
+    # if "byt5" in config.name:
+    if "baichuan" in config.name_or_path:
+    # if "CharacterGLM-6B" in config.name:
+    # if "fastchat-t5" in config.name:  # 报错 pyo3_runtime.PanicException: AddedVocabulary bad split
+    # if True:
+        # test_unk(config)
+         test_reversible(config)

utils/speed_util.py DELETED Viewed

@@ -1,9 +0,0 @@
-"""
-encode速度
-decode速度
-## examples
-qwen的encode速度有点慢
-"""

utils/symbol.py DELETED Viewed

@@ -1,35 +0,0 @@
-"""
-special_symbols: https://github.com/google/sentencepiece/blob/master/doc/special_symbols.md
-emoji:
-"""
-import sys
-# 来自 https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/tokenizer/gpt2_tokenization.py#L55
-# 啥意思？
-def bytes_to_unicode():
-    """
-    Returns list of utf-8 byte and a corresponding list of unicode strings.
-    The reversible bpe codes work on unicode strings.
-    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
-    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
-    This is a signficant percentage of your normal, say, 32K bpe vocab.
-    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
-    And avoids mapping to whitespace/control characters the bpe code barfs on.
-    """
-    _chr = unichr if sys.version_info[0] == 2 else chr
-    bs = list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + \
-        list(range(ord("®"), ord("ÿ") + 1))
-    cs = bs[:]
-    n = 0
-    for b in range(2**8):
-        if b not in bs:
-            bs.append(b)
-            cs.append(2**8 + n)
-            n += 1
-    cs = [_chr(n) for n in cs]
-    return dict(zip(bs, cs))
-aa = bytes_to_unicode()
-print(aa)

utils/text_util.py CHANGED Viewed

@@ -1,12 +1,23 @@
 def is_digit_char(uchar):
     return uchar in "0123456789"
-def has_digit(text):
     return any(is_digit_char(ch) for ch in text)
 def is_all_digit(text):
     return all(is_digit_char(char) for char in text)

+"""
+char_
+"""
+def detect_lang_from_unicode():
+    pass
 def is_digit_char(uchar):
     return uchar in "0123456789"
+def contains_digit(text):
     return any(is_digit_char(ch) for ch in text)
+def get_digit_count(text):
+    pass
 def is_all_digit(text):
     return all(is_digit_char(char) for char in text)

utils/vocab.jd.txt.v2 DELETED Viewed

@@ -1,10268 +0,0 @@
-[PAD]
-[unused1]
-[unused2]
-[unused3]
-[unused4]
-[unused5]
-[unused6]
-[unused7]
-[unused8]
-[unused9]
-[unused10]
-[unused11]
-[unused12]
-[unused13]
-[unused14]
-[unused15]
-[unused16]
-[unused17]
-[unused18]
-[unused19]
-[unused20]
-[unused21]
-[unused22]
-[unused23]
-[unused24]
-[unused25]
-[unused26]
-[unused27]
-[unused28]
-[unused29]
-[unused30]
-[unused31]
-[unused32]
-[unused33]
-[unused34]
-[unused35]
-[unused36]
-[unused37]
-[unused38]
-[unused39]
-[unused40]
-[unused41]
-[unused42]
-[unused43]
-[unused44]
-[unused45]
-[unused46]
-[unused47]
-[unused48]
-[unused49]
-[unused50]
-[unused51]
-[unused52]
-[unused53]
-[unused54]
-[unused55]
-[unused56]
-[unused57]
-[unused58]
-[unused59]
-[unused60]
-[unused61]
-[unused62]
-[unused63]
-[unused64]
-[unused65]
-[unused66]
-[unused67]
-[unused68]
-[unused69]
-[unused70]
-[unused71]
-[unused72]
-[unused73]
-[unused74]
-[unused75]
-[unused76]
-[unused77]
-[unused78]
-[unused79]
-[unused80]
-[unused81]
-[unused82]
-[unused83]
-[unused84]
-[unused85]
-[unused86]
-[unused87]
-[unused88]
-[unused89]
-[unused90]
-[unused91]
-[unused92]
-[unused93]
-[unused94]
-[unused95]
-[unused96]
-[unused97]
-[unused98]
-[unused99]
-[UNK]
-[CLS]
-[SEP]
-[MASK]
-<S>
-<T>
-!
-"
-”
-“
-—
-–
-…
-’
-‘
-#
-$
-%
-&
-'
-(
-)
-*
-+
-,
--
-.
-/
-0
-1
-2
-3
-4
-5
-6
-7
-8
-9
-10
-11
-12
-13
-14
-15
-16
-17
-18
-19
-20
-21
-22
-23
-24
-25
-26
-27
-28
-29
-30
-31
-32
-33
-34
-35
-36
-37
-38
-39
-40
-41
-42
-43
-44
-45
-46
-47
-48
-49
-50
-51
-52
-53
-54
-55
-56
-57
-58
-59
-60
-61
-62
-63
-64
-65
-66
-67
-68
-69
-70
-71
-72
-73
-74
-75
-76
-77
-78
-79
-80
-81
-82
-83
-84
-85
-86
-87
-88
-89
-90
-91
-92
-93
-94
-95
-96
-97
-98
-99
-100
-120
-128
-180
-200
-256
-304
-360
-500
-512
-1000
-1080
-2000
-2014
-2015
-2016
-2017
-2018
-2019
-2020
-2021
-2022
-:
-;
-<
-=
->
-?
-@
-[
-\
-]
-^
-_
-a
-b
-c
-d
-e
-f
-g
-h
-i
-j
-k
-l
-m
-n
-o
-p
-q
-r
-s
-t
-u
-v
-w
-x
-y
-z
-{
-|
-}
-~
-£
-¤
-¥
-§
-«
-°
-±
-²
-³
-µ
-·
-¹
-º
-»
-¼
-×
-ß
-æ
-÷
-ø
-đ
-ŋ
-ɔ
-ə
-ɡ
-ʰ
-ˇ
-ˈ
-ˊ
-ˋ
-ˍ
-ː
-˙
-˚
-ˢ
-α
-β
-γ
-δ
-ε
-η
-θ
-ι
-κ
-λ
-μ
-ν
-ο
-π
-ρ
-ς
-σ
-τ
-υ
-φ
-χ
-ψ
-ω
-а
-б
-в
-г
-д
-е
-ж
-з
-и
-к
-л
-м
-н
-о
-п
-р
-с
-т
-у
-ф
-х
-ц
-ч
-ш
-ы
-ь
-я
-і
-ก
-ง
-น
-ม
-ย
-ร
-อ
-า
-เ
-๑
-་
-ღ
-ᵃ
-ᵉ
-ᵍ
-ᵏ
-ᵐ
-ᵒ
-ᵘ
-‖
-„
-†
-•
-‥
-‧
-‰
-′
-″
-‹
-›
-※
-‿
-⁄
-ⁱ
-⁺
-ⁿ
-₁
-₂
-₃
-₄
-€
-℃
-№
-ⅰ
-ⅱ
-ⅲ
-ⅳ
-ⅴ
-←
-↑
-→
-↓
-⇒
-∀
-−
-∕
-∙
-√
-∞
-∟
-∠
-∣
-∥
-∩
-∮
-∶
-∼
-∽
-≈
-≒
-≡
-≤
-≥
-≦
-≧
-≪
-≫
-⊙
-⋅
-⋈
-⋯
-⌒
-①
-②
-③
-④
-⑤
-⑥
-⑦
-⑧
-⑨
-⑩
-⑴
-⑵
-⑶
-⑷
-⑸
-⒈
-⒉
-⒊
-⒋
-ⓒ
-ⓔ
-ⓘ
-─
-━
-│
-┃
-┅
-┆
-┊
-┌
-└
-├
-┣
-═
-║
-╚
-╞
-╠
-╭
-╮
-╯
-╰
-╱
-╳
-▂
-▃
-▅
-▇
-█
-▉
-▋
-▌
-▍
-▎
-■
-□
-▬
-▲
-△
-►
-▼
-▽
-◆
-◇
-○
-◎
-●
-◕
-◠
-◢
-◤
-★
-☆
-☞
-☼
-♡
-♪
-♫
-♬
-✕
-✦
-✪
-✰
-✿
-❀
-➜
-➤
-⦿
-、
-。
-〃
-々
-〇
-〈
-〉
-《
-》
-「
-」
-『
-』
-【
-】
-〓
-〔
-〕
-〖
-〗
-〜
-〝
-〞
-ㄧ
-ㆍ
-㈦
-㊣
-㎡
-㗎
-一
-丁
-七
-万
-丈
-三
-上
-下
-不
-与
-丐
-丑
-专
-且
-丕
-世
-丘
-丙
-业
-丛
-东
-丝
-丞
-両
-丢
-两
-严
-丧
-丨
-个
-丫
-中
-丰
-串
-临
-丶
-丸
-丹
-为
-主
-丼
-丽
-举
-丿
-乂
-乃
-久
-么
-义
-之
-乌
-乍
-乎
-乏
-乐
-乒
-乓
-乔
-乖
-乗
-乘
-乙
-乜
-九
-乞
-也
-习
-乡
-书
-乩
-买
-乱
-乳
-亀
-了
-予
-争
-事
-二
-于
-亏
-云
-互
-五
-井
-亘
-亚
-些
-亜
-亟
-亡
-亢
-交
-亥
-亦
-产
-亨
-亩
-享
-京
-亭
-亮
-亲
-亳
-亵
-人
-亿
-什
-仁
-仃
-仄
-仅
-仆
-仇
-今
-介
-仍
-从
-仏
-仑
-仓
-仔
-仕
-他
-仗
-付
-仙
-仝
-仞
-仟
-代
-令
-以
-仨
-仪
-们
-仮
-仰
-仲
-件
-价
-任
-份
-仿
-企
-伉
-伊
-伍
-伎
-伏
-伐
-休
-伕
-众
-优
-伙
-会
-伝
-伞
-伟
-传
-伢
-伤
-伦
-伪
-伫
-伯
-估
-伴
-伶
-伸
-伺
-似
-伽
-佃
-但
-位
-低
-住
-佐
-佑
-体
-佔
-何
-佗
-佘
-余
-佚
-佛
-作
-佝
-佞
-佟
-你
-佢
-佣
-佤
-佥
-佩
-佬
-佯
-佰
-佳
-佶
-佻
-佼
-使
-侃
-侄
-侈
-例
-侍
-侏
-侑
-侗
-供
-依
-侠
-価
-侣
-侥
-侦
-侧
-侨
-侬
-侮
-侯
-侵
-侷
-便
-促
-俄
-俊
-俎
-俏
-俐
-俑
-俗
-俘
-俚
-保
-俞
-俟
-信
-俨
-俩
-俪
-俬
-俭
-修
-俯
-俱
-俳
-俸
-俺
-俾
-倌
-倍
-倏
-倒
-倔
-倖
-倘
-候
-倚
-倜
-借
-倡
-値
-倦
-倩
-倪
-倬
-倭
-倶
-债
-值
-倾
-偃
-假
-偈
-偌
-偎
-偏
-偕
-做
-停
-健
-偶
-偷
-偻
-偿
-傀
-傅
-傍
-傚
-傣
-傥
-储
-傩
-催
-傲
-傻
-働
-像
-僖
-僚
-僧
-僭
-僮
-僱
-僵
-僻
-儆
-儋
-儒
-儡
-儿
-兀
-允
-元
-兄
-充
-兆
-先
-光
-克
-免
-児
-兑
-兔
-兖
-党
-兜
-兢
-入
-全
-八
-公
-六
-兮
-兰
-共
-兲
-关
-兴
-兵
-其
-具
-典
-兹
-养
-兼
-兽
-冀
-内
-円
-冇
-冈
-冉
-册
-再
-冏
-��
-冕
-冗
-写
-军
-农
-冠
-冢
-冤
-冥
-冨
-冬
-冯
-冰
-冲
-决
-况
-冶
-冷
-冻
-冼
-冽
-冾
-净
-凄
-准
-凇
-凉
-凋
-凌
-减
-凑
-凛
-凝
-几
-凡
-凤
-処
-凪
-凭
-凯
-凰
-凳
-凶
-凸
-凹
-出
-击
-函
-凿
-刀
-刁
-刃
-分
-切
-刈
-刊
-刍
-刎
-刑
-划
-列
-刘
-则
-刚
-创
-初
-删
-判
-刨
-利
-别
-刮
-到
-制
-刷
-券
-刹
-刺
-刻
-刽
-剁
-剂
-剃
-剉
-削
-剌
-前
-剐
-剑
-剔
-剖
-剜
-剣
-剤
-剥
-剧
-剩
-剪
-副
-割
-剷
-剽
-剿
-劈
-力
-劝
-办
-功
-加
-务
-劣
-动
-助
-努
-劫
-劭
-励
-劲
-劳
-労
-劵
-効
-劾
-势
-勃
-勇
-勉
-勋
-勐
-勒
-勖
-勘
-募
-勤
-勧
-勳
-勺
-勾
-勿
-匀
-包
-匆
-匈
-匍
-匐
-匕
-化
-北
-匙
-匝
-匠
-匡
-匣
-匪
-匮
-匹
-区
-医
-匾
-匿
-十
-千
-卅
-升
-午
-卉
-半
-卍
-华
-协
-卑
-卒
-卓
-单
-卖
-南
-単
-博
-卜
-卞
-卟
-占
-卡
-卢
-卤
-卦
-卧
-卫
-卮
-卯
-印
-危
-即
-却
-卵
-卷
-卸
-卿
-厂
-厄
-厅
-历
-厉
-压
-厌
-厕
-厘
-厚
-厝
-原
-厢
-厥
-厦
-厨
-厩
-厮
-厳
-去
-县
-叁
-参
-又
-叉
-及
-友
-双
-反
-収
-发
-叔
-取
-受
-变
-叙
-叛
-叟
-叠
-叡
-口
-古
-句
-另
-叨
-叩
-只
-叫
-召
-叭
-叮
-可
-台
-叱
-史
-右
-叵
-叶
-号
-司
-叹
-叻
-叼
-叽
-吁
-吃
-各
-吆
-合
-吉
-吊
-吋
-同
-名
-后
-吏
-吐
-向
-吒
-吓
-吕
-吖
-吗
-君
-吝
-吞
-吟
-吠
-吡
-否
-吧
-吨
-吩
-含
-听
-吭
-吮
-启
-吱
-吴
-吵
-吸
-吹
-吻
-吼
-吽
-吾
-呀
-呃
-呆
-呈
-告
-呋
-呎
-呐
-呓
-呕
-呗
-员
-呛
-呜
-呢
-呤
-呦
-周
-呱
-呲
-味
-呵
-呷
-呸
-呻
-呼
-命
-咀
-咁
-咂
-咄
-咆
-咋
-和
-咎
-咏
-咐
-咒
-咔
-咕
-咖
-咗
-咘
-咙
-咚
-咛
-咣
-咤
-咦
-咧
-咨
-咩
-咪
-咫
-咬
-咭
-咯
-咱
-咲
-咳
-咸
-咻
-咽
-咿
-哀
-品
-哂
-哄
-哆
-哇
-哈
-哉
-哋
-哌
-响
-哎
-哏
-哐
-哑
-哒
-哔
-哗
-哟
-哥
-哦
-哧
-哨
-哩
-哪
-哭
-哮
-哲
-哺
-哼
-哽
-唁
-唆
-唇
-唉
-唏
-唐
-唑
-唔
-唠
-唤
-唧
-唬
-售
-唯
-唰
-唱
-唳
-唷
-唸
-唾
-啃
-啄
-商
-啉
-啊
-啕
-啖
-啜
-啡
-啤
-啥
-啦
-啧
-啪
-啫
-啬
-啮
-啰
-啱
-啲
-啵
-啶
-啷
-啸
-啻
-啼
-啾
-喀
-喂
-喃
-善
-喆
-喇
-喉
-喊
-喋
-喏
-喔
-喘
-喙
-喜
-喝
-喟
-喧
-喫
-喰
-喱
-喳
-喵
-営
-喷
-喹
-喺
-喻
-喽
-嗅
-嗑
-嗒
-嗓
-嗔
-嗖
-嗜
-嗝
-嗟
-嗡
-嗣
-嗤
-嗦
-嗨
-嗪
-嗬
-嗯
-嗰
-嗲
-嗳
-嗷
-嗽
-嘀
-嘅
-嘈
-嘉
-嘌
-嘎
-嘘
-嘚
-嘛
-嘞
-嘟
-嘢
-嘣
-嘤
-嘧
-嘭
-嘱
-嘲
-嘴
-嘶
-嘹
-嘻
-嘿
-噌
-噎
-噔
-噗
-噙
-噜
-噢
-噤
-器
-噩
-噪
-噬
-噱
-噶
-噻
-噼
-嚎
-嚏
-嚐
-嚓
-嚟
-嚣
-嚷
-嚼
-囉
-囊
-囍
-囔
-囗
-囚
-四
-囝
-回
-囟
-因
-囡
-团
-団
-囤
-囧
-囫
-园
-困
-囱
-囲
-図
-围
-囹
-固
-国
-图
-囿
-圃
-圄
-圆
-圈
-圏
-圜
-土
-圣
-圧
-在
-圩
-圭
-地
-圳
-场
-圻
-圾
-址
-坂
-均
-坊
-坍
-坎
-坏
-坐
-坑
-块
-坚
-坛
-坝
-坞
-坟
-坠
-坡
-坤
-坦
-坨
-坪
-坯
-坳
-坵
-坷
-垂
-垃
-垄
-型
-垒
-垚
-垛
-垠
-垢
-垣
-垦
-垩
-垫
-垭
-垮
-埂
-埃
-埋
-城
-埔
-埕
-埗
-域
-埠
-埤
-埵
-埸
-培
-基
-埼
-堀
-堂
-堃
-堆
-堇
-堑
-堕
-堙
-堡
-堤
-堪
-堰
-堵
-堺
-堿
-塌
-塑
-塔
-塘
-塞
-塩
-填
-塬
-塭
-塾
-墀
-境
-墅
-墉
-墒
-墓
-増
-墘
-墙
-增
-墟
-墨
-墩
-壁
-壅
-壆
-壊
-壑
-壕
-壤
-士
-壬
-壮
-声
-売
-壳
-壶
-壹
-处
-备
-変
-复
-夏
-夔
-夕
-外
-夙
-多
-夜
-够
-夥
-大
-天
-太
-夫
-夭
-央
-夯
-失
-头
-夷
-夸
-夹
-夺
-奂
-奄
-奇
-奈
-奉
-奋
-奎
-奏
-契
-奔
-奕
-奖
-套
-奘
-奚
-奠
-奢
-奥
-女
-奴
-奶
-奸
-她
-好
-如
-妃
-妄
-妆
-妇
-妈
-妊
-妍
-妒
-妓
-妖
-妘
-妙
-妞
-妣
-妤
-妥
-妨
-妩
-妪
-妮
-妲
-妳
-妹
-妻
-妾
-姆
-姉
-姊
-始
-姐
-姑
-姒
-姓
-委
-姗
-姚
-姜
-姝
-姣
-姥
-姨
-姪
-姫
-姬
-姹
-姻
-姿
-威
-娃
-娄
-娅
-娆
-娇
-娉
-娑
-娓
-娘
-娜
-娟
-娠
-娣
-娥
-娩
-娱
-娲
-娴
-娶
-娼
-婀
-婆
-婉
-婊
-婕
-婚
-婢
-婧
-婪
-婴
-婵
-婶
-婷
-婺
-婿
-媒
-媚
-媛
-媞
-媲
-媳
-媾
-嫁
-嫂
-嫉
-嫌
-嫑
-嫔
-嫖
-嫘
-嫚
-嫡
-嫣
-嫦
-嫩
-嫲
-嬅
-嬉
-嬗
-嬛
-嬢
-嬴
-嬷
-嬿
-孀
-孃
-子
-孑
-孔
-孕
-孖
-字
-存
-孙
-孚
-孛
-孜
-孝
-孟
-孢
-季
-孤
-学
-孩
-孪
-孬
-孰
-孱
-孳
-孵
-孺
-孽
-宁
-它
-宅
-宇
-守
-安
-宋
-完
-宏
-宓
-宕
-宗
-官
-宙
-定
-宛
-宜
-宝
-实
-実
-宠
-审
-客
-宣
-室
-宥
-宦
-宪
-宫
-宰
-害
-宴
-宵
-家
-宸
-容
-宽
-宾
-宿
-寂
-寄
-寅
-密
-寇
-富
-寐
-寒
-寓
-寛
-寝
-寞
-察
-寡
-寥
-寨
-寮
-寰
-寸
-对
-寺
-寻
-导
-対
-寿
-封
-専
-射
-将
-尉
-尊
-小
-少
-尔
-尕
-尖
-尘
-尚
-尝
-尤
-尧
-尬
-就
-尴
-尸
-尹
-尺
-尻
-尼
-尽
-尾
-尿
-局
-屁
-层
-屄
-居
-屈
-屉
-届
-屋
-屌
-屎
-屏
-屐
-屑
-展
-属
-屠
-屡
-履
-屯
-山
-屹
-屿
-岀
-岁
-岂
-岌
-岐
-岑
-岔
-岖
-岗
-岘
-岙
-岚
-岛
-岩
-岫
-岬
-岭
-岱
-岳
-岷
-岸
-��
-峋
-峒
-峙
-峡
-峤
-峥
-峦
-峨
-峪
-峭
-峯
-峰
-峻
-崁
-崂
-崆
-崇
-崎
-崑
-崔
-崖
-崙
-崛
-崧
-崩
-崭
-崴
-崽
-嵇
-嵊
-嵋
-嵌
-嵘
-嵩
-嵬
-嵯
-嶂
-嶋
-嶙
-巅
-巍
-巖
-川
-州
-巡
-巢
-工
-左
-巧
-巨
-巩
-巫
-差
-己
-已
-巳
-巴
-巷
-巻
-巽
-巾
-巿
-币
-市
-布
-帅
-帆
-师
-希
-帐
-帑
-帕
-帖
-帘
-帚
-帛
-帜
-帝
-带
-帧
-席
-帮
-帯
-帰
-帷
-常
-帼
-帽
-幂
-幄
-幅
-幌
-幔
-幕
-幡
-幢
-干
-平
-年
-并
-幸
-幻
-幼
-幽
-广
-庁
-広
-庄
-庆
-庇
-床
-序
-庐
-库
-应
-底
-庖
-店
-庙
-庚
-府
-庞
-废
-庠
-度
-座
-庭
-庵
-庶
-康
-庸
-庹
-庾
-廃
-廉
-廊
-廓
-廖
-延
-廷
-建
-廿
-开
-弁
-异
-弃
-弄
-弈
-弊
-弋
-式
-弑
-弓
-弔
-引
-弗
-弘
-弛
-弟
-张
-弥
-弦
-弧
-弩
-弭
-弯
-弱
-弹
-强
-弼
-弾
-彅
-归
-当
-录
-彗
-彝
-形
-彤
-彦
-彧
-彩
-彪
-彫
-彬
-彭
-彰
-影
-彷
-役
-彻
-彼
-彿
-往
-征
-径
-待
-徇
-很
-徉
-徊
-律
-徐
-徒
-従
-徕
-得
-徘
-徙
-徜
-御
-徨
-循
-徬
-微
-徳
-徴
-德
-徼
-徽
-心
-必
-忆
-忌
-忍
-忏
-忐
-忑
-忒
-忖
-志
-忘
-忙
-応
-忠
-忡
-忤
-忧
-忪
-快
-忱
-念
-忻
-忽
-忿
-怀
-态
-怂
-怅
-怆
-怎
-怏
-怒
-怔
-怕
-怖
-怙
-怜
-思
-怠
-怡
-急
-怦
-性
-怨
-怪
-怯
-怵
-总
-怼
-恁
-恃
-恋
-恍
-恐
-恒
-恕
-恙
-恚
-恢
-恣
-恤
-恨
-恩
-恪
-恫
-恬
-恭
-息
-恰
-恳
-恵
-恶
-恸
-恺
-恻
-恼
-恿
-悄
-悉
-悌
-悍
-悔
-悖
-悚
-悟
-悠
-患
-悦
-您
-悩
-悪
-悬
-悯
-悱
-悲
-悴
-悸
-悻
-悼
-悽
-情
-惆
-惇
-惊
-惋
-惑
-惕
-惘
-惚
-惜
-惟
-惠
-惦
-惧
-惨
-惩
-惫
-惬
-惭
-惮
-惯
-惰
-想
-惴
-惶
-惹
-惺
-愁
-愆
-愈
-愉
-愍
-意
-愕
-愚
-感
-愣
-愤
-愧
-愫
-愿
-慈
-慌
-慎
-慑
-慕
-慢
-慧
-慨
-慰
-慵
-慷
-慾
-憋
-憎
-憔
-憧
-憨
-憩
-憬
-憾
-懂
-懈
-懊
-懋
-懑
-懒
-懦
-懵
-懿
-戈
-戊
-戌
-戍
-戎
-戏
-成
-我
-戒
-戕
-或
-战
-戚
-戛
-戟
-戡
-戦
-截
-戬
-戮
-戳
-戴
-户
-戸
-戻
-戾
-房
-所
-扁
-扇
-扈
-扉
-手
-才
-扎
-扑
-扒
-打
-扔
-払
-托
-扛
-扣
-扦
-执
-扩
-扪
-扫
-扬
-扭
-扮
-扯
-扰
-扱
-扳
-扶
-批
-扼
-找
-承
-技
-抄
-抉
-把
-抑
-抒
-抓
-投
-抖
-抗
-折
-抚
-抛
-抜
-択
-抟
-抠
-抡
-抢
-护
-报
-抨
-披
-抬
-抱
-抵
-抹
-押
-抽
-抿
-拂
-拄
-担
-拆
-拇
-拈
-拉
-拌
-拍
-拎
-拐
-拒
-拓
-拔
-拖
-拗
-拘
-拙
-拚
-招
-拜
-拟
-拡
-拢
-拣
-拥
-拦
-拧
-拨
-择
-括
-拭
-拮
-拯
-拱
-拳
-拴
-拷
-拼
-拽
-拾
-拿
-持
-挂
-指
-挈
-按
-挎
-挑
-挖
-挙
-挚
-挛
-挝
-挞
-挟
-挠
-挡
-挣
-挤
-挥
-挨
-挪
-挫
-振
-挲
-挹
-挺
-挽
-捂
-捅
-捆
-捉
-捋
-捌
-捍
-捎
-捏
-捐
-捕
-捞
-损
-捡
-换
-捣
-捧
-捩
-据
-捱
-捲
-捶
-捷
-捺
-捻
-掀
-掂
-掇
-授
-掉
-掌
-掏
-掐
-排
-掖
-掘
-掠
-探
-掣
-接
-控
-推
-掩
-措
-掬
-掰
-掲
-掳
-掴
-掷
-掸
-掺
-揃
-揄
-揆
-揉
-揍
-描
-提
-插
-揖
-握
-揣
-揩
-揪
-揭
-援
-揶
-揸
-揹
-揽
-搀
-搁
-搂
-搅
-搏
-搐
-搓
-搔
-搜
-搞
-搡
-搪
-搬
-搭
-携
-搽
-摀
-摁
-摄
-摆
-摇
-摈
-摊
-摒
-摔
-摘
-摞
-摧
-摩
-摸
-摹
-撂
-撃
-撅
-撇
-撑
-撒
-撕
-撚
-撞
-撤
-撩
-撬
-播
-撮
-撰
-撵
-撷
-撸
-撼
-擀
-擂
-擅
-操
-擎
-擒
-擘
-擞
-擡
-擢
-擦
-攀
-攒
-攘
-攞
-攥
-攫
-支
-收
-攸
-改
-攻
-放
-政
-故
-效
-敌
-敍
-敎
-敏
-救
-敕
-敖
-教
-敛
-敝
-敞
-敢
-散
-敦
-敬
-数
-敲
-整
-敷
-文
-斋
-斌
-斎
-斐
-斑
-斓
-斗
-料
-斛
-斜
-斟
-斡
-斤
-斥
-斧
-斩
-斫
-断
-斯
-新
-方
-施
-旁
-旃
-旅
-旋
-旌
-旎
-族
-旖
-旗
-无
-既
-日
-旦
-旧
-旨
-早
-旬
-旭
-旮
-旱
-时
-旷
-旺
-旻
-昀
-昂
-昆
-昇
-昉
-昊
-昌
-明
-昏
-易
-昔
-昕
-昙
-星
-映
-春
-昧
-昨
-昭
-是
-昱
-昴
-昵
-昶
-昼
-显
-晁
-晃
-晋
-晌
-晏
-晒
-晓
-晔
-晕
-晖
-晗
-晚
-晞
-晟
-晤
-晦
-晨
-晩
-普
-景
-晰
-晴
-晶
-晷
-智
-晾
-暂
-暄
-暇
-暌
-暐
-暑
-暖
-暗
-暝
-暧
-暨
-暮
-暱
-暴
-暸
-暹
-曙
-曜
-曝
-曦
-曰
-曲
-曳
-更
-曹
-曼
-曾
-替
-最
-月
-有
-朋
-服
-朐
-朔
-朕
-朗
-望
-朝
-期
-朦
-木
-未
-末
-本
-札
-术
-朱
-朴
-朵
-机
-朽
-杀
-杂
-权
-杆
-杈
-杉
-李
-杏
-材
-村
-杓
-杖
-杜
-杞
-束
-杠
-条
-来
-杨
-杭
-杯
-杰
-杳
-杵
-杷
-杼
-松
-板
-极
-构
-枇
-枉
-枋
-析
-枕
-林
-枚
-果
-枝
-枢
-枣
-枪
-枫
-枭
-枯
-枰
-枱
-枳
-架
-枷
-枸
-柄
-柏
-某
-柑
-柒
-染
-柔
-柘
-柚
-柜
-柞
-柠
-柢
-查
-柩
-柬
-柯
-柱
-柳
-柴
-査
-柿
-栀
-栃
-栄
-栅
-标
-栈
-栉
-栋
-栎
-栏
-树
-栓
-栖
-栗
-校
-栩
-株
-样
-核
-根
-格
-栽
-栾
-桀
-桁
-桂
-桃
-桅
-框
-案
-桉
-桌
-桎
-桐
-桑
-桓
-桔
-桜
-桠
-桡
-桢
-档
-桥
-桦
-桧
-桨
-桩
-桶
-梁
-梅
-梆
-梏
-梓
-梗
-梢
-梦
-梧
-梨
-梭
-梯
-械
-梳
-梵
-梶
-检
-棂
-棉
-棋
-棍
-棒
-棕
-棘
-棚
-棠
-棣
-森
-棱
-棵
-棹
-棺
-椁
-椅
-椋
-��
-椎
-椒
-検
-椪
-椭
-椰
-椹
-椽
-椿
-楂
-楔
-楚
-楝
-楞
-楠
-楣
-楫
-楮
-楷
-楸
-楹
-楼
-楽
-概
-榄
-榆
-榈
-榉
-榔
-榕
-榖
-榛
-榜
-榨
-榫
-榭
-榱
-榴
-榷
-榻
-槁
-槃
-槌
-槎
-槐
-槓
-様
-槛
-槟
-槭
-槲
-槻
-槽
-槿
-樊
-樑
-樟
-模
-権
-横
-樫
-樯
-樱
-樵
-樽
-樾
-橄
-橇
-橐
-橘
-橙
-橡
-橱
-橹
-橼
-檀
-檄
-檎
-檐
-檗
-檬
-欠
-次
-欢
-欣
-欧
-欲
-欸
-欺
-款
-歆
-歇
-歉
-歌
-歎
-歓
-歙
-歛
-止
-正
-此
-步
-武
-歧
-歩
-歪
-歯
-歳
-歴
-歹
-死
-歼
-殁
-殃
-殆
-殇
-殉
-殊
-残
-殒
-殓
-殖
-殡
-殭
-殴
-段
-殷
-殿
-毁
-毂
-毅
-毋
-母
-毎
-每
-毒
-毓
-比
-毕
-毗
-毘
-毙
-毛
-毡
-毫
-毯
-毽
-氏
-氐
-民
-氓
-气
-氖
-気
-氙
-氛
-氟
-氡
-氢
-氤
-氦
-氧
-氨
-氪
-氮
-氯
-氰
-氲
-水
-氷
-永
-氹
-氾
-汀
-汁
-求
-汆
-汇
-汉
-汎
-汐
-汕
-汗
-汛
-汝
-汞
-江
-池
-污
-汤
-汨
-汩
-汪
-汰
-汲
-汴
-汶
-汹
-汽
-汾
-沁
-沂
-沃
-沅
-沈
-沉
-沌
-沏
-沐
-沓
-沙
-沛
-沟
-没
-沢
-沣
-沥
-沦
-沧
-沪
-沫
-沭
-沮
-沱
-河
-沸
-油
-治
-沼
-沽
-沾
-沿
-泄
-泉
-泊
-泌
-泓
-法
-泗
-泛
-泞
-泠
-泡
-波
-泣
-泥
-注
-泪
-泫
-泮
-泯
-泰
-泱
-泳
-泵
-泷
-泸
-泻
-泼
-泽
-泾
-洁
-洄
-洋
-洒
-洗
-洙
-洛
-洞
-津
-洩
-洪
-洮
-洱
-洲
-洵
-洸
-洹
-活
-洼
-洽
-派
-流
-浃
-浄
-浅
-浆
-浇
-浊
-测
-济
-浏
-浑
-浒
-浓
-浔
-浙
-浚
-浜
-浣
-浦
-浩
-浪
-浬
-浮
-浯
-浴
-海
-浸
-涂
-涅
-消
-涉
-涌
-涎
-涓
-涔
-涕
-涙
-涛
-涝
-涞
-涟
-涠
-涡
-涣
-涤
-润
-涧
-涨
-涩
-涪
-涮
-涯
-液
-涵
-涸
-涿
-淀
-淄
-淅
-淆
-淇
-淋
-淌
-淑
-淖
-淘
-淙
-淞
-淡
-淤
-淦
-淫
-淬
-淮
-深
-淳
-混
-淹
-添
-淼
-清
-済
-渉
-渊
-渋
-渍
-渎
-渐
-渔
-渗
-渚
-渝
-渠
-渡
-渣
-渤
-渥
-温
-渭
-港
-渲
-渴
-游
-渺
-湃
-湄
-湍
-湖
-湘
-湛
-湟
-湧
-湫
-湮
-湳
-湾
-湿
-満
-溃
-溅
-溉
-溏
-源
-溜
-溟
-溢
-溥
-溧
-溪
-溯
-溱
-溴
-溶
-溺
-溼
-滁
-滂
-滇
-滋
-滑
-滓
-滔
-滕
-滙
-滚
-滝
-滞
-滟
-满
-滢
-滤
-滥
-滦
-滨
-滩
-滴
-漂
-漆
-漉
-漏
-漓
-演
-漕
-漠
-漩
-漪
-漫
-漯
-漱
-漳
-漾
-潆
-潇
-潋
-潍
-潘
-潜
-潞
-潟
-潢
-潦
-潧
-潭
-潮
-潴
-潸
-潺
-潼
-澄
-澈
-澍
-澎
-澜
-澡
-澧
-澳
-澹
-激
-濂
-濑
-濒
-濠
-濡
-濬
-濮
-濯
-瀑
-瀚
-瀛
-瀞
-瀬
-灌
-灏
-灞
-火
-灬
-灭
-灯
-灰
-灵
-灶
-灸
-灼
-灾
-灿
-炀
-炁
-炅
-炉
-炊
-炎
-炒
-炔
-炕
-炖
-炙
-炜
-炫
-炬
-炭
-炮
-炯
-炳
-炷
-炸
-点
-炼
-炽
-烁
-烂
-烃
-烈
-烊
-烘
-烙
-烛
-烟
-烤
-烦
-烧
-烨
-烩
-烫
-烬
-热
-烯
-烷
-烹
-烽
-焉
-焊
-焕
-焖
-焗
-焘
-焙
-焚
-焜
-焦
-焯
-焰
-焱
-然
-焼
-煅
-煊
-煌
-煎
-煖
-煜
-煞
-煤
-煦
-照
-煨
-煮
-煲
-煸
-煽
-熄
-熊
-熏
-熔
-熙
-熟
-熠
-熨
-熬
-熵
-熹
-燃
-燄
-燊
-燎
-燔
-燕
-燥
-燧
-燮
-燻
-燿
-爆
-爪
-爬
-爰
-爱
-爵
-父
-爷
-爸
-爹
-爻
-爽
-片
-版
-牌
-牍
-牒
-牙
-牛
-牝
-牟
-牠
-牡
-牢
-牦
-牧
-物
-牯
-牲
-牴
-牵
-特
-牺
-犀
-犁
-犄
-犊
-犍
-犒
-犬
-犯
-状
-犷
-犸
-犹
-狂
-狄
-狈
-狎
-狐
-狒
-狗
-狙
-狞
-狠
-狡
-狩
-独
-狭
-狮
-狰
-狱
-狸
-狼
-猎
-猕
-猖
-猗
-猛
-猜
-猝
-猥
-猩
-猪
-猫
-猬
-献
-猴
-猷
-猾
-猿
-獐
-獒
-獗
-獠
-獣
-獭
-獾
-玄
-率
-玉
-王
-玑
-玖
-玛
-玟
-玠
-玥
-玩
-玫
-玮
-环
-现
-玲
-玳
-玷
-玺
-玻
-珀
-珂
-珅
-珈
-珉
-珊
-珍
-珏
-珐
-珑
-珙
-珞
-珠
-珣
-珥
-珩
-珪
-班
-珮
-珲
-珺
-球
-琅
-理
-琇
-琉
-琊
-琍
-琏
-琐
-琛
-琢
-琥
-琦
-琨
-琪
-琬
-琮
-琰
-琲
-琳
-琴
-琵
-琶
-琼
-瑀
-瑁
-瑄
-瑕
-瑗
-瑙
-瑚
-瑛
-瑜
-瑞
-瑟
-瑠
-瑯
-瑰
-瑶
-瑾
-璀
-璁
-璃
-璇
-璋
-璎
-璐
-璜
-璞
-璟
-璧
-璨
-璿
-瓒
-瓜
-瓢
-瓣
-瓤
-瓦
-瓮
-瓯
-瓴
-瓶
-瓷
-甄
-甕
-甘
-甙
-甚
-甜
-生
-甥
-甦
-用
-甩
-甫
-甬
-甭
-甯
-田
-由
-甲
-申
-电
-男
-甸
-町
-画
-甾
-畀
-畅
-界
-畏
-畑
-畔
-留
-畜
-略
-畦
-番
-畲
-畳
-畴
-畸
-畹
-畿
-疆
-疏
-疑
-疔
-疖
-疗
-疙
-疚
-疝
-疟
-疡
-疣
-疤
-疥
-疫
-疮
-疯
-疱
-疲
-疳
-疵
-疸
-疹
-疼
-疽
-疾
-痂
-病
-症
-痈
-痉
-痊
-痍
-痒
-痔
-痕
-痘
-痛
-痞
-痠
-痢
-痣
-痤
-痧
-痨
-痪
-痫
-痰
-痱
-痴
-痹
-痺
-痼
-痿
-瘀
-瘁
-瘘
-瘙
-瘟
-瘠
-瘢
-瘤
-瘦
-瘩
-瘪
-瘫
-瘴
-瘸
-瘾
-癌
-癒
-癖
-癜
-癞
-癡
-癣
-癫
-癸
-発
-登
-白
-百
-皂
-的
-皆
-皇
-皈
-皋
-皎
-皑
-皓
-皖
-皙
-皮
-皱
-皴
-皿
-盂
-盅
-盆
-盈
-益
-盎
-盏
-盐
-监
-盒
-盔
-盖
-盗
-盘
-盛
-盟
-盥
-目
-盯
-盱
-盲
-直
-相
-盹
-盼
-盾
-省
-眈
-眉
-看
-県
-眙
-眞
-真
-眠
-眦
-眨
-眩
-眯
-眶
-眷
-眸
-眺
-眼
-着
-睁
-睇
-睐
-睑
-睛
-睡
-睢
-督
-睥
-睦
-睨
-睪
-睫
-睬
-睹
-睽
-睾
-睿
-瞄
-瞅
-瞇
-瞋
-瞌
-瞎
-瞑
-瞒
-瞓
-瞟
-瞠
-瞥
-瞧
-瞩
-瞪
-瞬
-瞰
-瞳
-瞻
-瞿
-矍
-矗
-矛
-矜
-矢
-��
-知
-矩
-矫
-短
-矮
-石
-矶
-矽
-矾
-矿
-码
-砂
-砌
-砍
-砒
-研
-砖
-砗
-砚
-砝
-砣
-砥
-砧
-砭
-砰
-砲
-破
-砷
-砸
-砺
-砼
-砾
-础
-硅
-硐
-硒
-硕
-硝
-硫
-硬
-确
-硼
-碁
-碇
-碉
-碌
-碍
-碎
-碑
-碓
-碗
-碘
-碚
-碛
-碟
-碣
-碧
-碰
-碱
-碳
-碴
-碾
-磁
-磅
-磊
-磋
-磐
-磕
-磡
-磨
-磬
-磲
-磷
-磺
-礁
-礡
-礴
-示
-礼
-社
-祀
-祁
-祂
-祇
-祈
-祉
-祎
-祐
-祕
-祖
-祗
-祚
-祛
-祜
-祝
-神
-祟
-祠
-祢
-祥
-票
-祭
-祯
-祷
-祸
-祺
-禀
-禁
-禄
-禅
-福
-禛
-禧
-禹
-禺
-离
-禽
-禾
-秀
-私
-秃
-秆
-秉
-秋
-种
-科
-秒
-秘
-租
-秣
-秤
-秦
-秧
-秩
-秭
-积
-称
-秸
-移
-秽
-稀
-程
-稍
-税
-稔
-稗
-稚
-稜
-稞
-稠
-稣
-稲
-稳
-稷
-稹
-稻
-稼
-稽
-稿
-穂
-穆
-穗
-穴
-究
-穷
-穹
-空
-穿
-突
-窃
-窄
-窈
-窍
-窑
-窒
-窓
-窕
-窖
-窗
-窘
-窜
-窝
-窟
-窠
-窥
-窦
-窨
-窿
-立
-竖
-站
-竜
-竞
-竟
-章
-竣
-童
-竭
-端
-竹
-竺
-竽
-竿
-笃
-笆
-笈
-笋
-笏
-笑
-笔
-笙
-笛
-笞
-笠
-符
-笨
-第
-笹
-笺
-笼
-等
-筊
-筋
-筏
-筐
-筑
-筒
-答
-策
-筛
-筝
-筠
-筱
-筲
-筵
-筷
-筹
-签
-简
-箇
-箍
-箐
-箔
-箕
-算
-箝
-管
-箩
-箫
-箭
-箱
-箴
-箸
-篁
-篆
-篇
-篑
-篓
-篙
-篝
-篠
-篡
-篪
-篮
-篱
-篷
-簇
-簌
-簦
-簧
-簪
-簷
-簸
-簿
-籁
-籍
-籐
-籤
-米
-类
-籼
-籽
-粄
-粉
-粑
-粒
-粕
-粗
-粘
-粟
-粤
-粥
-粧
-粪
-粮
-粱
-粲
-粳
-粹
-粼
-粽
-精
-粿
-糅
-糊
-糍
-糕
-糖
-糗
-糙
-糜
-糟
-糠
-糬
-糯
-糸
-系
-紊
-素
-索
-紧
-紫
-紮
-累
-絃
-経
-絮
-絵
-綑
-継
-続
-綦
-総
-縁
-縄
-繁
-繇
-繋
-纂
-纠
-红
-纣
-纤
-约
-级
-纨
-纪
-纫
-纬
-纭
-纯
-纰
-纱
-纲
-纳
-纵
-纶
-纷
-纸
-纹
-纺
-纽
-纾
-线
-绀
-练
-组
-绅
-细
-织
-终
-绊
-绍
-绎
-经
-绑
-绒
-结
-绔
-绕
-绘
-给
-绚
-绛
-络
-绝
-绞
-统
-绡
-绢
-绣
-绥
-绦
-继
-绩
-绪
-绫
-续
-绮
-绯
-绰
-绳
-维
-绵
-绶
-绷
-绸
-绻
-综
-绽
-绾
-绿
-缀
-缄
-缅
-缆
-缇
-缈
-缉
-缎
-缓
-缔
-缕
-编
-缘
-缙
-缚
-缜
-缝
-缠
-缢
-缤
-缥
-缨
-缩
-缪
-缭
-缮
-缰
-缱
-缴
-缸
-缺
-罂
-罄
-罐
-网
-罔
-罕
-罗
-罚
-罡
-罢
-罩
-罪
-置
-署
-罹
-羁
-羊
-羌
-美
-羔
-羚
-羞
-羟
-羡
-羣
-群
-羧
-羨
-羯
-羲
-羸
-羹
-羽
-羿
-翁
-翅
-翊
-翌
-翎
-翔
-翘
-翟
-翠
-翡
-翦
-翩
-翰
-翱
-翳
-翻
-翼
-耀
-老
-考
-耄
-者
-耆
-耋
-而
-耍
-耐
-耒
-耕
-耗
-耘
-耙
-耦
-耨
-耳
-耶
-耷
-耸
-耻
-耽
-耿
-聂
-聆
-聊
-聋
-职
-聒
-联
-聘
-聚
-聪
-聴
-聿
-肃
-肄
-肆
-肇
-肉
-肋
-肌
-肏
-肓
-肖
-肘
-肚
-肛
-肝
-肠
-股
-肢
-肤
-肥
-肩
-肪
-肮
-肯
-肱
-育
-肴
-肺
-肽
-肾
-肿
-胀
-胁
-胃
-胄
-胆
-背
-胍
-胎
-胖
-胚
-胛
-胜
-胝
-胞
-胡
-胤
-胥
-胧
-胫
-胭
-胯
-胰
-胱
-胳
-胴
-胶
-胸
-胺
-能
-脂
-脆
-脇
-脉
-脊
-脍
-脏
-脐
-脑
-脓
-脖
-脘
-脚
-脣
-脩
-脯
-脱
-脲
-脳
-脸
-脾
-腆
-腈
-腊
-腋
-腌
-腐
-腑
-腓
-腔
-腕
-腥
-腩
-腭
-腮
-腰
-腱
-腴
-腹
-腺
-腻
-腼
-腾
-腿
-膀
-膈
-膊
-膏
-膑
-膘
-膛
-膜
-膝
-膦
-膨
-膳
-膺
-膻
-臀
-臂
-臃
-臆
-臊
-臓
-臣
-臧
-自
-臬
-臭
-至
-致
-臻
-臼
-臾
-舀
-舂
-舅
-舆
-舌
-舍
-舎
-舐
-舒
-舔
-舖
-舗
-舛
-舜
-舞
-舟
-航
-舫
-般
-舰
-舱
-舵
-舶
-舷
-舸
-船
-舺
-舾
-艇
-艋
-艘
-艮
-良
-艰
-色
-艳
-艹
-艺
-艾
-节
-芃
-芈
-芊
-芋
-芍
-芎
-芒
-芙
-芜
-芝
-芡
-芥
-芦
-芩
-芪
-芫
-芬
-芭
-芮
-芯
-花
-芳
-芷
-芸
-芹
-芽
-芾
-苁
-苄
-苇
-苋
-苍
-苏
-苑
-苒
-苓
-苔
-苕
-苗
-苛
-苜
-苞
-苟
-苡
-苣
-若
-苦
-苫
-苯
-英
-苷
-苹
-苻
-茁
-茂
-范
-茄
-茅
-茉
-茎
-茏
-茗
-茜
-茧
-茨
-茫
-茬
-茭
-茯
-茱
-茴
-茵
-茶
-茸
-茹
-茼
-荀
-荃
-荆
-草
-荏
-荐
-荒
-荔
-荖
-荘
-荚
-荞
-荟
-荠
-荡
-荣
-荤
-荥
-荧
-荨
-荪
-荫
-药
-荳
-荷
-荸
-荻
-荼
-荽
-莅
-莆
-莉
-莎
-莒
-莓
-莘
-莞
-莠
-莪
-莫
-莱
-莲
-莴
-获
-莹
-莺
-莽
-莿
-菀
-菁
-菅
-菇
-菈
-菊
-菌
-菏
-菓
-菖
-菘
-菜
-菟
-菠
-菡
-菩
-菱
-菲
-菸
-菽
-萁
-萃
-萄
-萋
-萌
-萍
-萎
-萘
-萝
-萤
-营
-萦
-萧
-萨
-萩
-萱
-萸
-萼
-落
-葆
-著
-葚
-葛
-葡
-董
-葩
-葫
-葬
-葭
-葱
-葳
-葵
-葺
-蒂
-蒋
-蒐
-蒙
-蒜
-蒟
-蒡
-蒨
-蒲
-蒸
-蒹
-蒻
-蒿
-蓁
-蓄
-蓆
-蓉
-蓑
-蓓
-蓖
-蓝
-蓟
-蓦
-蓬
-蓼
-蓿
-蔑
-蔓
-蔗
-蔘
-蔚
-蔡
-蔫
-蔬
-蔵
-蔷
-蔺
-蔻
-蔼
-蔽
-蕃
-蕈
-蕉
-蕊
-蕙
-蕤
-蕨
-蕲
-蕴
-蕻
-蕾
-薄
-薅
-薇
-薏
-薑
-薙
-薛
-薨
-薪
-薬
-薯
-薰
-薹
-藏
-藐
-藓
-藕
-藜
-藤
-藩
-藻
-藿
-蘑
-蘸
-蘼
-虎
-虏
-虐
-虑
-虔
-虚
-虞
-虢
-虫
-虬
-虱
-虹
-虻
-虽
-虾
-蚀
-蚁
-蚂
-蚊
-蚌
-蚓
-蚕
-蚜
-蚝
-蚣
-蚤
-蚩
-蚪
-蚯
-蚱
-蚵
-蛀
-蛆
-蛇
-蛊
-蛋
-蛎
-蛐
-蛔
-蛙
-蛛
-蛟
-蛤
-蛭
-蛮
-蛰
-蛳
-蛹
-蛾
-蜀
-蜂
-蜃
-蜇
-蜈
-蜊
-蜍
-蜒
-蜓
-蜕
-蜗
-蜘
-蜚
-蜜
-蜡
-蜢
-蜥
-蜱
-��
-蜷
-蜻
-蜿
-蝇
-蝈
-蝉
-蝌
-蝎
-蝗
-蝙
-蝠
-蝨
-蝴
-蝶
-蝼
-螂
-螃
-融
-螨
-螯
-螳
-螺
-蟀
-蟆
-蟋
-蟑
-蟒
-蟠
-蟹
-蟾
-蠍
-蠔
-蠕
-蠛
-蠡
-蠢
-蠹
-血
-衄
-衅
-行
-衍
-衔
-街
-衙
-衞
-衡
-衢
-衣
-补
-表
-衩
-衫
-衬
-衮
-衰
-衲
-衷
-衾
-衿
-袁
-袂
-袄
-袅
-袈
-袋
-袍
-袒
-袖
-袜
-袤
-袪
-被
-袭
-袱
-裁
-裂
-装
-裆
-裔
-裕
-裘
-裙
-裟
-裤
-裨
-裱
-裳
-裴
-裸
-裹
-裾
-褂
-褐
-褒
-褓
-褔
-褚
-褥
-褪
-褫
-褶
-襁
-襄
-襟
-西
-要
-覃
-覆
-覇
-覚
-覧
-観
-见
-观
-规
-觅
-视
-览
-觉
-觊
-觎
-觐
-觑
-角
-觞
-解
-觥
-触
-言
-訳
-証
-詹
-誉
-誓
-読
-諡
-譁
-警
-譬
-譲
-讚
-计
-订
-认
-讥
-讧
-讨
-让
-讪
-讫
-训
-议
-讯
-记
-讲
-讳
-讴
-讶
-讷
-许
-讹
-论
-讼
-讽
-设
-访
-诀
-证
-诃
-评
-诅
-识
-诈
-诉
-诊
-诋
-词
-诏
-译
-试
-诗
-诘
-诙
-诚
-诛
-话
-诞
-诟
-诠
-诡
-询
-诣
-诤
-该
-详
-诧
-诩
-诫
-诬
-语
-误
-诰
-诱
-诲
-说
-诵
-诶
-请
-诸
-诺
-读
-诽
-课
-诿
-谀
-谁
-调
-谄
-谅
-谆
-谈
-谊
-谋
-谌
-谍
-谎
-谏
-谐
-谑
-谒
-谓
-谔
-谕
-谗
-谘
-谙
-谚
-谛
-谜
-谟
-谢
-谣
-谤
-谥
-谦
-谧
-谨
-谩
-谪
-谬
-谭
-谯
-谱
-谲
-谴
-谶
-谷
-豁
-豆
-豇
-豉
-豊
-豌
-豔
-豚
-象
-豢
-豪
-豫
-豹
-豺
-貂
-貅
-貌
-貔
-貘
-贝
-贞
-负
-贡
-财
-责
-贤
-败
-账
-货
-质
-贩
-贪
-贫
-贬
-购
-贮
-贯
-贰
-贱
-贲
-贴
-贵
-贷
-贸
-费
-贺
-贻
-贼
-贾
-贿
-赁
-赂
-赃
-资
-赅
-赈
-赊
-赋
-赌
-赎
-赏
-赐
-赓
-赔
-赖
-赘
-赚
-赛
-赝
-赞
-赠
-赡
-赢
-赣
-赤
-赦
-赧
-赫
-赭
-走
-赳
-赴
-赵
-赶
-起
-趁
-超
-越
-趋
-趟
-趣
-足
-趴
-趵
-趸
-趺
-趾
-跃
-跄
-跆
-跋
-跌
-跎
-跑
-跖
-跚
-跛
-距
-跟
-跤
-跨
-跩
-跪
-路
-跳
-践
-跷
-跹
-跺
-跻
-踉
-踊
-踌
-踏
-踝
-踞
-踟
-踢
-踩
-踪
-踮
-踱
-踵
-踹
-蹂
-蹄
-蹇
-蹈
-蹉
-蹊
-蹋
-蹑
-蹒
-蹙
-蹟
-蹦
-蹩
-蹬
-蹭
-蹲
-蹴
-蹶
-蹼
-蹿
-躁
-躇
-躏
-身
-躬
-躯
-躲
-躺
-転
-軽
-车
-轧
-轨
-轩
-转
-轭
-轮
-软
-轰
-轲
-轴
-轶
-轻
-轼
-载
-轿
-较
-辄
-辅
-辆
-辇
-辈
-辉
-辊
-辍
-辐
-辑
-输
-辕
-辖
-辗
-辘
-辙
-辛
-辜
-辞
-辟
-辣
-辨
-辩
-辫
-辰
-辱
-边
-辺
-辻
-込
-辽
-达
-迁
-迂
-迄
-迅
-过
-迈
-迎
-运
-近
-返
-还
-这
-进
-远
-违
-连
-迟
-迢
-迤
-迥
-迦
-迩
-迪
-迫
-迭
-述
-迷
-迸
-迹
-迺
-追
-退
-送
-适
-逃
-逅
-逆
-选
-逊
-逍
-透
-逐
-递
-途
-逗
-通
-逛
-逝
-逞
-速
-造
-逢
-逮
-逵
-逶
-逸
-逻
-逼
-逾
-遁
-遂
-遅
-遇
-遍
-遏
-遐
-遑
-遒
-道
-遗
-遛
-遢
-遣
-遥
-遨
-遭
-遮
-遴
-遵
-遶
-遽
-避
-邀
-邂
-邃
-邈
-邋
-邑
-邓
-邕
-邛
-邝
-邢
-那
-邦
-邨
-邪
-邬
-邮
-邯
-邰
-邱
-邳
-邵
-邸
-邹
-邺
-邻
-郁
-郅
-郊
-郎
-郑
-郜
-郝
-郡
-郢
-郤
-郦
-郧
-部
-郫
-郭
-郴
-郷
-郸
-都
-鄂
-鄙
-鄞
-鄢
-鄱
-酉
-酊
-酋
-酌
-配
-酐
-酒
-酗
-酚
-酝
-酢
-酣
-酥
-酩
-酪
-酬
-酮
-酯
-酰
-酱
-酵
-酶
-酷
-酸
-酿
-醃
-醇
-醉
-醋
-醍
-醐
-醒
-醚
-醛
-醣
-醪
-醮
-醯
-醴
-醺
-采
-釉
-释
-里
-重
-野
-量
-金
-釜
-釦
-鈪
-鉄
-鉴
-銭
-銮
-鍊
-鎌
-鎏
-鎗
-鏖
-鑑
-鑫
-针
-钉
-钊
-钎
-钏
-钒
-钓
-钗
-钙
-钛
-钜
-钝
-钞
-钟
-钠
-钡
-钢
-钣
-钤
-钥
-钦
-钧
-钨
-钩
-钮
-钯
-钰
-钱
-钳
-钴
-钵
-钺
-钻
-钼
-钾
-钿
-铀
-铁
-铂
-铃
-铄
-铅
-铆
-铉
-铎
-铐
-铛
-铜
-铝
-铠
-铡
-铢
-铣
-铤
-铨
-铩
-铬
-铭
-铮
-铰
-铲
-铵
-银
-铸
-铺
-链
-铿
-销
-锁
-锂
-锄
-锅
-锆
-锈
-锉
-锋
-锌
-锏
-锐
-锑
-错
-锚
-锟
-锡
-锢
-锣
-锤
-锥
-锦
-锭
-键
-锯
-锰
-锲
-锵
-锹
-锺
-锻
-镀
-镁
-镂
-镇
-镉
-镌
-镍
-镐
-镑
-镕
-镖
-镗
-镛
-镜
-镣
-镭
-镯
-镰
-镳
-镶
-长
-閒
-関
-闇
-闘
-闢
-门
-闪
-闫
-闭
-问
-闯
-闰
-闲
-间
-闵
-闷
-闸
-闹
-闺
-闻
-闽
-闾
-阀
-阁
-阂
-阅
-阆
-阇
-阈
-阉
-阎
-阐
-阑
-阔
-阕
-阖
-阙
-阚
-阜
-队
-阡
-阮
-阱
-防
-阳
-阴
-阵
-阶
-阻
-阿
-陀
-陂
-附
-际
-陆
-陇
-陈
-陋
-陌
-降
-限
-陕
-陛
-陞
-陟
-陡
-院
-除
-陨
-险
-陪
-陲
-陵
-陶
-陷
-険
-隅
-隆
-隈
-隋
-隍
-随
-隐
-隔
-隘
-隙
-障
-隠
-隣
-隧
-隶
-隼
-隽
-难
-雀
-雁
-雄
-雅
-集
-雇
-雉
-雌
-雍
-雎
-雏
-雑
-雒
-雕
-雨
-雪
-雯
-雰
-雳
-零
-雷
-雹
-雾
-需
-霁
-霄
-霆
-震
-霈
-霉
-霊
-霍
-霎
-霏
-霑
-霓
-霖
-霜
-霞
-霭
-霰
-露
-霸
-霹
-霾
-青
-靓
-靖
-静
-靛
-非
-靠
-靡
-面
-靥
-革
-靳
-靴
-靶
-靼
-鞅
-鞋
-鞍
-鞑
-鞘
-鞠
-鞣
-鞭
-韦
-韧
-韩
-韬
-韭
-音
-韵
-韶
-頫
-頼
-页
-顶
-顷
-项
-顺
-须
-顼
-顽
-顾
-顿
-颁
-颂
-预
-颅
-领
-颇
-颈
-颉
-颊
-颌
-颍
-颐
-频
-颓
-颔
-颖
-颗
-题
-颚
-颛
-颜
-额
-颞
-颠
-颡
-颢
-颤
-颦
-颧
-风
-飒
-飓
-飕
-飘
-飙
-飚
-飞
-食
-飨
-餐
-餮
-餵
-饍
-饕
-饥
-饨
-饪
-饬
-饭
-饮
-饯
-饰
-饱
-饲
-饴
-饵
-饶
-��
-饺
-饼
-饽
-饿
-馀
-馁
-馄
-馅
-馆
-馈
-馋
-馍
-馏
-馒
-馔
-首
-馗
-香
-馥
-馨
-駄
-駅
-駆
-験
-騨
-驒
-马
-驭
-驮
-驯
-驰
-驱
-驳
-驴
-驶
-驷
-驸
-驹
-驻
-驼
-驾
-驿
-骁
-骂
-骄
-骅
-骆
-骇
-骈
-骊
-骋
-验
-骏
-骐
-骑
-骗
-骚
-骛
-骜
-骞
-骠
-骡
-骤
-骥
-骧
-骨
-骰
-骶
-骷
-骸
-骼
-髂
-髅
-髋
-髓
-高
-髦
-髪
-髯
-髻
-鬃
-鬓
-鬟
-鬣
-鬼
-魁
-魂
-魄
-魅
-魇
-魍
-魏
-魔
-鮨
-鱼
-鱿
-鲁
-鲈
-鲍
-鲑
-鲛
-鲜
-鲟
-鲢
-鲤
-鲨
-鲫
-鲱
-鲲
-鲶
-鲷
-鲸
-鳃
-鳄
-鳅
-鳌
-鳍
-鳕
-鳖
-鳗
-鳝
-鳞
-鵰
-鸟
-鸠
-鸡
-鸢
-鸣
-鸥
-鸦
-鸨
-鸪
-鸭
-鸯
-鸳
-鸵
-鸽
-鸾
-鸿
-鹂
-鹃
-鹄
-鹅
-鹈
-鹉
-鹊
-鹌
-鹏
-鹑
-鹕
-鹘
-鹜
-鹞
-鹤
-鹦
-鹧
-鹫
-鹭
-鹰
-鹳
-鹿
-麂
-麋
-麒
-麓
-麝
-麟
-麦
-麴
-麸
-麺
-麻
-麾
-黄
-黍
-黎
-黏
-黑
-黒
-黔
-默
-黛
-黜
-黝
-黠
-黯
-鼋
-鼎
-鼐
-鼓
-鼠
-鼬
-鼹
-鼻
-鼾
-齁
-齐
-齢
-齿
-龄
-龅
-龈
-龊
-龋
-龌
-龙
-龚
-龛
-龟
-呡
-乾
-绗
-楦
-硌
-袢
-钕
-蕞
-癀
-皲
-貉
-唛
-笕
-椴
-―
-胗
-旯
-鳙
-鲇
-鳐
-鳜
-鲅
-鳊
-鲳
-鲽
-鲣
-枞
-炝
-醅
-馊
-捯
-抻
-绉
-馐
-饧
-莜
-嘬
-腘
-肫
-鳟
-镊
-犽
-洌
-蝰
-铱
-髌
-锃
-镲
-锗
-甑
-戗
-裥
-弎
-粝
-霂
-猄
-轱
-苎
-偲
-兿
-铷
-栢
-帏
-黢
-洇
-沄
-誊
-忸
-怩
-蚬
-籺
-氚
-犇
-锒
-鸩
-噘
-偾
-髫
-︰
-︱
-︶
-︿
-﹁
-﹂
-﹍
-﹏
-﹐
-﹑
-﹒
-﹔
-﹕
-﹖
-﹗
-﹙
-﹚
-﹝
-﹞
-﹡
-﹣
-！
-（
-）
-，
-：
-；
-？
-｜
-～
-｡
-｢
-｣
-､
-･
-ｯ
-ｰ
-ﾞ
-ﾟ
-￣
-￥
-...
-yam
-lofter
-##s
-by
-##0
-com
-##a
-##2
-##1
-##3
-##e
-##8
-##5
-##6
-##4
-##9
-##7
-##t
-##o
-##d
-##i
-##n
-app
-www
-the
-##m
-##c
-##l
-##y
-##r
-##g
-http
-qq
-##p
-##f
-google
-pixnet
-cookies
-tripadvisor
-##er
-##k
-##h
-facebook
-##b
-of
-##x
-##u
-iphone
-ip
-in
-##w
-##ing
-ctrip
-##on
-##v
-to
-id
-it
-windows
-llc
-top
-led
-at
-##an
-##z
-android
-and
-vr
-blogthis
-twitter
-##le
-ok
-cn
-no
-ios
-##in
-##mm
-on
-te
-ig
-lv
-##ng
-##us
-pc
-──
-##te
-##ed
-html
-ncc
-wifi
-email
-blog
-is
-mail
-online
-##al
-dvd
-##ic
-studio
-##℃
-##ia
-line
-vip
-##q
-##ce
-##en
-for
-##is
-##ra
-##es
-##j
-usb
-net
-cp
-asia
-##cm
-diy
-new
-ta
-language
-vs
-apple
-tw
-web
-##ne
-ipad
-you
-##re
-##tion
-ps
-de
-bt
-pony
-atm
-##ch
-ceo
-##or
-go
-##na
-av
-pro
-cafe
-pinterest
-pixstyleme3c
-##ta
-more
-said
-mp3
-##ll
-nba
-jun
-tv
-pm
-nbsp
-##ie
-linux
-##ma
-cd
-hd
-##ion
-am
-##th
-##st
-##se
-##et
-gdp
-my
-abc
-flash
-one
-##ck
-gps
-##ly
-web885
-##ge
-xd
-boss
-isbn
-org
-##ry
-me
-love
-##ter
-##ar
-##la
-hotel
-pk
-ie
-##os
-##el
-seo
-cpu
-##ml
-p2p
-may
-sun
-tue
-internet
-cc
-posted
-youtube
-##at
-##man
-ii
-abs
-nt
-pdf
-yahoo
-ago
-##it
-news
-mac
-##me
-java
-spa
-##de
-##nt
-hk
-all
-plus
-la
-##mb
-##ve
-west
-##da
-air
-##ps
-##to
-logo
-htc
-php
-https
-fi
-momo
-##son
-sat
-##ke
-ebd
-suv
-wi
-day
-apk
-##um
-mv
-galaxy
-wiki
-or
-brake
-this
-mon
-po
-javascript
-life
-home
-june
-##ss
-system
-pp
-world
-fb
-br
-##as
-ic
-ai
-leonardo
-safari
-live
-free
-xx
-wed
-win7
-kiehl
-##co
-lg
-o2o
-##go
-us
-mm
-vfm
-kanye
-##id
-jr
-##ey
-rss
-##sa
-##ro
-##am
-##no
-thu
-fri
-##sh
-##ki
-comments
-name
-##pe
-##ine
-max
-uber
-##mi
-##ton
-wordpress
-office
-##ment
-bd
-win10
-##ld
-##li
-gmail
-bb
-dior
-##rs
-##ri
-##rd
-up
-cad
-dr
-read
-##io
-url
-pvc
-paypal
-show
-policy
-##ty
-with
-txt
-##ba
-dna
-from
-post
-mini
-ar
-taiwan
-john
-##ga
-privacy
-agoda
-##ny
-word
-##by
-##ur
-##hz
-##ang
-cookie
-netscape
-##ka
-##～
-##ad
-house
-share
-note
-ibm
-code
-hello
-nike
-sim
-survey
-wikia
-cbc
-##tor
-##kg
-##rt
-campaign
-store
-os
-##ct
-##ts
-##°
-api
-##ns
-excel
-##ao
-##nd
-university
-##ya
-##il
-pierre
-ipo
-hotels
-##ian
-years
-##ers
-high
-##day
-time
-##ay
-bug
-##line
-##be
-xp
-talk2yam
-yamservice
-coco
-##dy
-sony
-##ies
-microsoft
-david
-people
-##ha
-instagram
-intel
-##ot
-iso
-##va
-##mo
-##land
-xxx
-man
-co
-ltxsw
-##ation
-baby
-##pa
-##ol
-tag
-##ue
-msn
-oppo
-##ca
-control
-##om
-st
-chrome
-##ure
-be
-lol
-##bo
-lady
-##way
-##ko
-##do
-##un
-corporation
-##ni
-herme
-##up
-ui
-##ds
-ppt
-admin
-three
-bbc
-re
-ca
-hp
-##ee
-tpp
-##ive
-root
-##cc
-##ble
-##ity
-adobe
-park
-et
-oled
-city
-##ex
-##ler
-##ap
-china
-##book
-view
-##ice
-global
-##km
-your
-hong
-##mg
-out
-##ms
-ng
-ebay
-menu
-ubuntu
-##cy
-rom
-##view
-open
-ktv
-do
-server
-##lo
-if
-english
-##oo
-step1
-kong
-club
-july
-inc
-mr
-hi
-##net
-touch
-##ls
-##ii
-michael
-lcd
-phone
-james
-step2
-ios9
-##box
-dc
-##ley
-samsung
-pokemon
-css
-##ent
-##les
-s8
-atom
-play
-bmw
-##said
-sa
-etf
-ctrl
-adidas
-amazon
-##ber
-##ner
-visa
-##der
-connectivity
-##hi
-firefox
-hr
-so
-style
-mark
-pop
-ol
-skip
-as
-##ir
-mba
-##ai
-le
-##ver
-cafe2017
-lte
-super
-##ron
-amd
-like
-are
-##ster
-we
-##sk
-paul
-data
-international
-##ft
-longchamp
-ssd
-good
-##ti
-reply
-##my
-apr
-star
-##ker
-source
-js
-get
-force
-photo
-##one
-##ow
-link
-bbs
-goods
-##lin
-python
-##ip
-game
-##ics
-blue
-page
-itunes
-gt
-gif
-##ff
-group
-about
-bar
-ganji
-##nce
-music
-lee
-not
-##per
-an
-faq
-comment
-days
-##ock
-##bs
-v1
-player
-xbox
-sql
-fm
-f1
-##ah
-##lv
-##mp
-melody
-xml
-market
-##au
-what
-gl
-##age
-tips
-book
-##ting
-mysql
-can
-##ung
-wonderland
-watch
-##ction
-mar
-mobile
-article
-##db
-part
-party
-##ore
-##op
-dj
-main
-##ong
-art
-ad
-pm2
-japan
-ts
-##ica
-der
-sm
-##wa
-ct
-homemesh
-search
-##tv
-##di
-macbook
-service
-type
-##ier
-##si
-##ok
-best
-goris
-lock
-cf
-big
-##ut
-ftp
-carol
-##vi
-happy
-sd
-##ac
-anti
-pe
-cnn
-iii
-esp
-jan
-tags
-august
-vol
-##fs
-##sion
-design
-ac
-press
-jordan
-ppp
-that
-key
-check
-##tt
-##㎡
-##lt
-power
-##bc
-vivi
-he
-jpg
-##rry
-nb
-##ted
-##rn
-usd
-##t00
-master
-model
-al
-ram
-goo
-##ui
-red
-##ary
-rpg
-item
-##pm
-##za
-project
-hot
-td
-blogabstract
-##ger
-gr2
-black
-electronic
-nfc
-year
-asus
-html5
-cindy
-##hd
-m3
-esc
-##od
-booking
-fed
-tvb
-##ina
-mit
-chan
-distribution
-next
-peter
-bios
-steam
-cm
-pk10
-##ix
-dec
-nasa
-##ana
-icecat
-b1
-will
-li
-se
-##ji
-##ard
-oct
-##ain
-jp
-##ze
-##bi
-cio
-smart
-h5
-##port
-curve
-vpn
-##nm
-##dia
-utc
-rmvb
-chanel
-a4
-miss
-##and
-##im
-media
-who
-she
-girl
-vera
-class
-vivo
-king
-##ei
-national
-ab
-ipod
-ap
-ms
-mp4
-msci
-##po
-mg
-index
-##bit
-##out
-##zz
-apec
-photoshop
-opec
-##tes
-##ast
-○○
-##ling
-##ory
-##ical
-kitty
-content
-step3
-##cn
-win8
-vc
-iphone7
-robert
-tcl
-beauty
-en
-dollars
-##ys
-##oc
-step
-pay
-yy
-a1
-##lly
-##ks
-download
-sep
-exe
-ph
-school
-gb
-center
-pr
-street
-##board
-uv
-##lan
-winrar
-##que
-##ua
-##com
-gpu
-ettoday
-fu
-tom
-##ren
-##via
-b2b
-##tch
-rose
-arm
-mb
-##ial
-##nn
-nvidia
-step4
-mvp
-york
-how
-cpi
-gov
-kg
-joe
-##xx
-mandy
-pa
-##ser
-copyright
-fashion
-don
-ecu
-##ist
-##art
-erp
-wap
-have
-##lm
-talk
-##ek
-##ning
-##if
-ch
-##ite
-video
-cs
-san
-iot
-look
-##ku
-october
-##ux
-trump
-##hs
-##ide
-box
-first
-##ins
-april
-##ight
-angel
-protected
-aa
-x1
-m2
-##fe
-##×
-##ho
-size
-min
-ofo
-fun
-gomaji
-ex
-hdmi
-food
-dns
-march
-chris
-kevin
-##lla
-##pp
-##ec
-ag
-ems
-##rm
-##ham
-off
-asp
-team
-fandom
-ed
-##ell
-info
-sina
-##able
-##ctor
-dll
-rights
-ltd
-idc
-jul
-ma
-surface
-mall
-eps
-green
-map
-space
-donald
-v2
-sodu
-##light
-reserved
-htm
-##han
-mod
-##ise
-##tions
-ti
-##shi
-doc
-icp
-wang
-##ram
-shopping
-aug
-##pi
-##well
-now
-wam
-b2
-##hu
-##gb
-f2
-mix
-##ef
-##uan
-bwl
-##plus
-##res
-core
-##ess
-tea
-hktvmall
-nhk
-##ate
-list
-##ese
-feb
-inn
-nov
-daniel
-##ci
-pass
-##bet
-##nk
-coffee
-ssl
-airbnb
-##ute
-fbi
-woshipm
-skype
-ea
-cg
-sp
-##fc
-##www
-yes
-edge
-alt
-fpga
-##ght
-##gs
-iso9001
-##ile
-##wood
-##uo
-image
-lin
-icon
-american
-##em
-set
-says
-##king
-##tive
-blogger
-##ox
-##zy
-##red
-##ium
-##lf
-nokia
-claire
-##ding
-november
-lohas
-##tic
-##cs
-##che
-##ire
-##gy
-##ult
-db
-january
-win
-road
-ptt
-##fa
-##mer
-anna
-pchome
-udn
-ef
-##time
-##tte
-g20
-white
-garden
-eleven
-di
-chen
-young
-cosplay
-bat
-##tra
-kindle
-npc
-steve
-etc
-##ern
-call
-xperia
-ces
-travel
-sk
-s7
-##ous
-##int
-edu
-file
-cho
-qr
-##car
-##our
-##ant
-eric
-rends
-##jo
-mastercard
-kb
-##min
-##ino
-vista
-##ris
-##ud
-jack
-##set
-pos
-##her
-##ou
-taipei
-beta
-##fi
-express
-body
-##ill
-aphojoy
-user
-december
-meiki
-##ick
-tweet
-richard
-##av
-iphone6
-##dd
-views
-##mark
-pd
-times
-level
-##ash
-point
-##ome
-koreanmall
-##ak
-george
-q2
-wma
-tcp
-full
-mlb
-##lle
-##watch
-tm
-run
-smith
-business
-##und
-color
-##tal
-##less
-moon
-##rl
-update
-pcb
-shop
-little
-end
-##mhz
-van
-dsp
-easy
-##house
-##key
-history
-oh
-##hy
-##web
-oem
-let
-was
-##gg
-review
-##wan
-##°c
-uc
-title
-##val
-united
-##ons
-doi
-trivago
-overdope
-sbs
-##ance
-grand
-special
-imf
-wx17house
-##so
-audi
-##he
-london
-william
-##rp
-##ake
-science
-beach
-cfa
-amp
-ps4
-##link
-##hp
-crm
-ferragamo
-bell
-make
-##eng
-under
-zh
-photos
-##style
-via
-da
-##gi
-company
-i7
-##ray
-thomas
-ufo
-i5
-##max
-plc
-ben
-back
-research
-mike
-##pc
-september
-##ace
-vps
-february
-pantos
-wp
-lisa
-jquery
-night
-long
-offer
-##berg
-##news
-ray
-fks
-wto
-over
-##all
-##rus
-##works
-blogtitle
-loftpermalink
-martin
-test
-ling
-km
-fda
-v3
-##ja
-outlet
-family
-##ea
-##top
-story
-##ness
-salvatore
-##lu
-swift
-room
-oracle
-##ul
-sam
-b2c
-week
-pi
-rock
-##ean
-##gle
-cctv
-after
-chinese
-##back
-powered
-x2
-##tan
-##nes
-canon
-only
-##zi
-##las
-say
-##oe
-##sd
-##bot
-##world
-##zo
-sky
-made
-top100
-just
-pmi
-gap
-##vr
-les
-ball
-vogue
-vi
-ing
-ofweek
-cos
-##list
-##ort
-##lon
-last
-##tc
-##of
-##bus
-##gen
-real
-eva
-a3
-nas
-##lie
-##ria
-##coin
-##bt
-his
-cat
-nata
-vive
-health
-drive
-sir
-du
-cup
-##ook
-##sy
-alex
-msg
-tour
-##word
-ebooks
-r8
-block
-nice
-pvp
-months
-rewards
-##ther
-##xi
-##sc
-micro
-gg
-blogfp
-op
-daily
-m1
-true
-##bb
-ml
-##tar
-##ky
-anthony
-##yo
-state
-##ara
-##aa
-##rc
-##tz
-##ston
-gear
-##eo
-##ade
-ge
-see
-##win
-##ura
-ss
-heart
-##den
-##ita
-down
-##sm
-el
-png
-rakuten
-whatsapp
-bay
-dream
-add
-##use
-pad
-gucci
-mpv
-##ode
-##fo
-island
-jason
-chicago
-##hone
-io
-sogo
-be2
-##ology
-cloud
-vcd
-##con
-##ford
-##joy
-##kb
-##rade
-but
-##ach
-docker
-##ful
-rfid
-ul
-##ase
-hit
-ford
-##star
-a2
-sdk
-reading
-edited
-##are
-cmos
-##mc
-siri
-light
-##ella
-bloomberg
-##read
-pizza
-##ison
-jimmy
-##vm
-college
-node
-journal
-ba
-##play
-##cer
-magic
-##yu
-jump
-tt
-##ings
-asr
-##lia
-step5
-network
-##cd
-mc
-pixstyleme
-money
-bl
-act
-##tus
-tokyo
-##rial
-##life
-emba
-##ae
-saas
-tcs
-##rk
-##wang
-summer
-##sp
-ko
-##ving
-premium
-netflix
-uk
-mt
-##lton
-right
-frank
-two
-##ple
-##cal
-##sen
-##ville
-hold
-nexus
-dd
-##ius
-##mah
-tila
-zero
-ce
-##tin
-resort
-##ws
-charles
-old
-p10
-report
-##ru
-bus
-vans
-lt
-##est
-pv
-links
-rebecca
-##dm
-azure
-limited
-bit
-##mon
-moto
-##eam
-var
-eos
-blogspot
-e3
-dos
-dm
-fc
-##ments
-##ik
-##kw
-boy
-##bin
-##ata
-er
-##vin
-##tu
-##ula
-station
-##ature
-files
-zara
-hdr
-top10
-nature
-magazine
-s6
-marriott
-avira
-case
-tab
-##ran
-tony
-##home
-oculus
-im
-##ral
-jean
-saint
-cry
-rosie
-##force
-##ini
-ice
-##bert
-##nder
-##mber
-pet
-plurk
-##sis
-##ence
-tim
-##nc
-##name
-log
-ips
-great
-ikea
-malaysia
-unix
-##ncy
-##nie
-akb48
-##ye
-##oid
-##chi
-oa
-xuehai
-##orm
-##rf
-##ware
-ho
-##pro
-text
-##era
-bob
-##ub
-scp
-avi
-##zen
-mi
-wu
-museum
-qvod
-apache
-lake
-jcb
-ni
-##hr
-hill
-ne
-weibo
-ruby
-##row
-iv
-##ish
-github
-mate
-##lot
-##ane
-andrew
-##tina
-t1
-rf
-ed2k
-##vel
-way
-final
-ns
-sweet
-bytes
-##ene
-##cker
-##px
-topapp
-helpapp
-rs
-low
-g4g
-care
-ldquo
-##fork
-leave
-rm
-edition
-##gan
-##zon
-##qq
-##google
-##ism
-gold
-explorer
-##zer
-toyota
-category
-select
-visual
-##labels
-restaurant
-##md
-posts
-s1
-##ico
-angelababy
-sports
-s3
-mbc
-shell
-x86
-candy
-##new
-kbs
-face
-xl
-##here
-swissinfo
-v8
-dram
-##ual
-##vice
-##wer
-sport
-q1
-ios10
-public
-int
-card
-ep
-au
-rt
-bill
-##mll
-kim
-wan
-##uk
-x3
-scott
-##ming
-e5
-h7n9
-worldcat
-brown
-##vo
-##led
-##ax
-##ert
-paris
-polo
-##lr
-capital
-##hing
-bank
-cv
-##chat
-adc
-##ule
-digital
-hotmail
-##pad
-bbq
-quot
-##ring
-before
-wali
-mcu
-costco
-north
-switch
-##city
-philips
-##mann
-management
-panasonic
-##cl
-##vd
-##ping
-##rge
-alice
-##lk
-css3
-##ney
-vision
-alpha
-##ular
-##tter
-lz
-mode
-gre
-pci
-##tm
-##yan
-##let
-work
-war
-coach
-ah
-mary
-huang
-##pt
-a8
-pt
-follow
-##berry
-##ew
-a5
-ghost
-##wn
-##og
-south
-##code
-girls
-##rid
-action
-villa
-git
-r11
-table
-games
-##cket
-error
-##anonymoussaid
-##ag
-here
-##ame
-##gc
-qa
-##lis
-gmp
-##gin
-vmalife
-##cher
-yu
-wedding
-##tis
-demo
-dragon
-soho
-social
-bye
-##rant
-river
-orz
-acer
-##ats
-del
-##ven
-ups
-value
-macd
-yougou
-##dn
-##ano
-ll
-##urt
-##rent
-continue
-script
-##wen
-##ect
-paper
-shift
-##chel
-##cat
-x5
-fox
-car
-aaa
-##blog
-loading
-##yn
-##tp
-kuso
-si
-sns
-rmb
-vdc
-forest
-central
-prime
-help
-ultra
-##rmb
-square
-##field
-##reen
-##ors
-##ju
-c1
-start
-##air
-##map
-cdn
-##wo
-cba
-stephen
-m8
-##get
-opera
-##base
-##ood
-vsa
-com™
-##aw
-##ail
-count
-t2
-##een
-hop
-##gp
-vsc
-tree
-##eg
-##ose
-##ories
-##shop
-alphago
-v4
-simon
-fluke62max
-zip
-##sta
-louis
-cr
-bas
-bc
-##yer
-hadoop
-##ube
-##wi
-hola
-##low
-place
-centre
-d3
-##fer
-##media
-exchange
-series
-##san
-eb
-##bank
-q3
-##nge
-##mail
-take
-##lp
-client
-east
-cache
-event
-vincent
-##nse
-sui
-adchoice
-##stry
-##zone
-ga
-apps
-sea
-##ab
-cisco
-##rner
-kymco
-##care
-dha
-##pu
-##yi
-minkoff
-royal
-p1
-annie
-collection
-kpi
-playstation
-bh
-##bar
-queen
-radio
-andy
-armani
-##xy
-manager
-iherb
-##ery
-##share
-spring
-raid
-johnson
-##ob
-volvo
-hall
-##ball
-v6
-our
-taylor
-##hk
-bi
-##cp
-kate
-bo
-water
-technology
-##rie
-##ona
-##sl
-hpv
-gtx
-hip
-rdquo
-jayz
-stone
-##lex
-##rum
-namespace
-##ale
-##atic
-des
-##erson
-##ql
-##ves
-##type
-enter
-d2
-##mix
-##bian
-a9
-jj
-ky
-##lc
-access
-movie
-##hc
-tower
-##ration
-##mit
-##nch
-ua
-tel
-prefix
-##o2
-##point
-ott
-##http
-##ury
-baidu
-##ink
-member
-##logy
-bigbang
-nownews
-##js
-##shot
-##tb
-eba
-##tics
-##lus
-v5
-spark
-##ama
-there
-##ions
-god
-##lls
-##down
-hiv
-##ress
-burberry
-day2
-##kv
-jeff
-related
-film
-edit
-joseph
-##ark
-cx
-order
-g9
-##ans
-##tty
-s5
-##bee
-thread
-xr
-buy
-sh
-land
-spotify
-mx
-##ari
-##verse
-sf
-why
-nego
-sunny
-dom
-exo
-positioning
-fit
-rgb
-##tton
-kiss
-alexa
-adam
-lp
-mp
-##ties
-##llow
-amy
-##du
-np
-institute
-##rth
-##lar
-##des
-sidebar
-imax
-site
-##cky
-##kit
-##ime
-season
-##fun
-gogoro
-a7
-pu
-lily
-fire
-twd600
-##vis
-##cture
-information
-close
-friday
-yi
-nick
-##tta
-##tel
-##lock
-cbd
-economy
-tinker
-double
-voice
-##app
-oops
-channel
-today
-##right
-raw
-xyz
-jim
-edm
-##cent
-supreme
-ds
-##its
-##asia
-dropbox
-##tti
-books
-##tle
-##ller
-##ken
-##more
-##boy
-sex
-##dom
-t3
-##ider
-##unch
-feel
-##put
-s2
-mo
-##gh
-men
-ka
-amoled
-div
-##tr
-##n1
-port
-howard
-##tags
-ken
-dnf
-##nus
-adsense
-ide
-buff
-thunder
-##town
-##ique
-has
-##body
-auto
-pin
-##erry
-tee
-number
-##the
-object
-psp
-cool
-udnbkk
-##mic
-miui
-##tro
-most
-r2
-##alk
-##nity
-s4
-law
-version
-##oa
-n1
-sgs
-docomo
-##tf
-##ack
-henry
-fc2
-##ded
-##sco
-##rite
-linkedin
-##ada
-##now
-wii
-##ndy
-ucbug
-sputniknews
-legalminer
-##ika
-##xp
-##bu
-q10
-oo
-b6
-come
-##rman
-cheese
-ming
-maker
-##gm
-nikon
-##fig
-ppi
-kelly
-jchere
-ted
-md
-fgo
-tech
-##tto
-dan
-soc
-##gl
-##len
-hair
-earth
-img
-##pper
-##a1
-acca
-##ition
-##ference
-suite
-##ig
-outlook
-##mond
-##cation
-##pr
-airport
-##over
-jones
-##ith
-lab
-##su
-co2
-town
-piece
-##llo
-no1
-vmware
-##qi
-focus
-reader
-##admin
-##ora
-tb
-false
-##log
-know
-lan
-##ces
-f4
-##ume
-motel
-stop
-##oper
-na
-flickr
-netcomponents
-##af
-pose
-williams
-local
-##ound
-##cg
-##site
-##iko
-gsm
-con
-##ath
-friends
-##hip
-cell
-##rey
-cream
-##cks
-##dp
-facebooktwitterpinterestgoogle
-sso
-shtml
-song
-swiss
-##mw
-lumia
-xdd
-string
-tiffany
-marc
-insee
-russell
-sc
-dell
-##ations
-camera
-##vs
-##flow
-##late
-classic
-##nter
-stay
-g1
-mtv
-##ever
-##lab
-##nger
-qe
-sata
-ryan
-d1
-cms
-##cing
-su
-editor
-##nap
-security
-sunday
-association
-##ens
-##bra
-acg
-sofascore
-mkv
-##ign
-jonathan
-gary
-build
-labels
-##oto
-tesla
-moba
-qi
-gohappy
-general
-ajax
-society
-##test
-##urs
-wps
-fedora
-##ich
-mozilla
-##dr
-usa
-urn
-##lina
-grace
-##die
-##try
-##ader
-elle
-##chen
-price
-##ten
-uhz
-##ough
-eq
-##hen
-states
-push
-session
-balance
-wow
-##cus
-##py
-when
-##ward
-##ep
-wong
-library
-prada
-##cle
-running
-##ree
-ck
-date
-q4
-##ctive
-##ool
-mk
-##ira
-die
-secret
-rq
-dota
-buffet
-e6
-##ez
-pan
-ha
-##card
-##cha
-alan
-day3
-eye
-f3
-##end
-france
-keep
-adi
-rna
-tvbs
-##ala
-solo
-nova
-##tail
-support
-##ries
-##ved
-base
-copy
-iis
-fps
-##ways
-hero
-hgih
-profile
-fish
-mu
-ssh
-entertainment
-chang
-##wd
-click
-cake
-##ond
-pre
-##tom
-kic
-pixel
-##ov
-##fl
-product
-##pd
-dear
-##gate
-es
-yumi
-audio
-##²
-##sky
-echo
-bin
-where
-##ture
-##ape
-find
-sap
-isis
-nand
-##load
-##ream
-band
-a6
-never
-##post
-festival
-##we
-guide
-zenfone
-##ike
-gd
-forum
-jessica
-strong
-alexander
-##ould
-software
-allen
-##ious
-program
-else
-lohasthree
-##gar
-please
-rc
-##ggle
-##ric
-bim
-##own
-eclipse
-brian
-##side
-##other
-##tech
-##ator
-engine
-##ged
-plaza
-##fit
-cia
-ngo
-westbrook
-shi
-tbs
-sci
-reuters
-##ily
-contextlink
-##hn
-af
-##cil
-bridge
-very
-##cel
-cambridge
-##ize
-##aid
-##data
-frm
-##head
-award
-butler
-##sun
-meta
-##mar
-america
-ps3
-puma
-pmid
-lc
-kitchen
-##lic
-day1
-future
-##text
-##page
-##rris
-pm1
-##ket
-fans
-christian
-bot
-kids
-trackback
-##hai
-c3
-display
-##hl
-n2
-idea
-##sent
-airmail
-##ug
-##men
-pwm
-##lution
-awards
-schemas
-asics
-wikipedia
-font
-##tional
-##vy
-c2
-##dget
-##ein
-contact
-pepper
-##uel
-##ument
-##hang
-q5
-##sue
-rain
-##ndi
-wei
-swatch
-##cept
-popular
-##ste
-##tag
-p2
-trc
-##west
-##live
-justin
-honda
-ping
-messenger
-##rap
-v9
-unity
-appqq
-leo
-##tone
-##ass
-uniqlo
-her
-jane
-memory
-moneydj
-##tical
-human
-##m2
-coc
-miacare
-##mn
-tmt
-##core
-vim
-kk
-##may
-fan
-target
-use
-too
-fast
-services
-##ope
-omega
-energy
-pinkoi
-##rain
-jackson
-##ement
-p9
-rd
-##tier
-##vic
-zone
-dl
-isofix
-cpa
-m4
-kimi
-davis
-##lay
-lulu
-##uck
-weeks
-qs
-##hop
-ae
-##ear
-eia
-##fly
-korea
-jpeg
-boost
-##ship
-small
-eur
-valley
-##iel
-simple
-##ude
-rn
-k2
-##ena
-non
-patrick
-feed
-process
-well
-qqmei
-##thing
-they
-aws
-lu
-pink
-##ters
-##kin
-board
-##vertisement
-wine
-##ien
-unicode
-##dge
-r1
-##tant
-##twitter
-cool1
-isp
-standard
-matt
-##fu
-##iner
-googlemsn
-pixnetfacebookyahoo
-x7
-##uce
-sao
-##ev
-##file
-xddd
-shirt
-##rio
-##hat
-givenchy
-ya
-bang
-##lio
-monday
-crystal
-##abc
-head
-ubuntuforumwikilinuxpastechat
-##vc
-##rity
-cnc
-ipv6
-null
-##ost
-yang
-imsean
-tiger
-##fet
-dji
-ji
-maria
-##come
-foundation
-##beth
-active
-##aft
-##don
-sr
-emma
-##khz
-living
-sas
-x6
-##face
-pptv
-x4
-##mate
-han
-sophie
-##jing
-fifa
-##mand
-other
-sale
-inwedding
-##gn
-##mmy
-##pmlast
-bad
-nana
-nbc
-##wu
-note7
-single
-##bel
-window
-##dio
-##ht
-union
-age
-##ivity
-domain
-neo
-##isa
-##lter
-f5
-steven
-##cts
-powerpoint
-tft
-self
-g2
-ft
-zol
-##act
-mwc
-nbapop
-eds
-ace
-##room
-previous
-author
-tomtom
-il
-##ets
-hu
-financial
-bp
-chi
-##hg
-fairmont
-cross
-gay
-h2
-function
-also
-##raph
-##ils
-i3
-avenue
-##host
-##bon
-##tsu
-message
-navigation
-fintech
-h6
-##ject
-##vas
-##firm
-credit
-##wf
-xxxx
-form
-##nor
-##space
-huawei
-plan
-json
-sbl
-##dc
-machine
-wish
-##sol
-windows7
-edward
-development
-washington
-##nsis
-lo
-##sio
-##ym
-##bor
-planet
-##wt
-ieee
-gpa
-camp
-ann
-gm
-##tw
-##oka
-connect
-##rss
-##work
-##atus
-wall
-chicken
-soul
-##times
-fa
-##ather
-##cord
-##eep
-hitachi
-gui
-harry
-##pan
-e1
-disney
-##press
-wind
-frigidaire
-##tl
-liu
-hsu
-basic
-von
-ev
-learning
-##ull
-expedia
-archives
-change
-##wei
-santa
-cut
-ins
-turbo
-brand
-cf1
-return
-##rip
-h1
-##nis
-application
-emc
-rx
-##oon
-quick
-wilson
-wing
-chapter
-##bug
-beyond
-##cms
-##dar
-##oh
-zoom
-e2
-trip
-sb
-##nba
-rcep
-aspx
-ci
-gc
-gnu
-##count
-advanced
-dance
-dv
-##url
-##ging
-am09
-shadow
-battle
-##cia
-emily
-##tation
-host
-ff
-techorz
-sars
-##mini
-##mporary
-##ering
-nc
-##next
-cma
-##mbps
-##gas
-##ift
-##dot
-amana
-##ros
-ir
-##eet
-##ible
-##aka
-dcs
-iq
-l1
-##lor
-maggie
-##iu
-##gt
-articles
-create
-##burg
-##iki
-database
-fantasy
-##rex
-##cam
-dlc
-dean
-##you
-hard
-path
-gaming
-victoria
-maps
-cb
-##lee
-##itor
-overchicstoretvhome
-systems
-##xt
-p3
-sarah
-##nan
-x9
-install
-second
-##ann
-##ph
-##rcle
-##nic
-##nar
-ec
-metro
-chocolate
-##rian
-##table
-skin
-##sn
-mountain
-inparadise
-ib
-##jia
-eeworld
-creative
-g5
-g3
-parker
-ecfa
-village
-sylvia
-hbl
-##ques
-##onsored
-##x2
-##v4
-##tein
-ie6
-##stack
-ver
-##ads
-##baby
-sound
-bbe
-##lone
-##uid
-ads
-gundam
-thinkpad
-scrum
-match
-##ave
-mems
-##oy
-##talk
-glass
-lamigo
-span
-##eme
-job
-##a5
-jay
-wade
-kde
-##lace
-ocean
-tvg
-##covery
-##r3
-##ners
-##rea
-junior
-think
-##aine
-cover
-##ision
-##sia
-##bow
-msi
-##love
-soft
-z2
-##pl
-mobil
-mind
-##uy
-nginx
-##oi
-##rr
-##mple
-##sson
-##nts
-comhd
-crv3000
-##uard
-deep
-lost
-field
-gallery
-##bia
-rate
-spf
-redis
-traction
-icloud
-fe
-jose
-##tory
-into
-sohu
-fx
-kicstart2
-##hia
-##sit
-ra
-##walk
-##xure
-##pact
-pacific
-xa
-natural
-carlo
-##walker
-##can
-cto
-gigi
-pen
-##hoo
-ob
-matlab
-##yy
-##iti
-mango
-##bbs
-sense
-c5
-oxford
-walker
-jennifer
-##ola
-course
-##bre
-##pus
-##rder
-lucky
-ivy
-##nia
-sotheby
-side
-##ugh
-joy
-##orage
-##ush
-##bat
-##dt
-r9
-##gio
-country
-wear
-##lax
-##moon
-seven
-study
-lonzo
-evolution
-##kk
-gs
-kd
-arduino
-b12
-##lux
-arpg
-##rdon
-cook
-##x5
-dark
-five
-##als
-##ida
-sign
-something
-##nda
-##posted
-fresh
-tf
-cam
-##mine
-##skip
-##form
-##ssion
-education
-##tee
-dyson
-stage
-##jie
-want
-##night
-epson
-pack
-##ppy
-##█
-wd
-##eh
-##rence
-left
-##lvin
-golden
-mhz
-discovery
-##trix
-##n2
-loft
-##uch
-##dra
-##sse
-speed
-sorry
-welcome
-##urn
-wave
-gaga
-##lmer
-teddy
-rp
-##sha
-rar
-holiday
-##vg
-##nos
-##rail
-gartner
-gi
-##dium
-kit
-b3
-eco
-sean
-##stone
-autocad
-nu
-##np
-f16
-write
-m5
-##ias
-images
-atp
-##dk
-fsm
-ve
-##xxx
-##cake
-unit
-lim
-ru
-##ification
-published
-angela
-analytics
-ak
-##nel
-gmt
-##icon
-again
-##₂
-##bby
-ios11
-waze
-##ust
-framework
-iptv
-delete
-cl
-wwdc
-##fw
-##xon
-brandt
-##ses
-##dragon
-tc
-vetements
-anne
-monte
-modern
-official
-##ere
-##nne
-##oud
-etnews
-##a2
-##graphy
-##rtex
-l2
-##gma
-mount
-ccd
-archive
-morning
-tan
-ddos
-e7
-day4
-gis
-its
-factory
-bruce
-pg
-##ito
-guest
-cdma
-##lling
-n3
-mega
-eyes
-ro
-women
-dac
-church
-##jun
-singapore
-##facebook
-starbucks
-##tos
-##stin
-##shine
-zen
-##mu
-tina
-request
-##gence
-qt
-q7
-##zzi
-diary
-##tore
-##ead
-cst
-##osa
-canada
-agent
-va
-##jiang
-##lam
-sg
-##nix
-##sday
-g6
-##master
-bing
-##zl
-charlie
-nb40
-thai
-ln284ct
-##itz
-bonnie
-##food
-##lent
-originals
-##stro
-##lts
-##bscribe
-children
-ntd
-yesstyle
-hmv
-##tment
-d5
-arts
-sms
-##pn
-topios9
-lifestyle
-virtual
-##ague
-xz
-##deo
-muji
-unt
-##nnis
-faq1
-##ette
-fly
-curry
-##pop
-release
-##cast
-##ews
-##stle
-ios7
-##ima
-dog
-lenovo
-##r4
-roger
-cbs
-vornado
-##desk
-##ald
-##van
-oil
-some
-break
-common
-##jy
-##lines
-g7
-twice
-ella
-nano
-belle
-##mes
-##self
-##note
-jb
-benz
-##ova
-save
-##wing
-kai
-##hua
-##rect
-rainer
-##unge
-adsl
-guestname
-##uma
-##kins
-##zu
-tokichoi
-##price
-county
-##med
-##mus
-rmk
-address
-vm
-openload
-##group
-##hin
-##iginal
-amg
-urban
-##oz
-jobs
-emi
-##public
-beautiful
-##sch
-album
-##dden
-##bell
-jerry
-works
-hostel
-miller
-##drive
-##rmin
-boot
-##fx
-##nome
-##ctionary
-##oman
-##lish
-##cr
-##hm
-##how
-francis
-xi
-c919
-b5
-evernote
-##uc
-vga
-coupe
-##urg
-##cca
-##uality
-multi
-##ett
-em
-hey
-##ani
-##tax
-##rma
-inside
-than
-leonnhurt
-##jin
-ict
-bird
-notes
-##dical
-##lli
-result
-iu
-ee
-smap
-gopro
-##last
-yin
-pure
-##dan
-##rame
-mama
-##oot
-bean
-marketing
-##hur
-bella
-sync
-xuite
-##ground
-discuz
-##getrelax
-##ince
-##bay
-cj
-gmat
-apt
-##pass
-jing
-##rix
-c4
-rich
-niusnews
-##ello
-bag
-##eting
-##mobile
-culture
-area
-##ience
-details
-gp
-universal
-silver
-dit
-private
-ddd
-u11
-kanshu
-##ified
-fung
-##nny
-dx
-tai
-##fr
-##lean
-##pin
-##rin
-ly
-rick
-##bility
-usb3
-banner
-##baru
-##gion
-metal
-dt
-vdf
-karl
-qualcomm
-bear
-oldid
-ian
-jo
-##tors
-population
-##ernel
-mmorpg
-##mv
-##bike
-ww
-friend
-##ager
-exhibition
-##del
-##pods
-fpx
-structure
-##free
-##tings
-kl
-##rley
-##copyright
-##mma
-california
-orange
-yoga
-canmake
-honey
-##anda
-nikkie
-dhl
-publishing
-##mall
-##gnet
-##┅
-e88
-##dog
-fishbase
-##!
-##"
-###
-##$
-##%
-##&
-##'
-##(
-##)
-##*
-##+
-##,
-##-
-##.
-##/
-##:
-##;
-##<
-##=
-##>
-##?
-##@
-##[
-##\
-##]
-##^
-##_
-##{
-##|
-##}
-##~
-##£
-##¤
-##¥
-##§
-##«
-##±
-##³
-##µ
-##·
-##¹
-##º
-##»
-##¼
-##ß
-##æ
-##÷
-##ø
-##đ
-##ŋ
-##ɔ
-##ə
-##ɡ
-##ʰ
-##ˇ
-##ˈ
-##ˊ
-##ˋ
-##ˍ
-##ː
-##˙
-##˚
-##ˢ
-##α
-##β
-##γ
-##δ
-##ε
-##η
-##θ
-##ι
-##κ
-##λ
-##μ
-##ν
-##ο
-##��
-##ρ
-##ς
-##σ
-##τ
-##υ
-##φ
-##χ
-##ψ
-##б
-##в
-##г
-##д
-##е
-##ж
-##з
-##к
-##л
-##м
-##н
-##о
-##п
-##р
-##с
-##т
-##у
-##ф
-##х
-##ц
-##ч
-##ш
-##ы
-##ь
-##і
-##ก
-##ง
-##น
-##ม
-##ย
-##ร
-##อ
-##า
-##เ
-##๑
-##་
-##ღ
-##ᵃ
-##ᵉ
-##ᵍ
-##ᵏ
-##ᵐ
-##ᵒ
-##ᵘ
-##‖
-##„
-##†
-##•
-##‥
-##‧
-##
-##‰
-##′
-##″
-##‹
-##›
-##※
-##‿
-##⁄
-##ⁱ
-##⁺
-##ⁿ
-##₁
-##₃
-##₄
-##€
-##№
-##ⅰ
-##ⅱ
-##ⅲ
-##ⅳ
-##ⅴ
-##⇒
-##∀
-##−
-##∕
-##∙
-##√
-##∞
-##∟
-##∠
-##∣
-##∩
-##∮
-##∶
-##∼
-##∽
-##≈
-##≒
-##≡
-##≤
-##≥
-##≦
-##≧
-##≪
-##≫
-##⊙
-##⋅
-##⋈
-##⋯
-##⌒
-##①
-##②
-##③
-##④
-##⑤
-##⑥
-##⑦
-##⑧
-##⑨
-##⑩
-##⑴
-##⑵
-##⑶
-##⑷
-##⑸
-##⒈
-##⒉
-##⒊
-##⒋
-##ⓒ
-##ⓔ
-##ⓘ
-##━
-##┃
-##┆
-##┊
-##┌
-##└
-##├
-##┣
-##═
-##║
-##╚
-##╞
-##╠
-##╭
-##╮
-##╯
-##╰
-##╱
-##╳
-##▂
-##▃
-##▅
-##▇
-##▉
-##▋
-##▌
-##▍
-##▎
-##□
-##▬
-##△
-##►
-##▽
-##◇
-##◕
-##◠
-##◢
-##◤
-##☞
-##☼
-##♡
-##♫
-##♬
-##✕
-##✦
-##✪
-##✰
-##✿
-##❀
-##➜
-##➤
-##⦿
-##、
-##。
-##〃
-##々
-##〇
-##〈
-##〉
-##《
-##》
-##「
-##」
-##『
-##』
-##【
-##】
-##〓
-##〔
-##〕
-##〖
-##〗
-##〜
-##〝
-##〞
-##ㄧ
-##ㆍ
-##㈦
-##㊣
-##㗎
-##︰
-##︱
-##︶
-##︿
-##﹁
-##﹂
-##﹍
-##﹏
-##﹐
-##﹑
-##﹒
-##﹔
-##﹕
-##﹖
-##﹗
-##﹙
-##﹚
-##﹝
-##﹞
-##﹡
-##﹣
-##！
-##（
-##）
-##，
-##：
-##；
-##？
-##｡
-##｢
-##｣
-##､
-##･
-##ｯ
-##ｰ
-##ﾞ
-##ﾟ
-##￣
-##￥

vocab.py ADDED Viewed

	@@ -0,0 +1,453 @@

+from patcher import tiktoken_patch
+import tiktoken
+from transformers import AutoTokenizer
+from enum import Enum, auto
+from dataclasses import dataclass, field
+from utils.log_util import logger
+from typing import Dict, Any, Union
+"""Interface:
+tokenizer.encode
+tokenizer.decode
+    tokenizer.convert_tokens_to_string   # gpt4 没有这个方法
+tokenizer.convert_ids_to_tokens
+tokenizer.parent = ""
+tokenizer.vocab_size
+tokenizer.get_vocab()   # gpt-neox-20b, llama
+tokenizer.type = TokenizerType.ByteBPE.name
+tokenizer.implementation = TokenizerImpl.SentencePiece.name   # https://github.com/facebookresearch/llama/blob/main/llama/tokenizer.py
+  "HFGPT2Tokenizer", "HFTokenizer", "GPT2BPETokenizer", "CharLevelTokenizer", "TiktokenTokenizer", "SPMTokenizer", https://github.com/EleutherAI/gpt-neox/blob/main/tools/preprocess_data.py
+tokenizer.comments = "split all numbers into individual digits, " \
+                     "and fallback to bytes to decompose unknown UTF-8 characters"
+tokenizer.all_special_tokens  # baichuan
+tokenizer.special_tokens_set   # gpt3.5_turbo
+tokenizer.special_tokens_map
+"""
+class TokenizerImpl(Enum):
+    """
+    - https://github.com/huggingface/tokenizers/blob/main/bindings/python/py_src/tokenizers/implementations/__init__.py
+    - https://huggingface.co/docs/transformers/tokenizer_summary
+    - https://github.com/EleutherAI/gpt-neox/blob/main/megatron/tokenizer/tokenizer.py
+    ## google/BertTokenizer
+    - https://github.com/huggingface/tokenizers/blob/main/bindings/python/py_src/tokenizers/implementations/bert_wordpiece.py
+    - 特征
+        - 算法：BERT的编码器是 BPE-WordPiece，将单词拆分成多个前缀符号（比如BERT中的##）最小单元
+        - 词典：有##开头的token，表示subword，
+            - 中文采用char粒度分词
+            - 英文采用  WordPiece
+    ## google/sentencepiece
+    - https://github.com/google/sentencepiece/blob/3863f7648e5d8edb571ac592f3ac4f5f0695275a/src/sentencepiece_model.proto#L48
+    - 支持 sentencepiece 和 wordpiece
+        - sentencepiece 有byte-bpe吗？
+            - UNIGRAM = 1;  // Unigram language model with dynamic algorithm
+            - BPE = 2;      // Byte Pair Encoding
+            - WORD = 3;     // Delimitered by whitespace.
+            - CHAR = 4;     // tokenizes into character sequence
+        - wordpiece
+    - 特征：
+        - 训练: spm_train --model_type unigram/bpe/char/word
+        - 特殊符号： Ġ
+        - 文件: *.sp_model  或 *.model  (可选文件 .vocab，) spm简称   (其他格式比如 tokenizer.json是给hf_tokenizer兼容用的)
+        - 实现:
+            - 依赖: protobuf
+            - 训练: `import sentencepiece as spm; spm.SentencePieceTrainer.train` 或 `spm_train`
+            - 加载: `import sentencepiece as spm; spm.SentencePieceProcessor().Load(vocab_file)`
+            - 方法: 是SentencePieceProcessor类型，sp_model.id_to_piece，有tokenizer.json tokenizer.model，
+            - 分词:
+                - pre_tokenizers.ByteLevel(add_prefix_space=True, use_regex=False)
+        - 词典:  词典字符有 ▁  (U+2581) ，表示空格或句首。
+    - 示例：google-t5, llama，baichuan, orion,
+        - llama: tokenizer.json(包含model.vocab model.merges)  tokenizer.model
+        - grok: 原始是 .model文件，后面转成了 tokenizer.json
+        - google-t5: tokenizer.json, spiece.model
+        - Skywork-13B-Math: tokenizer.model
+        - xlm_roberta: sentencepiece.bpe.model
+        - GPT2Tokenizer
+            - tokenizer.json, vocab.json, merges.txt   (https://huggingface.co/openai-community/gpt2)
+            - vocab.bpe, encoder.json, dict.txt  （fairseq版本，不常用，可以忽略这个版本）
+    ## thu/icetk
+      - icetk： sentencepiece的分支，支持image_tokenizer。
+    - glm, chatglm1, chatglm2
+    ## huggingface/tokenizers
+    - https://github.com/huggingface/tokenizers
+    - VS sentencepiece
+        - 支持sentencepiece
+            - .model转化为 (merges.txt + vocab.json) 或者 tokenizer.json
+                - https://github.com/huggingface/tokenizers/blob/main/bindings/python/scripts/sentencepiece_extractor.py
+            - 加载 merges.txt, vocab.json
+                - SentencePieceBPETokenizer  https://github.com/huggingface/tokenizers/blob/v0.19.1/bindings/python/py_src/tokenizers/implementations/sentencepiece_bpe.py#L10
+        - 在 sentencepiece基础上，hf_tokenizer支持pre-tokenization的正则表达式，对tab和换行支持更好，支持special token
+    - 类型： 支持 BBPE, WordPiece or Unigram
+    - 特征：
+        - 文件: tokenizer.json(包含后两个文件的内容), merges.txt, vocab.json
+            - added_tokens 在vocab中不一定存在。
+        - 实现:
+            - 训练: `from tokenizers.trainers import BpeTrainer, UnigramTrainer, WordLevelTrainer, WordPieceTrainer`
+            - 加载:
+            - 方法: .model.from_file  .model.save   .model.token_to_id  .model.tokenize
+        - .model 是 tokenizer.models.BPE 类型
+        - 词典有 Ġ  "\u0120" 开头
+        - 优势
+        -
+    - 示例：gpt2, gpt_neox_20b, moss, bloom, qwen2
+    - 优势：相对sentence piece，
+        - ss
+    ## openai/tiktoken
+    - 特征：空格就是空格，
+    - 示例：gpt3.5 gpt4, qwen,
+    """
+    """ 算法体系  https://www.huaxiaozhuan.com/%E5%B7%A5%E5%85%B7/huggingface_transformer/chapters/1_tokenizer.html
+    - word-base tokenizer:
+    - char-base tokenizer:
+    - subword-based Tokenizer
+        - BPE
+            - byte-bpe: base vocabulary大小是256
+        - WordPiece:
+            - 相比BPE，WordPiece 仅保存最终词表，而不保存学到的 merge rule
+        - Unigram
+    - SentencePiece
+    """
+    # 分类体系：https://github.com/huggingface/tokenizers/blob/main/bindings/python/py_src/tokenizers/implementations/
+    BertTokenizer = "wordpiece.BertTokenizer"
+    JapaneseTokenizer = ("wordpiece.MecabTokenizer", "https://github.com/polm/fugashi")  # 常用日语包 ipadic，fugashi，
+    ByteLevelBPETokenizer = "byte_level_bpe"  # BBPE
+    SentencePieceBPETokenizer = "sentencepiece_bpe"
+    # 分类体系
+    # SentencePeice(BPE)
+    SentencePiece = auto()  # sentencepiece.bpe, sentencepiece.unigram, sentencepiece.char, sentencepiece.word,
+    byte_level_bpe = auto()
+    # HFTokenizer = auto()  # , 支持
+    TikToken = auto()
+    # subword-nmt
+    # WordPiece
+# load_vocab_with_SPECIAL_TOKEN = True # 如果不包含会导致计算词典大小错误、overlap_token计算不一致。
+@dataclass
+class TokenizerConfig:
+    """
+    https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/blob/main/src/leaderboard/read_evals.py
+    """
+    name_or_path: str  # org/model (path on hub), as unique id
+    name_display: str = None  #
+    impl: TokenizerImpl = None  # implementation, tokenizer_class/type
+    org: str = None
+    link: str = None  # http://**
+    desc: str = None  # description
+    meta: str = None
+    level: str = None  # char-level, word-level, byte-level
+    init_kwargs: Dict[str, Any] = field(default_factory=dict, )
+    def __post_init__(self):
+        if self.link is None:
+            self.link = "https://huggingface.co/" + self.name_or_path  # TODO + revision
+        if self.name_display is None:
+            self.name_display = self.name_or_path
+    @classmethod
+    def init_from_json_file(cls, json_filepath: str) -> 'TokenizerConfig':
+        pass
+    def __eq__(self, other):
+        if isinstance(other, self.__class__):
+            return self.__dict__ == other.__dict__
+        else:
+            return False
+    def __hash__(self):
+        return hash(self.name_or_path)
+# format: , description, hf_path, tokenizer_class/type, comments, Organization
+# TODO: append link and description to the end of dropdown button.
+_all_tokenizer_config = [
+    ##### bert 系列
+    TokenizerConfig("google-bert/bert-base-cased", impl=TokenizerImpl.BertTokenizer, org="Google",
+                    desc="first add whitespace around any CJK character, then perform wordpiece tokenization."),
+    TokenizerConfig("google-bert/bert-base-uncased", impl=TokenizerImpl.BertTokenizer, org="Google",
+                    desc="first add whitespace around any CJK character, then perform wordpiece tokenization."),
+    TokenizerConfig("google-bert/bert-base-chinese", impl=TokenizerImpl.BertTokenizer, org="Google",
+                    desc="first add whitespace around any CJK character, then perform wordpiece tokenization."),
+    TokenizerConfig("google-bert/bert-base-german-cased", impl=TokenizerImpl.BertTokenizer, org="Google"),
+    TokenizerConfig("dbmdz/bert-base-german-uncased", impl=TokenizerImpl.BertTokenizer, org="dbmdz"),
+    TokenizerConfig("google-bert/bert-base-multilingual-uncased", impl=TokenizerImpl.BertTokenizer, org="Google"),
+    TokenizerConfig("google-bert/bert-base-multilingual-cased", impl=TokenizerImpl.BertTokenizer, org="Google"),
+    TokenizerConfig("tohoku-nlp/bert-base-japanese", impl=TokenizerImpl.BertTokenizer, org="Tohoku",
+                    desc="The texts are first tokenized by MeCab morphological parser with the IPA dictionary, "
+                         "then split into subwords by the WordPiece algorithm."),
+    TokenizerConfig("clue/roberta_chinese_clue_tiny", name_display="clue/roberta-chinese-clue",
+                    impl=TokenizerImpl.BertTokenizer, org="CLUE",
+                    init_kwargs={"revision": "refs/pr/1"},
+                    desc="",
+                    meta="去掉了繁体字, https://github.com/CLUEbenchmark/CLUEPretrainedModels/blob/master/README.md"),
+    TokenizerConfig("eson/kplug-base-encoder", name_display="eson/kplug", impl=TokenizerImpl.BertTokenizer, org="JD"),
+    TokenizerConfig("ckiplab/gpt2-base-chinese", impl=TokenizerImpl.BertTokenizer, org="SINICA"),  # 台湾中央研究院
+    # WoBERT
+    # WoBERT Plus  https://github.com/ZhuiyiTechnology/WoBERT
+    ##### GPT2Tokenizer
+    TokenizerConfig("openai-community/gpt2", impl=TokenizerImpl.SentencePiece, org="OpenAI"),
+    # byte-level BPE,没有byte，是unicode-level的吗？
+    TokenizerConfig("ClassCat/gpt2-base-french", impl=TokenizerImpl.SentencePiece, org="ClassCat"),
+    TokenizerConfig("ClassCat/gpt2-base-spanish", impl=TokenizerImpl.SentencePiece, org="ClassCat"),
+    TokenizerConfig("fnlp/moss-moon-003-sft", impl=TokenizerImpl.SentencePiece, init_kwargs={"revision": "refs/pr/6"},
+                    org="Fudan",
+                    desc="This tokenizer has been trained to treat spaces like parts of the tokens "
+                         "(a bit like sentencepiece) so a word will be encoded differently whether "
+                         "it is at the beginning of the sentence (without space) or not",
+                    meta="在gpt2词典基础上，扩充了5万中文"),
+    TokenizerConfig("bigscience/bloom", impl=TokenizerImpl.SentencePiece, org="BigScience",
+                    meta="比gpt_neox的词典 对中文支持更好。"),
+    # ("bloomz_6b4_zh",
+    # ("BelleGroup/BELLE-7B-2M",   # 模型和词典都基于bloom
+    #
+    TokenizerConfig("EleutherAI/gpt-neox-20b", impl=TokenizerImpl.SentencePiece, org="EleutherAI"),  # 5万
+    TokenizerConfig("cyberagent/open-calm-7b", impl=TokenizerImpl.SentencePiece, org="CyberAgent"),  # GPTNeoXTokenizer
+    TokenizerConfig("abeja/gpt-neox-japanese-2.7b", impl=TokenizerImpl.SentencePiece, org="ABEJA"),
+    TokenizerConfig("Qwen/Qwen1.5-14B-Chat", name_display="Qwen/Qwen1.5", impl=TokenizerImpl.SentencePiece, org="Alibaba"),  # 15万，速度有点慢
+    TokenizerConfig("HuggingFaceH4/starchat-alpha", impl=TokenizerImpl.SentencePiece, org="-"),
+    ####### google/sentencepiece tokenizer:
+    # T5 llama internlm
+    TokenizerConfig("google-t5/t5-large", name_display="google-t5/t5", impl=TokenizerImpl.SentencePiece, org="Google"),
+    # t5_small, t5_base, t5_large, flan_t5_base,
+    # ("t5_base", "", "sentencepiece"),
+    # TokenizerConfig("google/flan-t5-base", impl=TokenizerImpl.SentencePiece, ),
+    TokenizerConfig("lmsys/fastchat-t5-3b-v1.0", impl=TokenizerImpl.SentencePiece,
+                    org="LMSYS",
+                    init_kwargs={"use_fast": False}  # 解决 pyo3_runtime.PanicException: AddedVocabulary bad split
+                    ),
+    TokenizerConfig("CohereForAI/aya-101", org="Cohere For AI"),  # "tokenizer_class": "T5Tokenizer",
+    TokenizerConfig("ClueAI/ChatYuan-large-v2", impl=TokenizerImpl.SentencePiece, org="CLUE"),
+    TokenizerConfig("ClueAI/PromptCLUE-base", impl=TokenizerImpl.SentencePiece, org="CLUE"),
+    TokenizerConfig("gradientai/Llama-3-8B-Instruct-Gradient-1048k", name_display="Meta/llama3",
+                    impl=TokenizerImpl.SentencePiece, org="Meta",
+                    desc="llama split all numbers into individual digits, and fallback to bytes to decompose unknown UTF-8 characters"),
+    # byte-level BPE
+    # '中文单字': 700, '中文多字': 0
+    TokenizerConfig("NousResearch/Llama-2-7b-chat-hf", name_display="Meta/llama2", impl=TokenizerImpl.SentencePiece,
+                    org="Meta"),
+    TokenizerConfig("huggyllama/llama-7b", name_display="Meta/llama", impl=TokenizerImpl.SentencePiece, org="Meta"),
+    TokenizerConfig("hpcai-tech/grok-1", name_display="xai-org/grok-1", impl=TokenizerImpl.SentencePiece, org="xAI"),
+    # 由.model文件转化为了
+    TokenizerConfig("hfl/chinese-llama-lora-7b", impl=TokenizerImpl.SentencePiece, org="-",
+                    meta="向原始LLaMA的词汇表中添加2w个中文词汇，针对原版LLaMA模型扩充了中文词表， 提升了中文编解码效率"),
+    #
+    TokenizerConfig("hfl/chinese-llama-2-7b", impl=TokenizerImpl.SentencePiece, org="-",
+                    meta="重新设计了新词表（大小：55296），进一步提升了中文字词的覆盖程度"),  #
+    TokenizerConfig("hfl/llama-3-chinese-8b", impl=TokenizerImpl.SentencePiece, org="-"),
+    TokenizerConfig("hfl/chinese-alpaca-lora-7b", impl=TokenizerImpl.SentencePiece, org="-"),
+    # 中文Alpaca模型在上述中文LLaMA模型的基础上进一步使用了指令数据进行精调。  "比chinese_llama词典多一个`[PAD]`，请勿混用"
+    #
+    # ("belle_llama_ext_7b",
+    # ("alpaca_7b",
+    TokenizerConfig("baichuan-inc/Baichuan-7B", name_display="baichuan-inc/baichuan",
+                    impl=TokenizerImpl.SentencePiece,
+                    level="byte-level", org="Baichuan"),
+    TokenizerConfig("baichuan-inc/Baichuan2-7B-Chat", name_display="baichuan-inc/baichuan2",
+                    impl=TokenizerImpl.SentencePiece, org="Baichuan",
+                    desc="expand the vocabulary size from 64000 in Baichuan1 to 125696"),
+    TokenizerConfig("internlm/internlm-chat-7b", impl=TokenizerImpl.SentencePiece, org="Shanghai AI Lab"),
+    # 上海AI实验室 +  商汤
+    TokenizerConfig("internlm/internlm2-chat-7b", impl=TokenizerImpl.SentencePiece, org="Shanghai AI Lab"),
+    TokenizerConfig("internlm/internlm2-math-7b", impl=TokenizerImpl.SentencePiece, org="Shanghai AI Lab"),
+    TokenizerConfig("internlm/internlm-xcomposer-7b", impl=TokenizerImpl.SentencePiece, org="Shanghai AI Lab"),
+    TokenizerConfig("tiiuae/falcon-7b", impl=TokenizerImpl.SentencePiece, org="TII"),
+    TokenizerConfig("tiiuae/falcon-180b", impl=TokenizerImpl.SentencePiece, org="TII"),
+    TokenizerConfig("Skywork/Skywork-13B-base", impl=TokenizerImpl.SentencePiece, org="Kunlun"),
+    TokenizerConfig("Skywork/Skywork-13B-Math", impl=TokenizerImpl.SentencePiece, org="Kunlun"),  # 文件：tokenizer.model
+    TokenizerConfig("FacebookAI/xlm-roberta-base", impl=TokenizerImpl.SentencePiece, org="Facebook"),
+    # 这个的tokenizer.json 为什么没有merges? vocab里为什么有概率值？
+    # "goat",
+    # ##### glm系列
+    # "glm_chinese",),
+    TokenizerConfig("THUDM/chatglm-6b", impl=TokenizerImpl.SentencePiece, org="Tsinghua",
+                    meta=f"num_image_tokens: {12}; num_image_tokens: {34} ",
+                    init_kwargs={"revision": "refs/pr/100"}),
+    TokenizerConfig("THUDM/chatglm2-6b", impl=TokenizerImpl.SentencePiece, org="Tsinghua", ),
+    TokenizerConfig("THUDM/chatglm3-6b", impl=TokenizerImpl.SentencePiece, org="Tsinghua", ),
+    TokenizerConfig("thu-coai/CharacterGLM-6B", impl=TokenizerImpl.SentencePiece, org="Tsinghua", ),
+    # tiktoken 系列
+    TokenizerConfig("openai/text-davinci-003", impl=TokenizerImpl.TikToken, org="OpenAI",
+                    link="https://github.com/openai/tiktoken"),
+    #
+    TokenizerConfig("openai/code-davinci-002", impl=TokenizerImpl.TikToken, org="OpenAI",
+                    link="https://github.com/openai/tiktoken"),
+    TokenizerConfig("openai/gpt-3.5-turbo", impl=TokenizerImpl.TikToken, org="OpenAI",
+                    link="https://github.com/openai/tiktoken",
+                    desc="tiktoken is a fast BPE tokeniser for use with OpenAI's models. There are 16 tokens KeyError"),
+    TokenizerConfig("openai/gpt-4", impl=TokenizerImpl.TikToken, org="OpenAI",
+                    link="https://github.com/openai/tiktoken", ),
+    TokenizerConfig("openai/gpt-4o", impl=TokenizerImpl.TikToken, org="OpenAI",
+                    link="https://github.com/openai/tiktoken", ),
+    TokenizerConfig("Qwen/Qwen-7B-Chat", name_display="Qwen/Qwen", impl=TokenizerImpl.TikToken, org="Alibaba",
+                    init_kwargs={"revision": "refs/pr/56"},
+                    meta="在gpt4词典基础上，删除了100个多数字token，增加10000中文词token；并优化了special_token的分词"),
+    # https://huggingface.co/Qwen/Qwen-7B-Chat#%E6%A8%A1%E5%9E%8B%E7%BB%86%E8%8A%82%EF%BC%88model%EF%BC%89
+    #  该词表在GPT-4使用的BPE词表cl100k_base基础上，对中文、多语言进行了优化，在对中、英、代码数据的高效编解码的基础上，
+    #  对部分多语言更加友好，方便用户在不扩展词表的情况下对部分语种进行能力增强。 词表对数字按单个数字位切分。
+    # TokenizerConfig("Qwen/Qwen-72B-Chat", impl=TokenizerImpl.TikToken),
+    # 未分类
+    # ("amber", ""),
+    TokenizerConfig("LLM360/CrystalCoder", org="MBZUAI"),
+    TokenizerConfig("mistralai/Mistral-7B-v0.1", org="Mistral"),
+    TokenizerConfig("mistralai/Mixtral-8x7B-v0.1", org="Mistral"),
+    TokenizerConfig("paust/pko-t5-large", org="PAUST"),
+    TokenizerConfig("01-ai/Yi-6B", org="Yi"),
+    TokenizerConfig("01-ai/Yi-34B", org="Yi"),
+    TokenizerConfig("01-ai/Yi-VL-34B", org="Yi"),
+    TokenizerConfig("OrionStarAI/Orion-14B-Chat", org="OrionStar"),
+    TokenizerConfig("microsoft/phi-1", org="Microsoft"),
+    TokenizerConfig("microsoft/phi-2", org="Microsoft"),
+    TokenizerConfig("microsoft/Phi-3-mini-4k-instruct", org="Microsoft", meta="即llama vocab"),
+    TokenizerConfig("Upstage/SOLAR-10.7B-v1.0", org="-"),
+    TokenizerConfig("google/mobilebert-uncased", org="Google"),
+    # ("google/mobilenet_v2_1.0_224",),  # error
+    TokenizerConfig("google/switch-c-2048", org="Google"),
+    TokenizerConfig("google/byt5-small", org="Google"),
+    TokenizerConfig("google/mt5-large", org="Google"),
+    TokenizerConfig("WizardLM/WizardCoder-Python-7B-V1.0", org="Microsoft"),
+    TokenizerConfig("WizardLM/WizardCoder-15B-V1.0", org="Microsoft"),
+    TokenizerConfig("WizardLM/WizardLM-7B-V1.0", org="Microsoft"),
+    TokenizerConfig("WizardLM/WizardMath-70B-V1.0", org="Microsoft"),
+    TokenizerConfig("TigerResearch/tigerbot-70b-chat-v4-4k", org="Tigerobo"),
+    TokenizerConfig("TigerResearch/tigerbot-13b-chat-v2", org="Tigerobo"),
+    TokenizerConfig("deepseek-ai/deepseek-coder-33b-instruct", org="DeepSeek"),
+    TokenizerConfig("deepseek-ai/deepseek-llm-7b-base", org="DeepSeek"),
+    TokenizerConfig("deepseek-ai/DeepSeek-V2", org="DeepSeek"),
+    TokenizerConfig("google/gemma-7b", org="Google"),
+    TokenizerConfig("allenai/OLMo-7B", org="Allen AI"),
+    TokenizerConfig("HuggingFaceH4/zephyr-7b-beta", org="HuggingFace"),
+    TokenizerConfig("ai21labs/Jamba-v0.1", org="AI21"),
+    TokenizerConfig("databricks/dbrx-instruct", org="Databricks"),
+    # ("claude",),
+    # https://github.com/Duxiaoman-DI/XuanYuan
+    # https://huggingface.co/apple/OpenELM-3B-Instruct  https://huggingface.co/apple/OpenELM-3B
+]
+assert len(set([config.name_display for config in _all_tokenizer_config])) == len(_all_tokenizer_config)
+assert len(set([config.name_or_path for config in _all_tokenizer_config])) == len(_all_tokenizer_config)
+assert len(set([config.name_or_path.split("/")[-1] for config in _all_tokenizer_config])) == len(_all_tokenizer_config)
+class TokenizerFactory:
+    def __init__(self):
+        self.all_tokenizer_configs = sorted(_all_tokenizer_config, key=lambda k: k.name_or_path)
+        self.all_tokenizer_names = [config.name_or_path for config in self.all_tokenizer_configs]
+        self.name_to_config_list = [
+            {config.name_or_path: config for config in self.all_tokenizer_configs},
+            {config.name_display: config for config in self.all_tokenizer_configs},
+            {config.name_display.split("/")[-1]: config for config in self.all_tokenizer_configs},
+        ]
+        self.tokenizer_cache = {}
+    def get_tokenizer_config(self, tokenizer_name: str) -> TokenizerConfig:
+        for name_to_config in self.name_to_config_list:
+            if tokenizer_name in name_to_config:
+                return name_to_config[tokenizer_name]
+        return None
+    def get_tokenizer(self, tokenizer_name: str):
+        """
+        :param tokenizer_config:
+        :return:
+        """
+        tokenizer_config = self.get_tokenizer_config(tokenizer_name)
+        # 1. load from cache
+        if tokenizer_config in self.tokenizer_cache:
+            return self.tokenizer_cache[tokenizer_config]
+        # 2. load tokenizer
+        logger.info(f"loading tokenizer {tokenizer_config.name_or_path}")
+        if tokenizer_config.impl == TokenizerImpl.TikToken and "openai" in tokenizer_config.name_or_path:
+            tokenizer = tiktoken.encoding_for_model(tokenizer_config.name_or_path.replace("openai/", ""))
+        else:
+            tokenizer = AutoTokenizer.from_pretrained(
+                tokenizer_config.name_or_path,
+                trust_remote_code=True,
+                **tokenizer_config.init_kwargs
+            )
+        self.tokenizer_cache[tokenizer_config] = tokenizer
+        return tokenizer
+    def get_name_with_hyperlink(self, tokenizer_name):
+        def model_hyperlink(link, model_name):
+            model_name = model_name
+            return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
+        tokenizer_config = self.get_tokenizer_config(tokenizer_name)
+        return model_hyperlink(tokenizer_config.link, tokenizer_config.name_display.split("/")[-1])
+tokenizer_factory = TokenizerFactory()
+# class TokenizerType(Enum):
+#
+#     # BERTTokenizer
+#     # 依赖一个txt文件
+#
+#
+#     # https://github.com/EleutherAI/gpt-neox/blob/v2.0/megatron/tokenizer/tokenizer.py#L231
+#     # 依赖一个json文件，Tokenizer.from_file(vocab_file)
+#     # 案例：gpt-neox-20B
+#     HFTokenizer = auto()
+#
+#     # 依赖: model_file, sentencepiece.SentencePieceProcessor(model_file)
+#     # 案例：
+#     SentencePieceTokenizer = auto()
+#
+#
+#     # 依赖: 3个json文件：vocab.json, merges.txt, special_tokens.txt
+#     # 源码:
+#     #   - https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/tokenizer/gpt2_tokenization.py#L92
+#     # Byte-level BPE
+#     GPT2BPETokenizer = auto()
+if __name__ == "__main__":
+    for tokenizer_config in tokenizer_factory.all_tokenizer_configs:
+        if True:
+            # if "t5" in tokenizer_config.name_or_path:
+            tokenizer1 = tokenizer_factory.get_tokenizer(tokenizer_config.name_or_path)
+            tokenizer2 = tokenizer_factory.get_tokenizer(tokenizer_config.name_display)
+            tokenizer3 = tokenizer_factory.get_tokenizer(tokenizer_config.name_display.split("/")[-1])
+            assert tokenizer1 == tokenizer2 == tokenizer3
+            print(tokenizer_config.name_or_path, len(tokenizer1))

vocab/Intern_gpt/README.md DELETED Viewed

File without changes

vocab/__init__.py DELETED Viewed

@@ -1,260 +0,0 @@
-import importlib
-from enum import Enum, auto
-"""Interface:
-tokenizer.encode
-tokenizer.decode
-    tokenizer.convert_tokens_to_string   # gpt4 没有这个方法
-tokenizer.convert_ids_to_tokens
-tokenizer.parent = ""
-tokenizer.vocab_size
-tokenizer.get_vocab()   # gpt-neox-20b, llama
-tokenizer.type = TokenizerType.ByteBPE.name
-tokenizer.implementation = TokenizerImpl.SentencePiece.name   # https://github.com/facebookresearch/llama/blob/main/llama/tokenizer.py
-  "HFGPT2Tokenizer", "HFTokenizer", "GPT2BPETokenizer", "CharLevelTokenizer", "TiktokenTokenizer", "SPMTokenizer", https://github.com/EleutherAI/gpt-neox/blob/main/tools/preprocess_data.py
-  - google/bert
-    - 特征
-      - 词典：有##开头的token，表示subword
-    - 示例：
-  - bpe-google/sentencepiece:
-    - 特征：
-      - 训练:
-      - 文件: *.sp_model  或 *.model  (可选文件 .vocab，) spm简称
-      - 实现:
-        - 依赖: protobuf
-        - 训练: `import sentencepiece as spm; spm.SentencePieceTrainer.train` 或 `spm_train`
-        - 加载: `import sentencepiece as spm; spm.SentencePieceProcessor().Load(vocab_file)`
-        - 方法: 是SentencePieceProcessor类型，sp_model.id_to_piece，有tokenizer.json tokenizer.model，
-        - 分词:
-          - pre_tokenizers.ByteLevel(add_prefix_space=True, use_regex=False)
-      - 词典:  词典字符有 ▁  (U+2581) ，表示空格或句首。
-    - 示例：google-t5, llama，baichuan, orion,
-  - icetk： sentencepiece的分支，支持image_tokenizer
-    - glm, chatglm1, chatglm2
-  - openai/tiktoken
-  - bpe-hf_tokenizer
-    - ss
-    - 特征：
-      - 文件: tokenizer.json(包含后两个文件的内容), merges.txt, vocab.json
-        - added_tokens 在vocab中不一定存在。
-      - 实现:
-        - 训练: `from tokenizers.trainers import BpeTrainer, UnigramTrainer, WordLevelTrainer, WordPieceTrainer`
-        - 加载:
-        - 方法: .model.from_file  .model.save   .model.token_to_id  .model.tokenize
-      - .model 是 tokenizer.models.BPE 类型
-      - 词典有 Ġ  "\u0120" 开头
-      - 优势
-      -
-    - 示例：gpt2, gpt_neox_20b, moss, bloom, qwen2
-    - 优势：相对sentence piece，hf_tokenizer支持pre-tokenization的正则表达式，对tab和换行支持更好 ()
-      - ss
-  - tiktoken
-    - 特征：空格就是空格，
-    - 示例：gpt3.5 gpt4, qwen,
-tokenizer.comments = "split all numbers into individual digits, " \
-                     "and fallback to bytes to decompose unknown UTF-8 characters"
-tokenizer.all_special_tokens  # baichuan
-tokenizer.special_tokens_set   # gpt3.5_turbo
-tokenizer.special_tokens_map
-tokenizer.dependency [sentencepiece, tiktoken, icetk]
-"""
-from utils.log_util import logger
-# Animal = Enum('Animal', 'ANT BEE CAT DOG')
-uniq_tokenizers = [
-    ""
-]
-# format: alias/abbr, description, hf_path, tokenizer_class/type, comments, Organization
-# TODO: append link and description to the end of dropdown button.
-all_tokenizers = [
-    ##### bert 系列
-    ("bert_base_cased", "", "bert"),
-    ("bert_base_uncased", "", "bert"),
-    ("bert_base_chinese", "", "bert"),
-    ("roberta_chinese_clue", "", "bert"),
-    ("kplug",),
-    ("gpt2_chinese",),
-    ##### GPT2Tokenizer
-    ("gpt2", "", "GPT2Tokenizer",),  #
-    ("moss", "", "GPT2Tokenizer",),
-    ("bloom", "", "GPT2Tokenizer",),
-    # ("bloomz_6b4_zh",
-    # ("belle_7b_2m",   # 模型和词典都基于bloom
-    #
-    ("gpt_nexo_20b", "", "GPT2Tokenizer",),  # 5万
-    ("qwen1_5_14b_chat", "", "GPT2Tokenizer",),  # 15万，速度有点慢
-    ("starchat_alpha", "", "GPT2Tokenizer",),
-    ####### google/sentencepiece tokenizer:
-    # T5 llama internlm
-    ("t5_small", "", "sentencepiece"),
-    ("t5_base", "", "sentencepiece"),
-    ("t5_large", "", "sentencepiece"),
-    ("chatyuan_large_v2", "", "sentencepiece"),
-    ("prompt_clue", "", "sentencepiece"),
-    ("llama", "", "sentencepiece", "llama use single digits and thus uses 4 tokens to encode the number 1000"),  # '中文单字': 700, '中文多字': 0
-    ("llama2", "", "sentencepiece"),
-    ("llama3", "", "sentencepiece"),
-    ("chinese_llama", "", "sentencepiece"),  #
-    ("chinese_llama2", "", "sentencepiece"),  #
-    ("llama_3_chinese_8b", "sentencepiece"),
-    # ("chinese_alpaca_lora_7b",  # 中文Alpaca模型在上述中文LLaMA模型的基础上进一步使用了指令数据进行精调。
-    # ("belle_llama_ext_7b",
-    # ("alpaca_7b",
-    ("baichuan", "", "sentencepiece"),
-    ("baichuan2", "", "sentencepiece"),
-    ("internlm_chat_7b", "", "sentencepiece"),
-    ("internlm2_chat_7b", "", "sentencepiece"),
-    ("internlm2_math_7b", "", "sentencepiece"),
-    ("internlm_xcomposer_7b", "", "sentencepiece"),
-    ("falcon_7b", "", "sentencepiece"),
-    ("falcon_180b", "", "sentencepiece"),
-    ("skywork_13b_base",),
-    ("skywork_13b_math",),
-    ("xlm_roberta", ),
-    # "goat",
-    # ##### glm系列
-    # "glm_chinese",),
-    ("chatglm_6b", "", "sentencepiece"),
-    ("chatglm2_6b", "", "sentencepiece"),
-    ("chatglm3_6b", "", "sentencepiece"),
-    ("character_glm_6b", "", "sentencepiece"),
-    # tiktoken 系列
-    ("qwen_1_8b_chat", "", "tiktoken"),
-    ("qwen_7b_chat", "", "tiktoken"),
-    ("qwen_72b_chat", "", "tiktoken"),
-    ("text_davinci_003", "", "tiktoken"),
-    ("code_davinci_002", "", "tiktoken"),
-    ("gpt_35_turbo", "", "tiktoken"),
-    ("gpt_4", "", "tiktoken"),
-    # 未分类
-    # ("amber", ""),
-    ("crystal_coder", ""),
-    ("mistral_7b",),
-    ("mixtral_8_7b",),
-    ("flan_t5_base",),
-    ("fastchat_t5_3b",),
-    ("pko_t5_large",),
-    ("wizardcoder_15b_v1",),
-    ("yi_6b",),
-    ("yi_34b",),
-    ("yi_vl34b",),
-    ("orion_14b_chat",),
-    ("phi_1",),
-    ("phi_2",),
-    ("phi_3_mini", "即llama vocab"),
-    ("solar_10_7b",),
-    ("mobilebert_uncased",),
-    # ("mobilenet_v2",),  # error
-    ("switch_c_2048",),
-    ("byt5_small",),
-    ("mt5_large",),
-    ("wizardcoder_python_7b_v1",),
-    ("wizardlm_7b_v1",),
-    ("wizardmath_70b_v1",),
-    ("tigerbot_70b_chat_v4_4k",),
-    ("tigerbot_13b_chat_v2",),
-    ("deepseek_coder_33b_instruct",),
-    ("deepseek_llm_7b_base",),
-    ("gemma_7b",),
-    ("olmo_7b",),
-    ("aya_101",),
-    ("zephyr_7b_beta",),
-    ("jamba_v0_1", ),
-    ("dbrx_instruct", ),
-    ("grok_1",),
-    # ("claude",),
-    ("gpt_nexo_20b", ),
-    ("gpt_neox_japanese_2_7b", ),
-]
-all_tokenizers = [tokenizer[0] for tokenizer in all_tokenizers]
-all_tokenizers = sorted(all_tokenizers)
-class TokenizerType(Enum):
-    """
-    - https://huggingface.co/docs/transformers/tokenizer_summary
-    - https://github.com/EleutherAI/gpt-neox/blob/main/megatron/tokenizer/tokenizer.py
-    - https://github.com/google/sentencepiece/blob/3863f7648e5d8edb571ac592f3ac4f5f0695275a/src/sentencepiece_model.proto#L48
-      - UNIGRAM = 1;  // Unigram language model with dynamic algorithm
-      - BPE = 2;      // Byte Pair Encoding
-      - WORD = 3;     // Delimitered by whitespace.
-      - CHAR = 4;     // tokenizes into character sequence
-    """
-    BPE = auto()
-    ByteBPE = auto()  # BBPE  Byte-Level BPE
-    GPT2BPETokenizer = auto()  #
-    BERTTokenizer = auto()
-# class TokenizerType(Enum):
-#
-#     # BERTTokenizer
-#     # 依赖一个txt文件
-#
-#
-#     # https://github.com/EleutherAI/gpt-neox/blob/v2.0/megatron/tokenizer/tokenizer.py#L231
-#     # 依赖一个json文件，Tokenizer.from_file(vocab_file)
-#     # 案例：gpt-neox-20B
-#     HFTokenizer = auto()
-#
-#     # 依赖: model_file, sentencepiece.SentencePieceProcessor(model_file)
-#     # 案例：
-#     SentencePieceTokenizer = auto()
-#
-#
-#     # 依赖: 3个json文件：vocab.json, merges.txt, special_tokens.txt
-#     # 源码:
-#     #   - https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/tokenizer/gpt2_tokenization.py#L92
-#     # Byte-level BPE
-#     GPT2BPETokenizer = auto()
-class TokenizerImpl(Enum):
-    """
-    https://github.com/google/sentencepiece，支持 sentencepiece(BPE,unigram,char,word), wordpiece,
-    spm_train --model_type unigram/bpe/char/word
-    """
-    SentencePiece = auto()
-    # https://github.com/huggingface/transformers/blob/v4.30.2/src/transformers/models/gpt2/tokenization_gpt2.py#L104
-    # 构造词典：
-    # GPT2Tokenizer = auto()
-    # BertTokenizer = auto()  #
-    """
-    """
-    HFTokenizer = auto()  # https://github.com/huggingface/tokenizers, 支持
-cache = {}
-def load_tokener(model_name):
-    if model_name in cache:
-        return cache[model_name]
-    logger.info(f"loading tokenizer {model_name}")
-    tokenizer = importlib.import_module("." + model_name, 'vocab').tokenizer
-    tokenizer.alias = model_name
-    return tokenizer
-if __name__ == "__main__":
-    pass

vocab/_alpaca_7b/README.md DELETED Viewed

File without changes

vocab/_goat/README.md DELETED Viewed

File without changes

vocab/_goat/__init__.py DELETED Viewed

File without changes

vocab/albert/__init__.py DELETED Viewed

@@ -1,6 +0,0 @@
-"""
-SentencePiece(unigram)
-https://huggingface.co/docs/transformers/tokenizer_summary#sentencepiece
-"""

vocab/aya_101/__init__.py DELETED Viewed

@@ -1,5 +0,0 @@
-from transformers import AutoTokenizer
-tokenizer = AutoTokenizer.from_pretrained("CohereForAI/aya-101")

vocab/baichuan/Baichuan-7B/config.json DELETED Viewed

@@ -1,26 +0,0 @@
-{
-  "architectures": [
-    "BaiChuanForCausalLM"
-  ],
-  "auto_map": {
-    "AutoConfig": "configuration_baichuan.BaiChuanConfig",
-    "AutoModelForCausalLM": "modeling_baichuan.BaiChuanForCausalLM"
-  },
-  "bos_token_id": 1,
-  "eos_token_id": 2,
-  "hidden_act": "silu",
-  "hidden_size": 4096,
-  "initializer_range": 0.02,
-  "intermediate_size": 11008,
-  "max_position_embeddings": 4096,
-  "model_type": "baichuan",
-  "num_attention_heads": 32,
-  "num_hidden_layers": 32,
-  "pad_token_id": 0,
-  "rms_norm_eps": 1e-06,
-  "tie_word_embeddings": false,
-  "torch_dtype": "float32",
-  "transformers_version": "4.29.1",
-  "use_cache": true,
-  "vocab_size": 64000
-}

vocab/baichuan/Baichuan-7B/configuration_baichuan.py DELETED Viewed

@@ -1,66 +0,0 @@
-# coding=utf-8
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
-logger = logging.get_logger(__name__)
-class BaiChuanConfig(PretrainedConfig):
-    model_type = "baichuan"
-    keys_to_ignore_at_inference = ["past_key_values"]
-    def __init__(
-        self,
-        vocab_size=64000,
-        hidden_size=4096,
-        intermediate_size=11008,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        hidden_act="silu",
-        max_position_embeddings=4096,
-        initializer_range=0.02,
-        rms_norm_eps=1e-6,
-        use_cache=True,
-        pad_token_id=0,
-        bos_token_id=1,
-        eos_token_id=2,
-        tie_word_embeddings=False,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.use_cache = use_cache
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )

vocab/baichuan/Baichuan-7B/special_tokens_map.json DELETED Viewed

@@ -1,23 +0,0 @@
-{
-  "bos_token": {
-    "content": "<s>",
-    "lstrip": false,
-    "normalized": true,
-    "rstrip": false,
-    "single_word": false
-  },
-  "eos_token": {
-    "content": "</s>",
-    "lstrip": false,
-    "normalized": true,
-    "rstrip": false,
-    "single_word": false
-  },
-  "unk_token": {
-    "content": "<unk>",
-    "lstrip": false,
-    "normalized": true,
-    "rstrip": false,
-    "single_word": false
-  }
-}

vocab/baichuan/Baichuan-7B/tokenization_baichuan.py DELETED Viewed

@@ -1,250 +0,0 @@
-# coding=utf-8
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-from shutil import copyfile
-from typing import Any, Dict, List, Optional, Tuple
-import sentencepiece as spm
-from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
-from transformers.utils import logging
-logger = logging.get_logger(__name__)
-VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
-PRETRAINED_VOCAB_FILES_MAP = {
-    "vocab_file": {},
-    "tokenizer_file": {},
-}
-PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {}
-class BaiChuanTokenizer(PreTrainedTokenizer):
-    """
-    Construct a BaiChuan tokenizer. Based on byte-level Byte-Pair-Encoding.
-    Args:
-        vocab_file (`str`):
-            Path to the vocabulary file.
-    """
-    vocab_files_names = VOCAB_FILES_NAMES
-    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
-    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
-    model_input_names = ["input_ids", "attention_mask"]
-    def __init__(
-        self,
-        vocab_file,
-        unk_token="<unk>",
-        bos_token="<s>",
-        eos_token="</s>",
-        pad_token=None,
-        sp_model_kwargs: Optional[Dict[str, Any]] = None,
-        add_bos_token=True,
-        add_eos_token=False,
-        clean_up_tokenization_spaces=False,
-        **kwargs,
-    ):
-        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
-        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
-        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
-        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
-        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
-        self.vocab_file = vocab_file
-        self.add_bos_token = add_bos_token
-        self.add_eos_token = add_eos_token
-        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        self.sp_model.Load(vocab_file)
-        super().__init__(
-            bos_token=bos_token,
-            eos_token=eos_token,
-            unk_token=unk_token,
-            pad_token=pad_token,
-            add_bos_token=add_bos_token,
-            add_eos_token=add_eos_token,
-            sp_model_kwargs=self.sp_model_kwargs,
-            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
-            **kwargs,
-        )
-    def __getstate__(self):
-        state = self.__dict__.copy()
-        state["sp_model"] = None
-        return state
-    def __setstate__(self, d):
-        self.__dict__ = d
-        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        self.sp_model.Load(self.vocab_file)
-    @property
-    def vocab_size(self):
-        """Returns vocab size"""
-        return self.sp_model.get_piece_size()
-    def get_vocab(self):
-        """Returns vocab as a dict"""
-        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
-        vocab.update(self.added_tokens_encoder)
-        return vocab
-    def _tokenize(self, text):
-        """Returns a tokenized string."""
-        return self.sp_model.encode(text, out_type=str)
-    def _convert_token_to_id(self, token):
-        """Converts a token (str) in an id using the vocab."""
-        return self.sp_model.piece_to_id(token)
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        token = self.sp_model.IdToPiece(index)
-        return token
-    def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (string) in a single string."""
-        current_sub_tokens = []
-        out_string = ""
-        prev_is_special = False
-        for i, token in enumerate(tokens):
-            # make sure that special tokens are not decoded using sentencepiece model
-            if token in self.all_special_tokens:
-                if not prev_is_special and i != 0:
-                    out_string += " "
-                out_string += self.sp_model.decode(current_sub_tokens) + token
-                prev_is_special = True
-                current_sub_tokens = []
-            else:
-                current_sub_tokens.append(token)
-                prev_is_special = False
-        out_string += self.sp_model.decode(current_sub_tokens)
-        return out_string
-    def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
-        """
-        Save the vocabulary and special tokens file to a directory.
-        Args:
-            save_directory (`str`):
-                The directory in which to save the vocabulary.
-        Returns:
-            `Tuple(str)`: Paths to the files saved.
-        """
-        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
-            return
-        out_vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-        )
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
-            copyfile(self.vocab_file, out_vocab_file)
-        elif not os.path.isfile(self.vocab_file):
-            with open(out_vocab_file, "wb") as fi:
-                content_spiece_model = self.sp_model.serialized_model_proto()
-                fi.write(content_spiece_model)
-        return (out_vocab_file,)
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
-        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
-        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
-        output = bos_token_id + token_ids_0 + eos_token_id
-        if token_ids_1 is not None:
-            output = output + bos_token_id + token_ids_1 + eos_token_id
-        return output
-    def get_special_tokens_mask(
-        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
-    ) -> List[int]:
-        """
-        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer `prepare_for_model` method.
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-        Returns:
-            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-        if already_has_special_tokens:
-            return super().get_special_tokens_mask(
-                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
-            )
-        bos_token_id = [1] if self.add_bos_token else []
-        eos_token_id = [1] if self.add_eos_token else []
-        if token_ids_1 is None:
-            return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
-        return (
-            bos_token_id
-            + ([0] * len(token_ids_0))
-            + eos_token_id
-            + bos_token_id
-            + ([0] * len(token_ids_1))
-            + eos_token_id
-        )
-    def create_token_type_ids_from_sequences(
-        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
-    ) -> List[int]:
-        """
-        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
-        sequence pair mask has the following format:
-        ```
-        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-        | first sequence    | second sequence |
-        ```
-        if token_ids_1 is None, only returns the first portion of the mask (0s).
-        Args:
-            token_ids_0 (`List[int]`):
-                List of ids.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-        Returns:
-            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
-        """
-        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
-        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
-        output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
-        if token_ids_1 is not None:
-            output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
-        return output

vocab/baichuan/Baichuan-7B/tokenizer.model DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:4be54af290d93c113bcbf421115ae9eed9d6340408f564898f1e966dc738ef01
-size 1136699

vocab/baichuan/Baichuan-7B/tokenizer_config.json DELETED Viewed

@@ -1,35 +0,0 @@
-{
-  "auto_map": {
-    "AutoTokenizer": ["tokenization_baichuan.BaiChuanTokenizer", null]
-  },
-  "add_bos_token": false,
-  "add_eos_token": false,
-  "bos_token": {
-    "__type": "AddedToken",
-    "content": "<s>",
-    "lstrip": false,
-    "normalized": true,
-    "rstrip": false,
-    "single_word": false
-  },
-  "clean_up_tokenization_spaces": false,
-  "eos_token": {
-    "__type": "AddedToken",
-    "content": "</s>",
-    "lstrip": false,
-    "normalized": true,
-    "rstrip": false,
-    "single_word": false
-  },
-  "model_max_length": 1000000000000000019884624838656,
-  "sp_model_kwargs": {},
-  "tokenizer_class": "BaiChuanTokenizer",
-  "unk_token": {
-    "__type": "AddedToken",
-    "content": "<unk>",
-    "lstrip": false,
-    "normalized": true,
-    "rstrip": false,
-    "single_word": false
-  }
-}

vocab/baichuan/__init__.py DELETED Viewed

@@ -1,19 +0,0 @@
-import os
-import config
-from transformers import AutoTokenizer
-from vocab import TokenizerType
-if config.USE_REMOTE:
-    tokenizer = AutoTokenizer.from_pretrained("baichuan-inc/Baichuan-7B", trust_remote_code=True)
-else:
-    CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
-    TOKENIZER_DIR = os.path.join(CURRENT_DIR, "Baichuan-7B")
-    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR, trust_remote_code=True)
-# byte-bpe  sentencepiece
-tokenizer.type = TokenizerType.ByteBPE
-tokenizer.comments = "使用 SentencePiece 中的 Byte-Pair Encoding (BPE) 作为分词算法"

vocab/baichuan/demo.py DELETED Viewed

@@ -1,6 +0,0 @@
-from vocab.baichuan import tokenizer
-id1 = tokenizer.encode("<pad>")
-token1 = tokenizer.decode(125696)

vocab/baichuan/error.md DELETED Viewed

@@ -1,8 +0,0 @@
-## AttributeError: 'BaichuanTokenizer' object has no attribute 'sp_model'
-https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/discussions/18
-transfomers 4.34 doesn't work for me either. Degrading to 4.33.1 works in my case