Spaces:

eson
/

tokenizer-arena

Running

App Files Files Community

eson commited on May 1

Commit

1b7fc74

•

1 Parent(s): 367a536

add compression leaderboard

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

app.py +10 -249
app_compression.py +127 -0
app_playground.py +248 -0
css/style.css +24 -1
examples.py +1 -1
patcher/gr_interface.py +59 -0
tokenizer/sptokenizer_patch.py → patcher/sptokenizer_patch_deprecated.py +12 -4
patcher/sptokenizer_wrapper.py +61 -0
{tokenizer → patcher}/tiktoken_patch.py +5 -0
stats/compress_rate.json +1868 -0
stats/compress_rate/amber.en.json +0 -1
stats/compress_rate/amber.zh-Hans.json +0 -1
stats/compress_rate/aya_101.en.json +0 -1
stats/compress_rate/aya_101.zh-Hans.json +0 -1
stats/compress_rate/baichuan.en.json +0 -1
stats/compress_rate/baichuan.zh-Hans.json +0 -1
stats/compress_rate/baichuan2.en.json +0 -1
stats/compress_rate/baichuan2.zh-Hans.json +0 -1
stats/compress_rate/bert_base_cased.en.json +0 -1
stats/compress_rate/bert_base_cased.zh-Hans.json +0 -1
stats/compress_rate/bert_base_chinese.en.json +0 -1
stats/compress_rate/bert_base_chinese.zh-Hans.json +0 -1
stats/compress_rate/bert_base_uncased.en.json +0 -1
stats/compress_rate/bert_base_uncased.zh-Hans.json +0 -1
stats/compress_rate/bloom.en.json +0 -1
stats/compress_rate/bloom.zh-Hans.json +0 -1
stats/compress_rate/byt5_small.en.json +0 -1
stats/compress_rate/byt5_small.zh-Hans.json +0 -1
stats/compress_rate/character_glm_6b.en.json +0 -1
stats/compress_rate/character_glm_6b.zh-Hans.json +0 -1
stats/compress_rate/chatglm2_6b.en.json +0 -1
stats/compress_rate/chatglm2_6b.zh-Hans.json +0 -1
stats/compress_rate/chatglm3_6b.en.json +0 -1
stats/compress_rate/chatglm3_6b.zh-Hans.json +0 -1
stats/compress_rate/chatglm_6b.en.json +0 -1
stats/compress_rate/chatglm_6b.zh-Hans.json +0 -1
stats/compress_rate/chatyuan_large_v2.en.json +0 -1
stats/compress_rate/chatyuan_large_v2.zh-Hans.json +0 -1
stats/compress_rate/chinese_llama.en.json +0 -1
stats/compress_rate/chinese_llama.zh-Hans.json +0 -1
stats/compress_rate/chinese_llama2.en.json +0 -1
stats/compress_rate/chinese_llama2.zh-Hans.json +0 -1
stats/compress_rate/code_davinci_002.en.json +0 -1
stats/compress_rate/code_davinci_002.zh-Hans.json +0 -1
stats/compress_rate/crystal_coder.en.json +0 -1
stats/compress_rate/crystal_coder.zh-Hans.json +0 -1
stats/compress_rate/dbrx_instruct.en.json +0 -1
stats/compress_rate/dbrx_instruct.zh-Hans.json +0 -1
stats/compress_rate/deepseek_coder_33b_instruct.en.json +0 -1
stats/compress_rate/deepseek_coder_33b_instruct.zh-Hans.json +0 -1

app.py CHANGED Viewed

@@ -1,255 +1,16 @@
-# coding=utf-8
-# author: xusong
-# time: 2022/8/23 16:06
-"""
-## TODO:
-- i18 国际化  https://blog.csdn.net/qq_26212731/article/details/78457198   request.header中也有language
-- iter_vocab 的 warmup
-- 开关
-  - add_special_token 开关
-  - theme 开关 light/dark
-  - token_id/tokens/bytes 开关
-  - 中文字词统计，是否要包括 _ G 等字符
-- 评测
-  - OOV评测
-- 通过 javascript 添加 hover_text
-- 英文 utf-8编码
-- 词典支持下载，借用image下载的标签，
-- baichuan的单字数量怎么两万多个？
-- qwen:  ValueError: Unclosed image token
-- 路径修改为全path  meta-llama/Llama-2-13b-hf
-plots
-table
-## related demo
-- [](http://text-processing.com/demo/tokenize/)
-- [gpt-tokenizer](https://gpt-tokenizer.dev/)
-- [llama-tokenizer-js](https://belladoreai.github.io/llama-tokenizer-js/example-demo/build/)
-- [](https://huggingface.co/spaces/Xenova/the-tokenizer-playground)
-## 可视化
-[ The, 2, QUICK, Brown, Foxes, jumped, over, the, lazy, dog's, bone ]
-"""
 import gradio as gr
-from vocab import all_tokenizers
-from util import *
-from examples import example_fn, example_types
-from utils.compress_rate_util import common_units, common_corpuses
-get_window_url_params = """
-    function(url_params) {
-        const params = new URLSearchParams(window.location.search);
-        url_params = JSON.stringify(Object.fromEntries(params));
-        return url_params;
-        }
-    """
-with gr.Blocks(css="css/style.css", title="Tokenizer Arena") as demo:
-    gr.HTML("""<h1 align="center">Tokenizer Arena ⚔️</h1>""")
-    # links: https://www.coderstool.com/utf8-encoding-decoding
-    # 功能：输入文本，进行分词
-    # 分词器：常见的分词器有集中，
-    # 背景：方便分词、看词粒度、对比
-    with gr.Row():
-        gr.Markdown("## Input Text")
-        dropdown_examples = gr.Dropdown(
-            example_types,
-            type="index",
-            show_label=False,
-            container=False,
-            scale=0,
-            elem_classes="example-style"
-        )
-    user_input = gr.Textbox(
-        # value=default_user_input,
-        label="Input Text",
-        lines=5,
-        show_label=False,
-    )
-    gr.Markdown("## Tokenization")
-    # compress rate setting
-    with gr.Accordion("Compress Rate Setting", open=True):
-        gr.Markdown(
-            "Please select corpus and unit of compress rate, get more details at [github](https://github.com/xu-song/tokenizer-arena/). ")
-        with gr.Row():
-            compress_rate_corpus = gr.CheckboxGroup(
-                common_corpuses,  # , "code"
-                value=["cc100-en", "cc100-zh-Hans"],
-                label="corpus",
-                # info=""
-            )
-            compress_rate_unit = gr.Radio(
-                common_units,
-                value="b_tokens/g_bytes",
-                label="unit",
-            )
-    # TODO: Token Setting
-    # with gr.Accordion("Token Filter Setting", open=False):
-    #     gr.Markdown(
-    #         "Get total number of tokens which contain the following character)")
-    #     gr.Radio(
-    #         ["zh-Hans", "", "number", "space"],
-    #         value="zh",
-    #     )
-    with gr.Row():
-        with gr.Column(scale=6):
-            with gr.Group():
-                tokenizer_type_1 = gr.Dropdown(
-                    all_tokenizers,
-                    label="Tokenizer 1",
-                )
-                with gr.Group():
-                    """
-                    <div class="stat"><div class="stat-value">69</div><div class="stat-label">Characters</div></div>
-                    """
-                    with gr.Row():
-                        stats_vocab_size_1 = gr.TextArea(
-                            label="Vocab Size",
-                            lines=1,
-                            elem_classes="statistics"
-                        )
-                        stats_zh_token_size_1 = gr.TextArea(
-                            label="ZH char/word",
-                            lines=1,
-                            elem_classes="statistics",
-                            visible=False
-                        )
-                        stats_compress_rate_1 = gr.TextArea(
-                            label="Compress Rate",
-                            lines=1,
-                            elem_classes="statistics"
-                        )
-                        stats_overlap_token_size_1 = gr.TextArea(
-                            # value=default_stats_overlap_token_size,
-                            label="Overlap Tokens",
-                            lines=1,
-                            elem_classes="statistics"
-                        )
-                        # stats_3 = gr.TextArea(
-                        #     label="Compress Rate",
-                        #     lines=1,
-                        #     elem_classes="statistics"
-                        # )
-        # https://www.onlinewebfonts.com/icon/418591
-        gr.Image("images/VS.svg", scale=1, show_label=False,
-                 show_download_button=False, container=False,
-                 show_share_button=False)
-        with gr.Column(scale=6):
-            with gr.Group():
-                tokenizer_type_2 = gr.Dropdown(
-                    all_tokenizers,
-                    label="Tokenizer 2",
-                )
-                with gr.Group():
-                    with gr.Row():
-                        stats_vocab_size_2 = gr.TextArea(
-                            label="VocabSize",
-                            lines=1,
-                            elem_classes="statistics"
-                        )
-                        stats_zh_token_size_2 = gr.TextArea(
-                            label="ZH char/word",  # 中文字/词
-                            lines=1,
-                            elem_classes="statistics",
-                            visible=False
-                        )
-                        stats_compress_rate_2 = gr.TextArea(
-                            label="Compress Rate",
-                            lines=1,
-                            elem_classes="statistics"
-                        )
-                        stats_filtered_token_2 = gr.TextArea(
-                            label="filtered tokens",
-                            lines=1,
-                            elem_classes="statistics",
-                            visible=False
-                        )
-                        stats_overlap_token_size_2 = gr.TextArea(
-                            label="Overlap Tokens",
-                            lines=1,
-                            elem_classes="statistics"
-                        )
-    # TODO: 图 表 压缩率
-    with gr.Row():
-        # dynamic change label
-        with gr.Column():
-            output_text_1 = gr.Highlightedtext(
-                show_legend=True,
-                elem_classes="space-show"
-            )
-        with gr.Column():
-            output_text_2 = gr.Highlightedtext(
-                show_legend=True,
-                elem_classes="space-show"
-            )
-    with gr.Row():
-        output_table_1 = gr.Dataframe()
-        output_table_2 = gr.Dataframe()
-    # setting
-    # compress_rate_unit.change(compress_rate_unit_change, [compress_rate_unit],
-    #                             [stats_compress_rate_1, stats_compress_rate_2])
-    tokenizer_type_1.change(tokenize, [user_input, tokenizer_type_1],
-                            [output_text_1, output_table_1])
-    tokenizer_type_1.change(basic_count, [tokenizer_type_1], [stats_vocab_size_1, stats_zh_token_size_1])
-    tokenizer_type_1.change(get_overlap_token_size, [tokenizer_type_1, tokenizer_type_2],
-                            [stats_overlap_token_size_1, stats_overlap_token_size_2])
-    tokenizer_type_1.change(get_compress_rate, [tokenizer_type_1, compress_rate_corpus, compress_rate_unit],
-                            [stats_compress_rate_1])
-    # TODO: every=3
-    user_input.change(tokenize_pair,
-                      [user_input, tokenizer_type_1, tokenizer_type_2],
-                      [output_text_1, output_table_1, output_text_2, output_table_2])  # , pass_request=1
-    tokenizer_type_2.change(tokenize, [user_input, tokenizer_type_2],
-                            [output_text_2, output_table_2])
-    tokenizer_type_2.change(basic_count, [tokenizer_type_2], [stats_vocab_size_2, stats_zh_token_size_2])
-    tokenizer_type_2.change(get_overlap_token_size, [tokenizer_type_1, tokenizer_type_2],
-                            [stats_overlap_token_size_1, stats_overlap_token_size_2])
-    tokenizer_type_2.change(get_compress_rate,
-                            [tokenizer_type_2, compress_rate_corpus, compress_rate_unit],
-                            [stats_compress_rate_2])
-    compress_rate_unit.change(get_compress_rate,
-                              [tokenizer_type_1, compress_rate_corpus, compress_rate_unit],
-                              [stats_compress_rate_1])
-    compress_rate_unit.change(get_compress_rate,
-                              [tokenizer_type_2, compress_rate_corpus, compress_rate_unit],
-                              [stats_compress_rate_2])
-    compress_rate_corpus.change(get_compress_rate,
-                                [tokenizer_type_1, compress_rate_corpus, compress_rate_unit],
-                                [stats_compress_rate_1])
-    compress_rate_corpus.change(get_compress_rate,
-                                [tokenizer_type_2, compress_rate_corpus, compress_rate_unit],
-                                [stats_compress_rate_2])
-    dropdown_examples.change(
-        example_fn,
-        dropdown_examples,
-        [user_input, tokenizer_type_1, tokenizer_type_2]
-    )
-    demo.load(js=open("js/onload.js", "r", encoding="utf-8").read())
-    demo.load(
-        fn=on_load,
-        inputs=[user_input],  # 这���只需要传个空object即可。
-        outputs=[user_input, tokenizer_type_1, tokenizer_type_2],
-        js=get_window_url_params
-    )
 if __name__ == "__main__":
-    # demo.queue(max_size=20).launch()
-    demo.launch()
-    # demo.launch(share=True)

 import gradio as gr
+from app_playground import demo as tab_playground
+from app_compression import demo as tab_compression
+from patcher.gr_interface import TabbedInterface
+demo = TabbedInterface(
+    [tab_playground, tab_compression],
+    [" ⚔️Playground", "🏆 Compression Leaderboard",],  # 编码速度，解码速度，字符分类(zh、num等，支持正则)，支持的语言，机构，。
+    title='<div align="center">Tokenizer Arena ⚔️</div>',
+    css="css/style.css"
+)
 if __name__ == "__main__":
+    demo.launch()

app_compression.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import gradio as gr
+from utils.compression_util import get_compression_leaderboard
+from utils.compression_util import common_corpuses
+with gr.Blocks() as demo:
+    # gr.Markdown("## Convertor")
+    # with gr.Accordion("Convertor", open=False):
+    #     gr.Markdown("Tokenize {} corpus")
+    #     with gr.Row(elem_classes="no-border"):
+    #         gr.Button("File Size", min_width=50)
+    #         file_size = gr.Textbox(
+    #             show_label=False,
+    #             min_width=50,
+    #             # elem_classes="textbox-as-text"
+    #         )
+    #         gr.Dropdown(
+    #             choices=['MB', 'GB', 'TB'],
+    #             show_label=False,
+    #             min_width=15,
+    #             # elem_classes="textbox-as-text"
+    #         )
+    #         # gr.Markdown('<h2 align="center">≈</h2>')
+    #         # gr.HTML('<h2 style="margin: auto;">≈</h2>')
+    #         gr.Button(
+    #             "≈",
+    #             min_width=10,
+    #             elem_classes="button-white h2-font"
+    #
+    #         )
+    #
+    #         gr.Button(
+    #             "Tokens",
+    #             min_width=50
+    #         )
+    #         gr.Textbox(
+    #             show_label=False,
+    #             min_width=50
+    #         )
+    #         gr.Dropdown(
+    #             ['million', 'billion', 'trillion'],
+    #             show_label=False,
+    #             min_width=15,
+    #             elem_classes="button-white"
+    #         )
+    gr.Markdown("## 🛠️ Setting")  # ⚙
+    with gr.Accordion("Please select corpus and measure of compression rate ...", open=True):
+        # file size 💽 🖴, tokens 🧮
+        # gr.Markdown(
+        #     "Please select corpus and measure of compression rate.\n"
+            #"`num_of_trillion_tokens`  `num_of_billion_tokens`\n"
+            # "- `b_tokens/g_bytes` measures how many billion tokens per gigabytes corpus. \n"
+            # "- `t_tokens/t_bytes` measures how many trillion tokens per terabytes corpus. \n"
+            # "- `n_chars/n_tokens` measures how many chars per token in the current corpus. \n\n"
+            # "All the above measures are depend on corpus. You can reproduce this "
+            # "procedure at [github](https://github.com/xu-song/tokenizer-arena/)."
+        # )
+        with gr.Row():
+            compress_rate_corpus = gr.Dropdown(
+                common_corpuses,  # , "code"
+                value=["cc100-en", "cc100-zh-Hans"],
+                label="corpus",
+                multiselect=True
+                # info=""
+            )
+            # unit of file_size: gigabyte terabyte
+            # unit of token_num: million billion trillion
+            # The most common units of measurement include length (meter, inch, foot), weight (gram, kilogram, pound), volume (liter, gallon, milliliter), time (second, minute, hour)
+            compress_rate_unit = gr.Radio(
+                ["b_tokens/g_bytes", "t_tokens/t_bytes"],
+                value="b_tokens/g_bytes",
+                label="measure",
+            )
+        gr.Markdown(
+            # "`num_of_trillion_tokens`  `num_of_billion_tokens`\n"
+            "- `b_tokens/g_bytes` measures how many billion tokens per gigabytes corpus. \n"
+            "- `t_tokens/t_bytes` measures how many trillion tokens per terabytes corpus. \n"
+            "- `n_chars/n_tokens` measures how many chars per token in the tokenized corpus. \n\n"
+            "All the above measures are depend on corpus. You can reproduce this "
+            "procedure at [github](https://github.com/xu-song/tokenizer-arena/)."
+        )
+    gr.Markdown("## 🏆 Compression Rate Leaderboard")
+    search_bar = gr.Textbox(
+        placeholder="🔍 Search tokenizers(e.g., 'llama') and press ENTER...",
+        show_label=False,
+        elem_id="search-bar",
+    )
+    compress_rate_table = gr.Dataframe()
+    # func call
+    compress_rate_corpus.change(
+        get_compression_leaderboard,
+        inputs=[compress_rate_corpus, compress_rate_unit],
+        outputs=compress_rate_table
+    )
+    compress_rate_unit.change(
+        get_compression_leaderboard,
+        inputs=[compress_rate_corpus, compress_rate_unit],
+        outputs=compress_rate_table
+    )
+    # file_size.change(
+    #     get_all_compress_rate,
+    #     outputs=compress_rate_table
+    # )
+    search_bar.submit(
+        get_compression_leaderboard,
+        inputs=[
+            compress_rate_corpus,
+            compress_rate_unit,
+            search_bar,
+        ],
+        outputs=compress_rate_table
+    )
+    demo.load(
+        get_compression_leaderboard,
+        inputs=[compress_rate_corpus, compress_rate_unit],
+        outputs=compress_rate_table
+    )
+if __name__ == "__main__":
+    demo.launch()

app_playground.py ADDED Viewed

	@@ -0,0 +1,248 @@

+# coding=utf-8
+# author: xusong
+# time: 2022/8/23 16:06
+"""
+## TODO:
+- i18 国际化  https://blog.csdn.net/qq_26212731/article/details/78457198   request.header中也有language
+- iter_vocab 的 warmup
+- 开关
+  - add_special_token 开关
+  - theme 开关 light/dark
+  - token_id/tokens/bytes 开关
+  - 中文字词统计，是否要包括 _ G 等字符
+- 评测
+  - OOV评测
+- 通过 javascript 添加 hover_text
+- 英文 utf-8编码
+- 词典支持下载，借用image下载的标签，
+- baichuan的单字数量怎么两万多个？
+- qwen:  ValueError: Unclosed image token
+- 路径修改为全path  meta-llama/Llama-2-13b-hf
+plots
+table
+## related demo
+- [](http://text-processing.com/demo/tokenize/)
+- [gpt-tokenizer](https://gpt-tokenizer.dev/)
+- [llama-tokenizer-js](https://belladoreai.github.io/llama-tokenizer-js/example-demo/build/)
+- [](https://huggingface.co/spaces/Xenova/the-tokenizer-playground)
+## 可视化
+[ The, 2, QUICK, Brown, Foxes, jumped, over, the, lazy, dog's, bone ]
+"""
+import gradio as gr
+from vocab import all_tokenizers
+from util import *
+from examples import example_fn, example_types
+get_window_url_params = """
+    function(url_params) {
+        const params = new URLSearchParams(window.location.search);
+        url_params = JSON.stringify(Object.fromEntries(params));
+        return url_params;
+        }
+    """
+with gr.Blocks() as demo:
+    # links: https://www.coderstool.com/utf8-encoding-decoding
+    # 功能：输入文本，进行分词
+    # 分词器：常见的分词器有集中，
+    # 背景：方便分词、看词粒度、对比
+    with gr.Row():
+        gr.Markdown("## Input Text")
+        dropdown_examples = gr.Dropdown(
+            example_types,
+            type="index",
+            show_label=False,
+            container=False,
+            scale=0,
+            elem_classes="example-style"
+        )
+    user_input = gr.Textbox(
+        # value=default_user_input,
+        label="Input Text",
+        lines=5,
+        show_label=False,
+    )
+    gr.Markdown("## Tokenization")
+    # compress rate setting TODO: 将 这个模块调整到下面
+    # with gr.Accordion("Compress Rate Setting", open=True):
+    #     gr.Markdown(
+    #         "Please select corpus and unit of compress rate, get more details at [github](https://github.com/xu-song/tokenizer-arena/). ")
+    #     with gr.Row():
+    #         compress_rate_corpus = gr.CheckboxGroup(
+    #             common_corpuses,  # , "code"
+    #             value=["cc100-en", "cc100-zh-Hans"],
+    #             label="corpus",
+    #             # info=""
+    #         )
+    #         compress_rate_unit = gr.Radio(
+    #             common_units,
+    #             value="b_tokens/g_bytes",
+    #             label="unit",
+    #         )
+    # TODO: Token Setting
+    # with gr.Accordion("Token Filter Setting", open=False):
+    #     gr.Markdown(
+    #         "Get total number of tokens which contain the following character)")
+    #     gr.Radio(
+    #         ["zh-Hans", "", "number", "space"],
+    #         value="zh",
+    #     )
+    with gr.Row():
+        with gr.Column(scale=6):
+            with gr.Group():
+                tokenizer_name_1 = gr.Dropdown(
+                    all_tokenizers,
+                    label="Tokenizer 1",
+                )
+                with gr.Group():
+                    with gr.Row():
+                        stats_vocab_size_1 = gr.TextArea(
+                            label="Vocab Size",
+                            lines=1,
+                            elem_classes="statistics"
+                        )
+                        stats_zh_token_size_1 = gr.TextArea(
+                            label="ZH char/word",
+                            lines=1,
+                            elem_classes="statistics",
+                        )
+                        # stats_compress_rate_1 = gr.TextArea(
+                        #     label="Compress Rate",
+                        #     lines=1,
+                        #     elem_classes="statistics",
+                        # )
+                        stats_overlap_token_size_1 = gr.TextArea(
+                            # value=default_stats_overlap_token_size,
+                            label="Overlap Tokens",
+                            lines=1,
+                            elem_classes="statistics"
+                        )
+                        # stats_3 = gr.TextArea(
+                        #     label="Compress Rate",
+                        #     lines=1,
+                        #     elem_classes="statistics"
+                        # )
+        # https://www.onlinewebfonts.com/icon/418591
+        gr.Image("images/VS.svg", scale=1, show_label=False,
+                 show_download_button=False, container=False,
+                 show_share_button=False)
+        with gr.Column(scale=6):
+            with gr.Group():
+                tokenizer_name_2 = gr.Dropdown(
+                    all_tokenizers,
+                    label="Tokenizer 2",
+                )
+                with gr.Group():
+                    with gr.Row():
+                        stats_vocab_size_2 = gr.TextArea(
+                            label="VocabSize",
+                            lines=1,
+                            elem_classes="statistics"
+                        )
+                        stats_zh_token_size_2 = gr.TextArea(
+                            label="ZH char/word",  # 中文字/词
+                            lines=1,
+                            elem_classes="statistics",
+                        )
+                        # stats_compress_rate_2 = gr.TextArea(
+                        #     label="Compress Rate",
+                        #     lines=1,
+                        #     elem_classes="statistics"
+                        # )
+                        stats_filtered_token_2 = gr.TextArea(
+                            label="filtered tokens",
+                            lines=1,
+                            elem_classes="statistics",
+                            visible=False
+                        )
+                        stats_overlap_token_size_2 = gr.TextArea(
+                            label="Overlap Tokens",
+                            lines=1,
+                            elem_classes="statistics"
+                        )
+    # TODO: 图 表 压缩率
+    with gr.Row():
+        # dynamic change label
+        with gr.Column():
+            output_text_1 = gr.Highlightedtext(
+                show_legend=True,
+                elem_classes="space-show"
+            )
+        with gr.Column():
+            output_text_2 = gr.Highlightedtext(
+                show_legend=True,
+                elem_classes="space-show"
+            )
+    with gr.Row():
+        output_table_1 = gr.Dataframe()
+        output_table_2 = gr.Dataframe()
+    # setting
+    # compress_rate_unit.change(compress_rate_unit_change, [compress_rate_unit],
+    #                             [stats_compress_rate_1, stats_compress_rate_2])
+    tokenizer_name_1.change(tokenize, [user_input, tokenizer_name_1],
+                            [output_text_1, output_table_1])
+    tokenizer_name_1.change(basic_count, [tokenizer_name_1], [stats_vocab_size_1, stats_zh_token_size_1])
+    tokenizer_name_1.change(get_overlap_token_size, [tokenizer_name_1, tokenizer_name_2],
+                            [stats_overlap_token_size_1, stats_overlap_token_size_2])
+    # tokenizer_type_1.change(get_compress_rate, [tokenizer_type_1, compress_rate_corpus, compress_rate_unit],
+    #                         [stats_compress_rate_1])
+    # TODO: every=3
+    user_input.change(tokenize_pair,
+                      [user_input, tokenizer_name_1, tokenizer_name_2],
+                      [output_text_1, output_table_1, output_text_2, output_table_2])  # , pass_request=1
+    tokenizer_name_2.change(tokenize, [user_input, tokenizer_name_2],
+                            [output_text_2, output_table_2])
+    tokenizer_name_2.change(basic_count, [tokenizer_name_2], [stats_vocab_size_2, stats_zh_token_size_2])
+    tokenizer_name_2.change(get_overlap_token_size, [tokenizer_name_1, tokenizer_name_2],
+                            [stats_overlap_token_size_1, stats_overlap_token_size_2])
+    # tokenizer_type_2.change(get_compress_rate,
+    #                         [tokenizer_type_2, compress_rate_corpus, compress_rate_unit],
+    #                         [stats_compress_rate_2])
+    #
+    # compress_rate_unit.change(get_compress_rate,
+    #                           [tokenizer_type_1, compress_rate_corpus, compress_rate_unit],
+    #                           [stats_compress_rate_1])
+    # compress_rate_unit.change(get_compress_rate,
+    #                           [tokenizer_type_2, compress_rate_corpus, compress_rate_unit],
+    #                           [stats_compress_rate_2])
+    # compress_rate_corpus.change(get_compress_rate,
+    #                             [tokenizer_type_1, compress_rate_corpus, compress_rate_unit],
+    #                             [stats_compress_rate_1])
+    # compress_rate_corpus.change(get_compress_rate,
+    #                             [tokenizer_type_2, compress_rate_corpus, compress_rate_unit],
+    #                             [stats_compress_rate_2])
+    dropdown_examples.change(
+        example_fn,
+        dropdown_examples,
+        [user_input, tokenizer_name_1, tokenizer_name_2]
+    )
+    demo.load(js=open("js/onload.js", "r", encoding="utf-8").read())
+    demo.load(
+        fn=on_load,
+        inputs=[user_input],  # 这里只需要传个空object即可。
+        outputs=[user_input, tokenizer_name_1, tokenizer_name_2],
+        js=get_window_url_params
+    )
+if __name__ == "__main__":
+    # demo.queue(max_size=20).launch()
+    demo.launch()
+    # demo.launch(share=True)

css/style.css CHANGED Viewed

@@ -8,6 +8,28 @@
 	white-space: pre-wrap;
 }
 /* 隐藏legend */
 .category-legend {
 	display: none !important;
@@ -33,4 +55,5 @@
 .example-style {
 	max-width: 150px;
 	align-self: self-end;
-}

 	white-space: pre-wrap;
 }
+/* white button */
+.button-as-text {
+	background: #fff;
+    border-color: #fff;
+}
+.textbox-as-text {
+    border-style: hidden;
+    background: #fff;
+    border-color: #fff;
+}
+.h2-font {
+    font-size: 30px;
+}
+.no-border {
+    border: 0px none;
+}
 /* 隐藏legend */
 .category-legend {
 	display: none !important;
 .example-style {
 	max-width: 150px;
 	align-self: self-end;
+}

examples.py CHANGED Viewed

@@ -24,7 +24,7 @@ examples = {
         # ！？｡＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.
         ["punctuation: ,.:/?+=\"，。！？；【】〔〕〖〗", "gemma_7b", "llama"],  # llama词典有点小
         ["symbol: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan", "llama"],
-        # ["special: [PAD] [UNK] [CLS] [SEP] [MASK] <|endoftext|>", "", ""],
     ],
     "zh": [
         ["空格测试：  2个空格        8个空格", "llama", "chatglm2_6b"],  # chatglm 有blank_n,

         # ！？｡＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.
         ["punctuation: ,.:/?+=\"，。！？；【】〔〕〖〗", "gemma_7b", "llama"],  # llama词典有点小
         ["symbol: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan", "llama"],
+        # ["special: [PAD] [UNK] [CLS] [SEP] [MASK] <|system|> <|user|> <|assistant|> <|endoftext|>", "", ""],
     ],
     "zh": [
         ["空格测试：  2个空格        8个空格", "llama", "chatglm2_6b"],  # chatglm 有blank_n,

patcher/gr_interface.py ADDED Viewed

	@@ -0,0 +1,59 @@

+"""
+原生 TabbedInterface 的 title采用markdown，不能实现居中，因此这里做了调整。
+"""
+from gradio import Blocks, Interface, Theme, Tabs, Tab, HTML
+class TabbedInterface(Blocks):
+    """
+    A TabbedInterface is created by providing a list of Interfaces or Blocks, each of which gets
+    rendered in a separate tab. Only the components from the Interface/Blocks will be rendered in the tab.
+    Certain high-level attributes of the Blocks (e.g. custom `css`, `js`, and `head` attributes) will not be loaded.
+    Demos: tabbed_interface_lite
+    """
+    def __init__(
+        self,
+        interface_list: list[Interface],
+        tab_names: list[str] | None = None,
+        title: str | None = None,
+        theme: Theme | str | None = None,
+        analytics_enabled: bool | None = None,
+        css: str | None = None,
+        js: str | None = None,
+        head: str | None = None,
+    ):
+        """
+        Parameters:
+            interface_list: A list of Interfaces (or Blocks) to be rendered in the tabs.
+            tab_names: A list of tab names. If None, the tab names will be "Tab 1", "Tab 2", etc.
+            title: The tab title to display when this demo is opened in a browser window.
+            theme: A Theme object or a string representing a theme. If a string, will look for a built-in theme with that name (e.g. "soft" or "default"), or will attempt to load a theme from the Hugging Face Hub (e.g. "gradio/monochrome"). If None, will use the Default theme.
+            analytics_enabled: Whether to allow basic telemetry. If None, will use GRADIO_ANALYTICS_ENABLED environment variable or default to True.
+            css: Custom css as a string or path to a css file. This css will be included in the demo webpage.
+            js: Custom js or path to js file to run when demo is first loaded. This javascript will be included in the demo webpage.
+            head: Custom html to insert into the head of the demo webpage. This can be used to add custom meta tags, scripts, stylesheets, etc. to the page.
+        Returns:
+            a Gradio Tabbed Interface for the given interfaces
+        """
+        super().__init__(
+            title=title or "Gradio",
+            theme=theme,
+            analytics_enabled=analytics_enabled,
+            mode="tabbed_interface",
+            css=css,
+            js=js,
+            head=head,
+        )
+        if tab_names is None:
+            tab_names = [f"Tab {i}" for i in range(len(interface_list))]
+        with self:
+            if title:
+                HTML(
+                    f"<h1 style='text-align: center; margin-bottom: 1rem'>{title}</h1>"
+                )
+            with Tabs():
+                for interface, tab_name in zip(interface_list, tab_names):
+                    with Tab(label=tab_name):
+                        interface.render()

tokenizer/sptokenizer_patch.py → patcher/sptokenizer_patch_deprecated.py RENAMED Viewed

@@ -1,6 +1,8 @@
 """
 ## usage
@@ -8,11 +10,15 @@
 ## 风险评估
-- 会干扰 sentencepiece.SentencePieceProcessor的正常使用吗？
 """
-import sentencepiece
 @property
@@ -32,15 +38,18 @@ def _tokenize(self, text):
     """Returns a tokenized string."""
     return self.encode(text, out_type=str)
 def _convert_token_to_id(self, token):
     """Converts a token (str) in an id using the vocab."""
     return self.piece_to_id(token)
 def _convert_id_to_token(self, index):
     """Converts an index (integer) in a token (str) using the vocab."""
     token = self.IdToPiece(index)
     return token
 def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
     """ copy from transformers.PreTrainedTokenizer
     Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
@@ -87,11 +96,10 @@ def decode(self, *args, **kwargs):
     return self.Decode(*args, **kwargs)
-sentencepiece.SentencePieceProcessor.vocab_size = vocab_size
 sentencepiece.SentencePieceProcessor.get_vocab = get_vocab
 sentencepiece.SentencePieceProcessor._convert_id_to_token = _convert_id_to_token
 sentencepiece.SentencePieceProcessor.convert_ids_to_tokens = convert_ids_to_tokens
 # sentencepiece.SentencePieceProcessor.tokenize = _tokenize
 sentencepiece.SentencePieceProcessor.encode = encode
 sentencepiece.SentencePieceProcessor.decode = decode

 """
+## adapt to transformer tokenizer
+https://github.com/huggingface/transformers/blob/v4.40.1/src/transformers/tokenization_utils.py#L379
 ## usage
 ## 风险评估
+- 可能会干扰 sentencepiece.SentencePieceProcessor的正常使用，比如 .vocab_size 原来是个方法，patch后是个property
+## TODO
+不用patch，改用wrapper。常见的 tokenizer通常是封装的 sentencepiece，
 """
+import sentencepiece
 @property
     """Returns a tokenized string."""
     return self.encode(text, out_type=str)
 def _convert_token_to_id(self, token):
     """Converts a token (str) in an id using the vocab."""
     return self.piece_to_id(token)
 def _convert_id_to_token(self, index):
     """Converts an index (integer) in a token (str) using the vocab."""
     token = self.IdToPiece(index)
     return token
 def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
     """ copy from transformers.PreTrainedTokenizer
     Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
     return self.Decode(*args, **kwargs)
+sentencepiece.SentencePieceProcessor.vocab_size = vocab_size  #
 sentencepiece.SentencePieceProcessor.get_vocab = get_vocab
 sentencepiece.SentencePieceProcessor._convert_id_to_token = _convert_id_to_token
 sentencepiece.SentencePieceProcessor.convert_ids_to_tokens = convert_ids_to_tokens
 # sentencepiece.SentencePieceProcessor.tokenize = _tokenize
 sentencepiece.SentencePieceProcessor.encode = encode
 sentencepiece.SentencePieceProcessor.decode = decode

patcher/sptokenizer_wrapper.py ADDED Viewed

	@@ -0,0 +1,61 @@

+""" 封装 sentencepiece.SentencePieceProcessor，以便符合transformers中的tokenizer标准
+## reference
+## usage
+- grok
+"""
+import sentencepiece as spm
+from transformers import PreTrainedTokenizer
+class SPTokenizerWrapper(PreTrainedTokenizer):
+    """
+    ## impl in PreTrainedTokenizer
+    - convert_ids_to_tokens
+    """
+    def __init__(self, vocab_file):
+        self.vocab_file = vocab_file
+        self.sp_model = spm.SentencePieceProcessor(self.vocab_file)
+        super().__init__()
+    @property
+    def vocab_size(self):
+        """Returns vocab size"""
+        return self.sp_model.get_piece_size()
+    def get_vocab(self):
+        """Returns vocab as a dict"""
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        return vocab
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.sp_model.piece_to_id(token)
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        token = self.sp_model.IdToPiece(index)
+        return token
+    # def (self, ids, skip_special_tokens=False):  # impl in PreTrainedTokenizer
+    def encode(self, *args, **kwargs):
+        kwargs.pop("add_special_tokens", None)
+        kwargs.pop("allowed_special", None)
+        return self.sp_model.Encode(*args, **kwargs)
+    def decode(self, *args, **kwargs):
+        kwargs.pop("skip_special_tokens", None)
+        return self.sp_model.Decode(*args, **kwargs)
+# PreTrainedTokenizer.convert_ids_to_tokens

{tokenizer → patcher}/tiktoken_patch.py RENAMED Viewed

@@ -83,6 +83,10 @@ def encode(self, *args, **kwargs):
     return self._encode(*args, **kwargs)
 # tiktoken patch
 Encoding._encode = Encoding.encode
 Encoding.encode = encode
@@ -90,3 +94,4 @@ Encoding.decode = decode
 Encoding.convert_ids_to_tokens = convert_ids_to_tokens
 Encoding.get_vocab = get_vocab
 Encoding.vocab_size = vocab_size

     return self._encode(*args, **kwargs)
+def __len__(self):
+    return self.n_vocab
 # tiktoken patch
 Encoding._encode = Encoding.encode
 Encoding.encode = encode
 Encoding.convert_ids_to_tokens = convert_ids_to_tokens
 Encoding.get_vocab = get_vocab
 Encoding.vocab_size = vocab_size
+Encoding.__len__ = __len__

stats/compress_rate.json ADDED Viewed

	@@ -0,0 +1,1868 @@

+{
+  "amber.cc100-en": {
+    "vocab_size": 32000,
+    "n_bytes": 1124813,
+    "n_tokens": 294627,
+    "n_chars": 1121360
+  },
+  "aya_101.cc100-en": {
+    "vocab_size": 250100,
+    "n_bytes": 1124813,
+    "n_tokens": 317881,
+    "n_chars": 1121360
+  },
+  "baichuan.cc100-en": {
+    "vocab_size": 64000,
+    "n_bytes": 1124813,
+    "n_tokens": 280108,
+    "n_chars": 1121360
+  },
+  "baichuan2.cc100-en": {
+    "vocab_size": 125696,
+    "n_bytes": 1124813,
+    "n_tokens": 269011,
+    "n_chars": 1121360
+  },
+  "bert_base_cased.cc100-en": {
+    "vocab_size": 28996,
+    "n_bytes": 1124813,
+    "n_tokens": 288022,
+    "n_chars": 1121360
+  },
+  "bert_base_chinese.cc100-en": {
+    "vocab_size": 21128,
+    "n_bytes": 1124813,
+    "n_tokens": 377068,
+    "n_chars": 1121360
+  },
+  "bert_base_uncased.cc100-en": {
+    "vocab_size": 30522,
+    "n_bytes": 1124813,
+    "n_tokens": 280575,
+    "n_chars": 1121360
+  },
+  "bloom.cc100-en": {
+    "vocab_size": 250680,
+    "n_bytes": 1124813,
+    "n_tokens": 257405,
+    "n_chars": 1121360
+  },
+  "byt5_small.cc100-en": {
+    "vocab_size": 384,
+    "n_bytes": 1124813,
+    "n_tokens": 1134813,
+    "n_chars": 1121360
+  },
+  "character_glm_6b.cc100-en": {
+    "vocab_size": 64789,
+    "n_bytes": 1124813,
+    "n_tokens": 289347,
+    "n_chars": 1121360
+  },
+  "chatglm2_6b.cc100-en": {
+    "vocab_size": 64787,
+    "n_bytes": 1124813,
+    "n_tokens": 289329,
+    "n_chars": 1121360
+  },
+  "chatglm3_6b.cc100-en": {
+    "vocab_size": 64796,
+    "n_bytes": 1124813,
+    "n_tokens": 289347,
+    "n_chars": 1121360
+  },
+  "chatglm_6b.cc100-en": {
+    "vocab_size": 150344,
+    "n_bytes": 1124813,
+    "n_tokens": 284761,
+    "n_chars": 1121360
+  },
+  "chatyuan_large_v2.cc100-en": {
+    "vocab_size": 32128,
+    "n_bytes": 1124813,
+    "n_tokens": 536033,
+    "n_chars": 1121360
+  },
+  "chinese_llama.cc100-en": {
+    "vocab_size": 49953,
+    "n_bytes": 1124813,
+    "n_tokens": 291514,
+    "n_chars": 1121360
+  },
+  "chinese_llama2.cc100-en": {
+    "vocab_size": 55296,
+    "n_bytes": 1124813,
+    "n_tokens": 294627,
+    "n_chars": 1121360
+  },
+  "code_davinci_002.cc100-en": {
+    "vocab_size": 50281,
+    "n_bytes": 1124813,
+    "n_tokens": 258403,
+    "n_chars": 1121360
+  },
+  "crystal_coder.cc100-en": {
+    "vocab_size": 32022,
+    "n_bytes": 1124813,
+    "n_tokens": 284627,
+    "n_chars": 1121360
+  },
+  "dbrx_instruct.cc100-en": {
+    "vocab_size": 100280,
+    "n_bytes": 1124813,
+    "n_tokens": 254985,
+    "n_chars": 1121360
+  },
+  "deepseek_coder_33b_instruct.cc100-en": {
+    "vocab_size": 32022,
+    "n_bytes": 1124813,
+    "n_tokens": 287408,
+    "n_chars": 1121360
+  },
+  "deepseek_llm_7b_base.cc100-en": {
+    "vocab_size": 100015,
+    "n_bytes": 1124813,
+    "n_tokens": 272324,
+    "n_chars": 1121360
+  },
+  "falcon_180b.cc100-en": {
+    "vocab_size": 65024,
+    "n_bytes": 1124813,
+    "n_tokens": 262509,
+    "n_chars": 1121360
+  },
+  "falcon_7b.cc100-en": {
+    "vocab_size": 65024,
+    "n_bytes": 1124813,
+    "n_tokens": 262509,
+    "n_chars": 1121360
+  },
+  "fastchat_t5_3b.cc100-en": {
+    "vocab_size": 32110,
+    "n_bytes": 1124813,
+    "n_tokens": 484941,
+    "n_chars": 1121360
+  },
+  "flan_t5_base.cc100-en": {
+    "vocab_size": 32100,
+    "n_bytes": 1124813,
+    "n_tokens": 290104,
+    "n_chars": 1121360
+  },
+  "gemma_7b.cc100-en": {
+    "vocab_size": 256000,
+    "n_bytes": 1124813,
+    "n_tokens": 268010,
+    "n_chars": 1121360
+  },
+  "gpt2.cc100-en": {
+    "vocab_size": 50257,
+    "n_bytes": 1124813,
+    "n_tokens": 258428,
+    "n_chars": 1121360
+  },
+  "gpt2_chinese.cc100-en": {
+    "vocab_size": 21128,
+    "n_bytes": 1124813,
+    "n_tokens": 392641,
+    "n_chars": 1121360
+  },
+  "gpt_35_turbo.cc100-en": {
+    "vocab_size": 100277,
+    "n_bytes": 1124813,
+    "n_tokens": 254985,
+    "n_chars": 1121360
+  },
+  "gpt_4.cc100-en": {
+    "vocab_size": 100277,
+    "n_bytes": 1124813,
+    "n_tokens": 254985,
+    "n_chars": 1121360
+  },
+  "gpt_nexo_20b.cc100-en": {
+    "vocab_size": 50277,
+    "n_bytes": 1124813,
+    "n_tokens": 259357,
+    "n_chars": 1121360
+  },
+  "grok_1.cc100-en": {
+    "vocab_size": 131072,
+    "n_bytes": 1124813,
+    "n_tokens": 258048,
+    "n_chars": 1121360
+  },
+  "internlm2_chat_7b.cc100-en": {
+    "vocab_size": 92544,
+    "n_bytes": 1124813,
+    "n_tokens": 271583,
+    "n_chars": 1121360
+  },
+  "internlm2_math_7b.cc100-en": {
+    "vocab_size": 92544,
+    "n_bytes": 1124813,
+    "n_tokens": 271583,
+    "n_chars": 1121360
+  },
+  "internlm_chat_7b.cc100-en": {
+    "vocab_size": 103168,
+    "n_bytes": 1124813,
+    "n_tokens": 271293,
+    "n_chars": 1121360
+  },
+  "internlm_xcomposer_7b.cc100-en": {
+    "vocab_size": 103168,
+    "n_bytes": 1124813,
+    "n_tokens": 271293,
+    "n_chars": 1121360
+  },
+  "jamba_v0_1.cc100-en": {
+    "vocab_size": 65536,
+    "n_bytes": 1124813,
+    "n_tokens": 274242,
+    "n_chars": 1121360
+  },
+  "kplug.cc100-en": {
+    "vocab_size": 10261,
+    "n_bytes": 1124813,
+    "n_tokens": 393564,
+    "n_chars": 1121360
+  },
+  "llama.cc100-en": {
+    "vocab_size": 32000,
+    "n_bytes": 1124813,
+    "n_tokens": 294627,
+    "n_chars": 1121360
+  },
+  "llama2.cc100-en": {
+    "vocab_size": 32001,
+    "n_bytes": 1124813,
+    "n_tokens": 294627,
+    "n_chars": 1121360
+  },
+  "llama3.cc100-en": {
+    "vocab_size": 128256,
+    "n_bytes": 1124813,
+    "n_tokens": 254944,
+    "n_chars": 1121360
+  },
+  "mistral_7b.cc100-en": {
+    "vocab_size": 32000,
+    "n_bytes": 1124813,
+    "n_tokens": 285801,
+    "n_chars": 1121360
+  },
+  "mixtral_8_7b.cc100-en": {
+    "vocab_size": 32000,
+    "n_bytes": 1124813,
+    "n_tokens": 285801,
+    "n_chars": 1121360
+  },
+  "mobilebert_uncased.cc100-en": {
+    "vocab_size": 30522,
+    "n_bytes": 1124813,
+    "n_tokens": 280575,
+    "n_chars": 1121360
+  },
+  "moss.cc100-en": {
+    "vocab_size": 106072,
+    "n_bytes": 1124813,
+    "n_tokens": 257070,
+    "n_chars": 1121360
+  },
+  "mt5_large.cc100-en": {
+    "vocab_size": 250100,
+    "n_bytes": 1124813,
+    "n_tokens": 317881,
+    "n_chars": 1121360
+  },
+  "olmo_7b.cc100-en": {
+    "vocab_size": 50280,
+    "n_bytes": 1124813,
+    "n_tokens": 259357,
+    "n_chars": 1121360
+  },
+  "orion_14b_chat.cc100-en": {
+    "vocab_size": 84608,
+    "n_bytes": 1124813,
+    "n_tokens": 265948,
+    "n_chars": 1121360
+  },
+  "phi_1.cc100-en": {
+    "vocab_size": 50295,
+    "n_bytes": 1124813,
+    "n_tokens": 258409,
+    "n_chars": 1121360
+  },
+  "phi_2.cc100-en": {
+    "vocab_size": 50295,
+    "n_bytes": 1124813,
+    "n_tokens": 258409,
+    "n_chars": 1121360
+  },
+  "phi_3_mini.cc100-en": {
+    "vocab_size": 32011,
+    "n_bytes": 1124813,
+    "n_tokens": 294627,
+    "n_chars": 1121360
+  },
+  "pko_t5_large.cc100-en": {
+    "vocab_size": 50358,
+    "n_bytes": 1124813,
+    "n_tokens": 658985,
+    "n_chars": 1121360
+  },
+  "prompt_clue.cc100-en": {
+    "vocab_size": 32128,
+    "n_bytes": 1124813,
+    "n_tokens": 536033,
+    "n_chars": 1121360
+  },
+  "qwen1_5_14b_chat.cc100-en": {
+    "vocab_size": 151646,
+    "n_bytes": 1124813,
+    "n_tokens": 257983,
+    "n_chars": 1121360
+  },
+  "qwen_1_8b_chat.cc100-en": {
+    "vocab_size": 151851,
+    "n_bytes": 1124813,
+    "n_tokens": 257983,
+    "n_chars": 1121360
+  },
+  "qwen_72b_chat.cc100-en": {
+    "vocab_size": 151851,
+    "n_bytes": 1124813,
+    "n_tokens": 257983,
+    "n_chars": 1121360
+  },
+  "qwen_7b_chat.cc100-en": {
+    "vocab_size": 151851,
+    "n_bytes": 1124813,
+    "n_tokens": 257983,
+    "n_chars": 1121360
+  },
+  "roberta_chinese_clue.cc100-en": {
+    "vocab_size": 8021,
+    "n_bytes": 1124813,
+    "n_tokens": 583058,
+    "n_chars": 1121360
+  },
+  "skywork_13b_base.cc100-en": {
+    "vocab_size": 65519,
+    "n_bytes": 1124813,
+    "n_tokens": 294617,
+    "n_chars": 1121360
+  },
+  "skywork_13b_math.cc100-en": {
+    "vocab_size": 65519,
+    "n_bytes": 1124813,
+    "n_tokens": 294617,
+    "n_chars": 1121360
+  },
+  "solar_10_7b.cc100-en": {
+    "vocab_size": 32000,
+    "n_bytes": 1124813,
+    "n_tokens": 285801,
+    "n_chars": 1121360
+  },
+  "starchat_alpha.cc100-en": {
+    "vocab_size": 49156,
+    "n_bytes": 1124813,
+    "n_tokens": 288965,
+    "n_chars": 1121360
+  },
+  "switch_c_2048.cc100-en": {
+    "vocab_size": 32100,
+    "n_bytes": 1124813,
+    "n_tokens": 290104,
+    "n_chars": 1121360
+  },
+  "t5_base.cc100-en": {
+    "vocab_size": 32100,
+    "n_bytes": 1124813,
+    "n_tokens": 290104,
+    "n_chars": 1121360
+  },
+  "t5_large.cc100-en": {
+    "vocab_size": 32100,
+    "n_bytes": 1124813,
+    "n_tokens": 290104,
+    "n_chars": 1121360
+  },
+  "t5_small.cc100-en": {
+    "vocab_size": 32100,
+    "n_bytes": 1124813,
+    "n_tokens": 290104,
+    "n_chars": 1121360
+  },
+  "text_davinci_003.cc100-en": {
+    "vocab_size": 50281,
+    "n_bytes": 1124813,
+    "n_tokens": 258403,
+    "n_chars": 1121360
+  },
+  "tigerbot_13b_chat_v2.cc100-en": {
+    "vocab_size": 60515,
+    "n_bytes": 1124813,
+    "n_tokens": 285652,
+    "n_chars": 1121360
+  },
+  "tigerbot_70b_chat_v4_4k.cc100-en": {
+    "vocab_size": 65110,
+    "n_bytes": 1124813,
+    "n_tokens": 286946,
+    "n_chars": 1121360
+  },
+  "wizardcoder_15b_v1.cc100-en": {
+    "vocab_size": 49153,
+    "n_bytes": 1124813,
+    "n_tokens": 288965,
+    "n_chars": 1121360
+  },
+  "wizardcoder_python_7b_v1.cc100-en": {
+    "vocab_size": 32001,
+    "n_bytes": 1124813,
+    "n_tokens": 294627,
+    "n_chars": 1121360
+  },
+  "wizardlm_7b_v1.cc100-en": {
+    "vocab_size": 32001,
+    "n_bytes": 1124813,
+    "n_tokens": 294627,
+    "n_chars": 1121360
+  },
+  "wizardmath_70b_v1.cc100-en": {
+    "vocab_size": 32002,
+    "n_bytes": 1124813,
+    "n_tokens": 294627,
+    "n_chars": 1121360
+  },
+  "xlm_roberta.cc100-en": {
+    "vocab_size": 250002,
+    "n_bytes": 1124813,
+    "n_tokens": 300026,
+    "n_chars": 1121360
+  },
+  "yi_34b.cc100-en": {
+    "vocab_size": 64000,
+    "n_bytes": 1124813,
+    "n_tokens": 270400,
+    "n_chars": 1121360
+  },
+  "yi_6b.cc100-en": {
+    "vocab_size": 64000,
+    "n_bytes": 1124813,
+    "n_tokens": 270400,
+    "n_chars": 1121360
+  },
+  "yi_vl34b.cc100-en": {
+    "vocab_size": 64000,
+    "n_bytes": 1124813,
+    "n_tokens": 269738,
+    "n_chars": 1121360
+  },
+  "zephyr_7b_beta.cc100-en": {
+    "vocab_size": 32000,
+    "n_bytes": 1124813,
+    "n_tokens": 285801,
+    "n_chars": 1121360
+  },
+  "amber.cc100-zh-Hans": {
+    "vocab_size": 32000,
+    "n_bytes": 2633047,
+    "n_tokens": 1330093,
+    "n_chars": 927311
+  },
+  "aya_101.cc100-zh-Hans": {
+    "vocab_size": 250100,
+    "n_bytes": 2633047,
+    "n_tokens": 631182,
+    "n_chars": 927311
+  },
+  "baichuan.cc100-zh-Hans": {
+    "vocab_size": 64000,
+    "n_bytes": 2633047,
+    "n_tokens": 626117,
+    "n_chars": 927311
+  },
+  "baichuan2.cc100-zh-Hans": {
+    "vocab_size": 125696,
+    "n_bytes": 2633047,
+    "n_tokens": 541464,
+    "n_chars": 927311
+  },
+  "bert_base_cased.cc100-zh-Hans": {
+    "vocab_size": 28996,
+    "n_bytes": 2633047,
+    "n_tokens": 899709,
+    "n_chars": 927311
+  },
+  "bert_base_chinese.cc100-zh-Hans": {
+    "vocab_size": 21128,
+    "n_bytes": 2633047,
+    "n_tokens": 896599,
+    "n_chars": 927311
+  },
+  "bert_base_uncased.cc100-zh-Hans": {
+    "vocab_size": 30522,
+    "n_bytes": 2633047,
+    "n_tokens": 898554,
+    "n_chars": 927311
+  },
+  "bloom.cc100-zh-Hans": {
+    "vocab_size": 250680,
+    "n_bytes": 2633047,
+    "n_tokens": 573008,
+    "n_chars": 927311
+  },
+  "byt5_small.cc100-zh-Hans": {
+    "vocab_size": 384,
+    "n_bytes": 2633047,
+    "n_tokens": 2643047,
+    "n_chars": 927311
+  },
+  "character_glm_6b.cc100-zh-Hans": {
+    "vocab_size": 64789,
+    "n_bytes": 2633047,
+    "n_tokens": 583646,
+    "n_chars": 927311
+  },
+  "chatglm2_6b.cc100-zh-Hans": {
+    "vocab_size": 64787,
+    "n_bytes": 2633047,
+    "n_tokens": 583646,
+    "n_chars": 927311
+  },
+  "chatglm3_6b.cc100-zh-Hans": {
+    "vocab_size": 64796,
+    "n_bytes": 2633047,
+    "n_tokens": 583646,
+    "n_chars": 927311
+  },
+  "chatglm_6b.cc100-zh-Hans": {
+    "vocab_size": 150344,
+    "n_bytes": 2633047,
+    "n_tokens": 527384,
+    "n_chars": 927311
+  },
+  "chatyuan_large_v2.cc100-zh-Hans": {
+    "vocab_size": 32128,
+    "n_bytes": 2633047,
+    "n_tokens": 564905,
+    "n_chars": 927311
+  },
+  "chinese_llama.cc100-zh-Hans": {
+    "vocab_size": 49953,
+    "n_bytes": 2633047,
+    "n_tokens": 623219,
+    "n_chars": 927311
+  },
+  "chinese_llama2.cc100-zh-Hans": {
+    "vocab_size": 55296,
+    "n_bytes": 2633047,
+    "n_tokens": 625766,
+    "n_chars": 927311
+  },
+  "code_davinci_002.cc100-zh-Hans": {
+    "vocab_size": 50281,
+    "n_bytes": 2633047,
+    "n_tokens": 1876809,
+    "n_chars": 927311
+  },
+  "crystal_coder.cc100-zh-Hans": {
+    "vocab_size": 32022,
+    "n_bytes": 2633047,
+    "n_tokens": 1320093,
+    "n_chars": 927311
+  },
+  "dbrx_instruct.cc100-zh-Hans": {
+    "vocab_size": 100280,
+    "n_bytes": 2633047,
+    "n_tokens": 1084939,
+    "n_chars": 927311
+  },
+  "deepseek_coder_33b_instruct.cc100-zh-Hans": {
+    "vocab_size": 32022,
+    "n_bytes": 2633047,
+    "n_tokens": 720577,
+    "n_chars": 927311
+  },
+  "deepseek_llm_7b_base.cc100-zh-Hans": {
+    "vocab_size": 100015,
+    "n_bytes": 2633047,
+    "n_tokens": 605081,
+    "n_chars": 927311
+  },
+  "falcon_180b.cc100-zh-Hans": {
+    "vocab_size": 65024,
+    "n_bytes": 2633047,
+    "n_tokens": 1124681,
+    "n_chars": 927311
+  },
+  "falcon_7b.cc100-zh-Hans": {
+    "vocab_size": 65024,
+    "n_bytes": 2633047,
+    "n_tokens": 1124681,
+    "n_chars": 927311
+  },
+  "fastchat_t5_3b.cc100-zh-Hans": {
+    "vocab_size": 32110,
+    "n_bytes": 2633047,
+    "n_tokens": 178974,
+    "n_chars": 927311
+  },
+  "flan_t5_base.cc100-zh-Hans": {
+    "vocab_size": 32100,
+    "n_bytes": 2633047,
+    "n_tokens": 173520,
+    "n_chars": 927311
+  },
+  "gemma_7b.cc100-zh-Hans": {
+    "vocab_size": 256000,
+    "n_bytes": 2633047,
+    "n_tokens": 641795,
+    "n_chars": 927311
+  },
+  "gpt2.cc100-zh-Hans": {
+    "vocab_size": 50257,
+    "n_bytes": 2633047,
+    "n_tokens": 1876809,
+    "n_chars": 927311
+  },
+  "gpt2_chinese.cc100-zh-Hans": {
+    "vocab_size": 21128,
+    "n_bytes": 2633047,
+    "n_tokens": 899506,
+    "n_chars": 927311
+  },
+  "gpt_35_turbo.cc100-zh-Hans": {
+    "vocab_size": 100277,
+    "n_bytes": 2633047,
+    "n_tokens": 1084939,
+    "n_chars": 927311
+  },
+  "gpt_4.cc100-zh-Hans": {
+    "vocab_size": 100277,
+    "n_bytes": 2633047,
+    "n_tokens": 1084939,
+    "n_chars": 927311
+  },
+  "gpt_nexo_20b.cc100-zh-Hans": {
+    "vocab_size": 50277,
+    "n_bytes": 2633047,
+    "n_tokens": 1220529,
+    "n_chars": 927311
+  },
+  "grok_1.cc100-zh-Hans": {
+    "vocab_size": 131072,
+    "n_bytes": 2633047,
+    "n_tokens": 1414508,
+    "n_chars": 927311
+  },
+  "internlm2_chat_7b.cc100-zh-Hans": {
+    "vocab_size": 92544,
+    "n_bytes": 2633047,
+    "n_tokens": 579976,
+    "n_chars": 927311
+  },
+  "internlm2_math_7b.cc100-zh-Hans": {
+    "vocab_size": 92544,
+    "n_bytes": 2633047,
+    "n_tokens": 579976,
+    "n_chars": 927311
+  },
+  "internlm_chat_7b.cc100-zh-Hans": {
+    "vocab_size": 103168,
+    "n_bytes": 2633047,
+    "n_tokens": 579109,
+    "n_chars": 927311
+  },
+  "internlm_xcomposer_7b.cc100-zh-Hans": {
+    "vocab_size": 103168,
+    "n_bytes": 2633047,
+    "n_tokens": 579109,
+    "n_chars": 927311
+  },
+  "jamba_v0_1.cc100-zh-Hans": {
+    "vocab_size": 65536,
+    "n_bytes": 2633047,
+    "n_tokens": 1067054,
+    "n_chars": 927311
+  },
+  "kplug.cc100-zh-Hans": {
+    "vocab_size": 10261,
+    "n_bytes": 2633047,
+    "n_tokens": 902451,
+    "n_chars": 927311
+  },
+  "llama.cc100-zh-Hans": {
+    "vocab_size": 32000,
+    "n_bytes": 2633047,
+    "n_tokens": 1330093,
+    "n_chars": 927311
+  },
+  "llama2.cc100-zh-Hans": {
+    "vocab_size": 32001,
+    "n_bytes": 2633047,
+    "n_tokens": 1330093,
+    "n_chars": 927311
+  },
+  "llama3.cc100-zh-Hans": {
+    "vocab_size": 128256,
+    "n_bytes": 2633047,
+    "n_tokens": 747405,
+    "n_chars": 927311
+  },
+  "mistral_7b.cc100-zh-Hans": {
+    "vocab_size": 32000,
+    "n_bytes": 2633047,
+    "n_tokens": 1041023,
+    "n_chars": 927311
+  },
+  "mixtral_8_7b.cc100-zh-Hans": {
+    "vocab_size": 32000,
+    "n_bytes": 2633047,
+    "n_tokens": 1041023,
+    "n_chars": 927311
+  },
+  "mobilebert_uncased.cc100-zh-Hans": {
+    "vocab_size": 30522,
+    "n_bytes": 2633047,
+    "n_tokens": 898554,
+    "n_chars": 927311
+  },
+  "moss.cc100-zh-Hans": {
+    "vocab_size": 106072,
+    "n_bytes": 2633047,
+    "n_tokens": 557455,
+    "n_chars": 927311
+  },
+  "mt5_large.cc100-zh-Hans": {
+    "vocab_size": 250100,
+    "n_bytes": 2633047,
+    "n_tokens": 631182,
+    "n_chars": 927311
+  },
+  "olmo_7b.cc100-zh-Hans": {
+    "vocab_size": 50280,
+    "n_bytes": 2633047,
+    "n_tokens": 1220529,
+    "n_chars": 927311
+  },
+  "orion_14b_chat.cc100-zh-Hans": {
+    "vocab_size": 84608,
+    "n_bytes": 2633047,
+    "n_tokens": 529926,
+    "n_chars": 927311
+  },
+  "phi_1.cc100-zh-Hans": {
+    "vocab_size": 50295,
+    "n_bytes": 2633047,
+    "n_tokens": 1876809,
+    "n_chars": 927311
+  },
+  "phi_2.cc100-zh-Hans": {
+    "vocab_size": 50295,
+    "n_bytes": 2633047,
+    "n_tokens": 1876809,
+    "n_chars": 927311
+  },
+  "phi_3_mini.cc100-zh-Hans": {
+    "vocab_size": 32011,
+    "n_bytes": 2633047,
+    "n_tokens": 1330093,
+    "n_chars": 927311
+  },
+  "pko_t5_large.cc100-zh-Hans": {
+    "vocab_size": 50358,
+    "n_bytes": 2633047,
+    "n_tokens": 2533519,
+    "n_chars": 927311
+  },
+  "prompt_clue.cc100-zh-Hans": {
+    "vocab_size": 32128,
+    "n_bytes": 2633047,
+    "n_tokens": 564905,
+    "n_chars": 927311
+  },
+  "qwen1_5_14b_chat.cc100-zh-Hans": {
+    "vocab_size": 151646,
+    "n_bytes": 2633047,
+    "n_tokens": 589211,
+    "n_chars": 927311
+  },
+  "qwen_1_8b_chat.cc100-zh-Hans": {
+    "vocab_size": 151851,
+    "n_bytes": 2633047,
+    "n_tokens": 589211,
+    "n_chars": 927311
+  },
+  "qwen_72b_chat.cc100-zh-Hans": {
+    "vocab_size": 151851,
+    "n_bytes": 2633047,
+    "n_tokens": 589211,
+    "n_chars": 927311
+  },
+  "qwen_7b_chat.cc100-zh-Hans": {
+    "vocab_size": 151851,
+    "n_bytes": 2633047,
+    "n_tokens": 589211,
+    "n_chars": 927311
+  },
+  "roberta_chinese_clue.cc100-zh-Hans": {
+    "vocab_size": 8021,
+    "n_bytes": 2633047,
+    "n_tokens": 907144,
+    "n_chars": 927311
+  },
+  "skywork_13b_base.cc100-zh-Hans": {
+    "vocab_size": 65519,
+    "n_bytes": 2633047,
+    "n_tokens": 663923,
+    "n_chars": 927311
+  },
+  "skywork_13b_math.cc100-zh-Hans": {
+    "vocab_size": 65519,
+    "n_bytes": 2633047,
+    "n_tokens": 663923,
+    "n_chars": 927311
+  },
+  "solar_10_7b.cc100-zh-Hans": {
+    "vocab_size": 32000,
+    "n_bytes": 2633047,
+    "n_tokens": 1041023,
+    "n_chars": 927311
+  },
+  "starchat_alpha.cc100-zh-Hans": {
+    "vocab_size": 49156,
+    "n_bytes": 2633047,
+    "n_tokens": 882018,
+    "n_chars": 927311
+  },
+  "switch_c_2048.cc100-zh-Hans": {
+    "vocab_size": 32100,
+    "n_bytes": 2633047,
+    "n_tokens": 173519,
+    "n_chars": 927311
+  },
+  "t5_base.cc100-zh-Hans": {
+    "vocab_size": 32100,
+    "n_bytes": 2633047,
+    "n_tokens": 173519,
+    "n_chars": 927311
+  },
+  "t5_large.cc100-zh-Hans": {
+    "vocab_size": 32100,
+    "n_bytes": 2633047,
+    "n_tokens": 173519,
+    "n_chars": 927311
+  },
+  "t5_small.cc100-zh-Hans": {
+    "vocab_size": 32100,
+    "n_bytes": 2633047,
+    "n_tokens": 173519,
+    "n_chars": 927311
+  },
+  "text_davinci_003.cc100-zh-Hans": {
+    "vocab_size": 50281,
+    "n_bytes": 2633047,
+    "n_tokens": 1876809,
+    "n_chars": 927311
+  },
+  "tigerbot_13b_chat_v2.cc100-zh-Hans": {
+    "vocab_size": 60515,
+    "n_bytes": 2633047,
+    "n_tokens": 577385,
+    "n_chars": 927311
+  },
+  "tigerbot_70b_chat_v4_4k.cc100-zh-Hans": {
+    "vocab_size": 65110,
+    "n_bytes": 2633047,
+    "n_tokens": 577211,
+    "n_chars": 927311
+  },
+  "wizardcoder_15b_v1.cc100-zh-Hans": {
+    "vocab_size": 49153,
+    "n_bytes": 2633047,
+    "n_tokens": 882018,
+    "n_chars": 927311
+  },
+  "wizardcoder_python_7b_v1.cc100-zh-Hans": {
+    "vocab_size": 32001,
+    "n_bytes": 2633047,
+    "n_tokens": 1330093,
+    "n_chars": 927311
+  },
+  "wizardlm_7b_v1.cc100-zh-Hans": {
+    "vocab_size": 32001,
+    "n_bytes": 2633047,
+    "n_tokens": 1330093,
+    "n_chars": 927311
+  },
+  "wizardmath_70b_v1.cc100-zh-Hans": {
+    "vocab_size": 32002,
+    "n_bytes": 2633047,
+    "n_tokens": 1330093,
+    "n_chars": 927311
+  },
+  "xlm_roberta.cc100-zh-Hans": {
+    "vocab_size": 250002,
+    "n_bytes": 2633047,
+    "n_tokens": 619844,
+    "n_chars": 927311
+  },
+  "yi_34b.cc100-zh-Hans": {
+    "vocab_size": 64000,
+    "n_bytes": 2633047,
+    "n_tokens": 588729,
+    "n_chars": 927311
+  },
+  "yi_6b.cc100-zh-Hans": {
+    "vocab_size": 64000,
+    "n_bytes": 2633047,
+    "n_tokens": 588729,
+    "n_chars": 927311
+  },
+  "yi_vl34b.cc100-zh-Hans": {
+    "vocab_size": 64000,
+    "n_bytes": 2633047,
+    "n_tokens": 596166,
+    "n_chars": 927311
+  },
+  "zephyr_7b_beta.cc100-zh-Hans": {
+    "vocab_size": 32000,
+    "n_bytes": 2633047,
+    "n_tokens": 1041023,
+    "n_chars": 927311
+  },
+  "amber.cc100-es": {
+    "vocab_size": 32000,
+    "n_bytes": 1664455,
+    "n_tokens": 492235,
+    "n_chars": 1630297
+  },
+  "aya_101.cc100-es": {
+    "vocab_size": 250100,
+    "n_bytes": 1664455,
+    "n_tokens": 472231,
+    "n_chars": 1630297
+  },
+  "baichuan.cc100-es": {
+    "vocab_size": 64000,
+    "n_bytes": 1664455,
+    "n_tokens": 585804,
+    "n_chars": 1630297
+  },
+  "baichuan2.cc100-es": {
+    "vocab_size": 125696,
+    "n_bytes": 1664455,
+    "n_tokens": 551326,
+    "n_chars": 1630297
+  },
+  "bert_base_cased.cc100-es": {
+    "vocab_size": 28996,
+    "n_bytes": 1664455,
+    "n_tokens": 630231,
+    "n_chars": 1630297
+  },
+  "bert_base_chinese.cc100-es": {
+    "vocab_size": 21128,
+    "n_bytes": 1664455,
+    "n_tokens": 609419,
+    "n_chars": 1630297
+  },
+  "bert_base_uncased.cc100-es": {
+    "vocab_size": 30522,
+    "n_bytes": 1664455,
+    "n_tokens": 558042,
+    "n_chars": 1630297
+  },
+  "bloom.cc100-es": {
+    "vocab_size": 250680,
+    "n_bytes": 1664455,
+    "n_tokens": 350793,
+    "n_chars": 1630297
+  },
+  "byt5_small.cc100-es": {
+    "vocab_size": 384,
+    "n_bytes": 1664455,
+    "n_tokens": 1674455,
+    "n_chars": 1630297
+  },
+  "character_glm_6b.cc100-es": {
+    "vocab_size": 64789,
+    "n_bytes": 1664455,
+    "n_tokens": 566501,
+    "n_chars": 1630297
+  },
+  "chatglm2_6b.cc100-es": {
+    "vocab_size": 64787,
+    "n_bytes": 1664455,
+    "n_tokens": 566476,
+    "n_chars": 1630297
+  },
+  "chatglm3_6b.cc100-es": {
+    "vocab_size": 64796,
+    "n_bytes": 1664455,
+    "n_tokens": 566501,
+    "n_chars": 1630297
+  },
+  "chatglm_6b.cc100-es": {
+    "vocab_size": 150344,
+    "n_bytes": 1664455,
+    "n_tokens": 514848,
+    "n_chars": 1630297
+  },
+  "chatyuan_large_v2.cc100-es": {
+    "vocab_size": 32128,
+    "n_bytes": 1664455,
+    "n_tokens": 889530,
+    "n_chars": 1630297
+  },
+  "chinese_llama.cc100-es": {
+    "vocab_size": 49953,
+    "n_bytes": 1664455,
+    "n_tokens": 486672,
+    "n_chars": 1630297
+  },
+  "chinese_llama2.cc100-es": {
+    "vocab_size": 55296,
+    "n_bytes": 1664455,
+    "n_tokens": 492235,
+    "n_chars": 1630297
+  },
+  "code_davinci_002.cc100-es": {
+    "vocab_size": 50281,
+    "n_bytes": 1664455,
+    "n_tokens": 569853,
+    "n_chars": 1630297
+  },
+  "crystal_coder.cc100-es": {
+    "vocab_size": 32022,
+    "n_bytes": 1664455,
+    "n_tokens": 482235,
+    "n_chars": 1630297
+  },
+  "dbrx_instruct.cc100-es": {
+    "vocab_size": 100280,
+    "n_bytes": 1664455,
+    "n_tokens": 433875,
+    "n_chars": 1630297
+  },
+  "deepseek_coder_33b_instruct.cc100-es": {
+    "vocab_size": 32022,
+    "n_bytes": 1664455,
+    "n_tokens": 523884,
+    "n_chars": 1630297
+  },
+  "deepseek_llm_7b_base.cc100-es": {
+    "vocab_size": 100015,
+    "n_bytes": 1664455,
+    "n_tokens": 480877,
+    "n_chars": 1630297
+  },
+  "falcon_180b.cc100-es": {
+    "vocab_size": 65024,
+    "n_bytes": 1664455,
+    "n_tokens": 442138,
+    "n_chars": 1630297
+  },
+  "falcon_7b.cc100-es": {
+    "vocab_size": 65024,
+    "n_bytes": 1664455,
+    "n_tokens": 442138,
+    "n_chars": 1630297
+  },
+  "fastchat_t5_3b.cc100-es": {
+    "vocab_size": 32110,
+    "n_bytes": 1664455,
+    "n_tokens": 970105,
+    "n_chars": 1630297
+  },
+  "flan_t5_base.cc100-es": {
+    "vocab_size": 32100,
+    "n_bytes": 1664455,
+    "n_tokens": 706405,
+    "n_chars": 1630297
+  },
+  "gemma_7b.cc100-es": {
+    "vocab_size": 256000,
+    "n_bytes": 1664455,
+    "n_tokens": 371321,
+    "n_chars": 1630297
+  },
+  "gpt2.cc100-es": {
+    "vocab_size": 50257,
+    "n_bytes": 1664455,
+    "n_tokens": 569853,
+    "n_chars": 1630297
+  },
+  "gpt2_chinese.cc100-es": {
+    "vocab_size": 21128,
+    "n_bytes": 1664455,
+    "n_tokens": 703390,
+    "n_chars": 1630297
+  },
+  "gpt_35_turbo.cc100-es": {
+    "vocab_size": 100277,
+    "n_bytes": 1664455,
+    "n_tokens": 433875,
+    "n_chars": 1630297
+  },
+  "gpt_4.cc100-es": {
+    "vocab_size": 100277,
+    "n_bytes": 1664455,
+    "n_tokens": 433875,
+    "n_chars": 1630297
+  },
+  "gpt_nexo_20b.cc100-es": {
+    "vocab_size": 50277,
+    "n_bytes": 1664455,
+    "n_tokens": 494577,
+    "n_chars": 1630297
+  },
+  "grok_1.cc100-es": {
+    "vocab_size": 131072,
+    "n_bytes": 1664455,
+    "n_tokens": 449392,
+    "n_chars": 1630297
+  },
+  "internlm2_chat_7b.cc100-es": {
+    "vocab_size": 92544,
+    "n_bytes": 1664455,
+    "n_tokens": 518871,
+    "n_chars": 1630297
+  },
+  "internlm2_math_7b.cc100-es": {
+    "vocab_size": 92544,
+    "n_bytes": 1664455,
+    "n_tokens": 518871,
+    "n_chars": 1630297
+  },
+  "internlm_chat_7b.cc100-es": {
+    "vocab_size": 103168,
+    "n_bytes": 1664455,
+    "n_tokens": 516572,
+    "n_chars": 1630297
+  },
+  "internlm_xcomposer_7b.cc100-es": {
+    "vocab_size": 103168,
+    "n_bytes": 1664455,
+    "n_tokens": 516572,
+    "n_chars": 1630297
+  },
+  "jamba_v0_1.cc100-es": {
+    "vocab_size": 65536,
+    "n_bytes": 1664455,
+    "n_tokens": 420883,
+    "n_chars": 1630297
+  },
+  "kplug.cc100-es": {
+    "vocab_size": 10261,
+    "n_bytes": 1664455,
+    "n_tokens": 704804,
+    "n_chars": 1630297
+  },
+  "llama.cc100-es": {
+    "vocab_size": 32000,
+    "n_bytes": 1664455,
+    "n_tokens": 492235,
+    "n_chars": 1630297
+  },
+  "llama2.cc100-es": {
+    "vocab_size": 32001,
+    "n_bytes": 1664455,
+    "n_tokens": 492235,
+    "n_chars": 1630297
+  },
+  "llama3.cc100-es": {
+    "vocab_size": 128256,
+    "n_bytes": 1664455,
+    "n_tokens": 433289,
+    "n_chars": 1630297
+  },
+  "mistral_7b.cc100-es": {
+    "vocab_size": 32000,
+    "n_bytes": 1664455,
+    "n_tokens": 513915,
+    "n_chars": 1630297
+  },
+  "mixtral_8_7b.cc100-es": {
+    "vocab_size": 32000,
+    "n_bytes": 1664455,
+    "n_tokens": 513915,
+    "n_chars": 1630297
+  },
+  "mobilebert_uncased.cc100-es": {
+    "vocab_size": 30522,
+    "n_bytes": 1664455,
+    "n_tokens": 558042,
+    "n_chars": 1630297
+  },
+  "moss.cc100-es": {
+    "vocab_size": 106072,
+    "n_bytes": 1664455,
+    "n_tokens": 568539,
+    "n_chars": 1630297
+  },
+  "mt5_large.cc100-es": {
+    "vocab_size": 250100,
+    "n_bytes": 1664455,
+    "n_tokens": 472231,
+    "n_chars": 1630297
+  },
+  "olmo_7b.cc100-es": {
+    "vocab_size": 50280,
+    "n_bytes": 1664455,
+    "n_tokens": 494577,
+    "n_chars": 1630297
+  },
+  "orion_14b_chat.cc100-es": {
+    "vocab_size": 84608,
+    "n_bytes": 1664455,
+    "n_tokens": 628571,
+    "n_chars": 1630297
+  },
+  "phi_1.cc100-es": {
+    "vocab_size": 50295,
+    "n_bytes": 1664455,
+    "n_tokens": 569853,
+    "n_chars": 1630297
+  },
+  "phi_2.cc100-es": {
+    "vocab_size": 50295,
+    "n_bytes": 1664455,
+    "n_tokens": 569853,
+    "n_chars": 1630297
+  },
+  "phi_3_mini.cc100-es": {
+    "vocab_size": 32011,
+    "n_bytes": 1664455,
+    "n_tokens": 492235,
+    "n_chars": 1630297
+  },
+  "pko_t5_large.cc100-es": {
+    "vocab_size": 50358,
+    "n_bytes": 1664455,
+    "n_tokens": 1134056,
+    "n_chars": 1630297
+  },
+  "prompt_clue.cc100-es": {
+    "vocab_size": 32128,
+    "n_bytes": 1664455,
+    "n_tokens": 889530,
+    "n_chars": 1630297
+  },
+  "qwen1_5_14b_chat.cc100-es": {
+    "vocab_size": 151646,
+    "n_bytes": 1664455,
+    "n_tokens": 434264,
+    "n_chars": 1630297
+  },
+  "qwen_1_8b_chat.cc100-es": {
+    "vocab_size": 151851,
+    "n_bytes": 1664455,
+    "n_tokens": 434264,
+    "n_chars": 1630297
+  },
+  "qwen_72b_chat.cc100-es": {
+    "vocab_size": 151851,
+    "n_bytes": 1664455,
+    "n_tokens": 434264,
+    "n_chars": 1630297
+  },
+  "qwen_7b_chat.cc100-es": {
+    "vocab_size": 151851,
+    "n_bytes": 1664455,
+    "n_tokens": 434264,
+    "n_chars": 1630297
+  },
+  "roberta_chinese_clue.cc100-es": {
+    "vocab_size": 8021,
+    "n_bytes": 1664455,
+    "n_tokens": 866564,
+    "n_chars": 1630297
+  },
+  "skywork_13b_base.cc100-es": {
+    "vocab_size": 65519,
+    "n_bytes": 1664455,
+    "n_tokens": 492211,
+    "n_chars": 1630297
+  },
+  "skywork_13b_math.cc100-es": {
+    "vocab_size": 65519,
+    "n_bytes": 1664455,
+    "n_tokens": 492211,
+    "n_chars": 1630297
+  },
+  "solar_10_7b.cc100-es": {
+    "vocab_size": 32000,
+    "n_bytes": 1664455,
+    "n_tokens": 513915,
+    "n_chars": 1630297
+  },
+  "starchat_alpha.cc100-es": {
+    "vocab_size": 49156,
+    "n_bytes": 1664455,
+    "n_tokens": 530592,
+    "n_chars": 1630297
+  },
+  "switch_c_2048.cc100-es": {
+    "vocab_size": 32100,
+    "n_bytes": 1664455,
+    "n_tokens": 706400,
+    "n_chars": 1630297
+  },
+  "t5_base.cc100-es": {
+    "vocab_size": 32100,
+    "n_bytes": 1664455,
+    "n_tokens": 706400,
+    "n_chars": 1630297
+  },
+  "t5_large.cc100-es": {
+    "vocab_size": 32100,
+    "n_bytes": 1664455,
+    "n_tokens": 706400,
+    "n_chars": 1630297
+  },
+  "t5_small.cc100-es": {
+    "vocab_size": 32100,
+    "n_bytes": 1664455,
+    "n_tokens": 706400,
+    "n_chars": 1630297
+  },
+  "text_davinci_003.cc100-es": {
+    "vocab_size": 50281,
+    "n_bytes": 1664455,
+    "n_tokens": 569853,
+    "n_chars": 1630297
+  },
+  "tigerbot_13b_chat_v2.cc100-es": {
+    "vocab_size": 60515,
+    "n_bytes": 1664455,
+    "n_tokens": 482553,
+    "n_chars": 1630297
+  },
+  "tigerbot_70b_chat_v4_4k.cc100-es": {
+    "vocab_size": 65110,
+    "n_bytes": 1664455,
+    "n_tokens": 484099,
+    "n_chars": 1630297
+  },
+  "wizardcoder_15b_v1.cc100-es": {
+    "vocab_size": 49153,
+    "n_bytes": 1664455,
+    "n_tokens": 530592,
+    "n_chars": 1630297
+  },
+  "wizardcoder_python_7b_v1.cc100-es": {
+    "vocab_size": 32001,
+    "n_bytes": 1664455,
+    "n_tokens": 492235,
+    "n_chars": 1630297
+  },
+  "wizardlm_7b_v1.cc100-es": {
+    "vocab_size": 32001,
+    "n_bytes": 1664455,
+    "n_tokens": 492235,
+    "n_chars": 1630297
+  },
+  "wizardmath_70b_v1.cc100-es": {
+    "vocab_size": 32002,
+    "n_bytes": 1664455,
+    "n_tokens": 492235,
+    "n_chars": 1630297
+  },
+  "xlm_roberta.cc100-es": {
+    "vocab_size": 250002,
+    "n_bytes": 1664455,
+    "n_tokens": 399850,
+    "n_chars": 1630297
+  },
+  "yi_34b.cc100-es": {
+    "vocab_size": 64000,
+    "n_bytes": 1664455,
+    "n_tokens": 577018,
+    "n_chars": 1630297
+  },
+  "yi_6b.cc100-es": {
+    "vocab_size": 64000,
+    "n_bytes": 1664455,
+    "n_tokens": 577018,
+    "n_chars": 1630297
+  },
+  "yi_vl34b.cc100-es": {
+    "vocab_size": 64000,
+    "n_bytes": 1664455,
+    "n_tokens": 576794,
+    "n_chars": 1630297
+  },
+  "zephyr_7b_beta.cc100-es": {
+    "vocab_size": 32000,
+    "n_bytes": 1664455,
+    "n_tokens": 513915,
+    "n_chars": 1630297
+  },
+  "aya_101.cc100-fr": {
+    "vocab_size": 250100,
+    "n_bytes": 1540504,
+    "n_tokens": 470944,
+    "n_chars": 1484970
+  },
+  "baichuan.cc100-fr": {
+    "vocab_size": 64000,
+    "n_bytes": 1540504,
+    "n_tokens": 540430,
+    "n_chars": 1484970
+  },
+  "baichuan2.cc100-fr": {
+    "vocab_size": 125696,
+    "n_bytes": 1540504,
+    "n_tokens": 512313,
+    "n_chars": 1484970
+  },
+  "bert_base_cased.cc100-fr": {
+    "vocab_size": 28996,
+    "n_bytes": 1540504,
+    "n_tokens": 583210,
+    "n_chars": 1484970
+  },
+  "bert_base_chinese.cc100-fr": {
+    "vocab_size": 21128,
+    "n_bytes": 1540504,
+    "n_tokens": 553134,
+    "n_chars": 1484970
+  },
+  "bert_base_uncased.cc100-fr": {
+    "vocab_size": 30522,
+    "n_bytes": 1540504,
+    "n_tokens": 504075,
+    "n_chars": 1484970
+  },
+  "bloom.cc100-fr": {
+    "vocab_size": 250680,
+    "n_bytes": 1540504,
+    "n_tokens": 321639,
+    "n_chars": 1484970
+  },
+  "byt5_small.cc100-fr": {
+    "vocab_size": 384,
+    "n_bytes": 1540504,
+    "n_tokens": 1550504,
+    "n_chars": 1484970
+  },
+  "character_glm_6b.cc100-fr": {
+    "vocab_size": 64789,
+    "n_bytes": 1540504,
+    "n_tokens": 515052,
+    "n_chars": 1484970
+  },
+  "chatglm2_6b.cc100-fr": {
+    "vocab_size": 64787,
+    "n_bytes": 1540504,
+    "n_tokens": 515028,
+    "n_chars": 1484970
+  },
+  "chatglm3_6b.cc100-fr": {
+    "vocab_size": 64796,
+    "n_bytes": 1540504,
+    "n_tokens": 515052,
+    "n_chars": 1484970
+  },
+  "chatglm_6b.cc100-fr": {
+    "vocab_size": 150344,
+    "n_bytes": 1540504,
+    "n_tokens": 499261,
+    "n_chars": 1484970
+  },
+  "chatyuan_large_v2.cc100-fr": {
+    "vocab_size": 32128,
+    "n_bytes": 1540504,
+    "n_tokens": 822012,
+    "n_chars": 1484970
+  },
+  "chinese_llama.cc100-fr": {
+    "vocab_size": 49953,
+    "n_bytes": 1540504,
+    "n_tokens": 450352,
+    "n_chars": 1484970
+  },
+  "chinese_llama2.cc100-fr": {
+    "vocab_size": 55296,
+    "n_bytes": 1540504,
+    "n_tokens": 457243,
+    "n_chars": 1484970
+  },
+  "code_davinci_002.cc100-fr": {
+    "vocab_size": 50281,
+    "n_bytes": 1540504,
+    "n_tokens": 521776,
+    "n_chars": 1484970
+  },
+  "crystal_coder.cc100-fr": {
+    "vocab_size": 32022,
+    "n_bytes": 1540504,
+    "n_tokens": 447243,
+    "n_chars": 1484970
+  },
+  "dbrx_instruct.cc100-fr": {
+    "vocab_size": 100280,
+    "n_bytes": 1540504,
+    "n_tokens": 412685,
+    "n_chars": 1484970
+  },
+  "deepseek_coder_33b_instruct.cc100-fr": {
+    "vocab_size": 32022,
+    "n_bytes": 1540504,
+    "n_tokens": 537538,
+    "n_chars": 1484970
+  },
+  "deepseek_llm_7b_base.cc100-fr": {
+    "vocab_size": 100015,
+    "n_bytes": 1540504,
+    "n_tokens": 507693,
+    "n_chars": 1484970
+  },
+  "falcon_180b.cc100-fr": {
+    "vocab_size": 65024,
+    "n_bytes": 1540504,
+    "n_tokens": 407853,
+    "n_chars": 1484970
+  },
+  "falcon_7b.cc100-fr": {
+    "vocab_size": 65024,
+    "n_bytes": 1540504,
+    "n_tokens": 407853,
+    "n_chars": 1484970
+  },
+  "fastchat_t5_3b.cc100-fr": {
+    "vocab_size": 32110,
+    "n_bytes": 1540504,
+    "n_tokens": 717675,
+    "n_chars": 1484970
+  },
+  "flan_t5_base.cc100-fr": {
+    "vocab_size": 32100,
+    "n_bytes": 1540504,
+    "n_tokens": 476135,
+    "n_chars": 1484970
+  },
+  "gemma_7b.cc100-fr": {
+    "vocab_size": 256000,
+    "n_bytes": 1540504,
+    "n_tokens": 374551,
+    "n_chars": 1484970
+  },
+  "gpt2.cc100-fr": {
+    "vocab_size": 50257,
+    "n_bytes": 1540504,
+    "n_tokens": 521776,
+    "n_chars": 1484970
+  },
+  "gpt2_chinese.cc100-fr": {
+    "vocab_size": 21128,
+    "n_bytes": 1540504,
+    "n_tokens": 636442,
+    "n_chars": 1484970
+  },
+  "gpt_35_turbo.cc100-fr": {
+    "vocab_size": 100277,
+    "n_bytes": 1540504,
+    "n_tokens": 412685,
+    "n_chars": 1484970
+  },
+  "gpt_4.cc100-fr": {
+    "vocab_size": 100277,
+    "n_bytes": 1540504,
+    "n_tokens": 412685,
+    "n_chars": 1484970
+  },
+  "gpt_nexo_20b.cc100-fr": {
+    "vocab_size": 50277,
+    "n_bytes": 1540504,
+    "n_tokens": 458961,
+    "n_chars": 1484970
+  },
+  "grok_1.cc100-fr": {
+    "vocab_size": 131072,
+    "n_bytes": 1540504,
+    "n_tokens": 428298,
+    "n_chars": 1484970
+  },
+  "internlm2_chat_7b.cc100-fr": {
+    "vocab_size": 92544,
+    "n_bytes": 1540504,
+    "n_tokens": 496629,
+    "n_chars": 1484970
+  },
+  "internlm2_math_7b.cc100-fr": {
+    "vocab_size": 92544,
+    "n_bytes": 1540504,
+    "n_tokens": 496629,
+    "n_chars": 1484970
+  },
+  "internlm_chat_7b.cc100-fr": {
+    "vocab_size": 103168,
+    "n_bytes": 1540504,
+    "n_tokens": 495045,
+    "n_chars": 1484970
+  },
+  "internlm_xcomposer_7b.cc100-fr": {
+    "vocab_size": 103168,
+    "n_bytes": 1540504,
+    "n_tokens": 495045,
+    "n_chars": 1484970
+  },
+  "jamba_v0_1.cc100-fr": {
+    "vocab_size": 65536,
+    "n_bytes": 1540504,
+    "n_tokens": 412899,
+    "n_chars": 1484970
+  },
+  "kplug.cc100-fr": {
+    "vocab_size": 10261,
+    "n_bytes": 1540504,
+    "n_tokens": 638107,
+    "n_chars": 1484970
+  },
+  "llama.cc100-fr": {
+    "vocab_size": 32000,
+    "n_bytes": 1540504,
+    "n_tokens": 457243,
+    "n_chars": 1484970
+  },
+  "llama2.cc100-fr": {
+    "vocab_size": 32001,
+    "n_bytes": 1540504,
+    "n_tokens": 457243,
+    "n_chars": 1484970
+  },
+  "llama3.cc100-fr": {
+    "vocab_size": 128256,
+    "n_bytes": 1540504,
+    "n_tokens": 412146,
+    "n_chars": 1484970
+  },
+  "mistral_7b.cc100-fr": {
+    "vocab_size": 32000,
+    "n_bytes": 1540504,
+    "n_tokens": 476666,
+    "n_chars": 1484970
+  },
+  "mixtral_8_7b.cc100-fr": {
+    "vocab_size": 32000,
+    "n_bytes": 1540504,
+    "n_tokens": 476666,
+    "n_chars": 1484970
+  },
+  "mobilebert_uncased.cc100-fr": {
+    "vocab_size": 30522,
+    "n_bytes": 1540504,
+    "n_tokens": 504075,
+    "n_chars": 1484970
+  },
+  "moss.cc100-fr": {
+    "vocab_size": 106072,
+    "n_bytes": 1540504,
+    "n_tokens": 515669,
+    "n_chars": 1484970
+  },
+  "mt5_large.cc100-fr": {
+    "vocab_size": 250100,
+    "n_bytes": 1540504,
+    "n_tokens": 470944,
+    "n_chars": 1484970
+  },
+  "olmo_7b.cc100-fr": {
+    "vocab_size": 50280,
+    "n_bytes": 1540504,
+    "n_tokens": 458961,
+    "n_chars": 1484970
+  },
+  "orion_14b_chat.cc100-fr": {
+    "vocab_size": 84608,
+    "n_bytes": 1540504,
+    "n_tokens": 564107,
+    "n_chars": 1484970
+  },
+  "phi_1.cc100-fr": {
+    "vocab_size": 50295,
+    "n_bytes": 1540504,
+    "n_tokens": 521776,
+    "n_chars": 1484970
+  },
+  "phi_2.cc100-fr": {
+    "vocab_size": 50295,
+    "n_bytes": 1540504,
+    "n_tokens": 521776,
+    "n_chars": 1484970
+  },
+  "phi_3_mini.cc100-fr": {
+    "vocab_size": 32011,
+    "n_bytes": 1540504,
+    "n_tokens": 457243,
+    "n_chars": 1484970
+  },
+  "pko_t5_large.cc100-fr": {
+    "vocab_size": 50358,
+    "n_bytes": 1540504,
+    "n_tokens": 1044665,
+    "n_chars": 1484970
+  },
+  "prompt_clue.cc100-fr": {
+    "vocab_size": 32128,
+    "n_bytes": 1540504,
+    "n_tokens": 822012,
+    "n_chars": 1484970
+  },
+  "qwen1_5_14b_chat.cc100-fr": {
+    "vocab_size": 151646,
+    "n_bytes": 1540504,
+    "n_tokens": 413637,
+    "n_chars": 1484970
+  },
+  "qwen_1_8b_chat.cc100-fr": {
+    "vocab_size": 151851,
+    "n_bytes": 1540504,
+    "n_tokens": 413637,
+    "n_chars": 1484970
+  },
+  "qwen_72b_chat.cc100-fr": {
+    "vocab_size": 151851,
+    "n_bytes": 1540504,
+    "n_tokens": 413637,
+    "n_chars": 1484970
+  },
+  "qwen_7b_chat.cc100-fr": {
+    "vocab_size": 151851,
+    "n_bytes": 1540504,
+    "n_tokens": 413637,
+    "n_chars": 1484970
+  },
+  "roberta_chinese_clue.cc100-fr": {
+    "vocab_size": 8021,
+    "n_bytes": 1540504,
+    "n_tokens": 787363,
+    "n_chars": 1484970
+  },
+  "skywork_13b_base.cc100-fr": {
+    "vocab_size": 65519,
+    "n_bytes": 1540504,
+    "n_tokens": 457233,
+    "n_chars": 1484970
+  },
+  "skywork_13b_math.cc100-fr": {
+    "vocab_size": 65519,
+    "n_bytes": 1540504,
+    "n_tokens": 457233,
+    "n_chars": 1484970
+  },
+  "solar_10_7b.cc100-fr": {
+    "vocab_size": 32000,
+    "n_bytes": 1540504,
+    "n_tokens": 476666,
+    "n_chars": 1484970
+  },
+  "starchat_alpha.cc100-fr": {
+    "vocab_size": 49156,
+    "n_bytes": 1540504,
+    "n_tokens": 509958,
+    "n_chars": 1484970
+  },
+  "switch_c_2048.cc100-fr": {
+    "vocab_size": 32100,
+    "n_bytes": 1540504,
+    "n_tokens": 476133,
+    "n_chars": 1484970
+  },
+  "t5_base.cc100-fr": {
+    "vocab_size": 32100,
+    "n_bytes": 1540504,
+    "n_tokens": 476133,
+    "n_chars": 1484970
+  },
+  "t5_large.cc100-fr": {
+    "vocab_size": 32100,
+    "n_bytes": 1540504,
+    "n_tokens": 476133,
+    "n_chars": 1484970
+  },
+  "t5_small.cc100-fr": {
+    "vocab_size": 32100,
+    "n_bytes": 1540504,
+    "n_tokens": 476133,
+    "n_chars": 1484970
+  },
+  "text_davinci_003.cc100-fr": {
+    "vocab_size": 50281,
+    "n_bytes": 1540504,
+    "n_tokens": 521776,
+    "n_chars": 1484970
+  },
+  "tigerbot_13b_chat_v2.cc100-fr": {
+    "vocab_size": 60515,
+    "n_bytes": 1540504,
+    "n_tokens": 447372,
+    "n_chars": 1484970
+  },
+  "tigerbot_70b_chat_v4_4k.cc100-fr": {
+    "vocab_size": 65110,
+    "n_bytes": 1540504,
+    "n_tokens": 448567,
+    "n_chars": 1484970
+  },
+  "wizardcoder_15b_v1.cc100-fr": {
+    "vocab_size": 49153,
+    "n_bytes": 1540504,
+    "n_tokens": 509958,
+    "n_chars": 1484970
+  },
+  "wizardcoder_python_7b_v1.cc100-fr": {
+    "vocab_size": 32001,
+    "n_bytes": 1540504,
+    "n_tokens": 457243,
+    "n_chars": 1484970
+  },
+  "wizardlm_7b_v1.cc100-fr": {
+    "vocab_size": 32001,
+    "n_bytes": 1540504,
+    "n_tokens": 457243,
+    "n_chars": 1484970
+  },
+  "wizardmath_70b_v1.cc100-fr": {
+    "vocab_size": 32002,
+    "n_bytes": 1540504,
+    "n_tokens": 457243,
+    "n_chars": 1484970
+  },
+  "xlm_roberta.cc100-fr": {
+    "vocab_size": 250002,
+    "n_bytes": 1540504,
+    "n_tokens": 405041,
+    "n_chars": 1484970
+  },
+  "yi_34b.cc100-fr": {
+    "vocab_size": 64000,
+    "n_bytes": 1540504,
+    "n_tokens": 533106,
+    "n_chars": 1484970
+  },
+  "yi_6b.cc100-fr": {
+    "vocab_size": 64000,
+    "n_bytes": 1540504,
+    "n_tokens": 533106,
+    "n_chars": 1484970
+  },
+  "yi_vl34b.cc100-fr": {
+    "vocab_size": 64000,
+    "n_bytes": 1540504,
+    "n_tokens": 532288,
+    "n_chars": 1484970
+  },
+  "zephyr_7b_beta.cc100-fr": {
+    "vocab_size": 32000,
+    "n_bytes": 1540504,
+    "n_tokens": 476666,
+    "n_chars": 1484970
+  }
+}

stats/compress_rate/amber.en.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"vocab_size": 32000, "n_bytes": 1124813, "n_tokens": 294627, "n_chars": 1121360}

stats/compress_rate/amber.zh-Hans.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"vocab_size": 32000, "n_bytes": 2633047, "n_tokens": 1330093, "n_chars": 927311}

stats/compress_rate/aya_101.en.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"vocab_size": 250100, "n_bytes": 1124813, "n_tokens": 317881, "n_chars": 1121360}

stats/compress_rate/aya_101.zh-Hans.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"vocab_size": 250100, "n_bytes": 2633047, "n_tokens": 631182, "n_chars": 927311}

stats/compress_rate/baichuan.en.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"vocab_size": 64000, "n_bytes": 1124813, "n_tokens": 280108, "n_chars": 1121360}

stats/compress_rate/baichuan.zh-Hans.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"vocab_size": 64000, "n_bytes": 2633047, "n_tokens": 626117, "n_chars": 927311}

stats/compress_rate/baichuan2.en.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"vocab_size": 125696, "n_bytes": 1124813, "n_tokens": 269011, "n_chars": 1121360}

stats/compress_rate/baichuan2.zh-Hans.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"vocab_size": 125696, "n_bytes": 2633047, "n_tokens": 541464, "n_chars": 927311}

stats/compress_rate/bert_base_cased.en.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"vocab_size": 28996, "n_bytes": 1124813, "n_tokens": 288022, "n_chars": 1121360}

stats/compress_rate/bert_base_cased.zh-Hans.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"vocab_size": 28996, "n_bytes": 2633047, "n_tokens": 899709, "n_chars": 927311}

stats/compress_rate/bert_base_chinese.en.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"vocab_size": 21128, "n_bytes": 1124813, "n_tokens": 377068, "n_chars": 1121360}

stats/compress_rate/bert_base_chinese.zh-Hans.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"vocab_size": 21128, "n_bytes": 2633047, "n_tokens": 896599, "n_chars": 927311}

stats/compress_rate/bert_base_uncased.en.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"vocab_size": 30522, "n_bytes": 1124813, "n_tokens": 280575, "n_chars": 1121360}

stats/compress_rate/bert_base_uncased.zh-Hans.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"vocab_size": 30522, "n_bytes": 2633047, "n_tokens": 898554, "n_chars": 927311}

stats/compress_rate/bloom.en.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"vocab_size": 250680, "n_bytes": 1124813, "n_tokens": 257405, "n_chars": 1121360}

stats/compress_rate/bloom.zh-Hans.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"vocab_size": 250680, "n_bytes": 2633047, "n_tokens": 573008, "n_chars": 927311}

stats/compress_rate/byt5_small.en.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"vocab_size": 256, "n_bytes": 1124813, "n_tokens": 1134813, "n_chars": 1121360}

stats/compress_rate/byt5_small.zh-Hans.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"vocab_size": 256, "n_bytes": 2633047, "n_tokens": 2643047, "n_chars": 927311}

stats/compress_rate/character_glm_6b.en.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"vocab_size": 64794, "n_bytes": 1124813, "n_tokens": 289347, "n_chars": 1121360}

stats/compress_rate/character_glm_6b.zh-Hans.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"vocab_size": 64794, "n_bytes": 2633047, "n_tokens": 583646, "n_chars": 927311}

stats/compress_rate/chatglm2_6b.en.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"vocab_size": 64794, "n_bytes": 1124813, "n_tokens": 289329, "n_chars": 1121360}

stats/compress_rate/chatglm2_6b.zh-Hans.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"vocab_size": 64794, "n_bytes": 2633047, "n_tokens": 583646, "n_chars": 927311}

stats/compress_rate/chatglm3_6b.en.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"vocab_size": 64798, "n_bytes": 1124813, "n_tokens": 289347, "n_chars": 1121360}

stats/compress_rate/chatglm3_6b.zh-Hans.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"vocab_size": 64798, "n_bytes": 2633047, "n_tokens": 583646, "n_chars": 927311}

stats/compress_rate/chatglm_6b.en.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"vocab_size": 150344, "n_bytes": 1124813, "n_tokens": 284761, "n_chars": 1121360}

stats/compress_rate/chatglm_6b.zh-Hans.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"vocab_size": 150344, "n_bytes": 2633047, "n_tokens": 527384, "n_chars": 927311}

stats/compress_rate/chatyuan_large_v2.en.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"vocab_size": 32128, "n_bytes": 1124813, "n_tokens": 536033, "n_chars": 1121360}

stats/compress_rate/chatyuan_large_v2.zh-Hans.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"vocab_size": 32128, "n_bytes": 2633047, "n_tokens": 564905, "n_chars": 927311}

stats/compress_rate/chinese_llama.en.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"vocab_size": 49953, "n_bytes": 1124813, "n_tokens": 291514, "n_chars": 1121360}

stats/compress_rate/chinese_llama.zh-Hans.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"vocab_size": 49953, "n_bytes": 2633047, "n_tokens": 623219, "n_chars": 927311}

stats/compress_rate/chinese_llama2.en.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"vocab_size": 55296, "n_bytes": 1124813, "n_tokens": 294627, "n_chars": 1121360}

stats/compress_rate/chinese_llama2.zh-Hans.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"vocab_size": 55296, "n_bytes": 2633047, "n_tokens": 625766, "n_chars": 927311}

stats/compress_rate/code_davinci_002.en.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"vocab_size": 50281, "n_bytes": 1124813, "n_tokens": 258403, "n_chars": 1121360}

stats/compress_rate/code_davinci_002.zh-Hans.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"vocab_size": 50281, "n_bytes": 2633047, "n_tokens": 1876809, "n_chars": 927311}

stats/compress_rate/crystal_coder.en.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"vocab_size": 32000, "n_bytes": 1124813, "n_tokens": 284627, "n_chars": 1121360}

stats/compress_rate/crystal_coder.zh-Hans.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"vocab_size": 32000, "n_bytes": 2633047, "n_tokens": 1320093, "n_chars": 927311}

stats/compress_rate/dbrx_instruct.en.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"vocab_size": 100277, "n_bytes": 1124813, "n_tokens": 254985, "n_chars": 1121360}

stats/compress_rate/dbrx_instruct.zh-Hans.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"vocab_size": 100277, "n_bytes": 2633047, "n_tokens": 1084939, "n_chars": 927311}

stats/compress_rate/deepseek_coder_33b_instruct.en.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"vocab_size": 32000, "n_bytes": 1124813, "n_tokens": 287408, "n_chars": 1121360}

stats/compress_rate/deepseek_coder_33b_instruct.zh-Hans.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"vocab_size": 32000, "n_bytes": 2633047, "n_tokens": 720577, "n_chars": 927311}