File size: 4,909 Bytes
2bd606a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97354e0
2bd606a
 
 
 
 
 
f1b4ae2
2bd606a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
"""
TODO:
- 统计 tokenizer_impl
- 统计 OOV
- 统计 reversal
- 增加 math,code
"""

import gradio as gr
from compression_util import get_compression_leaderboard, common_corpuses


with gr.Blocks() as demo:
    # gr.Markdown("## Convertor")
    # with gr.Accordion("Convertor", open=False):
    #     gr.Markdown("Tokenize {} corpus")
    #     with gr.Row(elem_classes="no-border"):
    #         gr.Button("File Size", min_width=50)
    #         file_size = gr.Textbox(
    #             show_label=False,
    #             min_width=50,
    #             # elem_classes="textbox-as-text"
    #         )
    #         gr.Dropdown(
    #             choices=['MB', 'GB', 'TB'],
    #             show_label=False,
    #             min_width=15,
    #             # elem_classes="textbox-as-text"
    #         )
    #         # gr.Markdown('<h2 align="center">≈</h2>')
    #         # gr.HTML('<h2 style="margin: auto;">≈</h2>')
    #         gr.Button(
    #             "≈",
    #             min_width=10,
    #             elem_classes="button-white h2-font"
    #
    #         )
    #
    #         gr.Button(
    #             "Tokens",
    #             min_width=50
    #         )
    #         gr.Textbox(
    #             show_label=False,
    #             min_width=50
    #         )
    #         gr.Dropdown(
    #             ['million', 'billion', 'trillion'],
    #             show_label=False,
    #             min_width=15,
    #             elem_classes="button-white"
    #         )

    gr.Markdown("## 🛠️ Setting")  # ⚙
    with gr.Accordion("Please select the corpus and measure of compression rate.", open=True):
        # file size 💽 🖴, tokens 🧮
        # Total amount of disk used
        with gr.Row():
            with gr.Column():
                compress_rate_corpus = gr.Dropdown(
                    common_corpuses,  # , "code"
                    value=["cc100/en", "cc100/zh-Hans", "cc100/fr", "cc100/es"],
                    label="corpus",
                    multiselect=True
                    # info=""
                )

                # unit of file_size: gigabyte terabyte
                # unit of token_num: million billion trillion
                # The most common units of measurement include length (meter, inch, foot), weight (gram, kilogram, pound), volume (liter, gallon, milliliter), time (second, minute, hour)
                compress_rate_unit = gr.Radio(
                    ["b_tokens/g_bytes", "t_tokens/t_bytes"],
                    value="b_tokens/g_bytes",
                    label="measure",  # evaluation metric
                )

            gr.Markdown(
                # "Note:\n\n"
                "- `corpus`: tokenization is performed on the selected subsets of [cc100](https://huggingface.co/datasets/cc100) corpus.\n"
                "- `b_tokens/g_bytes` measures how many billion tokens per gigabytes corpus.\n"
                "- `t_tokens/t_bytes` measures how many trillion tokens per terabytes corpus.\n"
                # "- `g_bytes/b_tokens` measures how many gigabytes corpus per billion tokens.\n"
                # "- `t_bytes/t_tokens` measures how many terabytes corpus per trillion tokens.\n"
                "- `char/token` measures how many chars per token on the tokenized corpus.\n"
                "- `oov_ratio`: out-of-vocabulary ratio on the selected corpus, 👉 get [oov charset](https://huggingface.co/spaces/eson/tokenizer-arena/raw/main/stats/compression_rate.json)\n\n"
                "You can reproduce this procedure with [compression_util.py](https://huggingface.co/spaces/eson/tokenizer-arena/blob/main/compression_util.py)."
            )

    gr.Markdown("## 🏆 Compression Rate Leaderboard")
    search_bar = gr.Textbox(
        placeholder="🔍 Search by tokenizer or organization (e.g., 'llama', 'openai') and press ENTER...",
        show_label=False,
        elem_id="search-bar",
    )
    compress_rate_table = gr.Dataframe(datatype="html")

    # func call
    compress_rate_corpus.change(
        get_compression_leaderboard,
        inputs=[compress_rate_corpus, compress_rate_unit, search_bar],
        outputs=compress_rate_table
    )
    compress_rate_unit.change(
        get_compression_leaderboard,
        inputs=[compress_rate_corpus, compress_rate_unit, search_bar],
        outputs=compress_rate_table
    )
    # file_size.change(
    #     get_all_compress_rate,
    #     outputs=compress_rate_table
    # )

    search_bar.submit(
        get_compression_leaderboard,
        inputs=[
            compress_rate_corpus,
            compress_rate_unit,
            search_bar,
        ],
        outputs=compress_rate_table
    )

    demo.load(
        get_compression_leaderboard,
        inputs=[compress_rate_corpus, compress_rate_unit],
        outputs=compress_rate_table
    )

if __name__ == "__main__":
    demo.launch()