""" TODO: - 统计 tokenizer_impl - 统计 OOV - 统计 reversal - 增加 math,code """ import gradio as gr from compression_util import get_compression_leaderboard, common_corpuses with gr.Blocks() as demo: # gr.Markdown("## Convertor") # with gr.Accordion("Convertor", open=False): # gr.Markdown("Tokenize {} corpus") # with gr.Row(elem_classes="no-border"): # gr.Button("File Size", min_width=50) # file_size = gr.Textbox( # show_label=False, # min_width=50, # # elem_classes="textbox-as-text" # ) # gr.Dropdown( # choices=['MB', 'GB', 'TB'], # show_label=False, # min_width=15, # # elem_classes="textbox-as-text" # ) # # gr.Markdown('

') # # gr.HTML('

') # gr.Button( # "≈", # min_width=10, # elem_classes="button-white h2-font" # # ) # # gr.Button( # "Tokens", # min_width=50 # ) # gr.Textbox( # show_label=False, # min_width=50 # ) # gr.Dropdown( # ['million', 'billion', 'trillion'], # show_label=False, # min_width=15, # elem_classes="button-white" # ) gr.Markdown("## 🛠️ Setting") # ⚙ with gr.Accordion("Please select the corpus and measure of compression rate.", open=True): # file size 💽 🖴, tokens 🧮 # Total amount of disk used with gr.Row(): with gr.Column(): compress_rate_corpus = gr.Dropdown( common_corpuses, # , "code" value=["cc100/en", "cc100/zh-Hans", "cc100/fr", "cc100/es"], label="corpus", multiselect=True # info="" ) # unit of file_size: gigabyte terabyte # unit of token_num: million billion trillion # The most common units of measurement include length (meter, inch, foot), weight (gram, kilogram, pound), volume (liter, gallon, milliliter), time (second, minute, hour) compress_rate_unit = gr.Radio( ["b_tokens/g_bytes", "t_tokens/t_bytes"], value="b_tokens/g_bytes", label="measure", # evaluation metric ) gr.Markdown( "- `corpus`: tokenization is performed on the selected subsets of [cc100](https://huggingface.co/datasets/cc100) corpus.\n" "- `b_tokens/g_bytes` measures how many billion tokens per gigabytes corpus.\n" "- `t_tokens/t_bytes` measures how many trillion tokens per terabytes corpus.\n" # "- `g_bytes/b_tokens` measures how many gigabytes corpus per billion tokens.\n" # "- `t_bytes/t_tokens` measures how many terabytes corpus per trillion tokens.\n" "- `char/token` measures how many chars per token on the tokenized corpus.\n" "- `oov_ratio`: out-of-vocabulary ratio on the selected corpus. 👉 get [oov charset](https://huggingface.co/spaces/eson/tokenizer-arena/blob/main/stats/compression_rate.json)\n\n" "You can reproduce this procedure with [compression_util.py](https://huggingface.co/spaces/eson/tokenizer-arena/blob/main/compression_util.py)." ) gr.Markdown("## 🏆 Compression Rate Leaderboard") search_bar = gr.Textbox( placeholder="🔍 Search by tokenizer or organization (e.g., 'llama', 'openai') and press ENTER...", show_label=False, elem_id="search-bar", ) compress_rate_table = gr.Dataframe(datatype="html") # func call compress_rate_corpus.change( get_compression_leaderboard, inputs=[compress_rate_corpus, compress_rate_unit, search_bar], outputs=compress_rate_table ) compress_rate_unit.change( get_compression_leaderboard, inputs=[compress_rate_corpus, compress_rate_unit, search_bar], outputs=compress_rate_table ) # file_size.change( # get_all_compress_rate, # outputs=compress_rate_table # ) search_bar.submit( get_compression_leaderboard, inputs=[ compress_rate_corpus, compress_rate_unit, search_bar, ], outputs=compress_rate_table ) demo.load( get_compression_leaderboard, inputs=[compress_rate_corpus, compress_rate_unit], outputs=compress_rate_table ) if __name__ == "__main__": demo.launch()