File size: 4,750 Bytes
1b7fc74 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 |
import gradio as gr
from utils.compression_util import get_compression_leaderboard
from utils.compression_util import common_corpuses
with gr.Blocks() as demo:
# gr.Markdown("## Convertor")
# with gr.Accordion("Convertor", open=False):
# gr.Markdown("Tokenize {} corpus")
# with gr.Row(elem_classes="no-border"):
# gr.Button("File Size", min_width=50)
# file_size = gr.Textbox(
# show_label=False,
# min_width=50,
# # elem_classes="textbox-as-text"
# )
# gr.Dropdown(
# choices=['MB', 'GB', 'TB'],
# show_label=False,
# min_width=15,
# # elem_classes="textbox-as-text"
# )
# # gr.Markdown('<h2 align="center">≈</h2>')
# # gr.HTML('<h2 style="margin: auto;">≈</h2>')
# gr.Button(
# "≈",
# min_width=10,
# elem_classes="button-white h2-font"
#
# )
#
# gr.Button(
# "Tokens",
# min_width=50
# )
# gr.Textbox(
# show_label=False,
# min_width=50
# )
# gr.Dropdown(
# ['million', 'billion', 'trillion'],
# show_label=False,
# min_width=15,
# elem_classes="button-white"
# )
gr.Markdown("## 🛠️ Setting") # ⚙
with gr.Accordion("Please select corpus and measure of compression rate ...", open=True):
# file size 💽 🖴, tokens 🧮
# gr.Markdown(
# "Please select corpus and measure of compression rate.\n"
#"`num_of_trillion_tokens` `num_of_billion_tokens`\n"
# "- `b_tokens/g_bytes` measures how many billion tokens per gigabytes corpus. \n"
# "- `t_tokens/t_bytes` measures how many trillion tokens per terabytes corpus. \n"
# "- `n_chars/n_tokens` measures how many chars per token in the current corpus. \n\n"
# "All the above measures are depend on corpus. You can reproduce this "
# "procedure at [github](https://github.com/xu-song/tokenizer-arena/)."
# )
with gr.Row():
compress_rate_corpus = gr.Dropdown(
common_corpuses, # , "code"
value=["cc100-en", "cc100-zh-Hans"],
label="corpus",
multiselect=True
# info=""
)
# unit of file_size: gigabyte terabyte
# unit of token_num: million billion trillion
# The most common units of measurement include length (meter, inch, foot), weight (gram, kilogram, pound), volume (liter, gallon, milliliter), time (second, minute, hour)
compress_rate_unit = gr.Radio(
["b_tokens/g_bytes", "t_tokens/t_bytes"],
value="b_tokens/g_bytes",
label="measure",
)
gr.Markdown(
# "`num_of_trillion_tokens` `num_of_billion_tokens`\n"
"- `b_tokens/g_bytes` measures how many billion tokens per gigabytes corpus. \n"
"- `t_tokens/t_bytes` measures how many trillion tokens per terabytes corpus. \n"
"- `n_chars/n_tokens` measures how many chars per token in the tokenized corpus. \n\n"
"All the above measures are depend on corpus. You can reproduce this "
"procedure at [github](https://github.com/xu-song/tokenizer-arena/)."
)
gr.Markdown("## 🏆 Compression Rate Leaderboard")
search_bar = gr.Textbox(
placeholder="🔍 Search tokenizers(e.g., 'llama') and press ENTER...",
show_label=False,
elem_id="search-bar",
)
compress_rate_table = gr.Dataframe()
# func call
compress_rate_corpus.change(
get_compression_leaderboard,
inputs=[compress_rate_corpus, compress_rate_unit],
outputs=compress_rate_table
)
compress_rate_unit.change(
get_compression_leaderboard,
inputs=[compress_rate_corpus, compress_rate_unit],
outputs=compress_rate_table
)
# file_size.change(
# get_all_compress_rate,
# outputs=compress_rate_table
# )
search_bar.submit(
get_compression_leaderboard,
inputs=[
compress_rate_corpus,
compress_rate_unit,
search_bar,
],
outputs=compress_rate_table
)
demo.load(
get_compression_leaderboard,
inputs=[compress_rate_corpus, compress_rate_unit],
outputs=compress_rate_table
)
if __name__ == "__main__":
demo.launch()
|