dutch-tokenizer-arena / app_compression.py
yhavinga's picture
Add Llama tokenizer creation for Dutch, English, Code, Markdown and TeX.
c78da21
import gradio as gr
from utils.compression_util import get_compression_leaderboard
from utils.compression_util import common_corpuses
with gr.Blocks() as demo:
# gr.Markdown("## Convertor")
# with gr.Accordion("Convertor", open=False):
# gr.Markdown("Tokenize {} corpus")
# with gr.Row(elem_classes="no-border"):
# gr.Button("File Size", min_width=50)
# file_size = gr.Textbox(
# show_label=False,
# min_width=50,
# # elem_classes="textbox-as-text"
# )
# gr.Dropdown(
# choices=['MB', 'GB', 'TB'],
# show_label=False,
# min_width=15,
# # elem_classes="textbox-as-text"
# )
# # gr.Markdown('<h2 align="center">≈</h2>')
# # gr.HTML('<h2 style="margin: auto;">≈</h2>')
# gr.Button(
# "≈",
# min_width=10,
# elem_classes="button-white h2-font"
#
# )
#
# gr.Button(
# "Tokens",
# min_width=50
# )
# gr.Textbox(
# show_label=False,
# min_width=50
# )
# gr.Dropdown(
# ['million', 'billion', 'trillion'],
# show_label=False,
# min_width=15,
# elem_classes="button-white"
# )
gr.Markdown("## 🛠️ Setting") # ⚙
with gr.Accordion("Please select corpus and measure of compression rate ...", open=True):
# file size 💽 🖴, tokens 🧮
# gr.Markdown(
# "Please select corpus and measure of compression rate.\n"
#"`num_of_trillion_tokens` `num_of_billion_tokens`\n"
# "- `b_tokens/g_bytes` measures how many billion tokens per gigabytes corpus. \n"
# "- `t_tokens/t_bytes` measures how many trillion tokens per terabytes corpus. \n"
# "- `n_chars/n_tokens` measures how many chars per token in the current corpus. \n\n"
# "All the above measures are depend on corpus. You can reproduce this "
# "procedure at [github](https://github.com/xu-song/tokenizer-arena/)."
# )
with gr.Row():
compress_rate_corpus = gr.Dropdown(
common_corpuses, # , "code"
value=["cc100-nl", "cc100-en"],
label="corpus",
multiselect=True
# info=""
)
# unit of file_size: gigabyte terabyte
# unit of token_num: million billion trillion
# The most common units of measurement include length (meter, inch, foot), weight (gram, kilogram, pound), volume (liter, gallon, milliliter), time (second, minute, hour)
compress_rate_unit = gr.Radio(
["b_tokens/g_bytes", "t_tokens/t_bytes"],
value="b_tokens/g_bytes",
label="measure",
)
gr.Markdown(
# "`num_of_trillion_tokens` `num_of_billion_tokens`\n"
"- `b_tokens/g_bytes` measures how many billion tokens per gigabytes corpus. \n"
"- `t_tokens/t_bytes` measures how many trillion tokens per terabytes corpus. \n"
"- `n_chars/n_tokens` measures how many chars per token in the tokenized corpus. \n"
# "\nAll the above measures are depend on corpus. You can reproduce this "
# "procedure at [github](https://github.com/xu-song/tokenizer-arena/)."
)
gr.Markdown("## 🏆 Compression Rate Leaderboard")
search_bar = gr.Textbox(
placeholder="🔍 Search tokenizers(e.g., 'llama') and press ENTER...",
show_label=False,
elem_id="search-bar",
)
compress_rate_table = gr.Dataframe()
# func call
compress_rate_corpus.change(
get_compression_leaderboard,
inputs=[compress_rate_corpus, compress_rate_unit],
outputs=compress_rate_table
)
compress_rate_unit.change(
get_compression_leaderboard,
inputs=[compress_rate_corpus, compress_rate_unit],
outputs=compress_rate_table
)
# file_size.change(
# get_all_compress_rate,
# outputs=compress_rate_table
# )
search_bar.submit(
get_compression_leaderboard,
inputs=[
compress_rate_corpus,
compress_rate_unit,
search_bar,
],
outputs=compress_rate_table
)
demo.load(
get_compression_leaderboard,
inputs=[compress_rate_corpus, compress_rate_unit],
outputs=compress_rate_table
)
if __name__ == "__main__":
demo.launch()