tokenizer-arena / character_app.py
eson's picture
update
7c73423
raw
history blame contribute delete
No virus
2.65 kB
"""
##
"""
import gradio as gr
from character_util import get_character_table, default_columns
all_columns = [
("digit", "digit"),
("space", "space"),
("lang-chinese", 'zh'),
("lang-korea", 'ko'),
("lang-japanese", 'ja'),
# ("byte", "byte"),
# ("oov", "oov")
]
# columns = ["lang-zh", "lang-korea", "lang-japanese", "number", "space", "bytes", "oov"]
abbr2name = {column[1]: column[0].split('-')[-1] for column in all_columns}
def get_column_info(columns):
markdown = ""
for column in columns:
markdown += f"- `num({column})`: num of tokens containing {abbr2name[column]} characters\n" \
f"- `len({column})`: `min,median,max` length of tokens containing {abbr2name[column]} characters\n"
return markdown
with gr.Blocks() as demo:
gr.Markdown("## 🛠️ Setting") # ⚙
with gr.Accordion("Please select the type of character you want to count.", open=True):
# file size 💽 🖴, tokens 🧮
with gr.Row():
with gr.Column():
columns = gr.Checkboxgroup(
all_columns,
value=default_columns,
label="character type",
# info=""
)
gr.Markdown(
"To count other types of characters, you can modify [lang_util.py]"
"(https://huggingface.co/spaces/eson/tokenizer-arena/blob/main/utils/lang_util.py). "
)
column_info = gr.Markdown(
get_column_info(default_columns)
)
gr.Markdown("## 📊 Character Statistics")
search_bar = gr.Textbox(
placeholder="🔍 Search by tokenizer or organization (e.g., 'llama', 'openai') and press ENTER...",
show_label=False,
elem_id="search-bar",
)
compress_rate_table = gr.Dataframe(datatype="html", wrap=True)
search_bar.submit(
get_character_table,
inputs=[search_bar, columns],
outputs=compress_rate_table
)
columns.change(
get_character_table,
inputs=[search_bar, columns],
outputs=compress_rate_table
)
columns.change(
get_column_info,
inputs=[columns],
outputs=column_info
)
demo.load(
get_character_table,
inputs=[search_bar, columns],
outputs=compress_rate_table
)
if __name__ == "__main__":
demo.launch()