Spaces:
Running
Running
File size: 2,735 Bytes
7c73423 6ef6bf4 7c73423 6ef6bf4 7c73423 6ef6bf4 7c73423 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 |
"""
##
"""
import gradio as gr
from character_util import get_character_table, default_columns
all_columns = [
("digit", "digit"),
("space", "space"),
("lang-chinese", 'zh'),
("lang-korea", 'ko'),
("lang-japanese", 'ja'),
# ("byte", "byte"),
# ("oov", "oov")
]
# columns = ["lang-zh", "lang-korea", "lang-japanese", "number", "space", "bytes", "oov"]
abbr2name = {column[1]: column[0].split('-')[-1] for column in all_columns}
def get_column_info(columns):
markdown = ""
for column in columns:
markdown += f"- `num({column})`: num of tokens containing {abbr2name[column]} characters\n" \
f"- `len({column})`: `min,median,max` length of tokens containing {abbr2name[column]} characters\n"
return markdown
with gr.Blocks() as demo:
gr.Markdown("## 🛠️ Setting") # ⚙
with gr.Accordion("Please select the type of character you want to count.", open=True):
# file size 💽 🖴, tokens 🧮
with gr.Row():
with gr.Column():
columns = gr.Checkboxgroup(
all_columns,
value=default_columns,
label="character type",
# info=""
)
gr.Markdown(
"To count other types of characters, you can modify [lang_util.py]"
"(https://huggingface.co/spaces/eson/tokenizer-arena/blob/main/utils/lang_util.py). "
)
column_info = gr.Markdown(
get_column_info(default_columns)
)
gr.Markdown("## 📊 Character Statistics")
search_bar = gr.Textbox(
placeholder="🔍 Search by tokenizer or organization (e.g., 'llama', 'openai') and press ENTER...",
show_label=False,
elem_id="search-bar",
)
compress_rate_table = gr.Dataframe(datatype="html", wrap=True)
search_bar.submit(
get_character_table,
inputs=[search_bar, columns],
outputs=compress_rate_table
)
columns.change(
get_character_table,
inputs=[search_bar, columns],
outputs=compress_rate_table,
show_api=False
)
columns.change(
get_column_info,
inputs=[columns],
outputs=column_info,
show_api=False
)
demo.load(
get_character_table,
inputs=[search_bar, columns],
outputs=compress_rate_table,
show_api=False
)
if __name__ == "__main__":
demo.launch()
|