import gradio as gr from character_util import get_character_table all_columns = [ ("digit", "digit"), ("space", "space"), ("lang-chinese", 'zh'), ("lang-korea", 'ko'), ("lang-japanese", 'ja'), # ("byte", "byte"), # ("oov", "oov") ] default_columns = ["digit", "zh"] # columns = ["lang-zh", "lang-korea", "lang-japanese", "number", "space", "bytes", "oov"] abbr2name = {column[1]: column[0].split('-')[-1] for column in all_columns} def get_column_info(columns): print(columns) markdown = "" for column in columns: markdown += f"- `num({column})`: num of tokens containing {abbr2name[column]} characters\n" \ f"- `len({column})`: `min,median,max` length of tokens containing {abbr2name[column]} characters\n" return markdown with gr.Blocks() as demo: gr.Markdown("## 🛠️ Setting") # ⚙ with gr.Accordion("Please select the type of character you want to count.", open=True): # file size 💽 🖴, tokens 🧮 with gr.Row(): with gr.Column(): columns = gr.Checkboxgroup( all_columns, value=default_columns, label="character type", # info="" ) gr.Markdown( "To count other types of characters, you can modify [character_util.py]" "(https://huggingface.co/spaces/eson/tokenizer-arena/blob/main/character_util.py). " ) column_info = gr.Markdown( get_column_info(default_columns) ) gr.Markdown("## 📊 Character Statistics") search_bar = gr.Textbox( placeholder="🔍 Search by tokenizer or organization (e.g., 'llama', 'openai') and press ENTER...", show_label=False, elem_id="search-bar", ) compress_rate_table = gr.Dataframe(datatype="html", wrap=True) search_bar.submit( get_character_table, inputs=[search_bar, columns], outputs=compress_rate_table ) columns.change( get_character_table, inputs=[search_bar, columns], outputs=compress_rate_table ) columns.change( get_column_info, inputs=[columns], outputs=column_info ) demo.load( get_character_table, inputs=[search_bar, columns], outputs=compress_rate_table ) if __name__ == "__main__": demo.launch()