File size: 2,592 Bytes
2bd606a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import gradio as gr
from character_util import get_character_table

all_columns = [
    ("digit", "digit"),
    ("space", "space"),
    ("lang-chinese", 'zh'),
    ("lang-korea", 'ko'),
    ("lang-japanese", 'ja'),
    # ("byte", "byte"),
    # ("oov", "oov")
]
default_columns = ["digit", "zh"]

# columns = ["lang-zh", "lang-korea", "lang-japanese", "number", "space", "bytes", "oov"]

abbr2name = {column[1]: column[0].split('-')[-1] for column in all_columns}


def get_column_info(columns):
    print(columns)
    markdown = ""
    for column in columns:
        markdown += f"- `num({column})`: num of tokens containing {abbr2name[column]} characters\n" \
                    f"- `len({column})`: `min,median,max` length of tokens containing {abbr2name[column]} characters\n"
    return markdown


with gr.Blocks() as demo:
    gr.Markdown("## ๐Ÿ› ๏ธ Setting")  # โš™
    with gr.Accordion("Please select the type of character you want to count.", open=True):
        # file size ๐Ÿ’ฝ ๐Ÿ–ด, tokens ๐Ÿงฎ
        with gr.Row():
            with gr.Column():
                columns = gr.Checkboxgroup(
                    all_columns,
                    value=default_columns,
                    label="character type",
                    # info=""
                )
                gr.Markdown(
                    "To count other types of characters, you can modify [character_util.py]"
                    "(https://huggingface.co/spaces/eson/tokenizer-arena/blob/main/character_util.py). "
                )
            column_info = gr.Markdown(
                get_column_info(default_columns)
            )

        gr.Markdown("## ๐Ÿ“Š Character Statistics")
        search_bar = gr.Textbox(
            placeholder="๐Ÿ” Search by tokenizer or organization (e.g., 'llama', 'openai') and press ENTER...",
            show_label=False,
            elem_id="search-bar",
        )
        compress_rate_table = gr.Dataframe(datatype="html", wrap=True)

        search_bar.submit(
            get_character_table,
            inputs=[search_bar, columns],
            outputs=compress_rate_table
        )
        columns.change(
            get_character_table,
            inputs=[search_bar, columns],
            outputs=compress_rate_table
        )
        columns.change(
            get_column_info,
            inputs=[columns],
            outputs=column_info
        )

        demo.load(
            get_character_table,
            inputs=[search_bar, columns],
            outputs=compress_rate_table
        )

    if __name__ == "__main__":
        demo.launch()