# coding=utf-8 # author: xusong # time: 2022/8/23 16:06 """ ## TODO: - i18 国际化 https://blog.csdn.net/qq_26212731/article/details/78457198 request.header中也有language - iter_vocab 的 warmup - 开关 - add_special_token 开关 - theme 开关 light/dark - token_id/tokens/bytes 开关 - 中文字词统计,是否要包括 _ G 等字符 - 评测 - OOV评测 - 通过 javascript 添加 hover_text - 英文 utf-8编码 - 词典支持下载,借用image下载的标签, - baichuan的单字数量怎么两万多个? - qwen: ValueError: Unclosed image token - 路径修改为全path meta-llama/Llama-2-13b-hf plots table ## related demo - [](http://text-processing.com/demo/tokenize/) - [gpt-tokenizer](https://gpt-tokenizer.dev/) - [llama-tokenizer-js](https://belladoreai.github.io/llama-tokenizer-js/example-demo/build/) - [](https://huggingface.co/spaces/Xenova/the-tokenizer-playground) ## 可视化 [ The, 2, QUICK, Brown, Foxes, jumped, over, the, lazy, dog's, bone ] """ import gradio as gr from vocab import all_tokenizers from util import * from examples import example_fn, example_types get_window_url_params = """ function(url_params) { const params = new URLSearchParams(window.location.search); url_params = JSON.stringify(Object.fromEntries(params)); return url_params; } """ with gr.Blocks(css="css/style.css", title="Tokenizer Arena") as demo: gr.HTML("""

Tokenizer Arena ⚔️

""") # links: https://www.coderstool.com/utf8-encoding-decoding # 功能:输入文本,进行分词 # 分词器:常见的分词器有集中, # 背景:方便分词、看词粒度、对比 with gr.Row(): gr.Markdown("## Input Text") dropdown_examples = gr.Dropdown( example_types, type="index", show_label=False, container=False, scale=0, elem_classes="example-style" ) user_input = gr.Textbox( # value=default_user_input, label="Input Text", lines=5, show_label=False, ) gr.Markdown("## Tokenization") # compress rate setting with gr.Accordion("Compress Rate Setting", open=True): gr.Markdown("Please select corpus and unit of compress rate, get more details at [github](https://github.com/xu-song/tokenizer-arena/). ") with gr.Row(): compress_rate_corpus = gr.CheckboxGroup( ["cc100-en", "cc100-zh-Hans", "cc100-es", "code"], value=["cc100-en", "cc100-zh-Hans"], label="corpus", # info="" ) compress_rate_unit = gr.Radio( ["b_tokens/g_bytes", "g_bytes/b_tokens", "t_tokens/t_bytes", "t_bytes/t_tokens"], value="b_tokens/g_bytes", label="unit", ) # TODO: Token Setting # with gr.Accordion("Token Filter Setting", open=False): # gr.Markdown( # "Get total number of tokens which contain the following character)") # gr.Radio( # ["zh-Hans", "", "number", "space"], # value="zh", # ) with gr.Row(): with gr.Column(scale=6): with gr.Group(): tokenizer_type_1 = gr.Dropdown( all_tokenizers, label="Tokenizer 1", ) with gr.Group(): """
69
Characters
""" with gr.Row(): stats_vocab_size_1 = gr.TextArea( label="Vocab Size", lines=1, elem_classes="statistics" ) stats_zh_token_size_1 = gr.TextArea( label="ZH char/word", lines=1, elem_classes="statistics", visible=False ) stats_compress_rate_1 = gr.TextArea( label="Compress Rate", lines=1, elem_classes="statistics" ) stats_overlap_token_size_1 = gr.TextArea( # value=default_stats_overlap_token_size, label="Overlap Tokens", lines=1, elem_classes="statistics" ) # stats_3 = gr.TextArea( # label="Compress Rate", # lines=1, # elem_classes="statistics" # ) # https://www.onlinewebfonts.com/icon/418591 gr.Image("images/VS.svg", scale=1, show_label=False, show_download_button=False, container=False, show_share_button=False) with gr.Column(scale=6): with gr.Group(): tokenizer_type_2 = gr.Dropdown( all_tokenizers, label="Tokenizer 2", ) with gr.Group(): with gr.Row(): stats_vocab_size_2 = gr.TextArea( label="VocabSize", lines=1, elem_classes="statistics" ) stats_zh_token_size_2 = gr.TextArea( label="ZH char/word", # 中文字/词 lines=1, elem_classes="statistics", visible=False ) stats_compress_rate_2 = gr.TextArea( label="Compress Rate", lines=1, elem_classes="statistics" ) stats_filtered_token_2 = gr.TextArea( label="filtered tokens", lines=1, elem_classes="statistics", visible=False ) stats_overlap_token_size_2 = gr.TextArea( label="Overlap Tokens", lines=1, elem_classes="statistics" ) # TODO: 图 表 压缩率 with gr.Row(): # dynamic change label with gr.Column(): output_text_1 = gr.Highlightedtext( show_legend=True, elem_classes="space-show" ) with gr.Column(): output_text_2 = gr.Highlightedtext( show_legend=True, elem_classes="space-show" ) with gr.Row(): output_table_1 = gr.Dataframe() output_table_2 = gr.Dataframe() # setting # compress_rate_unit.change(compress_rate_unit_change, [compress_rate_unit], # [stats_compress_rate_1, stats_compress_rate_2]) tokenizer_type_1.change(tokenize, [user_input, tokenizer_type_1], [output_text_1, output_table_1]) tokenizer_type_1.change(basic_count, [tokenizer_type_1], [stats_vocab_size_1, stats_zh_token_size_1]) tokenizer_type_1.change(get_overlap_token_size, [tokenizer_type_1, tokenizer_type_2], [stats_overlap_token_size_1, stats_overlap_token_size_2]) tokenizer_type_1.change(get_compress_rate, [tokenizer_type_1, compress_rate_corpus, compress_rate_unit], [stats_compress_rate_1]) # TODO: every=3 user_input.change(tokenize_pair, [user_input, tokenizer_type_1, tokenizer_type_2], [output_text_1, output_table_1, output_text_2, output_table_2]) # , pass_request=1 tokenizer_type_2.change(tokenize, [user_input, tokenizer_type_2], [output_text_2, output_table_2]) tokenizer_type_2.change(basic_count, [tokenizer_type_2], [stats_vocab_size_2, stats_zh_token_size_2]) tokenizer_type_2.change(get_overlap_token_size, [tokenizer_type_1, tokenizer_type_2], [stats_overlap_token_size_1, stats_overlap_token_size_2]) tokenizer_type_2.change(get_compress_rate, [tokenizer_type_2, compress_rate_corpus, compress_rate_unit], [stats_compress_rate_2]) compress_rate_unit.change(get_compress_rate, [tokenizer_type_1, compress_rate_corpus, compress_rate_unit], [stats_compress_rate_1]) compress_rate_unit.change(get_compress_rate, [tokenizer_type_2, compress_rate_corpus, compress_rate_unit], [stats_compress_rate_2]) dropdown_examples.change( example_fn, dropdown_examples, [user_input, tokenizer_type_1, tokenizer_type_2] ) demo.load(js=open("js/onload.js", "r", encoding="utf-8").read()) demo.load( fn=on_load, inputs=[user_input], # 这里只需要传个空object即可。 outputs=[user_input, tokenizer_type_1, tokenizer_type_2], js=get_window_url_params ) if __name__ == "__main__": # demo.queue(max_size=20).launch() demo.launch() # demo.launch(share=True)