# coding=utf-8 # author: xusong # time: 2022/8/23 16:06 """ ## TODO: - i18 国际化 https://blog.csdn.net/qq_26212731/article/details/78457198 request.header中也有language - iter_vocab 的 warmup - 开关 - add_special_token 开关 - theme 开关 light/dark - token_id/tokens/bytes 开关 - 中文字词统计,是否要包括 _ G 等字符 - 评测 - OOV评测 - 通过 javascript 添加 hover_text - 英文 utf-8编码 - 词典支持下载,借用image下载的标签, - baichuan的单字数量怎么两万多个? - qwen: ValueError: Unclosed image token - 路径修改为全path meta-llama/Llama-2-13b-hf plots table ## related demo - [](http://text-processing.com/demo/tokenize/) - [gpt-tokenizer](https://gpt-tokenizer.dev/) - [llama-tokenizer-js](https://belladoreai.github.io/llama-tokenizer-js/example-demo/build/) - [](https://huggingface.co/spaces/Xenova/the-tokenizer-playground) ## 可视化 [ The, 2, QUICK, Brown, Foxes, jumped, over, the, lazy, dog's, bone ] """ import gradio as gr from vocab import all_tokenizers from util import * from examples import example_fn get_window_url_params = """ function(url_params) { const params = new URLSearchParams(window.location.search); url_params = JSON.stringify(Object.fromEntries(params)); return url_params; } """ with gr.Blocks(css="css/style.css", title="Tokenizer Arena") as demo: gr.HTML("""

Tokenizer Arena ⚔️

""") # links: https://www.coderstool.com/utf8-encoding-decoding # 功能:输入文本,进行分词 # 分词器:常见的分词器有集中, # 背景:方便分词、看词粒度、对比 with gr.Row(): gr.Markdown("## Input Text") dropdown_examples = gr.Dropdown( # ["空格测试", "标点测试", "符号测试", "数字测试"], ["spaces", "punctuations", "symbols", "digits"], value="Examples", type="index", show_label=False, container=False, scale=0, elem_classes="example-style" ) user_input = gr.Textbox( # value=default_user_input, label="Input Text", lines=5, show_label=False, ) gr.Markdown("## Tokenization") with gr.Row(): with gr.Column(scale=6): with gr.Group(): tokenizer_type_1 = gr.Dropdown( all_tokenizers, label="Tokenizer 1", ) with gr.Group(): """
69
Characters
""" with gr.Row(): stats_vocab_size_1 = gr.TextArea( label="VocabSize", lines=1, elem_classes="statistics" ) stats_zh_token_size_1 = gr.TextArea( label="ZH char/word", lines=1, elem_classes="statistics" ) stats_overlap_token_size_1 = gr.TextArea( # value=default_stats_overlap_token_size, label="Overlap Tokens", lines=1, elem_classes="statistics" ) # stats_3 = gr.TextArea( # label="Compress Rate", # lines=1, # elem_classes="statistics" # ) # https://www.onlinewebfonts.com/icon/418591 gr.Image("images/VS.svg", scale=1, show_label=False, show_download_button=False, container=False, show_share_button=False) with gr.Column(scale=6): with gr.Group(): tokenizer_type_2 = gr.Dropdown( all_tokenizers, label="Tokenizer 2", ) with gr.Group(): with gr.Row(): stats_vocab_size_2 = gr.TextArea( label="VocabSize", lines=1, elem_classes="statistics" ) stats_zh_token_size_2 = gr.TextArea( label="ZH char/word", # 中文字/词 lines=1, elem_classes="statistics" ) # stats_6 = gr.TextArea( # label="Compress Rate", # lines=1, # elem_classes="statistics" # ) stats_overlap_token_size_2 = gr.TextArea( label="Overlap Tokens", lines=1, elem_classes="statistics" ) # TODO: 图 表 压缩率 with gr.Row(): with gr.Column(): output_text_1 = gr.Highlightedtext( show_legend=True, elem_classes="space-show" ) with gr.Column(): output_text_2 = gr.Highlightedtext( show_legend=True, elem_classes="space-show" ) with gr.Row(): output_table_1 = gr.Dataframe() output_table_2 = gr.Dataframe() tokenizer_type_1.change(tokenize, [user_input, tokenizer_type_1], [output_text_1, output_table_1]) tokenizer_type_1.change(basic_count, [tokenizer_type_1], [stats_vocab_size_1, stats_zh_token_size_1]) tokenizer_type_1.change(get_overlap_token_size, [tokenizer_type_1, tokenizer_type_2], [stats_overlap_token_size_1, stats_overlap_token_size_2]) user_input.change(tokenize_pair, [user_input, tokenizer_type_1, tokenizer_type_2], [output_text_1, output_table_1, output_text_2, output_table_2]) # , pass_request=1 tokenizer_type_2.change(tokenize, [user_input, tokenizer_type_2], [output_text_2, output_table_2]) tokenizer_type_2.change(basic_count, [tokenizer_type_2], [stats_vocab_size_2, stats_zh_token_size_2]) tokenizer_type_2.change(get_overlap_token_size, [tokenizer_type_1, tokenizer_type_2], [stats_overlap_token_size_1, stats_overlap_token_size_2]) dropdown_examples.change( example_fn, dropdown_examples, [user_input, tokenizer_type_1, tokenizer_type_2] ) demo.load(_js=open("js/onload.js", "r", encoding="utf-8").read()) demo.load( fn=on_load, inputs=[user_input], # 这里只需要传个空object即可。 outputs=[user_input, tokenizer_type_1, tokenizer_type_2], _js=get_window_url_params ) if __name__ == "__main__": # demo.queue(max_size=20).launch() demo.launch()