Spaces:

yhavinga
/

dutch-tokenizer-arena

Running

App Files Files Community

eson commited on Sep 18, 2023

Commit

9495a4f

•

1 Parent(s): 0ce6477

update

Browse files

Files changed (36) hide show

app.py +32 -91
style.css → css/style.css +0 -0
evaluation.md +5 -0
examples.py +22 -0
images/README.md +5 -0
images/download_button.html +1 -0
js/onload.js +12 -0
util.py +67 -27
utils/_vocab.zh.jsonl +1189 -0
utils/log_util.py +1 -1
utils/zh_util.py +4 -2
vocab/README.md +3 -1
vocab/__init__.py +13 -8
vocab/{baichuan_7b → baichuan}/Baichuan-7B/config.json +0 -0
vocab/{baichuan_7b → baichuan}/Baichuan-7B/configuration_baichuan.py +0 -0
vocab/{baichuan_7b → baichuan}/Baichuan-7B/special_tokens_map.json +0 -0
vocab/{baichuan_7b → baichuan}/Baichuan-7B/tokenization_baichuan.py +0 -0
vocab/{baichuan_7b → baichuan}/Baichuan-7B/tokenizer.model +0 -0
vocab/{baichuan_7b → baichuan}/Baichuan-7B/tokenizer_config.json +0 -0
vocab/{baichuan_7b → baichuan}/__init__.py +0 -0
vocab/{baichuan_7b → baichuan}/demo.py +0 -0
vocab/baichuan2/__init__.py +10 -0
vocab/bloom/test_tokenizer.py +2 -0
vocab/chinese_llama2/__init__.py +7 -0
vocab/falcon_180b/__init__.py +11 -0
vocab/falcon_180b/tokenizer/special_tokens_map.json +16 -0
vocab/falcon_180b/tokenizer/tokenizer.json +0 -0
vocab/falcon_180b/tokenizer/tokenizer_config.json +8 -0
vocab/gpt_35_turbo/__init__.py +14 -5
vocab/gpt_35_turbo/aaa.py +5 -0
vocab/gpt_4/__init__.py +1 -46
vocab/gpt_neox_chinese_v1/20B_tokenizer_chinese.mock.json +2 -0
vocab/gpt_neox_chinese_v1/mock.py +24 -9
vocab/gpt_neox_chinese_v1/trouble-shooting.md +22 -0
vocab/llama/__init__.py +13 -0
vocab/llama/demo.py +33 -0

app.py CHANGED Viewed

@@ -4,21 +4,19 @@
 """
 ## TODO:
-- http get方式获取参数，(高优先级)
 - i18 国际化  https://blog.csdn.net/qq_26212731/article/details/78457198   request.header中也有language
 - iter_vocab 的 warmup
-- add_special_token 开关
-- theme 开关 light/dark
-- token_id/tokens/bytes 开关
 - 通过 javascript 添加 hover_text
-- 给方法 + 缓存，避免重复调用
 - 英文 utf-8编码
-- 词典支持下载
-- 中文字词统计，是否要包括 _ G 等字符
 - baichuan的单字数量怎么两万多个？
-- OOV
-- feedback位置
-- gpt4, gpt3.5 的overlap tokens 有问题。
 - qwen:  ValueError: Unclosed image token
 plots
@@ -39,57 +37,16 @@ table
 import gradio as gr
 from vocab import all_tokenizers
 from util import *
-# llama chatglm_6b gpt_nexo_20b baichuan  baichuan_7b
-examples_zh = [
-    ["空格测试：  2个空格        8个空格", "llama", "chatglm_6b"],  # chatglm 有blank_n,
-    ["标点测试：，。！？；", "baichuan_7b", "llama"],
-    ["符号测试：🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan_7b", "llama"],
-    ["数字测试：(10086 + 98) = 100184", "baichuan_7b", "llama"],
-    ["中文简体：宽带，繁体：樂來", "baichuan_7b", "llama"],
-]
-examples = [
-    ["spaces:  2spaces        8spaces", "llama", "chatglm_6b"],  # chatglm 有blank_n,
-    ["punctuations: ,./?\"，。！？；", "baichuan_7b", "llama"],
-    ["symbols: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan_7b", "llama"],
-    ["digits: (10086 + 98) = 100184", "baichuan_7b", "llama"],
-]
-# jieba.enable_parallel()  # flask中没办法parallel
-def example_fn(example_idx):
-    return examples[example_idx]
-"""Replace this text in the input field to see how tokenization works
-"""
-default_user_input = """Replace this text in the input field to see how tokenization works
-华为发布Mate60手机
-ラグビーワールドカップ2023フランス"""
-default_tokenizer_type_1 = "llama"
-default_tokenizer_type_2 = "internlm_chat_7b"
-default_stats_vocab_size_1, default_stats_zh_token_size_1 = basic_count(default_tokenizer_type_1)
-default_stats_vocab_size_2, default_stats_zh_token_size_2 = basic_count(default_tokenizer_type_2)
-default_stats_overlap_token_size = get_overlap_token_size(default_tokenizer_type_1, default_tokenizer_type_2)[0]
-default_output_text_1, default_output_table_1, default_output_len_1 = tokenize(default_user_input, default_tokenizer_type_1, update=False)
-default_output_text_2, default_output_table_2, default_output_len_2 = tokenize(default_user_input, default_tokenizer_type_2, update=False)
-with gr.Blocks(css="style.css") as demo:
     gr.HTML("""<h1 align="center">Tokenizer Arena ⚔️</h1>""")
     # links: https://www.coderstool.com/utf8-encoding-decoding
     # 功能：输入文本，进行分词
     # 分词器：常见的分词器有集中，
     # 背景：方便分词、看词粒度、对比
-    #
-    # Byte: 表示分词
     with gr.Row():
         gr.Markdown("## Input Text")
@@ -103,26 +60,18 @@ with gr.Blocks(css="style.css") as demo:
             scale=0,
             elem_classes="example-style"
         )
     user_input = gr.Textbox(
-        value=default_user_input,
         label="Input Text",
         lines=5,
         show_label=False,
-    )  # placeholder="Enter sentence here..."
-    # gr.Examples(
-    #     examples,
-    #     None,
-    # )
     gr.Markdown("## Tokenization")
     with gr.Row():
         with gr.Column(scale=6):
             with gr.Group():
                 tokenizer_type_1 = gr.Dropdown(
                     all_tokenizers,
-                    value=default_tokenizer_type_1,
                     label="Tokenizer 1",
                 )
                 with gr.Group():
@@ -131,19 +80,17 @@ with gr.Blocks(css="style.css") as demo:
                     """
                     with gr.Row():
                         stats_vocab_size_1 = gr.TextArea(
-                            value=default_stats_vocab_size_1,
                             label="VocabSize",
                             lines=1,
                             elem_classes="statistics"
                         )
                         stats_zh_token_size_1 = gr.TextArea(
-                            value=default_stats_zh_token_size_1,
                             label="ZH char/word",
                             lines=1,
                             elem_classes="statistics"
                         )
                         stats_overlap_token_size_1 = gr.TextArea(
-                            value=default_stats_overlap_token_size,
                             label="Overlap Tokens",
                             lines=1,
                             elem_classes="statistics"
@@ -161,19 +108,16 @@ with gr.Blocks(css="style.css") as demo:
             with gr.Group():
                 tokenizer_type_2 = gr.Dropdown(
                     all_tokenizers,
-                    value=default_tokenizer_type_2,
                     label="Tokenizer 2",
                 )
                 with gr.Group():
                     with gr.Row():
                         stats_vocab_size_2 = gr.TextArea(
-                            value=default_stats_vocab_size_2,
                             label="VocabSize",
                             lines=1,
                             elem_classes="statistics"
                         )
                         stats_zh_token_size_2 = gr.TextArea(
-                            value=default_stats_zh_token_size_2,
                             label="ZH char/word",  # 中文字/词
                             lines=1,
                             elem_classes="statistics"
@@ -184,7 +128,6 @@ with gr.Blocks(css="style.css") as demo:
                         #     elem_classes="statistics"
                         # )
                         stats_overlap_token_size_2 = gr.TextArea(
-                            value=default_stats_overlap_token_size,
                             label="Overlap Tokens",
                             lines=1,
                             elem_classes="statistics"
@@ -194,42 +137,28 @@ with gr.Blocks(css="style.css") as demo:
     with gr.Row():
         with gr.Column():
             output_text_1 = gr.Highlightedtext(
-                value=default_output_text_1,
-                label=f"Tokens: {default_output_len_1}",
                 show_legend=True,
                 elem_classes="space-show"
             )
         with gr.Column():
             output_text_2 = gr.Highlightedtext(
-                value=default_output_text_2,
-                label=f"Tokens: {default_output_len_2}",
                 show_legend=True,
                 elem_classes="space-show"
             )
     with gr.Row():
-        output_table_1 = gr.Dataframe(
-            value=default_output_table_1,
-            headers=["TokenID", "Byte", "Text"],
-            datatype=["str", "str", "str"],
-            # elem_classes="space-show",   # 给整个Dataframe加这个css不起作用，因此直接修改cell-wrap
-        )
-        output_table_2 = gr.Dataframe(
-            value=default_output_table_2,
-            headers=["TokenID", "Token", "Text"],
-            datatype=["str", "str", "str"],
-        )
     tokenizer_type_1.change(tokenize, [user_input, tokenizer_type_1],
                             [output_text_1, output_table_1])
-    # 下面两个好像可以合并
     tokenizer_type_1.change(basic_count, [tokenizer_type_1], [stats_vocab_size_1, stats_zh_token_size_1])
     tokenizer_type_1.change(get_overlap_token_size, [tokenizer_type_1, tokenizer_type_2],
                             [stats_overlap_token_size_1, stats_overlap_token_size_2])
     user_input.change(tokenize_pair,
                       [user_input, tokenizer_type_1, tokenizer_type_2],
-                      [output_text_1, output_table_1, output_text_2, output_table_2]) # , pass_request=1
     tokenizer_type_2.change(tokenize, [user_input, tokenizer_type_2],
                             [output_text_2, output_table_2])
@@ -243,9 +172,21 @@ with gr.Blocks(css="style.css") as demo:
         [user_input, tokenizer_type_1, tokenizer_type_2]
     )
-    # start up 初始化
-    # user_input.update(user_input.value + "___")
 if __name__ == "__main__":
-    demo.queue(max_size=20).launch()
-    # demo.launch()

 """
 ## TODO:
 - i18 国际化  https://blog.csdn.net/qq_26212731/article/details/78457198   request.header中也有language
 - iter_vocab 的 warmup
+- 开关
+  - add_special_token 开关
+  - theme 开关 light/dark
+  - token_id/tokens/bytes 开关
+  - 中文字词统计，是否要包括 _ G 等字符
+- 评测
+  - OOV评测
 - 通过 javascript 添加 hover_text
 - 英文 utf-8编码
+- 词典支持下载，借用image下载的标签，
 - baichuan的单字数量怎么两万多个？
 - qwen:  ValueError: Unclosed image token
 plots
 import gradio as gr
 from vocab import all_tokenizers
 from util import *
+from examples import example_fn
+with gr.Blocks(css="css/style.css", title="Tokenizer Arena") as demo:
     gr.HTML("""<h1 align="center">Tokenizer Arena ⚔️</h1>""")
     # links: https://www.coderstool.com/utf8-encoding-decoding
     # 功能：输入文本，进行分词
     # 分词器：常见的分词器有集中，
     # 背景：方便分词、看词粒度、对比
     with gr.Row():
         gr.Markdown("## Input Text")
             scale=0,
             elem_classes="example-style"
         )
     user_input = gr.Textbox(
+        # value=default_user_input,
         label="Input Text",
         lines=5,
         show_label=False,
+    )
     gr.Markdown("## Tokenization")
     with gr.Row():
         with gr.Column(scale=6):
             with gr.Group():
                 tokenizer_type_1 = gr.Dropdown(
                     all_tokenizers,
                     label="Tokenizer 1",
                 )
                 with gr.Group():
                     """
                     with gr.Row():
                         stats_vocab_size_1 = gr.TextArea(
                             label="VocabSize",
                             lines=1,
                             elem_classes="statistics"
                         )
                         stats_zh_token_size_1 = gr.TextArea(
                             label="ZH char/word",
                             lines=1,
                             elem_classes="statistics"
                         )
                         stats_overlap_token_size_1 = gr.TextArea(
+                            # value=default_stats_overlap_token_size,
                             label="Overlap Tokens",
                             lines=1,
                             elem_classes="statistics"
             with gr.Group():
                 tokenizer_type_2 = gr.Dropdown(
                     all_tokenizers,
                     label="Tokenizer 2",
                 )
                 with gr.Group():
                     with gr.Row():
                         stats_vocab_size_2 = gr.TextArea(
                             label="VocabSize",
                             lines=1,
                             elem_classes="statistics"
                         )
                         stats_zh_token_size_2 = gr.TextArea(
                             label="ZH char/word",  # 中文字/词
                             lines=1,
                             elem_classes="statistics"
                         #     elem_classes="statistics"
                         # )
                         stats_overlap_token_size_2 = gr.TextArea(
                             label="Overlap Tokens",
                             lines=1,
                             elem_classes="statistics"
     with gr.Row():
         with gr.Column():
             output_text_1 = gr.Highlightedtext(
                 show_legend=True,
                 elem_classes="space-show"
             )
         with gr.Column():
             output_text_2 = gr.Highlightedtext(
                 show_legend=True,
                 elem_classes="space-show"
             )
     with gr.Row():
+        output_table_1 = gr.Dataframe()
+        output_table_2 = gr.Dataframe()
     tokenizer_type_1.change(tokenize, [user_input, tokenizer_type_1],
                             [output_text_1, output_table_1])
     tokenizer_type_1.change(basic_count, [tokenizer_type_1], [stats_vocab_size_1, stats_zh_token_size_1])
     tokenizer_type_1.change(get_overlap_token_size, [tokenizer_type_1, tokenizer_type_2],
                             [stats_overlap_token_size_1, stats_overlap_token_size_2])
     user_input.change(tokenize_pair,
                       [user_input, tokenizer_type_1, tokenizer_type_2],
+                      [output_text_1, output_table_1, output_text_2, output_table_2])  # , pass_request=1
     tokenizer_type_2.change(tokenize, [user_input, tokenizer_type_2],
                             [output_text_2, output_table_2])
         [user_input, tokenizer_type_1, tokenizer_type_2]
     )
+    demo.load(_js=open("js/onload.js", "r", encoding="utf-8").read())
+    demo.load(
+        fn=on_load,
+        inputs=None,
+        outputs=[user_input, tokenizer_type_1, tokenizer_type_2],
+    )
 if __name__ == "__main__":
+    print("http://127.0.0.1:7860/?tokenizer1=llama&tokenizer2=chinese_llama2&text=fdsjlk")  # llama chinese_llama2
+    print(
+        "http://127.0.0.1:7860/?tokenizer1=chinese_llama&tokenizer2=chinese_llama2&text=fdsjlk")  # llama chinese_llama2
+    print("http://127.0.0.1:7860/?tokenizer1=baichuan&tokenizer2=baichuan2&text=sss")  # baichuan 1 VS 2
+    print("http://127.0.0.1:7860/?tokenizer1=bert&tokenizer2=clue&text=sss")  # bert VS clue
+    print("http://127.0.0.1:7860/?tokenizer1=clue&tokenizer2=kplug&text=sss")  # clue VS kplug
+    print("http://127.0.0.1:7860/?tokenizer1=baichuan&tokenizer2=baichuan2&text=sss")  #
+    # demo.queue(max_size=20).launch()
+    demo.launch()

style.css → css/style.css RENAMED Viewed

File without changes

evaluation.md ADDED Viewed

	@@ -0,0 +1,5 @@


1	+
2	+
3	+ ## coverage
4	+
5	+ rare characters falling back to utf-8 bytes

examples.py ADDED Viewed

	@@ -0,0 +1,22 @@

+examples = {
+    "en": [
+        ["spaces:  2spaces        8spaces\t1tab\t\t2tab\n1newline", "llama", "chatglm_6b"],  # chatglm 有blank_n,
+        # ！？｡＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.
+        ["punctuations: ,.:/?+=\"，。！？；【】〔〕〖〗", "baichuan", "llama"],
+        ["symbols: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan", "llama"],
+        ["digits: (10086 + 98) = 100184", "baichuan", "llama"]
+    ]
+    ,
+    "zh": [
+        ["空格测试：  2个空格        8个空格", "llama", "chatglm_6b"],  # chatglm 有blank_n,
+        ["标点测试：，。！？；", "baichuan_7b", "llama"],
+        ["符号测试：🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan_7b", "llama"],
+        ["数字测试：(10086 + 98) = 100184", "baichuan_7b", "llama"],
+        ["中文简体：宽带，繁体：樂來", "baichuan_7b", "llama"],
+    ]
+}
+def example_fn(example_idx):
+    return examples["en"][example_idx]

images/README.md CHANGED Viewed

	@@ -0,0 +1,5 @@


1	+
2	+ ## info
3	+
4	+ https://huggingface.co/bert-base-uncased
5	+

images/download_button.html ADDED Viewed

	@@ -0,0 +1 @@

+ <div class="icon-buttons svelte-1btp92j"><a href="data:image/svg+xml;base64,PD94bWwgdmVyc2lvbj0iMS4wIiBlbmNvZGluZz0idXRmLTgiPz4NCjwhLS0gU3ZnIFZlY3RvciBJY29ucyA6IGh0dHA6Ly93d3cub25saW5ld2ViZm9udHMuY29tL2ljb24gLS0+DQo8IURPQ1RZUEUgc3ZnIFBVQkxJQyAiLS8vVzNDLy9EVEQgU1ZHIDEuMS8vRU4iICJodHRwOi8vd3d3LnczLm9yZy9HcmFwaGljcy9TVkcvMS4xL0RURC9zdmcxMS5kdGQiPg0KPHN2ZyB2ZXJzaW9uPSIxLjEiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyIgeG1sbnM6eGxpbms9Imh0dHA6Ly93d3cudzMub3JnLzE5OTkveGxpbmsiIHg9IjBweCIgeT0iMHB4IiB2aWV3Qm94PSIwIDAgMjU2IDI1NiIgZW5hYmxlLWJhY2tncm91bmQ9Im5ldyAwIDAgMjU2IDI1NiIgeG1sOnNwYWNlPSJwcmVzZXJ2ZSI+DQo8bWV0YWRhdGE+IFN2ZyBWZWN0b3IgSWNvbnMgOiBodHRwOi8vd3d3Lm9ubGluZXdlYmZvbnRzLmNvbS9pY29uIDwvbWV0YWRhdGE+DQo8Zz48Zz48cGF0aCBmaWxsPSIjMDAwMDAwIiBkPSJNMTAwLDIyNC4zYy0wLjIsMS41LTEuMywxLjktMi41LDIuMWMtNS4yLDAuOS05LjQsMy45LTEzLjksNi41Yy0yLjgsMS42LTUuOSwyLjgtOC45LDQuMmMtMC40LDAuMi0xLjEsMC4zLTEuMywwLjFjLTEuNS0yLTMuMi0wLjQtNC41LDAuMWMtMi42LDEtNC45LDIuNy03LjMsNGMtMS4zLDAuNy0yLjYsMS41LTQsMi4xYy0xLjUsMC42LTMuMSwwLjgtNC4zLDIuMmMtMC40LDAuNC0xLjMsMC41LTEuOSwwLjVjLTAuNSwwLTAuOS0wLjUtMS40LTAuN2MwLjItMC40LDAuNC0wLjgsMC44LTFjMS4zLTAuOCwyLjctMS41LDQtMi4yYzItMS4xLDQtMi4zLDYuMS0zLjRjMC4zLTAuMiwwLjYtMC40LDAuOS0wLjVjNC45LTEuMiw5LTQsMTMuMy02LjNjMS42LTAuOSwzLTIuMSw0LjYtMy4yYy0xLjMtMS43LTIuNi0xLjItMy42LTAuNmMtNCwyLTcuOSw0LjEtMTEuOCw2LjJjLTAuNiwwLjMtMS4xLDAuOS0xLjgsMS4xYy0wLjYsMC4yLTEuNiwwLjEtMS44LTAuMmMtMC4zLTAuNS0wLjMtMS41LDAtMS45YzEuOS0yLjUsMy42LTUuMiw3LjItNS42YzEuMS0wLjEsMi4zLTAuNCwxLjQtMmMtMC40LTAuNy0wLjYtMS4zLDAuMS0xLjhjMC44LTAuNiwxLjctMS40LDIuOC0wLjdjMS4xLDAuOCwyLjQsMS4xLDMuNiwwLjVjMS4yLTAuNSwyLjQtMS4xLDEuNS0yLjhjLTAuNi0xLjEsMC4yLTEuNywxLjEtMi4yYzEuOS0wLjksMy44LTEuOSw1LjctMi43YzAuNi0wLjMsMS43LTAuNiwyLTAuM2MxLjksMi4xLDMuMSwwLjIsNC40LTAuN2MwLjUtMC40LDEuMi0wLjYsMS44LTAuOGMxLjIsMi0xLjMsMi0xLjUsMy40YzAuNSwwLjEsMSwwLjQsMS40LDAuM2MyLjctMS4xLDUuMy0yLjQsNy45LTMuNmMwLjItMC4xLDAuMi0wLjcsMC40LTEuMWMtMC4zLTAuMS0wLjctMC4zLTAuOS0wLjJjLTAuOSwwLjQtMS44LDAuOS0zLjEsMWMwLjctMC44LDEuNC0xLjcsMi4zLTIuM2MzLjEtMS45LDYuMy0zLjYsOS41LTUuM2MwLjYtMC4zLDEuNy0wLjYsMi4yLTAuM2MxLjMsMC44LDIuNSwwLjUsMy43LDBjMi40LTEsNC43LTIsNy0zLjFjMC40LTAuMiwwLjUtMC43LDAuOS0xLjJjLTIuMi0xLjUtMy45LDAuMi02LjEsMC43YzAuNC0wLjksMC40LTEuNywwLjgtMS45YzEuNS0wLjksMy0xLjgsNC42LTIuNWMxLjMtMC42LDIuNy0wLjYsMy43LTIuM2MtMC44LTAuMi0xLjQtMC41LTEuOS0wLjRjLTQuNCwwLjMtOC42LDAuOS0xMS42LDQuN2MtMS42LDEuOS00LjEsMi02LjQsMi4yYy0xLjMsMC4xLTEuNy0wLjgtMS40LTEuOWMwLjMtMSwwLjYtMi4yLDEuNC0yLjdjMi4zLTEuNCw0LjQtMy42LDcuNi0yLjdjMC42LDAuMiwxLjQsMC4xLDItMC4xYzMuMi0xLjEsNi40LTIuMyw5LjUtMy42YzEuMS0wLjQsMi43LTAuOSwxLTIuNmMtMC4xLTAuMSwwLjUtMS4yLDEtMS41YzMuOC0yLjYsNy42LTUuMSwxMS40LTcuNmM3LjItNC44LDE1LTguNSwyMi44LTEyLjRjNC44LTIuNCw4LjgtNS44LDEyLjktOS4xYzAuOS0wLjcsMS43LTEuNywyLjQtMi42YzEuNy0yLjMsMS40LTQuMS0xLjItNS40Yy0xLjYtMC44LTMuNS0xLjQtNS4zLTEuN2MtNi40LTEtMTIuOC0xLjctMTkuMS0yLjhjLTUuMS0wLjktOS4zLTMuNy0xMS42LTguM2MtMS4zLTIuNi0xLjgtNi0xLjMtOC44YzEuNC03LjcsNC42LTE0LjcsMTAuMi0yMC40YzUuNi01LjgsMTIuMS05LjksMTkuNy0xMi40YzQuNi0xLjUsOS41LTEsMTQuMiwwLjFjMywwLjcsNS45LDAuNyw4LjktMC42YzIuOS0xLjMsNS44LTIuNSw4LjMtNC41YzAuMi0wLjEsMC40LTAuMywwLjYtMC40YzEtMC4zLDEuOCwxLjcsMi45LDBjMC42LTAuOCwxLjQtMS41LDIuMS0yLjJjMCwwLDAuMiwwLDAuNiwwYzAsMC43LDAuMSwxLjUsMC4xLDIuMmMwLDEuNS0wLjcsMi45LDAuNSw0LjNjMC40LDAuNSwwLDEuNiwwLDIuNGMwLDMuMSwwLjIsNi4zLDAuMSw5LjRjLTAuMiwzLjQtMC43LDYuOC0yLjcsOS43Yy0yLjQsMy41LTUuMyw2LjUtOC44LDguOWMtMSwwLjYtMS44LDAuNy0yLjksMC40Yy0yLjYtMC43LTQuOS0yLTYuOC00LjFjLTIuOS0zLjItNi40LTUuOC0xMC4zLTcuOWMtMi45LTEuNS01LjUtMS4zLTguMiwwLjZjLTMuMywyLjQtNi42LDQuOC04LjcsOC41Yy0yLjEsMy44LTIuNiw3LjgtMC45LDExLjdjMC45LDIsMywzLDUuMSwzLjRjNC40LDAuOCw4LjgsMS41LDEzLjIsMmMzLjgsMC40LDcuOCwwLjIsMTEuNSwxYzUuMiwxLjEsMTAsMy4yLDEyLjYsOC40YzEsMi4xLDIuMyw0LjEsMy4xLDYuM2MwLjksMi43LDEsNS40LTAuNSw4LjFjLTIuMyw0LTUuMiw3LjQtOC45LDEwLjNjLTYsNC42LTEyLjYsOC0xOS4xLDExLjhjLTIuNCwxLjQtNC4zLDMuMS01LjksNS4zYy0xLDEuNC0yLjMsMi43LTMuOCwzLjZjLTcuMyw0LjMtMTQuNiw4LjMtMjIsMTIuNWMtMi40LDEuMy01LDEuMi03LjUsMS4zYy0wLjQsMC0wLjktMC40LTEtMC43czAuMy0wLjksMC43LTEuMWMyLTEsMy45LTIsNi0yLjljMS4xLTAuNSwyLjItMC45LDIuMy0yLjhjLTEuMywwLjMtMi40LDAuNS0zLjQsMC45Yy00LjIsMS43LTguNCwzLjUtMTIuNyw1LjFjLTIuOCwxLTUuOCwxLjYtOC40LDMuNGMtMC41LDAuMy0xLjQsMC4yLTIsMC4xYy0xLjktMC41LTMuNCwwLTUsMC45Yy0yLjksMS41LTUuOCwzLTguOCw0LjRjLTQuNiwyLjItOS40LDQuMi0xMy43LDcuMWMtMS4yLDAuOC0yLjIsMS43LTMuMywyLjZjLTAuMiwwLjEtMC4yLDAuNy0wLjEsMC45YzAuMSwwLjIsMC42LDAuNiwwLjgsMC41YzEuNi0wLjUsMy4yLTAuOSw0LjYtMS41YzIuMS0xLDQuMS0yLjMsNi4yLTMuM2MyLTEsNC0yLjMsNi42LTEuNWMxLDAuMywyLjMtMC44LDMuNi0xLjFjMC41LTAuMSwxLjIsMC4yLDEuOCwwLjRjLTAuMiwwLjYtMC4zLDEuNS0wLjcsMS43Yy0xLjYsMC45LTMuMywxLjUtNS4xLDIuMkMxMDEuNywyMjMuOSwxMDAuOCwyMjQuMSwxMDAsMjI0LjNjLTAuMS0wLjEtMC4xLTAuNC0wLjItMC42Yy0wLjEtMC4zLTAuMy0wLjUtMC41LTAuN2MwLDAtMC4zLDAuMi0wLjUsMC4zYzAuMiwwLjIsMC4zLDAuNSwwLjYsMC43Qzk5LjUsMjI0LjIsOTkuOCwyMjQuMiwxMDAsMjI0LjN6Ii8+PHBhdGggZmlsbD0iIzAwMDAwMCIgZD0iTTE1MS45LDU5LjZjNC44LTYuNSw4LjQtMTMuNCwxMy42LTE5LjFjMC4zLDAuMiwwLjYsMC40LDAuOCwwLjVjLTEuOCwzLjUtMy41LDcuMS01LjMsMTAuNmMwLjIsMC4xLDAuNSwwLjMsMC43LDAuNGMwLjgtMS4xLDEuNy0yLjIsMi40LTMuNGMxLjYtMi43LDIuOS01LjYsNC42LTguM2M0LjUtNy4yLDguNy0xNC42LDE1LTIwLjZjMC40LTAuNCwwLjgtMC45LDEuMy0xLjJjMS41LTEuMSwyLjctMS4xLDQuMywwLjFjMC4yLDAuMiwwLjQsMC44LDAuMywxLjFjLTAuNCwwLjktMSwxLjktMS42LDIuN2MtMS43LDIuNS0zLjMsNS01LjEsNy41Yy0xLjksMi42LTIuNyw1LjQtMi43LDguNWMwLDEuOS0wLjgsMy4zLTIuOSw0Yy0wLjEtMC44LTAuMS0xLjQtMC4yLTIuNGMtMi42LDMuOC01LjEsNy4zLTcuNSwxMC45Yy0wLjEsMC4xLTAuNCwwLTEsMGMyLjMtNS40LDYuNi05LjcsNy43LTE1LjZjLTMsMS40LTQuMiw0LjMtNS45LDYuNmMtMS43LDIuMy0yLjksNS00LjIsNy42Yy0xLjQsMi43LTIuOCw1LjQtNCw4LjJjLTEuMSwyLjYtMy4yLDQuOS0yLjYsOC40YzMuMS0xLjIsMy41LTQuOSw2LjUtNi4yYzAsMC42LDAuMSwxLDAsMS4zYy0zLjIsNy4xLTYuMSwxNC40LTEwLjgsMjAuOGMtMi42LDMuNS00LjYsNy42LTYuOCwxMS40Yy0xLDEuNy0yLjIsMy4zLTMuMyw0LjljLTAuNCwwLjYtMC44LDEuMS0xLjEsMS44Yy0xLDIuNC0xLjcsNC45LTMsNy4xYy0xLjMsMi4yLTEuMSw0LjctMS44LDdjLTEuNiw1LjUtMy4zLDEwLjktNSwxNi40Yy0yLjMsNy43LTMuOCwxNS43LTUuMSwyMy42Yy0wLjYsMy45LTAuOCw3LjktMC44LDExLjljMCw0LjEtNC40LDcuMS04LjEsNS41Yy0wLjQtMC4yLTAuOC0wLjUtMS0wLjljLTEuOS0zLjUtNC44LTYuNC03LjctOWMtMi41LTIuMy0yLjgtNC44LTItNy42YzAuNi0yLjIsMC40LTQuMS0wLjEtNi4xYy0wLjgtMy4yLTEuNy02LjMtMi05LjZjLTAuMi0yLjEtMS42LTMuNS0yLjMtNS4zYy0xLjMtMy4xLTIuNS02LjMtMy42LTkuNGMtMS45LTUuMy0zLjctMTAuNi01LjYtMTUuOGMtMC45LTIuNi0yLjItNS4xLTIuOS03LjhjLTAuOC0zLjMtMi44LTUuNy01LjMtNy43Yy0yLjItMS43LTMuMi0zLjktMi45LTYuNGMwLjMtMi45LTAuMS02LDEuNy04LjdjMS4yLTEuNywxLjctMy44LDIuNi01LjhjMC41LTEuMywxLjMtMi4xLDIuNi0xLjJjMS4xLDAuOCwyLDAuNCwyLjktMC4yYzIuMy0xLjcsNC43LTAuNSw3LjEtMC4zYzIuNiwyLjYsNS44LDQuNSw2LjQsOC43YzAuOSw2LjUsMi4yLDEyLjksMy43LDE5LjJjMS4zLDUuNiwzLDExLjEsNC41LDE2LjZjMC4yLDAuOSwwLjMsMS45LDAuNSwyLjhjMC40LDEuNCwwLjcsMi45LDEuMiw0LjNjMC4yLDAuNiwwLjUsMS4xLDAuNywxLjZjMC40LDEtMC4xLDIuNywxLjUsMi43YzAuNiwwLDEuMi0xLjgsMS42LTIuOGMyLjUtNy4zLDQuOC0xNC42LDcuMy0yMS45YzIuMS02LjEsNC40LTEyLjEsNi40LTE4LjJjMS4yLTMuNSwxLjUtNy4zLDMuMy0xMC43YzEuNi0zLDMuMi01LjksNS04LjhjMC41LTAuOSwxLjQtMi4xLDIuOC0wLjhjMC4xLDAuMSwxLTAuMywxLjMtMC43YzAuNy0wLjksMS4zLTEuOSwyLTIuOUMxNTAuNCw1Ny4zLDE1MC42LDU3LjQsMTUxLjksNTkuNnogTTE4NC4yLDIzLjhjLTEuMiwwLjMtMiwwLjItMi4yLDAuNmMtMS42LDItMy4yLDQtNC43LDZjLTAuMSwwLjIsMCwwLjgsMC4yLDAuOWMwLjQsMC4xLDEsMC4yLDEuMiwwQzE4MC44LDI5LjMsMTgyLjgsMjcuMiwxODQuMiwyMy44eiIvPjxwYXRoIGZpbGw9IiMwMDAwMDAiIGQ9Ik0xNzcsMTMuNmMtMS4zLDEuOC0xLjQsNC45LTQuMyw2LjJjLTAuNSwwLjItMS40LDAuMS0xLjktMC4xYy0wLjMtMC4xLTAuMy0xLTAuMi0xLjVjMC40LTEuOCwxLTMuNiwxLjUtNS40YzAuMS0wLjMsMC4yLTAuNywwLjMtMWMwLjgtMS41LDEuOC0yLDMuMS0xLjZDMTc2LjcsMTAuNSwxNzcuMSwxMS4zLDE3NywxMy42eiIvPjwvZz48L2c+DQo8L3N2Zz4=" download="image" target="_blank"><button aria-label="Download" title="Download" class="svelte-1030q2h"> <div class="svelte-1030q2h"><svg xmlns="http://www.w3.org/2000/svg" width="100%" height="100%" viewBox="0 0 32 32"><path fill="currentColor" d="M26 24v4H6v-4H4v4a2 2 0 0 0 2 2h20a2 2 0 0 0 2-2v-4zm0-10l-1.41-1.41L17 20.17V2h-2v18.17l-7.59-7.58L6 14l10 10l10-10z"></path></svg></div></button></a> </div>

js/onload.js ADDED Viewed

	@@ -0,0 +1,12 @@

+function() {
+    // feedback
+    //$("footer a")["href"] = "https://github.com/xu-song/tokenizer-arena/issues"
+    //$("footer a").childNodes[0].textContent ="Send Feedback"
+    document.querySelectorAll("footer a")[0].childNodes[0].textContent ="Send Feedback";
+    document.querySelectorAll("footer a")[0].href = "https://github.com/xu-song/tokenizer-arena/issues";
+    // download button
+    // API
+}

util.py CHANGED Viewed

@@ -5,13 +5,15 @@ import pandas as pd
 from vocab import load_tokener
 from utils.zh_util import iter_vocab
 from utils.log_util import logger
-def tokenize(text, tokenizer_type,  color_num=5, update=True):
     """
-    TODO: cache tokenizer
     """
-    logger.info("[param]:" + json.dumps({"text": text, "tokenizer_type": tokenizer_type}, ensure_ascii=False))
     pos_tokens = []
     tokenizer = load_tokener(tokenizer_type)
     encoding = tokenizer.encode(text)
@@ -29,16 +31,16 @@ def tokenize(text, tokenizer_type,  color_num=5, update=True):
                 token_str = token.decode("utf-8")
             except:
                 token_str = token.decode("utf-8", errors="ignore")
-                logger.info("[decode_error]: " + json.dumps(
                     {"tokenizer_type": tokenizer_type, "token": str(token), "token_str": token_str},
                     ensure_ascii=False))
             token_bytes = token
-            json_dumps = json.dumps(token_str)
         elif isinstance(token, str):
             token_str = token
             token_bytes = bytes(token_str, "utf-8")
-            json_dumps = json.dumps(token_str)
         else:
             return
@@ -48,31 +50,23 @@ def tokenize(text, tokenizer_type,  color_num=5, update=True):
              "Token": token_str,  # utf-8解码后的字符串，为什么有些是 <0xE7>，表示什么？比如llama
              "Text": decode_text,  #
              # "Bytes": token_bytes,  # bytes类型在gradio前端页面被解码成字符串，比如   b'\xe4\xb8\xad' 仍然显示成 "中"。因此 str(token_bytes)
-             "Bytes": str(token_bytes),
              # "Unicode": json_dumps  # unicode, 如果是ascii码，就直接显示。如果不是ascii码，就显示unicode
              }
         )
     table_df = pd.DataFrame(table)
-    logger.info(f"[Tokens {tokenizer_type}]: {table[:2]}")
     # print(table_df)
-    if update:
-        return gr.update(value=pos_tokens, label=f"Tokens: {len(encoding)}"), table_df
-    else:
-        return pos_tokens, table_df, len(encoding)
-def tokenize_pair(text, tokenizer_type_1, tokenizer_type_2, request: gr.Request):
-    if request:
-        client_ip = request.client.host
-        # local_ip = socket.gethostbyname(socket.gethostbyname(""))
-        headers = request.kwargs['headers']
-        if headers and 'x-forwarded-for' in headers:
-            x_forwarded_for = headers['x-forwarded-for']
-            client_ip = x_forwarded_for.split(' ')[0] if x_forwarded_for else ""
-        logger.info(f"[client_ip]: {client_ip}, {tokenizer_type_1}, {tokenizer_type_2}")
     pos_tokens_1, table_df_1 = tokenize(text, tokenizer_type_1)
     pos_tokens_2, table_df_2 = tokenize(text, tokenizer_type_2)
     return pos_tokens_1, table_df_1, pos_tokens_2, table_df_2
@@ -84,21 +78,67 @@ def basic_count(tokenizer_type):
     return tokenizer.vocab_size, f'{stats["中文汉字数"]["中文单字"]}/{stats["中文汉字数"]["中文多字"]}'
 def get_overlap_token_size(tokenizer_type_1, tokenizer_type_2):
     tokenizer1 = load_tokener(tokenizer_type_1)
     tokenizer2 = load_tokener(tokenizer_type_2)
-    vocab1 = tokenizer1.get_vocab()
-    vocab2 = tokenizer2.get_vocab()
-    overlap_tokens = vocab1.keys() & vocab2.keys()
     overlap_token_size = len(overlap_tokens)
-    logger.info(f"[OverlapTokens of {tokenizer_type_1} {tokenizer_type_2}]: {list(overlap_tokens)[:10]}")
     return overlap_token_size, overlap_token_size
 def test_coding():
     bytes1 = b'\xe4\xb8\xad'
     print(bytes1)  # b'\xe4\xb8\xad'
 if __name__ == "__main__":
-    print(basic_count("internlm_chat_7b"))

 from vocab import load_tokener
 from utils.zh_util import iter_vocab
 from utils.log_util import logger
+from functools import lru_cache
+from urllib.parse import urlparse, parse_qs
+@lru_cache
+def tokenize(text, tokenizer_type, color_num=5):
     """
     """
+    logger.info("param=" + json.dumps({"text": text, "tokenizer_type": tokenizer_type}, ensure_ascii=False))
     pos_tokens = []
     tokenizer = load_tokener(tokenizer_type)
     encoding = tokenizer.encode(text)
                 token_str = token.decode("utf-8")
             except:
                 token_str = token.decode("utf-8", errors="ignore")
+                logger.error("decode_error: " + json.dumps(
                     {"tokenizer_type": tokenizer_type, "token": str(token), "token_str": token_str},
                     ensure_ascii=False))
             token_bytes = token
+            # json_dumps = json.dumps(token_str)
         elif isinstance(token, str):
             token_str = token
             token_bytes = bytes(token_str, "utf-8")
+            # json_dumps = json.dumps(token_str)
         else:
             return
              "Token": token_str,  # utf-8解码后的字符串，为什么有些是 <0xE7>，表示什么？比如llama
              "Text": decode_text,  #
              # "Bytes": token_bytes,  # bytes类型在gradio前端页面被解码成字符串，比如   b'\xe4\xb8\xad' 仍然显示成 "中"。因此 str(token_bytes)
+             "UTF8 Bytes": str(token_bytes),
              # "Unicode": json_dumps  # unicode, 如果是ascii码，就直接显示。如果不是ascii码，就显示unicode
              }
         )
     table_df = pd.DataFrame(table)
+    logger.info(f"Tokens={table[:2]}")
     # print(table_df)
+    return gr.update(value=pos_tokens, label=f"Tokens: {len(encoding)}"), table_df
+@lru_cache
+def tokenize_pair(text, tokenizer_type_1, tokenizer_type_2):
+    """
+    input_text.change
+    """
     pos_tokens_1, table_df_1 = tokenize(text, tokenizer_type_1)
     pos_tokens_2, table_df_2 = tokenize(text, tokenizer_type_2)
     return pos_tokens_1, table_df_1, pos_tokens_2, table_df_2
     return tokenizer.vocab_size, f'{stats["中文汉字数"]["中文单字"]}/{stats["中文汉字数"]["中文多字"]}'
+@lru_cache
 def get_overlap_token_size(tokenizer_type_1, tokenizer_type_2):
     tokenizer1 = load_tokener(tokenizer_type_1)
     tokenizer2 = load_tokener(tokenizer_type_2)
+    vocab_set_1 = tokenizer1.get_vocab().keys()
+    vocab_set_2 = tokenizer2.get_vocab().keys()
+    token1 = next(iter(vocab_set_1))
+    token2 = next(iter(vocab_set_2))
+    if type(token1) != type(token2):  # bytes  str
+        if isinstance(token1, str):
+            vocab_set_1 = set([token.encode("utf-8") for token in vocab_set_1])
+        if isinstance(token2, str):
+            vocab_set_2 = set([token.encode("utf-8") for token in vocab_set_2])
+    overlap_tokens = vocab_set_1 & vocab_set_2
     overlap_token_size = len(overlap_tokens)
+    logger.info(
+        f"{overlap_token_size} OverlapTokens of {tokenizer_type_1} {tokenizer_type_2}: {list(overlap_tokens)[:10]}")
     return overlap_token_size, overlap_token_size
+default_user_input = """Replace this text in the input field to see how tokenization works
+华为发布Mate60手机
+ラグビーワールドカップ2023フランス"""
+default_tokenizer_type_1 = "llama"
+# default_tokenizer_type_2 = "internlm_chat_7b"
+default_tokenizer_type_2 = "gpt_35_turbo"
+def on_load(request: gr.Request):
+    """
+    onLoad
+    """
+    text = None
+    tokenizer_type_1 = None
+    tokenizer_type_2 = None
+    query_params = {}
+    if request:
+        client_ip = request.client.host
+        # local_ip = socket.gethostbyname(socket.gethostbyname(""))
+        # headers = request.kwargs['headers']
+        # if headers and 'x-forwarded-for' in headers:
+        #     x_forwarded_for = headers['x-forwarded-for']
+        #     client_ip = x_forwarded_for.split(' ')[0] if x_forwarded_for else ""
+        if "referer" in request.headers:
+            query_params = parse_qs(urlparse(request.headers["referer"]).query)
+            query_params = {k: v[0] for k, v in query_params.items() if len(v) > 0}
+        tokenizer_type_1 = query_params.get("tokenizer1", default_tokenizer_type_1)
+        tokenizer_type_2 = query_params.get("tokenizer2", default_tokenizer_type_2)
+        text = query_params.get("text", default_user_input)
+        logger.info(f"client_ip: {client_ip}; params: {query_params}")
+    return text, tokenizer_type_1, tokenizer_type_2
 def test_coding():
     bytes1 = b'\xe4\xb8\xad'
     print(bytes1)  # b'\xe4\xb8\xad'
 if __name__ == "__main__":
+    print(get_overlap_token_size("gpt_35_turbo", "gpt_4"))
+    # print(basic_count("internlm_chat_7b"))

utils/_vocab.zh.jsonl ADDED Viewed

	@@ -0,0 +1,1189 @@

+{"id": 529, "token": "’", "type": "中文标点"}
+{"id": 753, "token": "’s", "type": "中文标点"}
+{"id": 863, "token": "”", "type": "中文标点"}
+{"id": 1054, "token": " “", "type": "中文标点"}
+{"id": 1389, "token": " –", "type": "中文标点"}
+{"id": 1431, "token": "’t", "type": "中文标点"}
+{"id": 1811, "token": "。", "type": "中文标点"}
+{"id": 1981, "token": "…", "type": "中文标点"}
+{"id": 2001, "token": " —", "type": "中文标点"}
+{"id": 2029, "token": ".”", "type": "中文标点"}
+{"id": 2118, "token": "“", "type": "中文标点"}
+{"id": 2345, "token": "—", "type": "中文标点"}
+{"id": 2476, "token": ",”", "type": "中文标点"}
+{"id": 2950, "token": ".”\n\n", "type": "中文标点"}
+{"id": 3207, "token": "’re", "type": "中文标点"}
+{"id": 3451, "token": " ‘", "type": "中文标点"}
+{"id": 3490, "token": "。\n\n", "type": "中文标点"}
+{"id": 3922, "token": "，", "type": "中文标点"}
+{"id": 4070, "token": "’ve", "type": "中文标点"}
+{"id": 4235, "token": "–", "type": "中文标点"}
+{"id": 4344, "token": "’m", "type": "中文标点"}
+{"id": 4696, "token": " …", "type": "中文标点"}
+{"id": 4805, "token": "’ll", "type": "中文标点"}
+{"id": 5232, "token": "：", "type": "中文标点"}
+{"id": 5486, "token": "、", "type": "中文标点"}
+{"id": 5551, "token": "…\n\n", "type": "中文标点"}
+{"id": 6447, "token": "！", "type": "中文标点"}
+{"id": 7070, "token": "’d", "type": "中文标点"}
+{"id": 7663, "token": "”\n\n", "type": "中文标点"}
+{"id": 7705, "token": "）", "type": "中文标点"}
+{"id": 8107, "token": "年", "type": "中文单字"}
+{"id": 8713, "token": "——", "type": "中文标点"}
+{"id": 9039, "token": "数", "type": "中文单字"}
+{"id": 9080, "token": "日", "type": "中文单字"}
+{"id": 9174, "token": "。\n", "type": "中文标点"}
+{"id": 9520, "token": "”,", "type": "中文标点"}
+{"id": 9554, "token": "的", "type": "中文单字"}
+{"id": 9787, "token": " ·", "type": "中文标点"}
+{"id": 9953, "token": "月", "type": "中文单字"}
+{"id": 10110, "token": "（", "type": "中文标点"}
+{"id": 10378, "token": "“I", "type": "中文标点"}
+{"id": 10416, "token": " […", "type": "中文标点"}
+{"id": 10646, "token": "」", "type": "中文标点"}
+{"id": 11144, "token": "【", "type": "中文标点"}
+{"id": 11199, "token": "】", "type": "中文标点"}
+{"id": 11453, "token": "”.", "type": "中文标点"}
+{"id": 11571, "token": "？", "type": "中文标点"}
+{"id": 11883, "token": "用", "type": "中文单字"}
+{"id": 12291, "token": " …\n\n", "type": "中文标点"}
+{"id": 12671, "token": "?”", "type": "中文标点"}
+{"id": 12996, "token": " […]\n\n", "type": "中文标点"}
+{"id": 13153, "token": "成", "type": "中文单字"}
+{"id": 13177, "token": "「", "type": "中文标点"}
+{"id": 13372, "token": "名", "type": "中文单字"}
+{"id": 13646, "token": "时", "type": "中文单字"}
+{"id": 14260, "token": "·", "type": "中文标点"}
+{"id": 14305, "token": "“The", "type": "中文标点"}
+{"id": 14336, "token": "‘", "type": "中文标点"}
+{"id": 14382, "token": "……", "type": "中文标点"}
+{"id": 14558, "token": "件", "type": "中文单字"}
+{"id": 14639, "token": ".’", "type": "中文标点"}
+{"id": 15085, "token": "“We", "type": "中文标点"}
+{"id": 15120, "token": "一", "type": "中文单字"}
+{"id": 15179, "token": " „", "type": "中文标点"}
+{"id": 15225, "token": "请", "type": "中文单字"}
+{"id": 15397, "token": "”.\n\n", "type": "中文标点"}
+{"id": 16325, "token": "中", "type": "中文单字"}
+{"id": 16423, "token": "据", "type": "中文单字"}
+{"id": 16616, "token": "?”\n\n", "type": "中文标点"}
+{"id": 16620, "token": "————", "type": "中文标点"}
+{"id": 16882, "token": "码", "type": "中文单字"}
+{"id": 16937, "token": "不", "type": "中文单字"}
+{"id": 17039, "token": "新", "type": "中文单字"}
+{"id": 17161, "token": "文", "type": "中文单字"}
+{"id": 17223, "token": "—and", "type": "中文标点"}
+{"id": 17297, "token": "下", "type": "中文单字"}
+{"id": 17620, "token": "分", "type": "中文单字"}
+{"id": 17701, "token": "入", "type": "中文单字"}
+{"id": 17792, "token": "人", "type": "中文单字"}
+{"id": 17818, "token": "“It", "type": "中文标点"}
+{"id": 17860, "token": "功", "type": "中文单字"}
+{"id": 17905, "token": "上", "type": "中文单字"}
+{"id": 17982, "token": "户", "type": "中文单字"}
+{"id": 18171, "token": "！\n\n", "type": "中文标点"}
+{"id": 18184, "token": "为", "type": "中文单字"}
+{"id": 18217, "token": " ’", "type": "中文标点"}
+{"id": 18319, "token": "!”", "type": "中文标点"}
+{"id": 18363, "token": "间", "type": "中文单字"}
+{"id": 18476, "token": "号", "type": "中文单字"}
+{"id": 18655, "token": "取", "type": "中文单字"}
+{"id": 18904, "token": "回", "type": "中文单字"}
+{"id": 19000, "token": "在", "type": "��文单字"}
+{"id": 19047, "token": "页", "type": "中文单字"}
+{"id": 19066, "token": "。\n\n\n\n", "type": "中文标点"}
+{"id": 19113, "token": "字", "type": "中文单字"}
+{"id": 19361, "token": "有", "type": "中文单字"}
+{"id": 19483, "token": "个", "type": "中文单字"}
+{"id": 19524, "token": " ”", "type": "中文标点"}
+{"id": 19653, "token": "成功", "type": "中文多字"}
+{"id": 19967, "token": "作", "type": "中文单字"}
+{"id": 20145, "token": "】【", "type": "中文标点"}
+{"id": 20182, "token": "’,", "type": "中文标点"}
+{"id": 20379, "token": "示", "type": "中文单字"}
+{"id": 20600, "token": "用户", "type": "中文多字"}
+{"id": 20675, "token": "数据", "type": "中文多字"}
+{"id": 20834, "token": "出", "type": "中文单字"}
+{"id": 21043, "token": "是", "type": "中文单字"}
+{"id": 21060, "token": "….", "type": "中文标点"}
+{"id": 21082, "token": "时间", "type": "中文多字"}
+{"id": 21388, "token": "失", "type": "中文单字"}
+{"id": 21405, "token": "表", "type": "中文单字"}
+{"id": 21418, "token": "除", "type": "中文单字"}
+{"id": 21601, "token": "加", "type": "中文单字"}
+{"id": 21809, "token": "败", "type": "中文单字"}
+{"id": 21909, "token": "～", "type": "中文标点"}
+{"id": 21990, "token": "生", "type": "中文单字"}
+{"id": 22023, "token": "信", "type": "中文单字"}
+{"id": 22117, "token": "’est", "type": "中文标点"}
+{"id": 22238, "token": "类", "type": "中文单字"}
+{"id": 22324, "token": "置", "type": "中文单字"}
+{"id": 22416, "token": "—the", "type": "中文标点"}
+{"id": 22649, "token": "理", "type": "中文单字"}
+{"id": 22656, "token": "本", "type": "中文单字"}
+{"id": 22820, "token": "失败", "type": "中文多字"}
+{"id": 23018, "token": "息", "type": "中文单字"}
+{"id": 23039, "token": "行", "type": "中文单字"}
+{"id": 23187, "token": "定", "type": "中文单字"}
+{"id": 23189, "token": ",’", "type": "中文标点"}
+{"id": 23226, "token": "改", "type": "中文单字"}
+{"id": 23249, "token": "　", "type": "中文标点"}
+{"id": 23530, "token": "市", "type": "中文单字"}
+{"id": 23538, "token": "期", "type": "中文单字"}
+{"id": 23897, "token": "以", "type": "中文单字"}
+{"id": 23951, "token": "修", "type": "中文单字"}
+{"id": 23954, "token": "）\n", "type": "中文标点"}
+{"id": 24186, "token": "元", "type": "中文单字"}
+{"id": 24273, "token": "方", "type": "中文单字"}
+{"id": 24535, "token": "’.", "type": "中文标点"}
+{"id": 24580, "token": "录", "type": "中文单字"}
+{"id": 24775, "token": "区", "type": "中文单字"}
+{"id": 24946, "token": "单", "type": "中文单字"}
+{"id": 25010, "token": "�除", "type": "中文多字"}
+{"id": 25129, "token": "位", "type": "中文单字"}
+{"id": 25287, "token": "型", "type": "中文单字"}
+{"id": 25333, "token": "法", "type": "中文单字"}
+{"id": 25336, "token": "县", "type": "中文单字"}
+{"id": 25359, "token": "存", "type": "中文单字"}
+{"id": 25446, "token": "品", "type": "中文单字"}
+{"id": 25580, "token": "前", "type": "中文单字"}
+{"id": 25666, "token": "称", "type": "中文单字"}
+{"id": 25758, "token": "!”\n\n", "type": "中文标点"}
+{"id": 26016, "token": "；", "type": "中文标点"}
+{"id": 26062, "token": "�回", "type": "中文多字"}
+{"id": 26123, "token": "》", "type": "中文标点"}
+{"id": 26130, "token": "注", "type": "中文单字"}
+{"id": 26239, "token": "修改", "type": "中文多字"}
+{"id": 26592, "token": "值", "type": "中文单字"}
+{"id": 26794, "token": "输", "type": "中文单字"}
+{"id": 26892, "token": "建", "type": "中文单字"}
+{"id": 27179, "token": " (“", "type": "中文标点"}
+{"id": 27327, "token": "能", "type": "中文单字"}
+{"id": 27384, "token": "大", "type": "中文单字"}
+{"id": 27452, "token": "例", "type": "中文单字"}
+{"id": 27479, "token": "度", "type": "中文单字"}
+{"id": 27704, "token": "始", "type": "中文单字"}
+{"id": 27948, "token": "？\n\n", "type": "中文标点"}
+{"id": 27996, "token": "文件", "type": "中文多字"}
+{"id": 28037, "token": "到", "type": "中文单字"}
+{"id": 28038, "token": "《", "type": "中文标点"}
+{"id": 28190, "token": "面", "type": "中文单字"}
+{"id": 28359, "token": "�数", "type": "中文多字"}
+{"id": 28466, "token": "载", "type": "中文单字"}
+{"id": 28469, "token": "信息", "type": "中文多字"}
+{"id": 28542, "token": "点", "type": "中文单字"}
+{"id": 28587, "token": "��取", "type": "中文多字"}
+{"id": 28624, "token": " […]", "type": "中文标点"}
+{"id": 28741, "token": "密", "type": "中文单字"}
+{"id": 28833, "token": "动", "type": "中文单字"}
+{"id": 28873, "token": "果", "type": "中文单字"}
+{"id": 28918, "token": "、\n\n", "type": "中文标点"}
+{"id": 28966, "token": "）\n\n", "type": "中文标点"}
+{"id": 29096, "token": "—a", "type": "中文标点"}
+{"id": 29129, "token": "图", "type": "中文单字"}
+{"id": 29172, "token": "提", "type": "中文单字"}
+{"id": 29391, "token": "发", "type": "中文单字"}
+{"id": 29411, "token": "：\n", "type": "中文标点"}
+{"id": 29430, "token": "式", "type": "中文单字"}
+{"id": 29472, "token": "—\n\n", "type": "中文标点"}
+{"id": 29504, "token": "国", "type": "中文单字"}
+{"id": 29681, "token": "」\n\n", "type": "中文标点"}
+{"id": 29706, "token": "删除", "type": "中文多字"}
+{"id": 29719, "token": "’un", "type": "中文标点"}
+{"id": 29741, "token": "登", "type": "中文单字"}
+{"id": 29826, "token": "错", "type": "中文单字"}
+{"id": 30019, "token": "。。", "type": "中文标点"}
+{"id": 30046, "token": "者", "type": "中文单字"}
+{"id": 30051, "token": "认", "type": "中文单字"}
+{"id": 30156, "token": "误", "type": "中文单字"}
+{"id": 30177, "token": "接", "type": "中文单字"}
+{"id": 30184, "token": "’\n\n", "type": "中文标点"}
+{"id": 30356, "token": "关", "type": "中文单字"}
+{"id": 30358, "token": "重", "type": "中文单字"}
+{"id": 30537, "token": "第", "type": "中文单字"}
+{"id": 30590, "token": "地", "type": "中文单字"}
+{"id": 30624, "token": "如", "type": "中文单字"}
+{"id": 30697, "token": "————————", "type": "中文标点"}
+{"id": 30735, "token": "设", "type": "中文单字"}
+{"id": 30832, "token": "目", "type": "中文单字"}
+{"id": 30867, "token": "开", "type": "中文单字"}
+{"id": 30926, "token": "事", "type": "中文单字"}
+{"id": 31041, "token": "�数", "type": "中文多字"}
+{"id": 31091, "token": "名称", "type": "中文多字"}
+{"id": 31378, "token": "“This", "type": "中文标点"}
+{"id": 31472, "token": " ：", "type": "中文标点"}
+{"id": 31540, "token": "可", "type": "中文单字"}
+{"id": 31634, "token": "要", "type": "中文单字"}
+{"id": 31640, "token": "代", "type": "中文单字"}
+{"id": 31809, "token": "小", "type": "中文单字"}
+{"id": 31867, "token": "选", "type": "中文单字"}
+{"id": 31944, "token": "标", "type": "中文单字"}
+{"id": 31958, "token": "明", "type": "中文单字"}
+{"id": 31968, "token": "编", "type": "中文单字"}
+{"id": 32018, "token": "求", "type": "中文单字"}
+{"id": 32218, "token": "列", "type": "中文单字"}
+{"id": 32239, "token": "网", "type": "中文单字"}
+{"id": 32296, "token": "输入", "type": "中文多字"}
+{"id": 32307, "token": "万", "type": "中文单字"}
+{"id": 32335, "token": "最", "type": "中文单字"}
+{"id": 32351, "token": "！！", "type": "中文标点"}
+{"id": 32438, "token": "�建", "type": "中文多字"}
+{"id": 32626, "token": "返回", "type": "中文多字"}
+{"id": 32648, "token": "器", "type": "中文单字"}
+{"id": 32938, "token": "所", "type": "中文单字"}
+{"id": 32943, "token": "内", "type": "中文单字"}
+{"id": 33005, "token": "类型", "type": "中文多字"}
+{"id": 33014, "token": "体", "type": "中文单字"}
+{"id": 33035, "token": "通", "type": "中文单字"}
+{"id": 33052, "token": "务", "type": "中文单字"}
+{"id": 33091, "token": "此", "type": "中文单字"}
+{"id": 33122, "token": "商", "type": "中文单字"}
+{"id": 33144, "token": "序", "type": "中文单字"}
+{"id": 33200, "token": "错误", "type": "中文多字"}
+{"id": 33208, "token": "化", "type": "中文单字"}
+{"id": 33420, "token": "消", "type": "中文单字"}
+{"id": 33476, "token": "否", "type": "中文单字"}
+{"id": 33563, "token": "保", "type": "中文单字"}
+{"id": 33611, "token": "”)", "type": "中文标点"}
+{"id": 33655, "token": "使", "type": "中文单字"}
+{"id": 33671, "token": "次", "type": "中文单字"}
+{"id": 33672, "token": "“You", "type": "中文标点"}
+{"id": 33748, "token": "机", "type": "中文单字"}
+{"id": 33764, "token": "对", "type": "中文单字"}
+{"id": 33765, "token": "参数", "type": "中文多字"}
+{"id": 33777, "token": "’é", "type": "中文标点"}
+{"id": 33857, "token": "量", "type": "中文单字"}
+{"id": 33904, "token": "函数", "type": "中文多字"}
+{"id": 33967, "token": "密码", "type": "中文多字"}
+{"id": 33976, "token": "查", "type": "中文单字"}
+{"id": 34045, "token": "。”", "type": "中文标点"}
+{"id": 34048, "token": "部", "type": "中文单字"}
+{"id": 34171, "token": "性", "type": "中文单字"}
+{"id": 34208, "token": "和", "type": "中文单字"}
+{"id": 34226, "token": "更", "type": "中文单字"}
+{"id": 34547, "token": "后", "type": "中文单字"}
+{"id": 34577, "token": "证", "type": "中文单字"}
+{"id": 34676, "token": " 【", "type": "中文标点"}
+{"id": 34690, "token": "”，", "type": "中文标点"}
+{"id": 34972, "token": "题", "type": "中文单字"}
+{"id": 35056, "token": "确", "type": "中文单字"}
+{"id": 35083, "token": "格", "type": "中文单字"}
+{"id": 35147, "token": ".“", "type": "中文标点"}
+{"id": 35192, "token": ".—", "type": "中文标点"}
+{"id": 35284, "token": ".”\n\n\n\n", "type": "中文标点"}
+{"id": 35287, "token": "了", "type": "中文单字"}
+{"id": 35304, "token": "���", "type": "中文单字"}
+{"id": 35330, "token": "金", "type": "中文单字"}
+{"id": 35417, "token": "公", "type": "中文单字"}
+{"id": 35424, "token": "午", "type": "中文单字"}
+{"id": 35757, "token": "円", "type": "中文单字"}
+{"id": 35816, "token": "“There", "type": "中文标点"}
+{"id": 35818, "token": "片", "type": "中文单字"}
+{"id": 35894, "token": "空", "type": "中文单字"}
+{"id": 35959, "token": "请求", "type": "中文多字"}
+{"id": 36225, "token": "��加", "type": "中文多字"}
+{"id": 36319, "token": ".’\n\n", "type": "中文标点"}
+{"id": 36343, "token": "态", "type": "中文单字"}
+{"id": 36515, "token": "登录", "type": "中文多字"}
+{"id": 36577, "token": "’une", "type": "中文标点"}
+{"id": 36651, "token": "管", "type": "中文单字"}
+{"id": 36668, "token": "主", "type": "中文单字"}
+{"id": 36761, "token": "』", "type": "中文标点"}
+{"id": 36827, "token": "天", "type": "中文单字"}
+{"id": 36896, "token": "、「", "type": "中文标点"}
+{"id": 37026, "token": "自", "type": "中文单字"}
+{"id": 37046, "token": "我", "type": "中文单字"}
+{"id": 37087, "token": "全", "type": "中文单字"}
+{"id": 37271, "token": "今", "type": "中文单字"}
+{"id": 37395, "token": "页面", "type": "中文多字"}
+{"id": 37507, "token": "来", "type": "中文单字"}
+{"id": 37648, "token": "��作", "type": "中文多字"}
+{"id": 37656, "token": "正", "type": "中文单字"}
+{"id": 37687, "token": "说", "type": "中文单字"}
+{"id": 37689, "token": "意", "type": "中文单字"}
+{"id": 37705, "token": "送", "type": "中文单字"}
+{"id": 37729, "token": "容", "type": "中文单字"}
+{"id": 37767, "token": "已", "type": "中文单字"}
+{"id": 37985, "token": "结", "type": "中文单字"}
+{"id": 38087, "token": "：“", "type": "中文标点"}
+{"id": 38093, "token": "会", "type": "中文单字"}
+{"id": 38129, "token": "使用", "type": "中文多字"}
+{"id": 38232, "token": "。</", "type": "中文标点"}
+{"id": 38365, "token": "。\r\n", "type": "中文标点"}
+{"id": 38542, "token": "—but", "type": "中文标点"}
+{"id": 38574, "token": "段", "type": "中文单字"}
+{"id": 38609, "token": "�认", "type": "中文多字"}
+{"id": 38684, "token": "“If", "type": "中文标点"}
+{"id": 38741, "token": "。，", "type": "中文标点"}
+{"id": 38743, "token": "计", "type": "中文单字"}
+{"id": 39045, "token": "，请", "type": "中文多字"}
+{"id": 39084, "token": "源", "type": "中文单字"}
+{"id": 39135, "token": "色", "type": "中文单字"}
+{"id": 39177, "token": "時", "type": "中文单字"}
+{"id": 39209, "token": "交", "type": "中文单字"}
+{"id": 39276, "token": "系", "type": "中文单字"}
+{"id": 39282, "token": "过", "type": "中文单字"}
+{"id": 39312, "token": "电", "type": "中文单字"}
+{"id": 39365, "token": "询", "type": "中文单字"}
+{"id": 39404, "token": "符", "type": "中文单字"}
+{"id": 39425, "token": "…………", "type": "中文标点"}
+{"id": 39442, "token": "未", "type": "中文单字"}
+{"id": 39607, "token": "程", "type": "中文单字"}
+{"id": 40053, "token": "常", "type": "中文单字"}
+{"id": 40089, "token": "条", "type": "中文单字"}
+{"id": 40195, "token": "下", "type": "中文单字"}
+{"id": 40265, "token": "当", "type": "中文单字"}
+{"id": 40452, "token": "管理", "type": "中文多字"}
+{"id": 40466, "token": "��态", "type": "中文多字"}
+{"id": 40474, "token": "情", "type": "中文单字"}
+{"id": 40526, "token": "口", "type": "中文单字"}
+{"id": 40565, "token": "“He", "type": "中文标点"}
+{"id": 40702, "token": "’S", "type": "中文标点"}
+{"id": 40753, "token": "’a", "type": "中文标点"}
+{"id": 40862, "token": "合", "type": "中文单字"}
+{"id": 41007, "token": "方法", "type": "中文多字"}
+{"id": 41053, "token": "车", "type": "中文单字"}
+{"id": 41073, "token": "实", "type": "中文单字"}
+{"id": 41127, "token": "组", "type": "中文单字"}
+{"id": 41128, "token": "—that", "type": "中文标点"}
+{"id": 41190, "token": "操作", "type": "中文多字"}
+{"id": 41354, "token": "’.\n\n", "type": "中文标点"}
+{"id": 41401, "token": "版", "type": "中文单字"}
+{"id": 41642, "token": "周", "type": "中文单字"}
+{"id": 41723, "token": "址", "type": "中文单字"}
+{"id": 41771, "token": "获取", "type": "中文多字"}
+{"id": 41827, "token": "：\"", "type": "中文标点"}
+{"id": 41914, "token": "记", "type": "中文单字"}
+{"id": 41920, "token": "二", "type": "中文单字"}
+{"id": 42016, "token": "同", "type": "中文单字"}
+{"id": 42052, "token": "业", "type": "中文单字"}
+{"id": 42081, "token": "权", "type": "中文单字"}
+{"id": 42246, "token": "其", "type": "中文单字"}
+{"id": 42275, "token": " ，", "type": "中文标点"}
+{"id": 42399, "token": "进", "type": "中文单字"}
+{"id": 42421, "token": "试", "type": "中文单字"}
+{"id": 42462, "token": "验", "type": "中文单字"}
+{"id": 42506, "token": "料", "type": "中文单字"}
+{"id": 42553, "token": "，\n", "type": "中文标点"}
+{"id": 42605, "token": "，“", "type": "中文标点"}
+{"id": 42783, "token": "传", "type": "中文单字"}
+{"id": 43032, "token": "述", "type": "中文单字"}
+{"id": 43167, "token": "集", "type": "中文单字"}
+{"id": 43240, "token": "多", "type": "中文单字"}
+{"id": 43292, "token": "无", "type": "中文单字"}
+{"id": 43323, "token": "员", "type": "中文单字"}
+{"id": 43378, "token": "报", "type": "中文单字"}
+{"id": 43444, "token": " （", "type": "中文标点"}
+{"id": 43511, "token": "他", "type": "中文单字"}
+{"id": 43568, "token": "無", "type": "中文单字"}
+{"id": 43741, "token": "‘s", "type": "中文标点"}
+{"id": 43955, "token": "添加", "type": "中文多字"}
+{"id": 44130, "token": "“What", "type": "中文标点"}
+{"id": 44309, "token": "服", "type": "中文单字"}
+{"id": 44368, "token": "线", "type": "中文单字"}
+{"id": 44388, "token": "这", "type": "中文单字"}
+{"id": 44416, "token": "制", "type": "中文单字"}
+{"id": 44529, "token": "　　", "type": "中文标点"}
+{"id": 44603, "token": "—it", "type": "中文标点"}
+{"id": 44620, "token": "『", "type": "中文标点"}
+{"id": 44689, "token": "的", "type": "中文单字"}
+{"id": 44816, "token": "�始", "type": "中文多字"}
+{"id": 44820, "token": "�单", "type": "中文多字"}
+{"id": 44915, "token": "内容", "type": "中文多字"}
+{"id": 44996, "token": "’il", "type": "中文标点"}
+{"id": 45018, "token": "设置", "type": "中文多字"}
+{"id": 45059, "token": "生成", "type": "中文多字"}
+{"id": 45163, "token": "将", "type": "中文单字"}
+{"id": 45191, "token": "状态", "type": "中文多字"}
+{"id": 45221, "token": "=”", "type": "中文标点"}
+{"id": 45258, "token": "?’", "type": "中文标点"}
+{"id": 45277, "token": "列表", "type": "中文多字"}
+{"id": 45390, "token": "处", "type": "中文单字"}
+{"id": 45460, "token": "】\n\n", "type": "中文标点"}
+{"id": 45472, "token": "输", "type": "中文单字"}
+{"id": 45516, "token": "！\");\n", "type": "中文标点"}
+{"id": 45631, "token": " 「", "type": "中文标点"}
+{"id": 45736, "token": "高", "type": "中文单字"}
+{"id": 45829, "token": "子", "type": "中文单字"}
+{"id": 45893, "token": "道", "type": "中文单字"}
+{"id": 45934, "token": "�述", "type": "中文多字"}
+{"id": 46028, "token": "章", "type": "中文单字"}
+{"id": 46031, "token": "字段", "type": "中文多字"}
+{"id": 46034, "token": "手", "type": "中文单字"}
+{"id": 46056, "token": "库", "type": "中文单字"}
+{"id": 46091, "token": "三", "type": "中文单字"}
+{"id": 46093, "token": "….\n\n", "type": "中文标点"}
+{"id": 46233, "token": "“In", "type": "中文标点"}
+{"id": 46239, "token": "提示", "type": "中文多字"}
+{"id": 46281, "token": "从", "type": "中文单字"}
+{"id": 46456, "token": "支", "type": "中文单字"}
+{"id": 46690, "token": "“They", "type": "中文标点"}
+{"id": 46729, "token": "家", "type": "中文单字"}
+{"id": 46885, "token": "日期", "type": "中文多字"}
+{"id": 46961, "token": "长", "type": "中文单字"}
+{"id": 47000, "token": "付", "type": "中文单字"}
+{"id": 47012, "token": "获取", "type": "中文多字"}
+{"id": 47018, "token": "秒", "type": "中文单字"}
+{"id": 47030, "token": "图片", "type": "中文多字"}
+{"id": 47043, "token": "商品", "type": "中文多字"}
+{"id": 47095, "token": "路", "type": "中文单字"}
+{"id": 47200, "token": "代码", "type": "中文多字"}
+{"id": 47406, "token": "完", "type": "中文单字"}
+{"id": 47436, "token": "：</", "type": "中文标点"}
+{"id": 47523, "token": "象", "type": "中文单字"}
+{"id": 47548, "token": "则", "type": "中文单字"}
+{"id": 47551, "token": "现", "type": "中文单字"}
+{"id": 47566, "token": "设", "type": "中文单字"}
+{"id": 47577, "token": "地址", "type": "中文多字"}
+{"id": 47585, "token": "保存", "type": "中文多字"}
+{"id": 47653, "token": "京", "type": "中文单字"}
+{"id": 47770, "token": "转", "type": "中文单字"}
+{"id": 47896, "token": " –\n\n", "type": "中文标点"}
+{"id": 47971, "token": "�示", "type": "中文多字"}
+{"id": 48039, "token": "辑", "type": "中文单字"}
+{"id": 48044, "token": "一个", "type": "中文多字"}
+{"id": 48249, "token": "限", "type": "中文单字"}
+{"id": 48349, "token": "“A", "type": "中文标点"}
+{"id": 48463, "token": "默认", "type": "中文多字"}
+{"id": 48634, "token": "力", "type": "中文单字"}
+{"id": 48706, "token": "存在", "type": "中文多字"}
+{"id": 48785, "token": "数", "type": "中文单字"}
+{"id": 48858, "token": "创建", "type": "中文多字"}
+{"id": 48864, "token": "学", "type": "中文单字"}
+{"id": 48915, "token": "外", "type": "中文单字"}
+{"id": 48972, "token": "调", "type": "中文单字"}
+{"id": 48974, "token": "服务", "type": "中文多字"}
+{"id": 48982, "token": "项", "type": "中文单字"}
+{"id": 49055, "token": "请输入", "type": "中文多字"}
+{"id": 49216, "token": ".”\n", "type": "中文标点"}
+{"id": 49372, "token": "），", "type": "中文标点"}
+{"id": 49409, "token": "北", "type": "中文单字"}
+{"id": 49491, "token": "字符", "type": "中文多字"}
+{"id": 49525, "token": "—in", "type": "中文标点"}
+{"id": 49543, "token": "：\n\n", "type": "中文标点"}
+{"id": 49792, "token": "工", "type": "中文单字"}
+{"id": 49838, "token": "笑", "type": "中文单字"}
+{"id": 49928, "token": "监", "type": "中文单字"}
+{"id": 49977, "token": "“That", "type": "中文标点"}
+{"id": 49988, "token": "任", "type": "中文单字"}
+{"id": 50004, "token": "—which", "type": "中文标点"}
+{"id": 50021, "token": "相", "type": "中文单字"}
+{"id": 50027, "token": "验证", "type": "中文多字"}
+{"id": 50034, "token": "微", "type": "中文单字"}
+{"id": 50126, "token": "册", "type": "中文单字"}
+{"id": 50182, "token": "联", "type": "中文单字"}
+{"id": 50211, "token": "平", "type": "中文单字"}
+{"id": 50285, "token": "增", "type": "中文单字"}
+{"id": 50287, "token": "听", "type": "中文单字"}
+{"id": 50338, "token": "解", "type": "中文单字"}
+{"id": 50617, "token": "—to", "type": "中文标点"}
+{"id": 50667, "token": "等", "type": "中文单字"}
+{"id": 50808, "token": "’ai", "type": "中文标点"}
+{"id": 50928, "token": "得", "type": "中文单字"}
+{"id": 51107, "token": "更新", "type": "中文多字"}
+{"id": 51109, "token": "收", "type": "中文单字"}
+{"id": 51142, "token": "用户", "type": "中文多字"}
+{"id": 51202, "token": "选�", "type": "中文多字"}
+{"id": 51279, "token": "…”", "type": "中文标点"}
+{"id": 51385, "token": "安", "type": "中文单字"}
+{"id": 51392, "token": "价", "type": "中文单字"}
+{"id": 51431, "token": "第", "type": "中文单字"}
+{"id": 51450, "token": "取消", "type": "中文多字"}
+{"id": 51466, "token": "藏", "type": "中文单字"}
+{"id": 51477, "token": "创建", "type": "中文多字"}
+{"id": 51504, "token": "选择", "type": "中文多字"}
+{"id": 51510, "token": "订单", "type": "中文多字"}
+{"id": 51609, "token": "命", "type": "中文单字"}
+{"id": 51611, "token": "应", "type": "中文单字"}
+{"id": 51747, "token": "为空", "type": "中文多字"}
+{"id": 51749, "token": "—or", "type": "中文标点"}
+{"id": 51757, "token": "—I", "type": "中文标点"}
+{"id": 51786, "token": "“,", "type": "中文标点"}
+{"id": 51928, "token": "“When", "type": "中文标点"}
+{"id": 52030, "token": "看", "type": "中文单字"}
+{"id": 52084, "token": "索", "type": "中文单字"}
+{"id": 52188, "token": "�始化", "type": "中文多字"}
+{"id": 52225, "token": "资", "type": "中文单字"}
+{"id": 52254, "token": "查询", "type": "中文多字"}
+{"id": 52289, "token": "’en", "type": "中文标点"}
+{"id": 52332, "token": "产", "type": "中文单字"}
+{"id": 52563, "token": "表示", "type": "中文多字"}
+{"id": 52675, "token": "串", "type": "中文单字"}
+{"id": 52927, "token": "布", "type": "中文单字"}
+{"id": 53229, "token": "原", "type": "中文单字"}
+{"id": 53263, "token": "…..", "type": "中文标点"}
+{"id": 53283, "token": "知", "type": "中文单字"}
+{"id": 53434, "token": "级", "type": "中文单字"}
+{"id": 53513, "token": "––", "type": "中文标点"}
+{"id": 53610, "token": "水", "type": "中文单字"}
+{"id": 53626, "token": "上传", "type": "中文多字"}
+{"id": 53676, "token": "…and", "type": "中文标点"}
+{"id": 53802, "token": "监听", "type": "中文多字"}
+{"id": 53826, "token": "击", "type": "中文单字"}
+{"id": 53901, "token": "好", "type": "中文单字"}
+{"id": 53953, "token": "物", "type": "中文单字"}
+{"id": 54140, "token": "文", "type": "中文单字"}
+{"id": 54154, "token": "设置", "type": "中文多字"}
+{"id": 54253, "token": "不能", "type": "中文多字"}
+{"id": 54322, "token": "放", "type": "中文单字"}
+{"id": 54456, "token": "亿", "type": "中文单字"}
+{"id": 54493, "token": "经", "type": "中文单字"}
+{"id": 54581, "token": "描述", "type": "中文多字"}
+{"id": 54689, "token": "。。\n\n", "type": "中文标点"}
+{"id": 54747, "token": "。“", "type": "中文标点"}
+{"id": 54872, "token": "模", "type": "中文单字"}
+{"id": 55030, "token": "之", "type": "中文单字"}
+{"id": 55038, "token": "台", "type": "中文单字"}
+{"id": 55080, "token": "…I", "type": "中文标点"}
+{"id": 55121, "token": "显示", "type": "中文多字"}
+{"id": 55139, "token": "州", "type": "中文单字"}
+{"id": 55434, "token": "—is", "type": "中文标点"}
+{"id": 55487, "token": "配", "type": "中文单字"}
+{"id": 55642, "token": "处理", "type": "中文多字"}
+{"id": 55723, "token": "画", "type": "中文单字"}
+{"id": 55758, "token": "统", "type": "中文单字"}
+{"id": 55951, "token": "是", "type": "中文单字"}
+{"id": 55999, "token": "共", "type": "中文单字"}
+{"id": 56026, "token": "连", "type": "中文单字"}
+{"id": 56040, "token": "〜", "type": "中文标点"}
+{"id": 56163, "token": "„", "type": "中文标点"}
+{"id": 56209, "token": "…\"", "type": "中文标点"}
+{"id": 56235, "token": "海", "type": "中文单字"}
+{"id": 56386, "token": "开始", "type": "中文多字"}
+{"id": 56438, "token": "所有", "type": "中文多字"}
+{"id": 56602, "token": "节", "type": "中文单字"}
+{"id": 56716, "token": "返回", "type": "中文多字"}
+{"id": 56906, "token": "退", "type": "中文单字"}
+{"id": 56907, "token": "”。", "type": "中文标点"}
+{"id": 56955, "token": "”),", "type": "中文标点"}
+{"id": 56965, "token": "間", "type": "中文单字"}
+{"id": 57106, "token": "比", "type": "中文单字"}
+{"id": 57107, "token": "问", "type": "中文单字"}
+{"id": 57237, "token": "至", "type": "中文单字"}
+{"id": 57287, "token": "’aut", "type": "中文标点"}
+{"id": 57378, "token": "备", "type": "中文单字"}
+{"id": 57633, "token": "”:", "type": "中文标点"}
+{"id": 57668, "token": "你", "type": "中文单字"}
+{"id": 57752, "token": "黑", "type": "中文单字"}
+{"id": 57861, "token": "…”\n\n", "type": "中文标点"}
+{"id": 57892, "token": "’av", "type": "中文标点"}
+{"id": 58004, "token": "下午", "type": "中文多字"}
+{"id": 58119, "token": "编辑", "type": "中文多字"}
+{"id": 58291, "token": "或", "type": "中文单字"}
+{"id": 58318, "token": "与", "type": "中文单字"}
+{"id": 58322, "token": "影", "type": "中文单字"}
+{"id": 58386, "token": "’h", "type": "中文标点"}
+{"id": 58521, "token": "作者", "type": "中文多字"}
+{"id": 58543, "token": "话", "type": "中文单字"}
+{"id": 58552, "token": "视", "type": "中文单字"}
+{"id": 58653, "token": "读", "type": "中文单字"}
+{"id": 58655, "token": "告", "type": "中文单字"}
+{"id": 58666, "token": "美", "type": "中文单字"}
+{"id": 58721, "token": "事件", "type": "中文多字"}
+{"id": 58850, "token": "女", "type": "中文单字"}
+{"id": 58911, "token": "山", "type": "中文单字"}
+{"id": 59243, "token": "和", "type": "中文单字"}
+{"id": 59363, "token": "生", "type": "中文单字"}
+{"id": 59459, "token": "。（", "type": "中文标点"}
+{"id": 59462, "token": "需", "type": "中文单字"}
+{"id": 59464, "token": "复", "type": "中文单字"}
+{"id": 59505, "token": "手机", "type": "中文多字"}
+{"id": 59563, "token": "南", "type": "中文单字"}
+{"id": 59614, "token": "必", "type": "中文单字"}
+{"id": 59622, "token": "�行", "type": "中文多字"}
+{"id": 59712, "token": "」「", "type": "中文标点"}
+{"id": 59757, "token": "分", "type": "中文单字"}
+{"id": 59795, "token": "中国", "type": "中文多字"}
+{"id": 59892, "token": "闭", "type": "中文单字"}
+{"id": 59914, "token": "加载", "type": "中文多字"}
+{"id": 60174, "token": "城", "type": "中文单字"}
+{"id": 60205, "token": "用户名", "type": "中文多字"}
+{"id": 60233, "token": " 。", "type": "中文标点"}
+{"id": 60239, "token": "�性", "type": "中文多字"}
+{"id": 60251, "token": "结果", "type": "中文多字"}
+{"id": 60317, "token": "；\n", "type": "中文标点"}
+{"id": 60358, "token": "近", "type": "中文单字"}
+{"id": 60455, "token": "效", "type": "中文单字"}
+{"id": 60632, "token": "利", "type": "中文单字"}
+{"id": 60634, "token": "移", "type": "中文单字"}
+{"id": 60654, "token": "—as", "type": "中文标点"}
+{"id": 60656, "token": "’int", "type": "中文标点"}
+{"id": 60710, "token": "–\n\n", "type": "中文标点"}
+{"id": 60843, "token": "总", "type": "中文单字"}
+{"id": 60979, "token": "按", "type": "中文单字"}
+{"id": 61056, "token": "排", "type": "中文单字"}
+{"id": 61075, "token": "首", "type": "中文单字"}
+{"id": 61131, "token": "’n", "type": "中文标点"}
+{"id": 61176, "token": "··", "type": "中文标点"}
+{"id": 61304, "token": "記", "type": "中文单字"}
+{"id": 61311, "token": "————————————————", "type": "中文标点"}
+{"id": 61337, "token": "社", "type": "中文单字"}
+{"id": 61496, "token": "标题", "type": "中文多字"}
+{"id": 61553, "token": "“As", "type": "中文标点"}
+{"id": 61559, "token": "“No", "type": "中文标点"}
+{"id": 61603, "token": "“But", "type": "中文标点"}
+{"id": 61633, "token": "注意", "type": "中文多字"}
+{"id": 61648, "token": "完成", "type": "中文多字"}
+{"id": 61710, "token": "确定", "type": "中文多字"}
+{"id": 61786, "token": "西", "type": "中文单字"}
+{"id": 61826, "token": "先", "type": "中文单字"}
+{"id": 61903, "token": "…\"\n\n", "type": "中文标点"}
+{"id": 61994, "token": "然", "type": "中文单字"}
+{"id": 62049, "token": "键", "type": "中文单字"}
+{"id": 62205, "token": "名", "type": "中文单字"}
+{"id": 62249, "token": "周期", "type": "中文多字"}
+{"id": 62291, "token": "额", "type": "中文单字"}
+{"id": 62543, "token": "写", "type": "中文单字"}
+{"id": 62597, "token": "“My", "type": "中文标点"}
+{"id": 62717, "token": "�名", "type": "中文多字"}
+{"id": 62789, "token": "注册", "type": "中文多字"}
+{"id": 62855, "token": "签", "type": "中文单字"}
+{"id": 63091, "token": "自", "type": "中文单字"}
+{"id": 63093, "token": "。',\n", "type": "中文标点"}
+{"id": 63212, "token": "因", "type": "中文单字"}
+{"id": 63289, "token": "下载", "type": "中文多字"}
+{"id": 63344, "token": "如果", "type": "中文多字"}
+{"id": 63362, "token": "数据", "type": "中文多字"}
+{"id": 63397, "token": "命周期", "type": "中文多字"}
+{"id": 63679, "token": "注", "type": "中文单字"}
+{"id": 63750, "token": "”—", "type": "中文标点"}
+{"id": 63938, "token": "—not", "type": "中文标点"}
+{"id": 63977, "token": " —\n\n", "type": "中文标点"}
+{"id": 64022, "token": "别", "type": "中文单字"}
+{"id": 64026, "token": "并", "type": "中文单字"}
+{"id": 64045, "token": "异", "type": "中文单字"}
+{"id": 64063, "token": "束", "type": "中文单字"}
+{"id": 64171, "token": "修改", "type": "中文多字"}
+{"id": 64173, "token": "删除", "type": "中文多字"}
+{"id": 64179, "token": "生命周期", "type": "中文多字"}
+{"id": 64209, "token": "心", "type": "中文单字"}
+{"id": 64376, "token": "。\",\n", "type": "中文标点"}
+{"id": 64414, "token": "链", "type": "中文单字"}
+{"id": 64467, "token": "指", "type": "中文单字"}
+{"id": 64479, "token": "评", "type": "中文单字"}
+{"id": 64531, "token": "整", "type": "中文单字"}
+{"id": 64623, "token": "’in", "type": "中文标点"}
+{"id": 64803, "token": "四", "type": "中文单字"}
+{"id": 64889, "token": "断", "type": "中文单字"}
+{"id": 64936, "token": "角", "type": "中文单字"}
+{"id": 64960, "token": "生命周期函数", "type": "中文多字"}
+{"id": 65053, "token": "监听页面", "type": "中文多字"}
+{"id": 65164, "token": "连接", "type": "中文多字"}
+{"id": 65218, "token": "上", "type": "中文单字"}
+{"id": 65305, "token": "消息", "type": "中文多字"}
+{"id": 65312, "token": "”).", "type": "中文标点"}
+{"id": 65372, "token": "软", "type": "中文单字"}
+{"id": 65455, "token": "头", "type": "中文单字"}
+{"id": 65459, "token": "）、", "type": "中文标点"}
+{"id": 65529, "token": "对象", "type": "中文多字"}
+{"id": 65571, "token": "是否", "type": "中文多字"}
+{"id": 65573, "token": "邮", "type": "中文单字"}
+{"id": 65659, "token": "义", "type": "中文单字"}
+{"id": 65743, "token": "司", "type": "中文单字"}
+{"id": 65782, "token": "步", "type": "中文单字"}
+{"id": 65789, "token": "门", "type": "中文单字"}
+{"id": 65820, "token": "导", "type": "中文单字"}
+{"id": 65854, "token": "客", "type": "中文单字"}
+{"id": 65884, "token": "不能为空", "type": "中文多字"}
+{"id": 65917, "token": "右", "type": "中文单字"}
+{"id": 66052, "token": "频", "type": "中文单字"}
+{"id": 66101, "token": "\"—", "type": "中文标点"}
+{"id": 66201, "token": "像", "type": "中文单字"}
+{"id": 66327, "token": "。「", "type": "中文标点"}
+{"id": 66378, "token": "特", "type": "中文单字"}
+{"id": 66383, "token": "」と", "type": "中文标点"}
+{"id": 66545, "token": "”;", "type": "中文标点"}
+{"id": 66621, "token": " ….", "type": "中文标点"}
+{"id": 66625, "token": "“Our", "type": "中文标点"}
+{"id": 66677, "token": "记录", "type": "中文多字"}
+{"id": 66679, "token": "…\n\n\n", "type": "中文标点"}
+{"id": 66776, "token": "非", "type": "中文单字"}
+{"id": 66850, "token": " “[", "type": "中文标点"}
+{"id": 66870, "token": "省", "type": "中文单字"}
+{"id": 67117, "token": "输出", "type": "中文多字"}
+{"id": 67178, "token": "造", "type": "中文单字"}
+{"id": 67282, "token": "’ét", "type": "中文标点"}
+{"id": 67287, "token": "姓名", "type": "中文多字"}
+{"id": 67494, "token": "说明", "type": "中文多字"}
+{"id": 67658, "token": "字符串", "type": "中文多字"}
+{"id": 67669, "token": "径", "type": "中文单字"}
+{"id": 67735, "token": "�试", "type": "中文多字"}
+{"id": 67870, "token": "’e", "type": "中文标点"}
+{"id": 67886, "token": " ”\n\n", "type": "中文标点"}
+{"id": 67933, "token": "详", "type": "中文单字"}
+{"id": 67986, "token": "验证码", "type": "中文多字"}
+{"id": 67998, "token": "。\\", "type": "中文标点"}
+{"id": 68171, "token": "由", "type": "中文单字"}
+{"id": 68230, "token": "＾", "type": "中文标点"}
+{"id": 68306, "token": "’on", "type": "中文标点"}
+{"id": 68379, "token": "包", "type": "中文单字"}
+{"id": 68438, "token": "通过", "type": "中文多字"}
+{"id": 68464, "token": "东", "type": "中文单字"}
+{"id": 68850, "token": ")—", "type": "中文标点"}
+{"id": 68931, "token": "论", "type": "中文单字"}
+{"id": 68932, "token": "“And", "type": "中文标点"}
+{"id": 69049, "token": "当前", "type": "中文多字"}
+{"id": 69165, "token": "络", "type": "中文单字"}
+{"id": 69253, "token": "款", "type": "中文单字"}
+{"id": 69272, "token": "�藏", "type": "中文多字"}
+{"id": 69362, "token": "支付", "type": "中文多字"}
+{"id": 69496, "token": "启", "type": "中文单字"}
+{"id": 69636, "token": "而", "type": "中文单字"}
+{"id": 69856, "token": "填", "type": "中文单字"}
+{"id": 69905, "token": "格式", "type": "中文多字"}
+{"id": 69962, "token": "释", "type": "中文单字"}
+{"id": 69978, "token": "持", "type": "中文单字"}
+{"id": 70041, "token": "��索", "type": "中文多字"}
+{"id": 70090, "token": "北京", "type": "中文多字"}
+{"id": 70141, "token": "向", "type": "中文单字"}
+{"id": 70158, "token": "输入", "type": "中文多字"}
+{"id": 70203, "token": "算", "type": "中文单字"}
+{"id": 70214, "token": "“So", "type": "中文标点"}
+{"id": 70262, "token": "对", "type": "中文单字"}
+{"id": 70277, "token": "江", "type": "中文单字"}
+{"id": 70284, "token": "不存在", "type": "中文多字"}
+{"id": 70349, "token": "里", "type": "中文单字"}
+{"id": 70453, "token": "查", "type": "中文单字"}
+{"id": 70472, "token": "如", "type": "中文单字"}
+{"id": 70525, "token": "发", "type": "中文单字"}
+{"id": 70542, "token": "份", "type": "中文单字"}
+{"id": 70615, "token": ")，", "type": "中文标点"}
+{"id": 70616, "token": "责", "type": "中文单字"}
+{"id": 70626, "token": "科", "type": "中文单字"}
+{"id": 70694, "token": "文件", "type": "中文多字"}
+{"id": 70774, "token": "类", "type": "中文单字"}
+{"id": 70821, "token": "民", "type": "中文单字"}
+{"id": 70924, "token": "数组", "type": "中文多字"}
+{"id": 71005, "token": "治", "type": "中文单字"}
+{"id": 71082, "token": "%，", "type": "中文标点"}
+{"id": 71174, "token": "声", "type": "中文单字"}
+{"id": 71201, "token": "—they", "type": "中文标点"}
+{"id": 71208, "token": "男", "type": "中文单字"}
+{"id": 71270, "token": "“(", "type": "中文标点"}
+{"id": 71298, "token": "[…", "type": "中文标点"}
+{"id": 71461, "token": "重新", "type": "中文多字"}
+{"id": 71480, "token": "—you", "type": "中文标点"}
+{"id": 71600, "token": "设计", "type": "中文多字"}
+{"id": 71638, "token": "分类", "type": "中文多字"}
+{"id": 71668, "token": "输出", "type": "中文多字"}
+{"id": 71689, "token": "以上", "type": "中文多字"}
+{"id": 71733, "token": "异常", "type": "中文多字"}
+{"id": 71869, "token": "族", "type": "中文单字"}
+{"id": 71890, "token": "站", "type": "中文单字"}
+{"id": 72027, "token": "没", "type": "中文单字"}
+{"id": 72069, "token": "参数", "type": "中文多字"}
+{"id": 72099, "token": "県", "type": "中文单字"}
+{"id": 72125, "token": "雅", "type": "中文单字"}
+{"id": 72209, "token": "版本", "type": "中文多字"}
+{"id": 72234, "token": "换", "type": "中文单字"}
+{"id": 72237, "token": "核", "type": "中文单字"}
+{"id": 72238, "token": "素", "type": "中文单字"}
+{"id": 72318, "token": "—for", "type": "中文标点"}
+{"id": 72368, "token": "都", "type": "中文单字"}
+{"id": 72404, "token": "超", "type": "中文单字"}
+{"id": 72434, "token": "!’", "type": "中文标点"}
+{"id": 72456, "token": "网络", "type": "中文多字"}
+{"id": 72516, "token": "店", "type": "中文单字"}
+{"id": 72718, "token": "起", "type": "中文单字"}
+{"id": 72794, "token": "隐藏", "type": "中文多字"}
+{"id": 72843, "token": "享", "type": "中文单字"}
+{"id": 72873, "token": "方", "type": "中文单字"}
+{"id": 72917, "token": "进行", "type": "中文多字"}
+{"id": 73051, "token": "是否", "type": "中文多字"}
+{"id": 73071, "token": "提交", "type": "中文多字"}
+{"id": 73117, "token": "发送", "type": "中文多字"}
+{"id": 73164, "token": "联系", "type": "中文多字"}
+{"id": 73325, "token": "拉", "type": "中文单字"}
+{"id": 73329, "token": "…\n\n\n\n", "type": "中文标点"}
+{"id": 73361, "token": "米", "type": "中文单字"}
+{"id": 73548, "token": "系统", "type": "中文多字"}
+{"id": 73686, "token": "引", "type": "中文单字"}
+{"id": 73740, "token": "编号", "type": "中文多字"}
+{"id": 73751, "token": "点击", "type": "中文多字"}
+{"id": 73769, "token": "更", "type": "中文单字"}
+{"id": 73939, "token": "…)", "type": "中文标点"}
+{"id": 73958, "token": "中", "type": "中文单字"}
+{"id": 73981, "token": "语", "type": "中文单字"}
+{"id": 74022, "token": "”?", "type": "中文标点"}
+{"id": 74090, "token": "土", "type": "中文单字"}
+{"id": 74138, "token": "宋", "type": "中文单字"}
+{"id": 74245, "token": "直", "type": "中文单字"}
+{"id": 74257, "token": "每", "type": "中文单字"}
+{"id": 74318, "token": "公司", "type": "中文多字"}
+{"id": 74396, "token": "箱", "type": "中文单字"}
+{"id": 74412, "token": "字", "type": "中文单字"}
+{"id": 74445, "token": "项目", "type": "中文多字"}
+{"id": 74482, "token": "後", "type": "中文单字"}
+{"id": 74662, "token": "在", "type": "中文单字"}
+{"id": 74770, "token": "可以", "type": "中文多字"}
+{"id": 74843, "token": "参", "type": "中文单字"}
+{"id": 75140, "token": "变", "type": "中文单字"}
+{"id": 75146, "token": "基", "type": "中文单字"}
+{"id": 75259, "token": "页面", "type": "中文多字"}
+{"id": 75267, "token": "場", "type": "中文单字"}
+{"id": 75293, "token": "待", "type": "中文单字"}
+{"id": 75320, "token": "程序", "type": "中文多字"}
+{"id": 75376, "token": "）。", "type": "中文标点"}
+{"id": 75486, "token": "规", "type": "中文单字"}
+{"id": 75493, "token": "数据库", "type": "中文多字"}
+{"id": 75513, "token": "政", "type": "中文单字"}
+{"id": 75550, "token": "“For", "type": "中文标点"}
+{"id": 75630, "token": "雅黑", "type": "中文多字"}
+{"id": 75631, "token": "软雅黑", "type": "中文多字"}
+{"id": 75761, "token": "排序", "type": "中文多字"}
+{"id": 75787, "token": "。\n\n\n\n\n\n", "type": "中文标点"}
+{"id": 75863, "token": "也", "type": "中文单字"}
+{"id": 75910, "token": "介", "type": "中文单字"}
+{"id": 75976, "token": "首页", "type": "中文多字"}
+{"id": 76070, "token": "—including", "type": "中文标点"}
+{"id": 76099, "token": "关闭", "type": "中文多字"}
+{"id": 76148, "token": "，\n\n", "type": "中文标点"}
+{"id": 76161, "token": "钟", "type": "中文单字"}
+{"id": 76208, "token": "五", "type": "中文单字"}
+{"id": 76217, "token": "执行", "type": "中文多字"}
+{"id": 76323, "token": "审", "type": "中文单字"}
+{"id": 76417, "token": "单位", "type": "中文多字"}
+{"id": 76455, "token": "手机号", "type": "中文多字"}
+{"id": 76502, "token": "日", "type": "中文单字"}
+{"id": 76505, "token": "木", "type": "中文单字"}
+{"id": 76537, "token": "打", "type": "中文单字"}
+{"id": 76706, "token": "活", "type": "中文单字"}
+{"id": 76718, "token": "微软雅黑", "type": "中文多字"}
+{"id": 76750, "token": "播", "type": "中文单字"}
+{"id": 76843, "token": "！！\n\n", "type": "中文标点"}
+{"id": 76858, "token": "！”", "type": "中文标点"}
+{"id": 76864, "token": "！」", "type": "中文标点"}
+{"id": 76868, "token": "方式", "type": "中文多字"}
+{"id": 76929, "token": "—he", "type": "中文标点"}
+{"id": 76982, "token": "该", "type": "中文单字"}
+{"id": 77138, "token": "’am", "type": "中文标点"}
+{"id": 77158, "token": "…)\n\n", "type": "中文标点"}
+{"id": 77190, "token": "初始化", "type": "中文多字"}
+{"id": 77195, "token": "条件", "type": "中文多字"}
+{"id": 77219, "token": "記事", "type": "中文多字"}
+{"id": 77284, "token": "“.", "type": "中文标点"}
+{"id": 77413, "token": "展", "type": "中文单字"}
+{"id": 77479, "token": ",…\n\n", "type": "中文标点"}
+{"id": 77748, "token": "钮", "type": "中文单字"}
+{"id": 77913, "token": "具", "type": "中文单字"}
+{"id": 77937, "token": "路径", "type": "中文多字"}
+{"id": 78021, "token": "退出", "type": "中文多字"}
+{"id": 78111, "token": "宋体", "type": "中文多字"}
+{"id": 78228, "token": "志", "type": "中文单字"}
+{"id": 78244, "token": "言", "type": "中文单字"}
+{"id": 78272, "token": "购", "type": "中文单字"}
+{"id": 78366, "token": "……………………", "type": "中文标点"}
+{"id": 78388, "token": "但", "type": "中文单字"}
+{"id": 78519, "token": "星", "type": "中文单字"}
+{"id": 78640, "token": "两", "type": "中文单字"}
+{"id": 78657, "token": "例如", "type": "中文多字"}
+{"id": 78659, "token": "左", "type": "中文单字"}
+{"id": 78698, "token": "考", "type": "中文单字"}
+{"id": 78935, "token": "构", "type": "中文单字"}
+{"id": 78943, "token": "報", "type": "中文单字"}
+{"id": 79059, "token": "球", "type": "中文单字"}
+{"id": 79108, "token": "设计器", "type": "中文多字"}
+{"id": 79203, "token": "更新", "type": "中文多字"}
+{"id": 79656, "token": "相关", "type": "中文多字"}
+{"id": 79785, "token": "音", "type": "中文单字"}
+{"id": 79908, "token": "动生成", "type": "中文多字"}
+{"id": 79982, "token": "端", "type": "中文单字"}
+{"id": 80000, "token": "。”\n\n", "type": "中文标点"}
+{"id": 80003, "token": "，默认", "type": "中文多字"}
+{"id": 80019, "token": "新", "type": "中文单字"}
+{"id": 80073, "token": "搜索", "type": "中文多字"}
+{"id": 80078, "token": "—even", "type": "中文标点"}
+{"id": 80172, "token": "投", "type": "中文单字"}
+{"id": 80195, "token": "立", "type": "中文单字"}
+{"id": 80356, "token": "属性", "type": "中文多字"}
+{"id": 80426, "token": "�断", "type": "中文多字"}
+{"id": 80578, "token": "们", "type": "中文单字"}
+{"id": 80615, "token": ".…\n\n", "type": "中文标点"}
+{"id": 80699, "token": "火", "type": "中文单字"}
+{"id": 80804, "token": "示", "type": "中文单字"}
+{"id": 80866, "token": "清", "type": "中文单字"}
+{"id": 81194, "token": "金额", "type": "中文多字"}
+{"id": 81201, "token": "账", "type": "中文单字"}
+{"id": 81258, "token": "就", "type": "中文单字"}
+{"id": 81368, "token": "费", "type": "中文单字"}
+{"id": 81506, "token": "请选择", "type": "中文多字"}
+{"id": 81526, "token": "示例", "type": "中文多字"}
+{"id": 81543, "token": "没有", "type": "中文多字"}
+{"id": 81546, "token": "：\"+", "type": "中文标点"}
+{"id": 81628, "token": "查询", "type": "中文多字"}
+{"id": 81646, "token": "默认", "type": "中文多字"}
+{"id": 81665, "token": "结束", "type": "中文多字"}
+{"id": 81742, "token": "案", "type": "中文单字"}
+{"id": 81902, "token": "—with", "type": "中文标点"}
+{"id": 81951, "token": "控", "type": "中文单字"}
+{"id": 81976, "token": "请求", "type": "中文多字"}
+{"id": 82042, "token": "广", "type": "中文单字"}
+{"id": 82175, "token": "’app", "type": "中文标点"}
+{"id": 82267, "token": "确认", "type": "中文多字"}
+{"id": 82302, "token": "历", "type": "中文单字"}
+{"id": 82317, "token": "及", "type": "中文单字"}
+{"id": 82363, "token": "如果", "type": "中文多字"}
+{"id": 82364, "token": "？”", "type": "中文标点"}
+{"id": 82420, "token": "計", "type": "中文单字"}
+{"id": 82530, "token": "、、", "type": "中文标点"}
+{"id": 82533, "token": "止", "type": "中文单字"}
+{"id": 82554, "token": "方法", "type": "中文多字"}
+{"id": 82696, "token": "么", "type": "中文单字"}
+{"id": 82768, "token": "货", "type": "中文单字"}
+{"id": 82805, "token": "测试", "type": "中文多字"}
+{"id": 82900, "token": "数量", "type": "中文多字"}
+{"id": 82912, "token": "位置", "type": "中文多字"}
+{"id": 82973, "token": "時間", "type": "中文多字"}
+{"id": 83042, "token": "�权", "type": "中文多字"}
+{"id": 83047, "token": "开", "type": "中文单字"}
+{"id": 83125, "token": "文章", "type": "中文多字"}
+{"id": 83175, "token": "阳", "type": "中文单字"}
+{"id": 83266, "token": "队", "type": "中文单字"}
+{"id": 83301, "token": "技", "type": "中文单字"}
+{"id": 83324, "token": "场", "type": "中文单字"}
+{"id": 83337, "token": "链接", "type": "中文多字"}
+{"id": 83354, "token": "＞", "type": "中文标点"}
+{"id": 83439, "token": "添加", "type": "中文多字"}
+{"id": 83639, "token": "最", "type": "中文单字"}
+{"id": 83687, "token": "数字", "type": "中文多字"}
+{"id": 83741, "token": "声明", "type": "中文多字"}
+{"id": 83747, "token": "少", "type": "中文单字"}
+{"id": 83766, "token": "…but", "type": "中文标点"}
+{"id": 83799, "token": "形", "type": "中文单字"}
+{"id": 83800, "token": "产品", "type": "中文多字"}
+{"id": 83872, "token": "—are", "type": "中文标点"}
+{"id": 83932, "token": "稿", "type": "中文单字"}
+{"id": 83947, "token": "英", "type": "中文单字"}
+{"id": 83994, "token": "游", "type": "中文单字"}
+{"id": 84095, "token": "亿元", "type": "中文多字"}
+{"id": 84131, "token": "分钟", "type": "中文多字"}
+{"id": 84341, "token": ".…", "type": "中文标点"}
+{"id": 84410, "token": "商", "type": "中文单字"}
+{"id": 84498, "token": "“She", "type": "中文标点"}
+{"id": 84765, "token": "！\",", "type": "中文标点"}
+{"id": 84844, "token": "供", "type": "中文单字"}
+{"id": 84851, "token": "推", "type": "中文单字"}
+{"id": 84875, "token": "！\n\n\n\n", "type": "中文标点"}
+{"id": 84941, "token": "—who", "type": "中文标点"}
+{"id": 85155, "token": "初始化", "type": "中文多字"}
+{"id": 85188, "token": "税", "type": "中文单字"}
+{"id": 85284, "token": "按钮", "type": "中文多字"}
+{"id": 85366, "token": "—an", "type": "中文标点"}
+{"id": 85663, "token": "無し�", "type": "中文多字"}
+{"id": 85707, "token": "初", "type": "中文单字"}
+{"id": 85997, "token": "当", "type": "中文单字"}
+{"id": 85998, "token": "！');\n", "type": "中文标点"}
+{"id": 86127, "token": "私", "type": "中文单字"}
+{"id": 86206, "token": "需要", "type": "中文多字"}
+{"id": 86222, "token": "解", "type": "中文单字"}
+{"id": 86319, "token": "—we", "type": "中文标点"}
+{"id": 86348, "token": "全部", "type": "中文多字"}
+{"id": 86354, "token": "景", "type": "中文单字"}
+{"id": 86429, "token": "资源", "type": "中文多字"}
+{"id": 86436, "token": "去", "type": "中文单字"}
+{"id": 86461, "token": "华", "type": "中文单字"}
+{"id": 86508, "token": "“Yes", "type": "中文标点"}
+{"id": 86601, "token": "’T", "type": "中文标点"}
+{"id": 86741, "token": "评论", "type": "中文多字"}
+{"id": 86758, "token": "使用", "type": "中文多字"}
+{"id": 86846, "token": "’B", "type": "中文标点"}
+{"id": 86867, "token": "配置", "type": "中文多字"}
+{"id": 87023, "token": "–and", "type": "中文标点"}
+{"id": 87109, "token": "不", "type": "中文单字"}
+{"id": 87177, "token": "話", "type": "中文单字"}
+{"id": 87217, "token": "番", "type": "中文单字"}
+{"id": 87219, "token": "问题", "type": "中文多字"}
+{"id": 87247, "token": "—all", "type": "中文标点"}
+{"id": 87327, "token": "报道", "type": "中文多字"}
+{"id": 87412, "token": "环", "type": "中文单字"}
+{"id": 87441, "token": "张", "type": "中文单字"}
+{"id": 87447, "token": "開", "type": "中文单字"}
+{"id": 87474, "token": "無しさん", "type": "中文多字"}
+{"id": 87502, "token": "种", "type": "中文单字"}
+{"id": 87646, "token": "成", "type": "中文单字"}
+{"id": 87671, "token": "—one", "type": "中文标点"}
+{"id": 87844, "token": "易", "type": "中文单字"}
+{"id": 87990, "token": "“Oh", "type": "中文标点"}
+{"id": 88108, "token": "……\n\n", "type": "中文标点"}
+{"id": 88126, "token": "您", "type": "中文单字"}
+{"id": 88161, "token": "’an", "type": "中文标点"}
+{"id": 88240, "token": "视频", "type": "中文多字"}
+{"id": 88343, "token": "》，", "type": "中文标点"}
+{"id": 88348, "token": ".’”\n\n", "type": "中文标点"}
+{"id": 88356, "token": "再", "type": "中文单字"}
+{"id": 88367, "token": "可能", "type": "中文多字"}
+{"id": 88435, "token": "文字", "type": "中文多字"}
+{"id": 88631, "token": "板", "type": "中文单字"}
+{"id": 88851, "token": "’acc", "type": "中文标点"}
+{"id": 88852, "token": "以下", "type": "中文多字"}
+{"id": 88905, "token": "电话", "type": "中文多字"}
+{"id": 88925, "token": "“Well", "type": "中文标点"}
+{"id": 88958, "token": "—from", "type": "中文标点"}
+{"id": 89046, "token": "連", "type": "中文单字"}
+{"id": 89151, "token": "真", "type": "中文单字"}
+{"id": 89186, "token": "有效", "type": "中文多字"}
+{"id": 89213, "token": "’:", "type": "中文标点"}
+{"id": 89408, "token": "今年", "type": "中文多字"}
+{"id": 89575, "token": "€“", "type": "中文标点"}
+{"id": 89753, "token": "流", "type": "中文单字"}
+{"id": 89783, "token": "余", "type": "中文单字"}
+{"id": 89874, "token": "”\n", "type": "中文标点"}
+{"id": 89902, "token": "任务", "type": "中文多字"}
+{"id": 90070, "token": "见", "type": "中文单字"}
+{"id": 90091, "token": "正确", "type": "中文多字"}
+{"id": 90112, "token": "给", "type": "中文单字"}
+{"id": 90147, "token": "服务器", "type": "中文多字"}
+{"id": 90223, "token": "’es", "type": "中文标点"}
+{"id": 90261, "token": "来源", "type": "中文多字"}
+{"id": 90354, "token": "结", "type": "中文单字"}
+{"id": 90493, "token": "。<", "type": "中文标点"}
+{"id": 90578, "token": "…\n", "type": "中文标点"}
+{"id": 90581, "token": "－", "type": "中文标点"}
+{"id": 90756, "token": "详情", "type": "中文多字"}
+{"id": 90863, "token": "—if", "type": "中文标点"}
+{"id": 91006, "token": "？」", "type": "中文标点"}
+{"id": 91077, "token": "局", "type": "中文单字"}
+{"id": 91082, "token": "主", "type": "中文单字"}
+{"id": 91240, "token": "’à", "type": "中文标点"}
+{"id": 91272, "token": "优", "type": "中文单字"}
+{"id": 91386, "token": "书", "type": "中文单字"}
+{"id": 91417, "token": "’y", "type": "中文标点"}
+{"id": 91418, "token": "’util", "type": "中文标点"}
+{"id": 91443, "token": "’hui", "type": "中文标点"}
+{"id": 91466, "token": "一页", "type": "中文多字"}
+{"id": 91495, "token": "，并", "type": "中文多字"}
+{"id": 91547, "token": "发布", "type": "中文多字"}
+{"id": 91763, "token": "思", "type": "中文单字"}
+{"id": 91774, "token": "見", "type": "中文单字"}
+{"id": 91837, "token": "：<", "type": "中文标点"}
+{"id": 91875, "token": "動", "type": "中文单字"}
+{"id": 91940, "token": "运", "type": "中文单字"}
+{"id": 91951, "token": "审核", "type": "中文多字"}
+{"id": 91967, "token": "图", "type": "中文单字"}
+{"id": 91985, "token": "样", "type": "中文单字"}
+{"id": 92019, "token": "其中", "type": "中文多字"}
+{"id": 92056, "token": "权限", "type": "中文多字"}
+{"id": 92099, "token": "删除成功", "type": "中文多字"}
+{"id": 92113, "token": " “…", "type": "中文标点"}
+{"id": 92150, "token": "�新", "type": "中文多字"}
+{"id": 92193, "token": "（笑", "type": "中文多字"}
+{"id": 92211, "token": "，《", "type": "中文标点"}
+{"id": 92264, "token": ",’”", "type": "中文标点"}
+{"id": 92318, "token": "时间", "type": "中文多字"}
+{"id": 92366, "token": "】,", "type": "中文标点"}
+{"id": 92378, "token": "）\r\n", "type": "中文标点"}
+{"id": 92382, "token": "定义", "type": "中文多字"}
+{"id": 92517, "token": "关", "type": "中文单字"}
+{"id": 92527, "token": "登", "type": "中文单字"}
+{"id": 92553, "token": "销", "type": "中文单字"}
+{"id": 92555, "token": "万元", "type": "中文多字"}
+{"id": 92672, "token": "同时", "type": "中文多字"}
+{"id": 92693, "token": "無料", "type": "中文多字"}
+{"id": 92748, "token": "’all", "type": "中文标点"}
+{"id": 92776, "token": "即", "type": "中文单字"}
+{"id": 92780, "token": "只", "type": "中文单字"}
+{"id": 92877, "token": "老", "type": "中文单字"}
+{"id": 93056, "token": "、“", "type": "中文标点"}
+{"id": 93115, "token": "岁", "type": "中文单字"}
+{"id": 93126, "token": "’Brien", "type": "中文标点"}
+{"id": 93132, "token": "大小", "type": "中文多字"}
+{"id": 93233, "token": "找", "type": "中文单字"}
+{"id": 93269, "token": "“These", "type": "中文标点"}
+{"id": 93393, "token": "实", "type": "中文单字"}
+{"id": 93413, "token": "或", "type": "中文单字"}
+{"id": 93446, "token": "“\n\n", "type": "中文标点"}
+{"id": 93474, "token": "节点", "type": "中文多字"}
+{"id": 93598, "token": "若", "type": "中文单字"}
+{"id": 93636, "token": "小时", "type": "中文多字"}
+{"id": 93673, "token": "“To", "type": "中文标点"}
+{"id": 93830, "token": "—\"", "type": "中文标点"}
+{"id": 93922, "token": "’autres", "type": "中文标点"}
+{"id": 93994, "token": "其他", "type": "中文多字"}
+{"id": 94134, "token": "自治", "type": "中文多字"}
+{"id": 94249, "token": "分享", "type": "中文多字"}
+{"id": 94345, "token": "’ex", "type": "中文标点"}
+{"id": 94366, "token": "稍", "type": "中文单字"}
+{"id": 94518, "token": "…the", "type": "中文标点"}
+{"id": 94537, "token": "�件", "type": "中文多字"}
+{"id": 94588, "token": "达", "type": "中文单字"}
+{"id": 94668, "token": "邮箱", "type": "中文多字"}
+{"id": 94720, "token": "新增", "type": "中文多字"}
+{"id": 94785, "token": "提", "type": "中文单字"}
+{"id": 94895, "token": "：%", "type": "中文标点"}
+{"id": 94923, "token": "院", "type": "中文单字"}
+{"id": 94983, "token": "加", "type": "中文单字"}
+{"id": 95001, "token": "価", "type": "中文单字"}
+{"id": 95221, "token": "気", "type": "中文单字"}
+{"id": 95337, "token": "约", "type": "中文单字"}
+{"id": 95399, "token": "速", "type": "中文单字"}
+{"id": 95475, "token": "停", "type": "中文单字"}
+{"id": 95532, "token": "？\n", "type": "中文标点"}
+{"id": 95543, "token": "反", "type": "中文单字"}
+{"id": 95544, "token": "票", "type": "中文单字"}
+{"id": 95598, "token": "十", "type": "中文单字"}
+{"id": 96153, "token": "，则", "type": "中文多字"}
+{"id": 96197, "token": ",—", "type": "中文标点"}
+{"id": 96203, "token": "“At", "type": "中文标点"}
+{"id": 96206, "token": "’)", "type": "中文标点"}
+{"id": 96332, "token": "[…]", "type": "中文标点"}
+{"id": 96356, "token": "身", "type": "中文单字"}
+{"id": 96407, "token": "商品", "type": "中文多字"}
+{"id": 96412, "token": "含", "type": "中文单字"}
+{"id": 96455, "token": "率", "type": "中文单字"}
+{"id": 96500, "token": "汽", "type": "中文单字"}
+{"id": 96511, "token": "专", "type": "中文单字"}
+{"id": 96555, "token": "／", "type": "中文标点"}
+{"id": 96557, "token": "管理员", "type": "中文多字"}
+{"id": 97049, "token": "歳", "type": "中文单字"}
+{"id": 97150, "token": "，在", "type": "中文多字"}
+{"id": 97360, "token": ".–", "type": "中文标点"}
+{"id": 97432, "token": "”。\n\n", "type": "中文标点"}
+{"id": 97518, "token": "関", "type": "中文单字"}
+{"id": 97522, "token": "议", "type": "中文单字"}
+{"id": 97565, "token": "雷", "type": "中文单字"}
+{"id": 97655, "token": "正在", "type": "中文多字"}
+{"id": 97908, "token": "�能", "type": "中文多字"}
+{"id": 97999, "token": "。(", "type": "中文标点"}
+{"id": 98128, "token": "自动生成", "type": "中文多字"}
+{"id": 98134, "token": "’elle", "type": "中文标点"}
+{"id": 98184, "token": "些", "type": "中文单字"}
+{"id": 98220, "token": "界", "type": "中文单字"}
+{"id": 98245, "token": "陆", "type": "中文单字"}
+{"id": 98261, "token": "注意", "type": "中文多字"}
+{"id": 98390, "token": "备注", "type": "中文多字"}
+{"id": 98406, "token": "倍", "type": "中文单字"}
+{"id": 98458, "token": ",’’", "type": "中文标点"}
+{"id": 98476, "token": "“How", "type": "中文标点"}
+{"id": 98499, "token": "読", "type": "中文单字"}
+{"id": 98580, "token": "价格", "type": "中文多字"}
+{"id": 98657, "token": "检", "type": "中文单字"}
+{"id": 98711, "token": "我的", "type": "中文多字"}
+{"id": 98739, "token": "我们", "type": "中文多字"}
+{"id": 98806, "token": "还", "type": "中文单字"}
+{"id": 98871, "token": "析", "type": "中文单字"}
+{"id": 98897, "token": "企", "type": "中文单字"}
+{"id": 98915, "token": "友", "type": "中文单字"}
+{"id": 99007, "token": "”的", "type": "中文多字"}
+{"id": 99072, "token": "。www", "type": "中文标点"}
+{"id": 99083, "token": "“All", "type": "中文标点"}
+{"id": 99313, "token": ",…", "type": "中文标点"}
+{"id": 99337, "token": "简", "type": "中文单字"}
+{"id": 99379, "token": "移到", "type": "中文多字"}
+{"id": 99382, "token": ")”", "type": "中文标点"}
+{"id": 99397, "token": "問", "type": "中文单字"}
+{"id": 99480, "token": "功能", "type": "中文多字"}
+{"id": 99496, "token": "若要", "type": "中文多字"}
+{"id": 99502, "token": "长度", "type": "中文多字"}
+{"id": 99563, "token": "—at", "type": "中文标点"}
+{"id": 99643, "token": "】,【", "type": "中文标点"}
+{"id": 99741, "token": "装", "type": "中文单字"}
+{"id": 99750, "token": "感", "type": "中文单字"}
+{"id": 99771, "token": "哈", "type": "中文单字"}
+{"id": 99799, "token": "“One", "type": "中文标点"}
+{"id": 99849, "token": "何", "type": "中文单字"}
+{"id": 99941, "token": "预", "type": "中文单字"}
+{"id": 100065, "token": "～\n\n", "type": "中文标点"}
+{"id": 100066, "token": "送料", "type": "中文多字"}
+{"id": 100067, "token": "…it", "type": "中文标点"}
+{"id": 100179, "token": "尔", "type": "中文单字"}
+{"id": 100207, "token": "在线", "type": "中文多字"}

utils/log_util.py CHANGED Viewed

@@ -2,7 +2,7 @@
 import logging
 logging.basicConfig(
-    format='%(asctime)s - %(filename)s - %(levelname)s - %(process)d - %(thread)d - %(message)s',
     level=logging.INFO,
     datefmt="%Y-%m-%d %H:%M:%S",

 import logging
 logging.basicConfig(
+    format='[%(asctime)s] [%(levelname)s] [%(process)d:%(thread)d] [%(filename)s:%(lineno)d:%(funcName)s] %(message)s',
     level=logging.INFO,
     datefmt="%Y-%m-%d %H:%M:%S",

utils/zh_util.py CHANGED Viewed

@@ -52,7 +52,7 @@ def iter_vocab(tokenizer, name="", from_cache=True):
         if has_chinese(decode_str):
             # bert词典有 ##开头的
             # byteBPE词典有带空格的
-            decode_str = decode_str.strip().replace("#", "")
             zh_token_count["total"] += 1
             if len(decode_str) > 1:
                 zh_token_count["中文多字"] += 1
@@ -93,4 +93,6 @@ if __name__ == "__main__":
     # test_coding_length(jd_vocab_tokens, filter=lambda k: not is_chinese(k))
     # test_coding_length(zh_punc)
     # test_coding_length(zh_iterator())
-    iter_vocab()

         if has_chinese(decode_str):
             # bert词典有 ##开头的
             # byteBPE词典有带空格的
+            decode_str = decode_str.strip().replace("#", "")  # TODO, 按类型
             zh_token_count["total"] += 1
             if len(decode_str) > 1:
                 zh_token_count["中文多字"] += 1
     # test_coding_length(jd_vocab_tokens, filter=lambda k: not is_chinese(k))
     # test_coding_length(zh_punc)
     # test_coding_length(zh_iterator())
+    from vocab.gpt_35_turbo import tokenizer
+    iter_vocab(tokenizer)

vocab/README.md CHANGED Viewed

	@@ -86,4 +86,6 @@ https://github.com/pytorch/fairseq/blob/master/tests/test_noising.py#L37
86
87
88
89	- ##


86
87
88
89	+ ## reversible and lossless
90	+
91	+ It's reversible and lossless, so you can convert tokens back into the original text

vocab/__init__.py CHANGED Viewed

@@ -24,8 +24,12 @@ tokenizer.implementation = TokenizerImpl.SentencePiece.name   # https://github.c
   - tiktoken
   - icetk
   - hf_tokenizer
-    - 特征：.model 是 tokenizer.models.BPE 类型，词典有 Ġ  "\u0120" 开头，有1个tokenizer.json(包括 merge vocab)，或者分开独立文件
-    - 示例：gpt_neox_20b, moss
   - tiktoken
     - 特征：空格就是空格，
     - 示例：gpt3.5 gpt4
@@ -57,8 +61,8 @@ all_tokenizers = [
     "moss",
     #
     # ######
-    # "chatyuan_large_v2",
-    # "prompt_clue",
     #
     # #### bloom 系列
     "bloom",
@@ -69,7 +73,7 @@ all_tokenizers = [
     # "gpt_neox_chinese_v1",
     #
     # ##### glm系列
-    # "glm_chinese",
     "chatglm_6b",
     "chatglm2-6b",
     #
@@ -80,13 +84,14 @@ all_tokenizers = [
     # "chinese_alpaca_lora_7b",  # 中文Alpaca模型在上述中文LLaMA模型的基础上进一步使用了指令数据进行精调。
     # "belle_llama_ext_7b",
     # "alpaca_7b",
-    "baichuan_7b",
     "qwen",
     "internlm_chat_7b",
-    "goat",
 ]
 class TokenizerType(Enum):
     """
     - https://huggingface.co/docs/transformers/tokenizer_summary

   - tiktoken
   - icetk
   - hf_tokenizer
+    - 特征：
+      - .model 是 tokenizer.models.BPE 类型
+      - 词典有 Ġ  "\u0120" 开头
+      - 有1个tokenizer.json(包括 merge vocab)，或者分开独立文件
+      - .model.from_file  .model.save   .model.token_to_id  .model.tokenize
+    - 示例：gpt_neox_20b, moss, bloom
   - tiktoken
     - 特征：空格就是空格，
     - 示例：gpt3.5 gpt4
     "moss",
     #
     # ######
+    "chatyuan_large_v2",
+    "prompt_clue",
     #
     # #### bloom 系列
     "bloom",
     # "gpt_neox_chinese_v1",
     #
     # ##### glm系列
+    "glm_chinese",
     "chatglm_6b",
     "chatglm2-6b",
     #
     # "chinese_alpaca_lora_7b",  # 中文Alpaca模型在上述中文LLaMA模型的基础上进一步使用了指令数据进行精调。
     # "belle_llama_ext_7b",
     # "alpaca_7b",
+    "baichuan",
+    "baichuan2",
     "qwen",
     "internlm_chat_7b",
+    "falcon_180b",
+    # "goat",
 ]
 class TokenizerType(Enum):
     """
     - https://huggingface.co/docs/transformers/tokenizer_summary

vocab/{baichuan_7b → baichuan}/Baichuan-7B/config.json RENAMED Viewed

File without changes

vocab/{baichuan_7b → baichuan}/Baichuan-7B/configuration_baichuan.py RENAMED Viewed

File without changes

vocab/{baichuan_7b → baichuan}/Baichuan-7B/special_tokens_map.json RENAMED Viewed

File without changes

vocab/{baichuan_7b → baichuan}/Baichuan-7B/tokenization_baichuan.py RENAMED Viewed

File without changes

vocab/{baichuan_7b → baichuan}/Baichuan-7B/tokenizer.model RENAMED Viewed

File without changes

vocab/{baichuan_7b → baichuan}/Baichuan-7B/tokenizer_config.json RENAMED Viewed

File without changes

vocab/{baichuan_7b → baichuan}/__init__.py RENAMED Viewed

File without changes

vocab/{baichuan_7b → baichuan}/demo.py RENAMED Viewed

File without changes

vocab/baichuan2/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from transformers import AutoTokenizer
+from vocab import TokenizerType
+tokenizer = AutoTokenizer.from_pretrained("baichuan-inc/Baichuan2-7B-Chat", trust_remote_code=True)
+# byte-bpe  sentencepiece
+tokenizer.type = TokenizerType.ByteBPE
+tokenizer.comments = "expand the vocqbulary size from 64000 in Baichuan1 to 125696"

vocab/bloom/test_tokenizer.py CHANGED Viewed

@@ -12,6 +12,8 @@ print("vocab size:", tokenizer.vocab_size)
 tokens = tokenizer.encode("中")
 decode_line = tokenizer.decode(tokens)
 def id2token(ids):
     return tokenizer.convert_ids_to_tokens(ids)

 tokens = tokenizer.encode("中")
 decode_line = tokenizer.decode(tokens)
+tokenizer.save_vocabulary("tmp", "ddd")
 def id2token(ids):
     return tokenizer.convert_ids_to_tokens(ids)

vocab/chinese_llama2/__init__.py CHANGED Viewed

@@ -1,3 +1,10 @@
 from transformers import LlamaTokenizer
 tokenizer = LlamaTokenizer.from_pretrained("ziqingyang/chinese-llama-2-7b")

+"""
+## 词典扩容
+32000 <pad>
+32001 但
+"""
 from transformers import LlamaTokenizer
 tokenizer = LlamaTokenizer.from_pretrained("ziqingyang/chinese-llama-2-7b")

vocab/falcon_180b/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+import os
+from transformers import AutoTokenizer
+CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
+TOKENIZER_DIR = os.path.join(CURRENT_DIR, "tokenizer")
+# tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-180b")  # token
+tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR)

vocab/falcon_180b/tokenizer/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "additional_special_tokens": [
+    ">>TITLE<<",
+    ">>ABSTRACT<<",
+    ">>INTRODUCTION<<",
+    ">>SUMMARY<<",
+    ">>COMMENT<<",
+    ">>ANSWER<<",
+    ">>QUESTION<<",
+    ">>DOMAIN<<",
+    ">>PREFIX<<",
+    ">>SUFFIX<<",
+    ">>MIDDLE<<"
+  ],
+  "eos_token": "<|endoftext|>"
+}

vocab/falcon_180b/tokenizer/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

vocab/falcon_180b/tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "add_prefix_space": false,
+  "eos_token": "<|endoftext|>",
+  "model_max_length": 2048,
+  "name_or_path": "tiiuae/falcon-40b",
+  "special_tokens_map_file": null,
+  "tokenizer_class": "PreTrainedTokenizerFast"
+}

vocab/gpt_35_turbo/__init__.py CHANGED Viewed

@@ -7,6 +7,8 @@ from utils.log_util import logger
 tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo')
 tokenizer.vocab_size = tokenizer.n_vocab
 def decode(self, tokens, errors="replace"):
@@ -20,8 +22,11 @@ def decode(self, tokens, errors="replace"):
 def convert_ids_to_tokens(self, tokens):
     return tokenizer.decode_tokens_bytes(tokens)
-def get_vocab(self):
-    """Returns vocab as a dict"""
     vocab = {}
     key_error_list = []
     unicode_decode_error_list = []
@@ -29,11 +34,13 @@ def get_vocab(self):
         try:
             token_byte = self.convert_ids_to_tokens([i])[0]
             token_str = token_byte.decode("utf-8")
-            vocab[token_str] = i
-        except KeyError:  # 100256 100261-100275
             key_error_list.append(i)
-        except UnicodeDecodeError:  # 特别多
             unicode_decode_error_list.append((i, str(token_byte)))
     # vocab.update(self.added_tokens_encoder)
     logger.info(f"gpt_35_turbo {len(key_error_list)} KeyError: {key_error_list}")
@@ -41,6 +48,8 @@ def get_vocab(self):
     return vocab
 Encoding.decode = decode
 Encoding.convert_ids_to_tokens = convert_ids_to_tokens
 Encoding.get_vocab = get_vocab

 tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo')
 tokenizer.vocab_size = tokenizer.n_vocab
+tokenizer.comments = "tiktoken is a fast BPE tokeniser for use with OpenAI's models. There are 16 tokens KeyError"
+tokenizer.reversible = True  # It's reversible and lossless, so you can convert tokens back into the original text
 def decode(self, tokens, errors="replace"):
 def convert_ids_to_tokens(self, tokens):
     return tokenizer.decode_tokens_bytes(tokens)
+def get_vocab(self, token_type="str"):
+    """Returns vocab as a dict
+    :param token_type: ["str", "byte"]
+    :return:
+    """
     vocab = {}
     key_error_list = []
     unicode_decode_error_list = []
         try:
             token_byte = self.convert_ids_to_tokens([i])[0]
             token_str = token_byte.decode("utf-8")
+            vocab[token_byte] = i
+        except KeyError:  # 16 KeyError, 100256 100261-100275
             key_error_list.append(i)
+            # vocab[f"[KeyError]-{i}"] = i
+        except UnicodeDecodeError:  # 773 UnicodeDecodeError
             unicode_decode_error_list.append((i, str(token_byte)))
+            vocab[token_byte] = i
     # vocab.update(self.added_tokens_encoder)
     logger.info(f"gpt_35_turbo {len(key_error_list)} KeyError: {key_error_list}")
     return vocab
+# tiktoken patch
 Encoding.decode = decode
 Encoding.convert_ids_to_tokens = convert_ids_to_tokens
 Encoding.get_vocab = get_vocab

vocab/gpt_35_turbo/aaa.py CHANGED Viewed

@@ -17,6 +17,11 @@ import tiktoken
 tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo')
 for token_id in [100263, 99834]:  # special_tokens: 200257-100260 100276
     try:

 tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo')
+tokens = [100263, 99834]
+tokenizer.decode(tokens)
+tokenizer._core_bpe.decode_bytes(tokens).decode("utf-8", errors="replace")
 for token_id in [100263, 99834]:  # special_tokens: 200257-100260 100276
     try:

vocab/gpt_4/__init__.py CHANGED Viewed

@@ -1,48 +1,3 @@
-import tiktoken
-from tiktoken import Encoding
-from utils.log_util import logger
-tokenizer = tiktoken.encoding_for_model('gpt-4')
-tokenizer.vocab_size = tokenizer.n_vocab
-def decode(self, tokens, errors="replace"):
-# def decode(self, tokens: list[int], errors: str = "replace") -> str:
-    try:
-        decode_str = self._core_bpe.decode_bytes(tokens).decode("utf-8", errors=errors)
-    except:
-        decode_str = "null"
-    return decode_str
-def convert_ids_to_tokens(self, tokens):
-    return tokenizer.decode_tokens_bytes(tokens)
-def get_vocab(self):
-    """Returns vocab as a dict"""
-    vocab = {}
-    key_error_list = []
-    unicode_decode_error_list = []
-    for i in range(self.vocab_size):
-        try:
-            token_byte = self.convert_ids_to_tokens([i])[0]
-            token_str = token_byte.decode("utf-8")
-            vocab[token_str] = i
-        except KeyError:  # 100256 100261-100275
-            key_error_list.append(i)
-        except UnicodeDecodeError:  # 特别多
-            unicode_decode_error_list.append((i, str(token_byte)))
-    # vocab.update(self.added_tokens_encoder)
-    logger.info(f"gpt-4 {len(key_error_list)} KeyError: {key_error_list}")
-    logger.info(f"gpt-4 {len(unicode_decode_error_list)} UnicodeDecodeError: {unicode_decode_error_list[:5]}")
-    return vocab
-Encoding.decode = decode
-Encoding.convert_ids_to_tokens = convert_ids_to_tokens
-Encoding.get_vocab = get_vocab


1
2	+ from vocab.gpt_35_turbo import tokenizer













































3

vocab/gpt_neox_chinese_v1/20B_tokenizer_chinese.mock.json CHANGED Viewed

@@ -255,6 +255,8 @@
     "end_of_word_suffix": null,
     "fuse_unk": false,
     "vocab": {
       "531": 531,
       "541": 541,
       "566": 566,

     "end_of_word_suffix": null,
     "fuse_unk": false,
     "vocab": {
+      "<|endoftext|>": 0,
+      "<|padding|>": 1,
       "531": 531,
       "541": 541,
       "566": 566,

vocab/gpt_neox_chinese_v1/mock.py CHANGED Viewed

@@ -1,17 +1,32 @@
 import copy
 import json
-input_path = "20B_tokenizer_chinese.json"
-tokenizer = json.load(open(input_path, "r", encoding="utf-8"))
-vocab = tokenizer["model"]["vocab"]
-for k, v in copy.deepcopy(vocab).items():
-    vocab[str(v)] = v
-    vocab.pop(k)
-out_path = input_path.replace(".json", ".mock.json")
-with open(out_path, "w", encoding="utf-8") as f_out:
-    f_out.write(json.dumps(tokenizer, ensure_ascii=False, indent=2))

 import copy
 import json
+from tokenizers import Tokenizer
+def export_mock_tokenizer():
+    input_path = "20B_tokenizer_chinese.json"
+    tokenizer = json.load(open(input_path, "r", encoding="utf-8"))
+    vocab = tokenizer["model"]["vocab"]
+    added_tokens = [token["id"] for token in tokenizer["added_tokens"]]
+    for k, v in copy.deepcopy(vocab).items():
+        if v not in added_tokens:
+            vocab[str(v)] = v
+            vocab.pop(k)
+    out_path = input_path.replace(".json", ".mock.json")
+    with open(out_path, "w", encoding="utf-8") as f_out:
+        f_out.write(json.dumps(tokenizer, ensure_ascii=False, indent=2))
+def mock2():
+    pass
+def load_mock_tokenizer():
+    tokenizer = Tokenizer.from_file("20B_tokenizer_chinese.mock.json")
+    print('')
+export_mock_tokenizer()
+load_mock_tokenizer()

vocab/gpt_neox_chinese_v1/trouble-shooting.md ADDED Viewed

	@@ -0,0 +1,22 @@

+## Exception: data did not match any variant of untagged enum ModelWrapper at line 108219 column 3
+## The OrderedVocab you are attempting to save contains a hole for index 50254, your vocabulary could be corrupted !
+```
+The OrderedVocab you are attempting to save contains a hole for index 50254, your vocabulary could be corrupted !
+The OrderedVocab you are attempting to save contains a hole for index 50255, your vocabulary could be corrupted !
+The OrderedVocab you are attempting to save contains a hole for index 50256, your vocabulary could be corrupted !
+```
+原因：50254 这些token并未在vocab中定义，只在 `added_tokens` 里定义了。
+## ss

vocab/llama/__init__.py CHANGED Viewed

@@ -1,7 +1,20 @@
 """
 """
 import os

 """
+## 指令 special token
+{"token_id": 29961, "decode_str": "[", "token": "["}
+{"token_id": 25580, "decode_str": "INST", "token": "INST"}
+{"token_id": 29962, "decode_str": "]", "token": "]"}
+{"token_id": 3532, "decode_str": "<<", "token": "▁<<"}
+{"token_id": 14816, "decode_str": "SY", "token": "SY"}
+{"token_id": 29903, "decode_str": "S", "token": "S"}
+{"token_id": 6778, "decode_str": ">>", "token": ">>"}
+{"token_id": 13, "decode_str": "\n", "token": "<0x0A>"}
+疑问：为什么不将 <<SYS>> <</SYS>> [INST] [/INST] 做成1个id？
 """
 import os

vocab/llama/demo.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import json
+import os
+from transformers import LlamaTokenizer
+CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
+TOKENIZER_DIR = os.path.join(CURRENT_DIR, "tokenizer")
+tokenizer = LlamaTokenizer.from_pretrained(TOKENIZER_DIR)
+tokens = [    1, 29961, 25580, 29962,  3532, 14816, 29903,  6778,    13,  3492,
+           526,   263,  8444, 29892,  3390,  1319,   322, 15993, 20255, 29889,
+         29849,  1234,   408,  1371,  3730,   408,  1950, 29892,  1550,  1641,
+          9109, 29889, 29871,  3575,  6089,   881,   451,  3160,   738, 10311,
+          1319, 29892,   443,   621,   936, 29892, 11021,   391, 29892,  7916,
+           391, 29892,   304, 27375, 29892, 18215, 29892,   470, 27302,  2793,
+         29889,  3529,  9801,   393,   596, 20890,   526,  5374,   635,   443,
+          5365,  1463,   322,  6374,   297,  5469, 29889,    13,    13,  3644,
+           263,  1139,   947,   451,  1207,   738,  4060, 29892,   470,   338,
+           451,  2114,  1474, 16165,   261,   296, 29892,  5649,  2020,  2012,
+           310, 22862,  1554,   451,  1959, 29889,   960,   366,  1016, 29915,
+         29873,  1073,   278,  1234,   304,   263,  1139, 29892,  3113,  1016,
+         29915, 29873,  6232,  2089,  2472, 29889,    13, 29966,   829, 14816,
+         29903,  6778,    13,    13, 15970,   526,   366,   518, 29914, 25580,
+         29962]
+text = tokenizer.decode(tokens)
+print(text)
+for token_id in tokens:
+    print(json.dumps({"token_id": token_id, "decode_str": tokenizer.decode([token_id]), "token": tokenizer.convert_ids_to_tokens([token_id][0])}, ensure_ascii=False))