xu-song commited on
Commit
0177868
·
1 Parent(s): 79b95c3
Files changed (2) hide show
  1. app.py +9 -7
  2. util.py +1 -1
app.py CHANGED
@@ -17,6 +17,7 @@
17
  - 中文字词统计,是否要包括 _ G 等字符
18
  - baichuan的单字数量怎么两万多个?
19
  - OOV
 
20
 
21
 
22
  plots
@@ -40,11 +41,12 @@ from util import *
40
 
41
  # llama chatglm_6b gpt_nexo_20b baichuan baichuan_7b
42
  examples = [
43
- # ["空格测试: 2个空格 8个空格", "llama", "chatglm_6b"], # chatglm 有blank_n,
44
  ["标点测试:,。!?;", "baichuan_7b", "llama"],
45
  ["符号测试:🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan_7b", "llama"],
46
- ["中文简体:宽带,繁体:樂來", "baichuan_7b", "llama"],
47
  ["数字测试:(10086 + 98) = 100184", "baichuan_7b", "llama"],
 
 
48
  ]
49
 
50
  # jieba.enable_parallel() # flask中没办法parallel
@@ -66,8 +68,8 @@ default_tokenizer_type_2 = "internlm_chat_7b"
66
  default_stats_vocab_size_1, default_stats_zh_token_size_1 = basic_count(default_tokenizer_type_1)
67
  default_stats_vocab_size_2, default_stats_zh_token_size_2 = basic_count(default_tokenizer_type_2)
68
  default_stats_overlap_token_size = get_overlap_token_size(default_tokenizer_type_1, default_tokenizer_type_2)[0]
69
- default_output_text_1, default_output_table_1 = tokenize(default_user_input, default_tokenizer_type_1, update=False)
70
- default_output_text_2, default_output_table_2 = tokenize(default_user_input, default_tokenizer_type_2, update=False)
71
 
72
  with gr.Blocks(css="style.css") as demo:
73
  gr.HTML("""<h1 align="center">Tokenizer Arena ⚔️</h1>""")
@@ -81,7 +83,7 @@ with gr.Blocks(css="style.css") as demo:
81
  with gr.Row():
82
  gr.Markdown("## Input Text")
83
  dropdown_examples = gr.Dropdown(
84
- ["Example1", "Example2", "Example3"],
85
  value="Examples",
86
  type="index",
87
  show_label=False,
@@ -181,14 +183,14 @@ with gr.Blocks(css="style.css") as demo:
181
  with gr.Column():
182
  output_text_1 = gr.Highlightedtext(
183
  value=default_output_text_1,
184
- label="Tokens 1",
185
  show_legend=True,
186
  elem_classes="space-show"
187
  )
188
  with gr.Column():
189
  output_text_2 = gr.Highlightedtext(
190
  value=default_output_text_2,
191
- label="Tokens 2",
192
  show_legend=True,
193
  elem_classes="space-show"
194
  )
 
17
  - 中文字词统计,是否要包括 _ G 等字符
18
  - baichuan的单字数量怎么两万多个?
19
  - OOV
20
+ - feedback位置
21
 
22
 
23
  plots
 
41
 
42
  # llama chatglm_6b gpt_nexo_20b baichuan baichuan_7b
43
  examples = [
44
+ ["空格测试: 2个空格 8个空格", "llama", "chatglm_6b"], # chatglm 有blank_n,
45
  ["标点测试:,。!?;", "baichuan_7b", "llama"],
46
  ["符号测试:🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan_7b", "llama"],
 
47
  ["数字测试:(10086 + 98) = 100184", "baichuan_7b", "llama"],
48
+ ["中文简体:宽带,繁体:樂來", "baichuan_7b", "llama"],
49
+
50
  ]
51
 
52
  # jieba.enable_parallel() # flask中没办法parallel
 
68
  default_stats_vocab_size_1, default_stats_zh_token_size_1 = basic_count(default_tokenizer_type_1)
69
  default_stats_vocab_size_2, default_stats_zh_token_size_2 = basic_count(default_tokenizer_type_2)
70
  default_stats_overlap_token_size = get_overlap_token_size(default_tokenizer_type_1, default_tokenizer_type_2)[0]
71
+ default_output_text_1, default_output_table_1, default_output_len_1 = tokenize(default_user_input, default_tokenizer_type_1, update=False)
72
+ default_output_text_2, default_output_table_2, default_output_len_2 = tokenize(default_user_input, default_tokenizer_type_2, update=False)
73
 
74
  with gr.Blocks(css="style.css") as demo:
75
  gr.HTML("""<h1 align="center">Tokenizer Arena ⚔️</h1>""")
 
83
  with gr.Row():
84
  gr.Markdown("## Input Text")
85
  dropdown_examples = gr.Dropdown(
86
+ ["空格测试", "标点测试", "符号测试", "数字测试"],
87
  value="Examples",
88
  type="index",
89
  show_label=False,
 
183
  with gr.Column():
184
  output_text_1 = gr.Highlightedtext(
185
  value=default_output_text_1,
186
+ label=f"Tokens: {default_output_len_1}",
187
  show_legend=True,
188
  elem_classes="space-show"
189
  )
190
  with gr.Column():
191
  output_text_2 = gr.Highlightedtext(
192
  value=default_output_text_2,
193
+ label=f"Tokens: {default_output_len_2}",
194
  show_legend=True,
195
  elem_classes="space-show"
196
  )
util.py CHANGED
@@ -59,7 +59,7 @@ def tokenize(text, tokenizer_type, color_num=5, update=True):
59
  if update:
60
  return gr.update(value=pos_tokens, label=f"Tokens: {len(encoding)}"), table_df
61
  else:
62
- return pos_tokens, table_df
63
 
64
 
65
  def tokenize_pair(text, tokenizer_type_1, tokenizer_type_2):
 
59
  if update:
60
  return gr.update(value=pos_tokens, label=f"Tokens: {len(encoding)}"), table_df
61
  else:
62
+ return pos_tokens, table_df, len(encoding)
63
 
64
 
65
  def tokenize_pair(text, tokenizer_type_1, tokenizer_type_2):