eson commited on
Commit
309a593
1 Parent(s): b15345c
Files changed (1) hide show
  1. app.py +6 -8
app.py CHANGED
@@ -5,13 +5,16 @@
5
  """
6
  ## TODO:
7
  - http get方式获取参数,
8
- - 自启动
9
  - iter_vocab 的 warmup
10
  - add_special_token 开关
11
  - theme 开关 light/dark
12
  - token_id/tokens/bytes 开关
13
  - 通过 javascript 添加 hover_text
14
  - i18
 
 
 
 
15
 
16
 
17
 
@@ -30,7 +33,6 @@ table
30
  [ The, 2, QUICK, Brown, Foxes, jumped, over, the, lazy, dog's, bone ]
31
  """
32
 
33
-
34
  import gradio as gr
35
 
36
  from vocab import all_tokenizers
@@ -63,8 +65,6 @@ default_stats_overlap_token_size = get_overlap_token_size(default_tokenizer_type
63
  default_output_text_1, default_output_table_1 = tokenize(default_user_input, default_tokenizer_type_1, update=False)
64
  default_output_text_2, default_output_table_2 = tokenize(default_user_input, default_tokenizer_type_2, update=False)
65
 
66
-
67
-
68
  with gr.Blocks(css="style.css") as demo:
69
  gr.HTML("""<h1 align="center">Tokenizer Arena ⚔️</h1>""")
70
  # links: https://www.coderstool.com/utf8-encoding-decoding
@@ -97,7 +97,6 @@ with gr.Blocks(css="style.css") as demo:
97
  # None,
98
  # )
99
 
100
-
101
  gr.Markdown("## Tokenization")
102
 
103
  with gr.Row():
@@ -139,12 +138,12 @@ with gr.Blocks(css="style.css") as demo:
139
  # https://www.onlinewebfonts.com/icon/418591
140
  gr.Image("images/VS.svg", scale=1, show_label=False,
141
  show_download_button=False, container=False,
142
- show_share_button=False) # height=10,
143
  with gr.Column(scale=6):
144
  with gr.Group():
145
  tokenizer_type_2 = gr.Dropdown(
146
  all_tokenizers,
147
- value="baichuan_7b",
148
  label="Tokenizer 2",
149
  )
150
  with gr.Group():
@@ -229,7 +228,6 @@ with gr.Blocks(css="style.css") as demo:
229
  # start up 初始化
230
  # user_input.update(user_input.value + "___")
231
 
232
-
233
  if __name__ == "__main__":
234
  demo.queue(max_size=20).launch()
235
  # demo.launch()
 
5
  """
6
  ## TODO:
7
  - http get方式获取参数,
 
8
  - iter_vocab 的 warmup
9
  - add_special_token 开关
10
  - theme 开关 light/dark
11
  - token_id/tokens/bytes 开关
12
  - 通过 javascript 添加 hover_text
13
  - i18
14
+ - 给方法 + 缓存,避免重复调用
15
+ - 英文 utf-8编码
16
+ - 词典支持下载
17
+ - 中文字词统计,是否要包括 _ G 等字符
18
 
19
 
20
 
 
33
  [ The, 2, QUICK, Brown, Foxes, jumped, over, the, lazy, dog's, bone ]
34
  """
35
 
 
36
  import gradio as gr
37
 
38
  from vocab import all_tokenizers
 
65
  default_output_text_1, default_output_table_1 = tokenize(default_user_input, default_tokenizer_type_1, update=False)
66
  default_output_text_2, default_output_table_2 = tokenize(default_user_input, default_tokenizer_type_2, update=False)
67
 
 
 
68
  with gr.Blocks(css="style.css") as demo:
69
  gr.HTML("""<h1 align="center">Tokenizer Arena ⚔️</h1>""")
70
  # links: https://www.coderstool.com/utf8-encoding-decoding
 
97
  # None,
98
  # )
99
 
 
100
  gr.Markdown("## Tokenization")
101
 
102
  with gr.Row():
 
138
  # https://www.onlinewebfonts.com/icon/418591
139
  gr.Image("images/VS.svg", scale=1, show_label=False,
140
  show_download_button=False, container=False,
141
+ show_share_button=False)
142
  with gr.Column(scale=6):
143
  with gr.Group():
144
  tokenizer_type_2 = gr.Dropdown(
145
  all_tokenizers,
146
+ value=default_tokenizer_type_2,
147
  label="Tokenizer 2",
148
  )
149
  with gr.Group():
 
228
  # start up 初始化
229
  # user_input.update(user_input.value + "___")
230
 
 
231
  if __name__ == "__main__":
232
  demo.queue(max_size=20).launch()
233
  # demo.launch()