eson commited on
Commit
f1b4ae2
1 Parent(s): f652c69

add access token

Browse files
Files changed (5) hide show
  1. README.md +2 -70
  2. app.py +9 -3
  3. character_util.py +2 -2
  4. compression_app.py +1 -1
  5. compression_util.py +2 -2
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
  title: Tokenizer Arena
3
- emoji:
4
  colorFrom: red
5
  colorTo: gray
6
  sdk: gradio
@@ -12,72 +12,4 @@ datasets:
12
  ---
13
 
14
 
15
-
16
- ## 压缩率 Compress Rate
17
-
18
-
19
- 在 [cc-100](https://huggingface.co/datasets/cc100) 数据集,每个语言取1万条数据,测试不同tokenizer的压缩率。
20
-
21
- > 压缩率示例:
22
- llama3扩充了词典,具有更高的压缩比。同样1T字节的简体中文语料,llama分词后是 0.56万亿个token,llama3只需要0.31万亿个token。
23
-
24
- | tokenizer | vocab_size | t_bytes/t_tokens | t_tokens/t_bytes | n_chars/n_tokens |
25
- |:-----------------------------|-------------:|-------------------:|-------------------:|-------------------:|
26
- | llama | 32000 | 1.8 | 0.56 | 0.7 |
27
- | llama3 | 128000 | 3.2 | 0.31 | 1.24 |
28
-
29
- 可通过以下脚本进行复现
30
- ```sh
31
- python utils/compress_rate_util.py
32
- ```
33
-
34
-
35
-
36
-
37
- <details> <summary>英文压缩率</summary>
38
- 在英文数据集 cc100-en 计算压缩率
39
-
40
- | tokenizer | vocab_size | g_bytes/b_tokens | b_tokens/g_bytes | t_bytes/t_tokens | t_tokens/t_bytes | n_chars/n_tokens |
41
- |:----------------------------|-------------:|-------------------:|-------------------:|-------------------:|-------------------:|-------------------:|
42
- | amber | 32000 | 3.56 | 0.28 | 3.47 | 0.29 | 3.81 |
43
- | aya_101 | 250100 | 3.3 | 0.3 | 3.22 | 0.31 | 3.53 |
44
- | baichuan | 64000 | 3.74 | 0.27 | 3.65 | 0.27 | 4 |
45
- | baichuan2 | 125696 | 3.89 | 0.26 | 3.8 | 0.26 | 4.17 |
46
-
47
- </details>
48
-
49
-
50
- <details> <summary>简体中文压缩率</summary>
51
- 在简体中文数据集 cc100-zh-Hans 计算压缩率
52
-
53
- | tokenizer | vocab_size | g_bytes/b_tokens | b_tokens/g_bytes | t_bytes/t_tokens | t_tokens/t_bytes | n_chars/n_tokens |
54
- |:----------------------------|-------------:|-------------------:|-------------------:|-------------------:|-------------------:|-------------------:|
55
- | amber | 32000 | 1.84 | 0.54 | 1.8 | 0.56 | 0.7 |
56
- | aya_101 | 250100 | 3.89 | 0.26 | 3.79 | 0.26 | 1.47 |
57
- | baichuan | 64000 | 3.92 | 0.26 | 3.82 | 0.26 | 1.48 |
58
-
59
- </details>
60
-
61
-
62
-
63
-
64
- ## Reference
65
-
66
- - Getting the most out of your tokenizer for pre-training and domain adaptation
67
- - Efficient and Effective Text Encoding for Chinese LLaMA and Alpaca
68
- - blog
69
- - https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
70
- - https://huggingface.co/docs/transformers/tokenizer_summary#sentencepiece
71
- - https://www.huaxiaozhuan.com/%E5%B7%A5%E5%85%B7/huggingface_transformer/chapters/1_tokenizer.html
72
- - https://zhuanlan.zhihu.com/p/652520262
73
- - https://github.com/QwenLM/Qwen/blob/main/tokenization_note_zh.md
74
- - https://tonybaloney.github.io/posts/cjk-chinese-japanese-korean-llm-ai-best-practices.html
75
- -
76
- - demo
77
- - https://huggingface.co/spaces/Xenova/the-tokenizer-playground
78
- - https://github.com/dqbd/tiktokenizer
79
- - https://chat.lmsys.org/?leaderboard
80
- - https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard
81
- - paper
82
- - ss
83
- -
 
1
  ---
2
  title: Tokenizer Arena
3
+ emoji: 📚
4
  colorFrom: red
5
  colorTo: gray
6
  sdk: gradio
 
12
  ---
13
 
14
 
15
+ Please visit our GitHub repo for more information: https://github.com/xu-song/tokenizer-arena
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -1,13 +1,19 @@
1
-
2
  from playground_app import demo as playground_tab
3
  from compression_app import demo as compression_tab
4
  from character_app import demo as character_tab
5
  from patcher.gr_interface import TabbedInterface
 
 
 
 
 
6
 
7
 
 
8
  demo = TabbedInterface(
9
  [playground_tab, compression_tab, character_tab],
10
- [" ⚔️ Playground", "🏆 Compression Leaderboard", "📊 Character Statistics"], # 编码速度,解码速度,字符分类(zh、num等,支持正则),支持的语言,机构,。
11
  title='<div align="center">Tokenizer Arena ⚔️</div>',
12
  css="css/style.css"
13
  )
@@ -15,4 +21,4 @@ demo = TabbedInterface(
15
  demo.load(js=open("js/onload.js", "r", encoding="utf-8").read())
16
 
17
  if __name__ == "__main__":
18
- demo.launch()
 
1
+ import os
2
  from playground_app import demo as playground_tab
3
  from compression_app import demo as compression_tab
4
  from character_app import demo as character_tab
5
  from patcher.gr_interface import TabbedInterface
6
+ from huggingface_hub import login
7
+
8
+ auth_token = os.environ.get('HF_TOKEN', None)
9
+ if auth_token:
10
+ login(token=auth_token)
11
 
12
 
13
+ # 编码速度,解码速度,字符分类(zh、num等,支持正则),支持的语言,。
14
  demo = TabbedInterface(
15
  [playground_tab, compression_tab, character_tab],
16
+ [" ⚔️ Playground", "🏆 Compression Leaderboard", "📊 Character Statistics"],
17
  title='<div align="center">Tokenizer Arena ⚔️</div>',
18
  css="css/style.css"
19
  )
 
21
  demo.load(js=open("js/onload.js", "r", encoding="utf-8").read())
22
 
23
  if __name__ == "__main__":
24
+ demo.launch()
character_util.py CHANGED
@@ -1,7 +1,7 @@
1
  """
2
  TODO:
3
- 1. 繁体、简体、语种、
4
- 2. 确认 bert的space token数目
5
  3. add token_impl
6
  4.
7
  """
 
1
  """
2
  TODO:
3
+ 1. add more language
4
+ 2. check space count of bert
5
  3. add token_impl
6
  4.
7
  """
compression_app.py CHANGED
@@ -82,7 +82,7 @@ with gr.Blocks() as demo:
82
  # "- `g_bytes/b_tokens` measures how many gigabytes corpus per billion tokens.\n"
83
  # "- `t_bytes/t_tokens` measures how many terabytes corpus per trillion tokens.\n"
84
  "- `char/token` measures how many chars per token on the tokenized corpus.\n"
85
- "- `oov_ratio`: out-of-vocabulary ratio on the selected corpus. 👉 get [oov charset](https://huggingface.co/spaces/eson/tokenizer-arena/raw/main/stats/compression_rate.json)\n\n"
86
  "You can reproduce this procedure with [compression_util.py](https://huggingface.co/spaces/eson/tokenizer-arena/blob/main/compression_util.py)."
87
  )
88
 
 
82
  # "- `g_bytes/b_tokens` measures how many gigabytes corpus per billion tokens.\n"
83
  # "- `t_bytes/t_tokens` measures how many terabytes corpus per trillion tokens.\n"
84
  "- `char/token` measures how many chars per token on the tokenized corpus.\n"
85
+ "- `oov_ratio`: out-of-vocabulary ratio on the selected corpus, 👉 get [oov charset](https://huggingface.co/spaces/eson/tokenizer-arena/raw/main/stats/compression_rate.json)\n\n"
86
  "You can reproduce this procedure with [compression_util.py](https://huggingface.co/spaces/eson/tokenizer-arena/blob/main/compression_util.py)."
87
  )
88
 
compression_util.py CHANGED
@@ -1,9 +1,9 @@
1
  """
2
 
3
- 中文数据:clue superclue
4
- 英文数据:glue cnn_dailymail gigaword
5
  code:
6
  math:
 
7
 
8
  """
9
 
 
1
  """
2
 
3
+ ## TODO
 
4
  code:
5
  math:
6
+ whitespace:
7
 
8
  """
9