eson commited on
Commit
7d2062e
1 Parent(s): 480ae5d
Files changed (3) hide show
  1. README.md +1 -26
  2. config.py +1 -1
  3. utils/compress_rate_util.py +12 -9
README.md CHANGED
@@ -9,34 +9,9 @@ app_file: app.py
9
  pinned: false
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
13
 
14
 
15
- ## ss
16
-
17
-
18
- ## TODO
19
-
20
-
21
- - 搜索栏
22
- -
23
-
24
-
25
-
26
- ## 统计
27
-
28
-
29
- ## vocabsize
30
-
31
- - 增大能提到压缩率,副作用是增大计算量和内存 (getting the most out of your tokenizer for pre-training and)
32
- -
33
-
34
-
35
- https://huggingface.co/spaces/yenniejun/tokenizers-languages
36
-
37
-
38
-
39
- ## Compress Rate
40
 
41
 
42
  在 [cc-100](https://huggingface.co/datasets/cc100) 数据集,每个语言取1万条数据,测试不同tokenizer的压缩率。压缩率指标 `g_bytes/b_tokens`
 
9
  pinned: false
10
  ---
11
 
 
12
 
13
 
14
+ ## 压缩率 Compress Rate
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
 
17
  在 [cc-100](https://huggingface.co/datasets/cc100) 数据集,每个语言取1万条数据,测试不同tokenizer的压缩率。压缩率指标 `g_bytes/b_tokens`
config.py CHANGED
@@ -17,4 +17,4 @@ Buenos días!
17
  华为发布Mate60手机。
18
  ラグビーワールドカップ2023フランス"""
19
  default_tokenizer_type_1 = "llama3"
20
- default_tokenizer_type_2 = "gpt4"
 
17
  华为发布Mate60手机。
18
  ラグビーワールドカップ2023フランス"""
19
  default_tokenizer_type_1 = "llama3"
20
+ default_tokenizer_type_2 = "gpt_4"
utils/compress_rate_util.py CHANGED
@@ -41,6 +41,7 @@
41
 
42
  import json
43
  import os
 
44
  import pandas as pd
45
  from datasets import load_dataset
46
  from utils.log_util import logger
@@ -94,13 +95,12 @@ def pprint(stats):
94
  if unit not in stat:
95
  columns[unit] = unit_convertor(stat, unit)
96
  else:
97
- pass
98
 
99
  table.append(columns)
100
  df = pd.DataFrame(table)
101
  # print(df.to_markdown(index=False, tablefmt='fancy_grid'))
102
- logger.info("\n{df.to_markdown(index=False)}")
103
- return
104
 
105
 
106
  cache = {}
@@ -163,17 +163,20 @@ def tokenize_corpus(tokenizer, lang, cache_dir="stats/compress_rate"):
163
 
164
  def main():
165
  from vocab import all_tokenizers
 
 
 
 
 
 
 
166
  stats = {}
167
- for lang in ["en", "zh-Hans"]:
168
  print("###" * 10 + lang)
169
-
170
- # for tokenizer_name in ['llama', 'llama2', 'llama3']:
171
- for tokenizer_name in all_tokenizers:
172
  tokenizer = load_tokener(tokenizer_name)
173
  stat = tokenize_corpus(tokenizer, lang)
174
- # ["qwen1_5_14b_chat", "gpt_35_turbo",]:
175
  stats[tokenizer_name] = stat
176
-
177
  pprint(stats)
178
 
179
 
 
41
 
42
  import json
43
  import os
44
+ import sys
45
  import pandas as pd
46
  from datasets import load_dataset
47
  from utils.log_util import logger
 
95
  if unit not in stat:
96
  columns[unit] = unit_convertor(stat, unit)
97
  else:
98
+ logger.error(f"unit {unit} not support")
99
 
100
  table.append(columns)
101
  df = pd.DataFrame(table)
102
  # print(df.to_markdown(index=False, tablefmt='fancy_grid'))
103
+ logger.info(f"\n{df.to_markdown(index=False)}")
 
104
 
105
 
106
  cache = {}
 
163
 
164
  def main():
165
  from vocab import all_tokenizers
166
+ if len(sys.argv) == 3:
167
+ tokenizers = [sys.argv[1]]
168
+ corpuses = [sys.argv[2]]
169
+ else:
170
+ tokenizers = all_tokenizers
171
+ corpuses = ["en", "zh-Hans"]
172
+
173
  stats = {}
174
+ for lang in corpuses:
175
  print("###" * 10 + lang)
176
+ for tokenizer_name in tokenizers:
 
 
177
  tokenizer = load_tokener(tokenizer_name)
178
  stat = tokenize_corpus(tokenizer, lang)
 
179
  stats[tokenizer_name] = stat
 
180
  pprint(stats)
181
 
182