xu-song commited on
Commit
1b7fc74
·
1 Parent(s): 367a536

add compression leaderboard

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. app.py +10 -249
  2. app_compression.py +127 -0
  3. app_playground.py +248 -0
  4. css/style.css +24 -1
  5. examples.py +1 -1
  6. patcher/gr_interface.py +59 -0
  7. tokenizer/sptokenizer_patch.py → patcher/sptokenizer_patch_deprecated.py +12 -4
  8. patcher/sptokenizer_wrapper.py +61 -0
  9. {tokenizer → patcher}/tiktoken_patch.py +5 -0
  10. stats/compress_rate.json +1868 -0
  11. stats/compress_rate/amber.en.json +0 -1
  12. stats/compress_rate/amber.zh-Hans.json +0 -1
  13. stats/compress_rate/aya_101.en.json +0 -1
  14. stats/compress_rate/aya_101.zh-Hans.json +0 -1
  15. stats/compress_rate/baichuan.en.json +0 -1
  16. stats/compress_rate/baichuan.zh-Hans.json +0 -1
  17. stats/compress_rate/baichuan2.en.json +0 -1
  18. stats/compress_rate/baichuan2.zh-Hans.json +0 -1
  19. stats/compress_rate/bert_base_cased.en.json +0 -1
  20. stats/compress_rate/bert_base_cased.zh-Hans.json +0 -1
  21. stats/compress_rate/bert_base_chinese.en.json +0 -1
  22. stats/compress_rate/bert_base_chinese.zh-Hans.json +0 -1
  23. stats/compress_rate/bert_base_uncased.en.json +0 -1
  24. stats/compress_rate/bert_base_uncased.zh-Hans.json +0 -1
  25. stats/compress_rate/bloom.en.json +0 -1
  26. stats/compress_rate/bloom.zh-Hans.json +0 -1
  27. stats/compress_rate/byt5_small.en.json +0 -1
  28. stats/compress_rate/byt5_small.zh-Hans.json +0 -1
  29. stats/compress_rate/character_glm_6b.en.json +0 -1
  30. stats/compress_rate/character_glm_6b.zh-Hans.json +0 -1
  31. stats/compress_rate/chatglm2_6b.en.json +0 -1
  32. stats/compress_rate/chatglm2_6b.zh-Hans.json +0 -1
  33. stats/compress_rate/chatglm3_6b.en.json +0 -1
  34. stats/compress_rate/chatglm3_6b.zh-Hans.json +0 -1
  35. stats/compress_rate/chatglm_6b.en.json +0 -1
  36. stats/compress_rate/chatglm_6b.zh-Hans.json +0 -1
  37. stats/compress_rate/chatyuan_large_v2.en.json +0 -1
  38. stats/compress_rate/chatyuan_large_v2.zh-Hans.json +0 -1
  39. stats/compress_rate/chinese_llama.en.json +0 -1
  40. stats/compress_rate/chinese_llama.zh-Hans.json +0 -1
  41. stats/compress_rate/chinese_llama2.en.json +0 -1
  42. stats/compress_rate/chinese_llama2.zh-Hans.json +0 -1
  43. stats/compress_rate/code_davinci_002.en.json +0 -1
  44. stats/compress_rate/code_davinci_002.zh-Hans.json +0 -1
  45. stats/compress_rate/crystal_coder.en.json +0 -1
  46. stats/compress_rate/crystal_coder.zh-Hans.json +0 -1
  47. stats/compress_rate/dbrx_instruct.en.json +0 -1
  48. stats/compress_rate/dbrx_instruct.zh-Hans.json +0 -1
  49. stats/compress_rate/deepseek_coder_33b_instruct.en.json +0 -1
  50. stats/compress_rate/deepseek_coder_33b_instruct.zh-Hans.json +0 -1
app.py CHANGED
@@ -1,255 +1,16 @@
1
- # coding=utf-8
2
- # author: xusong
3
- # time: 2022/8/23 16:06
4
-
5
- """
6
- ## TODO:
7
- - i18 国际化 https://blog.csdn.net/qq_26212731/article/details/78457198 request.header中也有language
8
- - iter_vocab 的 warmup
9
- - 开关
10
- - add_special_token 开关
11
- - theme 开关 light/dark
12
- - token_id/tokens/bytes 开关
13
- - 中文字词统计,是否要包括 _ G 等字符
14
- - 评测
15
- - OOV评测
16
- - 通过 javascript 添加 hover_text
17
- - 英文 utf-8编码
18
- - 词典支持下载,借用image下载的标签,
19
- - baichuan的单字数量怎么两万多个?
20
- - qwen: ValueError: Unclosed image token
21
- - 路径修改为全path meta-llama/Llama-2-13b-hf
22
-
23
- plots
24
-
25
- table
26
-
27
- ## related demo
28
- - [](http://text-processing.com/demo/tokenize/)
29
- - [gpt-tokenizer](https://gpt-tokenizer.dev/)
30
- - [llama-tokenizer-js](https://belladoreai.github.io/llama-tokenizer-js/example-demo/build/)
31
- - [](https://huggingface.co/spaces/Xenova/the-tokenizer-playground)
32
-
33
- ## 可视化
34
-
35
- [ The, 2, QUICK, Brown, Foxes, jumped, over, the, lazy, dog's, bone ]
36
- """
37
 
38
  import gradio as gr
39
- from vocab import all_tokenizers
40
- from util import *
41
- from examples import example_fn, example_types
42
- from utils.compress_rate_util import common_units, common_corpuses
43
-
44
- get_window_url_params = """
45
- function(url_params) {
46
- const params = new URLSearchParams(window.location.search);
47
- url_params = JSON.stringify(Object.fromEntries(params));
48
- return url_params;
49
- }
50
- """
51
-
52
- with gr.Blocks(css="css/style.css", title="Tokenizer Arena") as demo:
53
- gr.HTML("""<h1 align="center">Tokenizer Arena ⚔️</h1>""")
54
- # links: https://www.coderstool.com/utf8-encoding-decoding
55
- # 功能:输入文本,进行分词
56
- # 分词器:常见的分词器有集中,
57
- # 背景:方便分词、看词粒度、对比
58
-
59
- with gr.Row():
60
- gr.Markdown("## Input Text")
61
- dropdown_examples = gr.Dropdown(
62
- example_types,
63
- type="index",
64
- show_label=False,
65
- container=False,
66
- scale=0,
67
- elem_classes="example-style"
68
- )
69
- user_input = gr.Textbox(
70
- # value=default_user_input,
71
- label="Input Text",
72
- lines=5,
73
- show_label=False,
74
- )
75
- gr.Markdown("## Tokenization")
76
-
77
- # compress rate setting
78
- with gr.Accordion("Compress Rate Setting", open=True):
79
- gr.Markdown(
80
- "Please select corpus and unit of compress rate, get more details at [github](https://github.com/xu-song/tokenizer-arena/). ")
81
- with gr.Row():
82
- compress_rate_corpus = gr.CheckboxGroup(
83
- common_corpuses, # , "code"
84
- value=["cc100-en", "cc100-zh-Hans"],
85
- label="corpus",
86
- # info=""
87
- )
88
- compress_rate_unit = gr.Radio(
89
- common_units,
90
- value="b_tokens/g_bytes",
91
- label="unit",
92
- )
93
- # TODO: Token Setting
94
- # with gr.Accordion("Token Filter Setting", open=False):
95
- # gr.Markdown(
96
- # "Get total number of tokens which contain the following character)")
97
- # gr.Radio(
98
- # ["zh-Hans", "", "number", "space"],
99
- # value="zh",
100
- # )
101
-
102
- with gr.Row():
103
- with gr.Column(scale=6):
104
- with gr.Group():
105
- tokenizer_type_1 = gr.Dropdown(
106
- all_tokenizers,
107
- label="Tokenizer 1",
108
- )
109
- with gr.Group():
110
- """
111
- <div class="stat"><div class="stat-value">69</div><div class="stat-label">Characters</div></div>
112
- """
113
- with gr.Row():
114
- stats_vocab_size_1 = gr.TextArea(
115
- label="Vocab Size",
116
- lines=1,
117
- elem_classes="statistics"
118
- )
119
- stats_zh_token_size_1 = gr.TextArea(
120
- label="ZH char/word",
121
- lines=1,
122
- elem_classes="statistics",
123
- visible=False
124
- )
125
- stats_compress_rate_1 = gr.TextArea(
126
- label="Compress Rate",
127
- lines=1,
128
- elem_classes="statistics"
129
- )
130
- stats_overlap_token_size_1 = gr.TextArea(
131
- # value=default_stats_overlap_token_size,
132
- label="Overlap Tokens",
133
- lines=1,
134
- elem_classes="statistics"
135
- )
136
- # stats_3 = gr.TextArea(
137
- # label="Compress Rate",
138
- # lines=1,
139
- # elem_classes="statistics"
140
- # )
141
- # https://www.onlinewebfonts.com/icon/418591
142
- gr.Image("images/VS.svg", scale=1, show_label=False,
143
- show_download_button=False, container=False,
144
- show_share_button=False)
145
- with gr.Column(scale=6):
146
- with gr.Group():
147
- tokenizer_type_2 = gr.Dropdown(
148
- all_tokenizers,
149
- label="Tokenizer 2",
150
- )
151
- with gr.Group():
152
- with gr.Row():
153
- stats_vocab_size_2 = gr.TextArea(
154
- label="VocabSize",
155
- lines=1,
156
- elem_classes="statistics"
157
- )
158
- stats_zh_token_size_2 = gr.TextArea(
159
- label="ZH char/word", # 中文字/词
160
- lines=1,
161
- elem_classes="statistics",
162
- visible=False
163
- )
164
- stats_compress_rate_2 = gr.TextArea(
165
- label="Compress Rate",
166
- lines=1,
167
- elem_classes="statistics"
168
- )
169
- stats_filtered_token_2 = gr.TextArea(
170
- label="filtered tokens",
171
- lines=1,
172
- elem_classes="statistics",
173
- visible=False
174
- )
175
- stats_overlap_token_size_2 = gr.TextArea(
176
- label="Overlap Tokens",
177
- lines=1,
178
- elem_classes="statistics"
179
- )
180
-
181
- # TODO: 图 表 压缩率
182
- with gr.Row():
183
- # dynamic change label
184
- with gr.Column():
185
- output_text_1 = gr.Highlightedtext(
186
- show_legend=True,
187
- elem_classes="space-show"
188
- )
189
- with gr.Column():
190
- output_text_2 = gr.Highlightedtext(
191
- show_legend=True,
192
- elem_classes="space-show"
193
- )
194
-
195
- with gr.Row():
196
- output_table_1 = gr.Dataframe()
197
- output_table_2 = gr.Dataframe()
198
-
199
- # setting
200
- # compress_rate_unit.change(compress_rate_unit_change, [compress_rate_unit],
201
- # [stats_compress_rate_1, stats_compress_rate_2])
202
-
203
- tokenizer_type_1.change(tokenize, [user_input, tokenizer_type_1],
204
- [output_text_1, output_table_1])
205
- tokenizer_type_1.change(basic_count, [tokenizer_type_1], [stats_vocab_size_1, stats_zh_token_size_1])
206
- tokenizer_type_1.change(get_overlap_token_size, [tokenizer_type_1, tokenizer_type_2],
207
- [stats_overlap_token_size_1, stats_overlap_token_size_2])
208
- tokenizer_type_1.change(get_compress_rate, [tokenizer_type_1, compress_rate_corpus, compress_rate_unit],
209
- [stats_compress_rate_1])
210
-
211
- # TODO: every=3
212
- user_input.change(tokenize_pair,
213
- [user_input, tokenizer_type_1, tokenizer_type_2],
214
- [output_text_1, output_table_1, output_text_2, output_table_2]) # , pass_request=1
215
-
216
- tokenizer_type_2.change(tokenize, [user_input, tokenizer_type_2],
217
- [output_text_2, output_table_2])
218
- tokenizer_type_2.change(basic_count, [tokenizer_type_2], [stats_vocab_size_2, stats_zh_token_size_2])
219
- tokenizer_type_2.change(get_overlap_token_size, [tokenizer_type_1, tokenizer_type_2],
220
- [stats_overlap_token_size_1, stats_overlap_token_size_2])
221
- tokenizer_type_2.change(get_compress_rate,
222
- [tokenizer_type_2, compress_rate_corpus, compress_rate_unit],
223
- [stats_compress_rate_2])
224
-
225
- compress_rate_unit.change(get_compress_rate,
226
- [tokenizer_type_1, compress_rate_corpus, compress_rate_unit],
227
- [stats_compress_rate_1])
228
- compress_rate_unit.change(get_compress_rate,
229
- [tokenizer_type_2, compress_rate_corpus, compress_rate_unit],
230
- [stats_compress_rate_2])
231
- compress_rate_corpus.change(get_compress_rate,
232
- [tokenizer_type_1, compress_rate_corpus, compress_rate_unit],
233
- [stats_compress_rate_1])
234
- compress_rate_corpus.change(get_compress_rate,
235
- [tokenizer_type_2, compress_rate_corpus, compress_rate_unit],
236
- [stats_compress_rate_2])
237
 
238
- dropdown_examples.change(
239
- example_fn,
240
- dropdown_examples,
241
- [user_input, tokenizer_type_1, tokenizer_type_2]
242
- )
243
 
244
- demo.load(js=open("js/onload.js", "r", encoding="utf-8").read())
245
- demo.load(
246
- fn=on_load,
247
- inputs=[user_input], # 这���只需要传个空object即可。
248
- outputs=[user_input, tokenizer_type_1, tokenizer_type_2],
249
- js=get_window_url_params
250
- )
251
 
252
  if __name__ == "__main__":
253
- # demo.queue(max_size=20).launch()
254
- demo.launch()
255
- # demo.launch(share=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
 
2
  import gradio as gr
3
+ from app_playground import demo as tab_playground
4
+ from app_compression import demo as tab_compression
5
+ from patcher.gr_interface import TabbedInterface
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
 
 
 
 
 
7
 
8
+ demo = TabbedInterface(
9
+ [tab_playground, tab_compression],
10
+ [" ⚔️Playground", "🏆 Compression Leaderboard",], # 编码速度,解码速度,字符分类(zh、num等,支持正则),支持的语言,机构,。
11
+ title='<div align="center">Tokenizer Arena ⚔️</div>',
12
+ css="css/style.css"
13
+ )
 
14
 
15
  if __name__ == "__main__":
16
+ demo.launch()
 
 
app_compression.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from utils.compression_util import get_compression_leaderboard
3
+ from utils.compression_util import common_corpuses
4
+
5
+ with gr.Blocks() as demo:
6
+ # gr.Markdown("## Convertor")
7
+ # with gr.Accordion("Convertor", open=False):
8
+ # gr.Markdown("Tokenize {} corpus")
9
+ # with gr.Row(elem_classes="no-border"):
10
+ # gr.Button("File Size", min_width=50)
11
+ # file_size = gr.Textbox(
12
+ # show_label=False,
13
+ # min_width=50,
14
+ # # elem_classes="textbox-as-text"
15
+ # )
16
+ # gr.Dropdown(
17
+ # choices=['MB', 'GB', 'TB'],
18
+ # show_label=False,
19
+ # min_width=15,
20
+ # # elem_classes="textbox-as-text"
21
+ # )
22
+ # # gr.Markdown('<h2 align="center">≈</h2>')
23
+ # # gr.HTML('<h2 style="margin: auto;">≈</h2>')
24
+ # gr.Button(
25
+ # "≈",
26
+ # min_width=10,
27
+ # elem_classes="button-white h2-font"
28
+ #
29
+ # )
30
+ #
31
+ # gr.Button(
32
+ # "Tokens",
33
+ # min_width=50
34
+ # )
35
+ # gr.Textbox(
36
+ # show_label=False,
37
+ # min_width=50
38
+ # )
39
+ # gr.Dropdown(
40
+ # ['million', 'billion', 'trillion'],
41
+ # show_label=False,
42
+ # min_width=15,
43
+ # elem_classes="button-white"
44
+ # )
45
+
46
+ gr.Markdown("## 🛠️ Setting") # ⚙
47
+ with gr.Accordion("Please select corpus and measure of compression rate ...", open=True):
48
+ # file size 💽 🖴, tokens 🧮
49
+ # gr.Markdown(
50
+ # "Please select corpus and measure of compression rate.\n"
51
+ #"`num_of_trillion_tokens` `num_of_billion_tokens`\n"
52
+ # "- `b_tokens/g_bytes` measures how many billion tokens per gigabytes corpus. \n"
53
+ # "- `t_tokens/t_bytes` measures how many trillion tokens per terabytes corpus. \n"
54
+ # "- `n_chars/n_tokens` measures how many chars per token in the current corpus. \n\n"
55
+ # "All the above measures are depend on corpus. You can reproduce this "
56
+ # "procedure at [github](https://github.com/xu-song/tokenizer-arena/)."
57
+ # )
58
+
59
+ with gr.Row():
60
+ compress_rate_corpus = gr.Dropdown(
61
+ common_corpuses, # , "code"
62
+ value=["cc100-en", "cc100-zh-Hans"],
63
+ label="corpus",
64
+ multiselect=True
65
+ # info=""
66
+ )
67
+
68
+
69
+ # unit of file_size: gigabyte terabyte
70
+ # unit of token_num: million billion trillion
71
+ # The most common units of measurement include length (meter, inch, foot), weight (gram, kilogram, pound), volume (liter, gallon, milliliter), time (second, minute, hour)
72
+ compress_rate_unit = gr.Radio(
73
+ ["b_tokens/g_bytes", "t_tokens/t_bytes"],
74
+ value="b_tokens/g_bytes",
75
+ label="measure",
76
+ )
77
+
78
+ gr.Markdown(
79
+ # "`num_of_trillion_tokens` `num_of_billion_tokens`\n"
80
+ "- `b_tokens/g_bytes` measures how many billion tokens per gigabytes corpus. \n"
81
+ "- `t_tokens/t_bytes` measures how many trillion tokens per terabytes corpus. \n"
82
+ "- `n_chars/n_tokens` measures how many chars per token in the tokenized corpus. \n\n"
83
+ "All the above measures are depend on corpus. You can reproduce this "
84
+ "procedure at [github](https://github.com/xu-song/tokenizer-arena/)."
85
+ )
86
+
87
+ gr.Markdown("## 🏆 Compression Rate Leaderboard")
88
+ search_bar = gr.Textbox(
89
+ placeholder="🔍 Search tokenizers(e.g., 'llama') and press ENTER...",
90
+ show_label=False,
91
+ elem_id="search-bar",
92
+ )
93
+ compress_rate_table = gr.Dataframe()
94
+
95
+ # func call
96
+ compress_rate_corpus.change(
97
+ get_compression_leaderboard,
98
+ inputs=[compress_rate_corpus, compress_rate_unit],
99
+ outputs=compress_rate_table
100
+ )
101
+ compress_rate_unit.change(
102
+ get_compression_leaderboard,
103
+ inputs=[compress_rate_corpus, compress_rate_unit],
104
+ outputs=compress_rate_table
105
+ )
106
+ # file_size.change(
107
+ # get_all_compress_rate,
108
+ # outputs=compress_rate_table
109
+ # )
110
+
111
+ search_bar.submit(
112
+ get_compression_leaderboard,
113
+ inputs=[
114
+ compress_rate_corpus,
115
+ compress_rate_unit,
116
+ search_bar,
117
+ ],
118
+ outputs=compress_rate_table
119
+ )
120
+
121
+ demo.load(
122
+ get_compression_leaderboard,
123
+ inputs=[compress_rate_corpus, compress_rate_unit],
124
+ outputs=compress_rate_table
125
+ )
126
+ if __name__ == "__main__":
127
+ demo.launch()
app_playground.py ADDED
@@ -0,0 +1,248 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # author: xusong
3
+ # time: 2022/8/23 16:06
4
+
5
+ """
6
+ ## TODO:
7
+ - i18 国际化 https://blog.csdn.net/qq_26212731/article/details/78457198 request.header中也有language
8
+ - iter_vocab 的 warmup
9
+ - 开关
10
+ - add_special_token 开关
11
+ - theme 开关 light/dark
12
+ - token_id/tokens/bytes 开关
13
+ - 中文字词统计,是否要包括 _ G 等字符
14
+ - 评测
15
+ - OOV评测
16
+ - 通过 javascript 添加 hover_text
17
+ - 英文 utf-8编码
18
+ - 词典支持下载,借用image下载的标签,
19
+ - baichuan的单字数量怎么两万多个?
20
+ - qwen: ValueError: Unclosed image token
21
+ - 路径修改为全path meta-llama/Llama-2-13b-hf
22
+
23
+ plots
24
+
25
+ table
26
+
27
+ ## related demo
28
+ - [](http://text-processing.com/demo/tokenize/)
29
+ - [gpt-tokenizer](https://gpt-tokenizer.dev/)
30
+ - [llama-tokenizer-js](https://belladoreai.github.io/llama-tokenizer-js/example-demo/build/)
31
+ - [](https://huggingface.co/spaces/Xenova/the-tokenizer-playground)
32
+
33
+ ## 可视化
34
+
35
+ [ The, 2, QUICK, Brown, Foxes, jumped, over, the, lazy, dog's, bone ]
36
+ """
37
+
38
+ import gradio as gr
39
+ from vocab import all_tokenizers
40
+ from util import *
41
+ from examples import example_fn, example_types
42
+
43
+ get_window_url_params = """
44
+ function(url_params) {
45
+ const params = new URLSearchParams(window.location.search);
46
+ url_params = JSON.stringify(Object.fromEntries(params));
47
+ return url_params;
48
+ }
49
+ """
50
+
51
+ with gr.Blocks() as demo:
52
+ # links: https://www.coderstool.com/utf8-encoding-decoding
53
+ # 功能:输入文本,进行分词
54
+ # 分词器:常见的分词器有集中,
55
+ # 背景:方便分词、看词粒度、对比
56
+
57
+ with gr.Row():
58
+ gr.Markdown("## Input Text")
59
+ dropdown_examples = gr.Dropdown(
60
+ example_types,
61
+ type="index",
62
+ show_label=False,
63
+ container=False,
64
+ scale=0,
65
+ elem_classes="example-style"
66
+ )
67
+ user_input = gr.Textbox(
68
+ # value=default_user_input,
69
+ label="Input Text",
70
+ lines=5,
71
+ show_label=False,
72
+ )
73
+ gr.Markdown("## Tokenization")
74
+
75
+ # compress rate setting TODO: 将 这个模块调整到下面
76
+ # with gr.Accordion("Compress Rate Setting", open=True):
77
+ # gr.Markdown(
78
+ # "Please select corpus and unit of compress rate, get more details at [github](https://github.com/xu-song/tokenizer-arena/). ")
79
+ # with gr.Row():
80
+ # compress_rate_corpus = gr.CheckboxGroup(
81
+ # common_corpuses, # , "code"
82
+ # value=["cc100-en", "cc100-zh-Hans"],
83
+ # label="corpus",
84
+ # # info=""
85
+ # )
86
+ # compress_rate_unit = gr.Radio(
87
+ # common_units,
88
+ # value="b_tokens/g_bytes",
89
+ # label="unit",
90
+ # )
91
+ # TODO: Token Setting
92
+ # with gr.Accordion("Token Filter Setting", open=False):
93
+ # gr.Markdown(
94
+ # "Get total number of tokens which contain the following character)")
95
+ # gr.Radio(
96
+ # ["zh-Hans", "", "number", "space"],
97
+ # value="zh",
98
+ # )
99
+
100
+ with gr.Row():
101
+ with gr.Column(scale=6):
102
+ with gr.Group():
103
+ tokenizer_name_1 = gr.Dropdown(
104
+ all_tokenizers,
105
+ label="Tokenizer 1",
106
+ )
107
+ with gr.Group():
108
+ with gr.Row():
109
+ stats_vocab_size_1 = gr.TextArea(
110
+ label="Vocab Size",
111
+ lines=1,
112
+ elem_classes="statistics"
113
+ )
114
+ stats_zh_token_size_1 = gr.TextArea(
115
+ label="ZH char/word",
116
+ lines=1,
117
+ elem_classes="statistics",
118
+ )
119
+ # stats_compress_rate_1 = gr.TextArea(
120
+ # label="Compress Rate",
121
+ # lines=1,
122
+ # elem_classes="statistics",
123
+ # )
124
+ stats_overlap_token_size_1 = gr.TextArea(
125
+ # value=default_stats_overlap_token_size,
126
+ label="Overlap Tokens",
127
+ lines=1,
128
+ elem_classes="statistics"
129
+ )
130
+ # stats_3 = gr.TextArea(
131
+ # label="Compress Rate",
132
+ # lines=1,
133
+ # elem_classes="statistics"
134
+ # )
135
+ # https://www.onlinewebfonts.com/icon/418591
136
+ gr.Image("images/VS.svg", scale=1, show_label=False,
137
+ show_download_button=False, container=False,
138
+ show_share_button=False)
139
+ with gr.Column(scale=6):
140
+ with gr.Group():
141
+ tokenizer_name_2 = gr.Dropdown(
142
+ all_tokenizers,
143
+ label="Tokenizer 2",
144
+ )
145
+ with gr.Group():
146
+ with gr.Row():
147
+ stats_vocab_size_2 = gr.TextArea(
148
+ label="VocabSize",
149
+ lines=1,
150
+ elem_classes="statistics"
151
+ )
152
+ stats_zh_token_size_2 = gr.TextArea(
153
+ label="ZH char/word", # 中文字/词
154
+ lines=1,
155
+ elem_classes="statistics",
156
+ )
157
+ # stats_compress_rate_2 = gr.TextArea(
158
+ # label="Compress Rate",
159
+ # lines=1,
160
+ # elem_classes="statistics"
161
+ # )
162
+ stats_filtered_token_2 = gr.TextArea(
163
+ label="filtered tokens",
164
+ lines=1,
165
+ elem_classes="statistics",
166
+ visible=False
167
+ )
168
+ stats_overlap_token_size_2 = gr.TextArea(
169
+ label="Overlap Tokens",
170
+ lines=1,
171
+ elem_classes="statistics"
172
+ )
173
+
174
+ # TODO: 图 表 压缩率
175
+ with gr.Row():
176
+ # dynamic change label
177
+ with gr.Column():
178
+ output_text_1 = gr.Highlightedtext(
179
+ show_legend=True,
180
+ elem_classes="space-show"
181
+ )
182
+ with gr.Column():
183
+ output_text_2 = gr.Highlightedtext(
184
+ show_legend=True,
185
+ elem_classes="space-show"
186
+ )
187
+
188
+ with gr.Row():
189
+ output_table_1 = gr.Dataframe()
190
+ output_table_2 = gr.Dataframe()
191
+
192
+ # setting
193
+ # compress_rate_unit.change(compress_rate_unit_change, [compress_rate_unit],
194
+ # [stats_compress_rate_1, stats_compress_rate_2])
195
+
196
+ tokenizer_name_1.change(tokenize, [user_input, tokenizer_name_1],
197
+ [output_text_1, output_table_1])
198
+ tokenizer_name_1.change(basic_count, [tokenizer_name_1], [stats_vocab_size_1, stats_zh_token_size_1])
199
+ tokenizer_name_1.change(get_overlap_token_size, [tokenizer_name_1, tokenizer_name_2],
200
+ [stats_overlap_token_size_1, stats_overlap_token_size_2])
201
+ # tokenizer_type_1.change(get_compress_rate, [tokenizer_type_1, compress_rate_corpus, compress_rate_unit],
202
+ # [stats_compress_rate_1])
203
+
204
+ # TODO: every=3
205
+ user_input.change(tokenize_pair,
206
+ [user_input, tokenizer_name_1, tokenizer_name_2],
207
+ [output_text_1, output_table_1, output_text_2, output_table_2]) # , pass_request=1
208
+
209
+ tokenizer_name_2.change(tokenize, [user_input, tokenizer_name_2],
210
+ [output_text_2, output_table_2])
211
+ tokenizer_name_2.change(basic_count, [tokenizer_name_2], [stats_vocab_size_2, stats_zh_token_size_2])
212
+ tokenizer_name_2.change(get_overlap_token_size, [tokenizer_name_1, tokenizer_name_2],
213
+ [stats_overlap_token_size_1, stats_overlap_token_size_2])
214
+ # tokenizer_type_2.change(get_compress_rate,
215
+ # [tokenizer_type_2, compress_rate_corpus, compress_rate_unit],
216
+ # [stats_compress_rate_2])
217
+ #
218
+ # compress_rate_unit.change(get_compress_rate,
219
+ # [tokenizer_type_1, compress_rate_corpus, compress_rate_unit],
220
+ # [stats_compress_rate_1])
221
+ # compress_rate_unit.change(get_compress_rate,
222
+ # [tokenizer_type_2, compress_rate_corpus, compress_rate_unit],
223
+ # [stats_compress_rate_2])
224
+ # compress_rate_corpus.change(get_compress_rate,
225
+ # [tokenizer_type_1, compress_rate_corpus, compress_rate_unit],
226
+ # [stats_compress_rate_1])
227
+ # compress_rate_corpus.change(get_compress_rate,
228
+ # [tokenizer_type_2, compress_rate_corpus, compress_rate_unit],
229
+ # [stats_compress_rate_2])
230
+
231
+ dropdown_examples.change(
232
+ example_fn,
233
+ dropdown_examples,
234
+ [user_input, tokenizer_name_1, tokenizer_name_2]
235
+ )
236
+
237
+ demo.load(js=open("js/onload.js", "r", encoding="utf-8").read())
238
+ demo.load(
239
+ fn=on_load,
240
+ inputs=[user_input], # 这里只需要传个空object即可。
241
+ outputs=[user_input, tokenizer_name_1, tokenizer_name_2],
242
+ js=get_window_url_params
243
+ )
244
+
245
+ if __name__ == "__main__":
246
+ # demo.queue(max_size=20).launch()
247
+ demo.launch()
248
+ # demo.launch(share=True)
css/style.css CHANGED
@@ -8,6 +8,28 @@
8
  white-space: pre-wrap;
9
  }
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  /* 隐藏legend */
12
  .category-legend {
13
  display: none !important;
@@ -33,4 +55,5 @@
33
  .example-style {
34
  max-width: 150px;
35
  align-self: self-end;
36
- }
 
 
8
  white-space: pre-wrap;
9
  }
10
 
11
+
12
+ /* white button */
13
+ .button-as-text {
14
+ background: #fff;
15
+ border-color: #fff;
16
+ }
17
+
18
+ .textbox-as-text {
19
+ border-style: hidden;
20
+ background: #fff;
21
+ border-color: #fff;
22
+ }
23
+
24
+
25
+ .h2-font {
26
+ font-size: 30px;
27
+ }
28
+
29
+ .no-border {
30
+ border: 0px none;
31
+ }
32
+
33
  /* 隐藏legend */
34
  .category-legend {
35
  display: none !important;
 
55
  .example-style {
56
  max-width: 150px;
57
  align-self: self-end;
58
+ }
59
+
examples.py CHANGED
@@ -24,7 +24,7 @@ examples = {
24
  # !?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.
25
  ["punctuation: ,.:/?+=\",。!?;【】〔〕〖〗", "gemma_7b", "llama"], # llama词典有点小
26
  ["symbol: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan", "llama"],
27
- # ["special: [PAD] [UNK] [CLS] [SEP] [MASK] <|endoftext|>", "", ""],
28
  ],
29
  "zh": [
30
  ["空格测试: 2个空格 8个空格", "llama", "chatglm2_6b"], # chatglm 有blank_n,
 
24
  # !?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.
25
  ["punctuation: ,.:/?+=\",。!?;【】〔〕〖〗", "gemma_7b", "llama"], # llama词典有点小
26
  ["symbol: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan", "llama"],
27
+ # ["special: [PAD] [UNK] [CLS] [SEP] [MASK] <|system|> <|user|> <|assistant|> <|endoftext|>", "", ""],
28
  ],
29
  "zh": [
30
  ["空格测试: 2个空格 8个空格", "llama", "chatglm2_6b"], # chatglm 有blank_n,
patcher/gr_interface.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ 原生 TabbedInterface 的 title采用markdown,不能实现居中,因此这里做了调整。
3
+ """
4
+
5
+ from gradio import Blocks, Interface, Theme, Tabs, Tab, HTML
6
+
7
+ class TabbedInterface(Blocks):
8
+ """
9
+ A TabbedInterface is created by providing a list of Interfaces or Blocks, each of which gets
10
+ rendered in a separate tab. Only the components from the Interface/Blocks will be rendered in the tab.
11
+ Certain high-level attributes of the Blocks (e.g. custom `css`, `js`, and `head` attributes) will not be loaded.
12
+
13
+ Demos: tabbed_interface_lite
14
+ """
15
+
16
+ def __init__(
17
+ self,
18
+ interface_list: list[Interface],
19
+ tab_names: list[str] | None = None,
20
+ title: str | None = None,
21
+ theme: Theme | str | None = None,
22
+ analytics_enabled: bool | None = None,
23
+ css: str | None = None,
24
+ js: str | None = None,
25
+ head: str | None = None,
26
+ ):
27
+ """
28
+ Parameters:
29
+ interface_list: A list of Interfaces (or Blocks) to be rendered in the tabs.
30
+ tab_names: A list of tab names. If None, the tab names will be "Tab 1", "Tab 2", etc.
31
+ title: The tab title to display when this demo is opened in a browser window.
32
+ theme: A Theme object or a string representing a theme. If a string, will look for a built-in theme with that name (e.g. "soft" or "default"), or will attempt to load a theme from the Hugging Face Hub (e.g. "gradio/monochrome"). If None, will use the Default theme.
33
+ analytics_enabled: Whether to allow basic telemetry. If None, will use GRADIO_ANALYTICS_ENABLED environment variable or default to True.
34
+ css: Custom css as a string or path to a css file. This css will be included in the demo webpage.
35
+ js: Custom js or path to js file to run when demo is first loaded. This javascript will be included in the demo webpage.
36
+ head: Custom html to insert into the head of the demo webpage. This can be used to add custom meta tags, scripts, stylesheets, etc. to the page.
37
+ Returns:
38
+ a Gradio Tabbed Interface for the given interfaces
39
+ """
40
+ super().__init__(
41
+ title=title or "Gradio",
42
+ theme=theme,
43
+ analytics_enabled=analytics_enabled,
44
+ mode="tabbed_interface",
45
+ css=css,
46
+ js=js,
47
+ head=head,
48
+ )
49
+ if tab_names is None:
50
+ tab_names = [f"Tab {i}" for i in range(len(interface_list))]
51
+ with self:
52
+ if title:
53
+ HTML(
54
+ f"<h1 style='text-align: center; margin-bottom: 1rem'>{title}</h1>"
55
+ )
56
+ with Tabs():
57
+ for interface, tab_name in zip(interface_list, tab_names):
58
+ with Tab(label=tab_name):
59
+ interface.render()
tokenizer/sptokenizer_patch.py → patcher/sptokenizer_patch_deprecated.py RENAMED
@@ -1,6 +1,8 @@
1
  """
2
 
 
3
 
 
4
 
5
  ## usage
6
 
@@ -8,11 +10,15 @@
8
 
9
  ## 风险评估
10
 
11
- - 会干扰 sentencepiece.SentencePieceProcessor的正常使用吗?
12
 
 
 
 
 
13
  """
14
- import sentencepiece
15
 
 
16
 
17
 
18
  @property
@@ -32,15 +38,18 @@ def _tokenize(self, text):
32
  """Returns a tokenized string."""
33
  return self.encode(text, out_type=str)
34
 
 
35
  def _convert_token_to_id(self, token):
36
  """Converts a token (str) in an id using the vocab."""
37
  return self.piece_to_id(token)
38
 
 
39
  def _convert_id_to_token(self, index):
40
  """Converts an index (integer) in a token (str) using the vocab."""
41
  token = self.IdToPiece(index)
42
  return token
43
 
 
44
  def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
45
  """ copy from transformers.PreTrainedTokenizer
46
  Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
@@ -87,11 +96,10 @@ def decode(self, *args, **kwargs):
87
  return self.Decode(*args, **kwargs)
88
 
89
 
90
- sentencepiece.SentencePieceProcessor.vocab_size = vocab_size
91
  sentencepiece.SentencePieceProcessor.get_vocab = get_vocab
92
  sentencepiece.SentencePieceProcessor._convert_id_to_token = _convert_id_to_token
93
  sentencepiece.SentencePieceProcessor.convert_ids_to_tokens = convert_ids_to_tokens
94
  # sentencepiece.SentencePieceProcessor.tokenize = _tokenize
95
  sentencepiece.SentencePieceProcessor.encode = encode
96
  sentencepiece.SentencePieceProcessor.decode = decode
97
-
 
1
  """
2
 
3
+ ## adapt to transformer tokenizer
4
 
5
+ https://github.com/huggingface/transformers/blob/v4.40.1/src/transformers/tokenization_utils.py#L379
6
 
7
  ## usage
8
 
 
10
 
11
  ## 风险评估
12
 
13
+ - 可能会干扰 sentencepiece.SentencePieceProcessor的正常使用,比如 .vocab_size 原来是个方法,patch后是个property
14
 
15
+
16
+ ## TODO
17
+
18
+ 不用patch,改用wrapper。常见的 tokenizer通常是封装的 sentencepiece,
19
  """
 
20
 
21
+ import sentencepiece
22
 
23
 
24
  @property
 
38
  """Returns a tokenized string."""
39
  return self.encode(text, out_type=str)
40
 
41
+
42
  def _convert_token_to_id(self, token):
43
  """Converts a token (str) in an id using the vocab."""
44
  return self.piece_to_id(token)
45
 
46
+
47
  def _convert_id_to_token(self, index):
48
  """Converts an index (integer) in a token (str) using the vocab."""
49
  token = self.IdToPiece(index)
50
  return token
51
 
52
+
53
  def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
54
  """ copy from transformers.PreTrainedTokenizer
55
  Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
 
96
  return self.Decode(*args, **kwargs)
97
 
98
 
99
+ sentencepiece.SentencePieceProcessor.vocab_size = vocab_size #
100
  sentencepiece.SentencePieceProcessor.get_vocab = get_vocab
101
  sentencepiece.SentencePieceProcessor._convert_id_to_token = _convert_id_to_token
102
  sentencepiece.SentencePieceProcessor.convert_ids_to_tokens = convert_ids_to_tokens
103
  # sentencepiece.SentencePieceProcessor.tokenize = _tokenize
104
  sentencepiece.SentencePieceProcessor.encode = encode
105
  sentencepiece.SentencePieceProcessor.decode = decode
 
patcher/sptokenizer_wrapper.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ 封装 sentencepiece.SentencePieceProcessor,以便符合transformers中的tokenizer标准
2
+
3
+ ## reference
4
+
5
+
6
+ ## usage
7
+
8
+ - grok
9
+
10
+ """
11
+
12
+ import sentencepiece as spm
13
+ from transformers import PreTrainedTokenizer
14
+
15
+
16
+ class SPTokenizerWrapper(PreTrainedTokenizer):
17
+ """
18
+
19
+ ## impl in PreTrainedTokenizer
20
+ - convert_ids_to_tokens
21
+ """
22
+
23
+ def __init__(self, vocab_file):
24
+ self.vocab_file = vocab_file
25
+ self.sp_model = spm.SentencePieceProcessor(self.vocab_file)
26
+ super().__init__()
27
+
28
+ @property
29
+ def vocab_size(self):
30
+ """Returns vocab size"""
31
+ return self.sp_model.get_piece_size()
32
+
33
+ def get_vocab(self):
34
+ """Returns vocab as a dict"""
35
+ vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
36
+ return vocab
37
+
38
+ def _convert_token_to_id(self, token):
39
+ """Converts a token (str) in an id using the vocab."""
40
+ return self.sp_model.piece_to_id(token)
41
+
42
+ def _convert_id_to_token(self, index):
43
+ """Converts an index (integer) in a token (str) using the vocab."""
44
+ token = self.sp_model.IdToPiece(index)
45
+ return token
46
+
47
+ # def (self, ids, skip_special_tokens=False): # impl in PreTrainedTokenizer
48
+
49
+
50
+ def encode(self, *args, **kwargs):
51
+ kwargs.pop("add_special_tokens", None)
52
+ kwargs.pop("allowed_special", None)
53
+ return self.sp_model.Encode(*args, **kwargs)
54
+
55
+ def decode(self, *args, **kwargs):
56
+ kwargs.pop("skip_special_tokens", None)
57
+ return self.sp_model.Decode(*args, **kwargs)
58
+
59
+
60
+
61
+ # PreTrainedTokenizer.convert_ids_to_tokens
{tokenizer → patcher}/tiktoken_patch.py RENAMED
@@ -83,6 +83,10 @@ def encode(self, *args, **kwargs):
83
  return self._encode(*args, **kwargs)
84
 
85
 
 
 
 
 
86
  # tiktoken patch
87
  Encoding._encode = Encoding.encode
88
  Encoding.encode = encode
@@ -90,3 +94,4 @@ Encoding.decode = decode
90
  Encoding.convert_ids_to_tokens = convert_ids_to_tokens
91
  Encoding.get_vocab = get_vocab
92
  Encoding.vocab_size = vocab_size
 
 
83
  return self._encode(*args, **kwargs)
84
 
85
 
86
+ def __len__(self):
87
+ return self.n_vocab
88
+
89
+
90
  # tiktoken patch
91
  Encoding._encode = Encoding.encode
92
  Encoding.encode = encode
 
94
  Encoding.convert_ids_to_tokens = convert_ids_to_tokens
95
  Encoding.get_vocab = get_vocab
96
  Encoding.vocab_size = vocab_size
97
+ Encoding.__len__ = __len__
stats/compress_rate.json ADDED
@@ -0,0 +1,1868 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "amber.cc100-en": {
3
+ "vocab_size": 32000,
4
+ "n_bytes": 1124813,
5
+ "n_tokens": 294627,
6
+ "n_chars": 1121360
7
+ },
8
+ "aya_101.cc100-en": {
9
+ "vocab_size": 250100,
10
+ "n_bytes": 1124813,
11
+ "n_tokens": 317881,
12
+ "n_chars": 1121360
13
+ },
14
+ "baichuan.cc100-en": {
15
+ "vocab_size": 64000,
16
+ "n_bytes": 1124813,
17
+ "n_tokens": 280108,
18
+ "n_chars": 1121360
19
+ },
20
+ "baichuan2.cc100-en": {
21
+ "vocab_size": 125696,
22
+ "n_bytes": 1124813,
23
+ "n_tokens": 269011,
24
+ "n_chars": 1121360
25
+ },
26
+ "bert_base_cased.cc100-en": {
27
+ "vocab_size": 28996,
28
+ "n_bytes": 1124813,
29
+ "n_tokens": 288022,
30
+ "n_chars": 1121360
31
+ },
32
+ "bert_base_chinese.cc100-en": {
33
+ "vocab_size": 21128,
34
+ "n_bytes": 1124813,
35
+ "n_tokens": 377068,
36
+ "n_chars": 1121360
37
+ },
38
+ "bert_base_uncased.cc100-en": {
39
+ "vocab_size": 30522,
40
+ "n_bytes": 1124813,
41
+ "n_tokens": 280575,
42
+ "n_chars": 1121360
43
+ },
44
+ "bloom.cc100-en": {
45
+ "vocab_size": 250680,
46
+ "n_bytes": 1124813,
47
+ "n_tokens": 257405,
48
+ "n_chars": 1121360
49
+ },
50
+ "byt5_small.cc100-en": {
51
+ "vocab_size": 384,
52
+ "n_bytes": 1124813,
53
+ "n_tokens": 1134813,
54
+ "n_chars": 1121360
55
+ },
56
+ "character_glm_6b.cc100-en": {
57
+ "vocab_size": 64789,
58
+ "n_bytes": 1124813,
59
+ "n_tokens": 289347,
60
+ "n_chars": 1121360
61
+ },
62
+ "chatglm2_6b.cc100-en": {
63
+ "vocab_size": 64787,
64
+ "n_bytes": 1124813,
65
+ "n_tokens": 289329,
66
+ "n_chars": 1121360
67
+ },
68
+ "chatglm3_6b.cc100-en": {
69
+ "vocab_size": 64796,
70
+ "n_bytes": 1124813,
71
+ "n_tokens": 289347,
72
+ "n_chars": 1121360
73
+ },
74
+ "chatglm_6b.cc100-en": {
75
+ "vocab_size": 150344,
76
+ "n_bytes": 1124813,
77
+ "n_tokens": 284761,
78
+ "n_chars": 1121360
79
+ },
80
+ "chatyuan_large_v2.cc100-en": {
81
+ "vocab_size": 32128,
82
+ "n_bytes": 1124813,
83
+ "n_tokens": 536033,
84
+ "n_chars": 1121360
85
+ },
86
+ "chinese_llama.cc100-en": {
87
+ "vocab_size": 49953,
88
+ "n_bytes": 1124813,
89
+ "n_tokens": 291514,
90
+ "n_chars": 1121360
91
+ },
92
+ "chinese_llama2.cc100-en": {
93
+ "vocab_size": 55296,
94
+ "n_bytes": 1124813,
95
+ "n_tokens": 294627,
96
+ "n_chars": 1121360
97
+ },
98
+ "code_davinci_002.cc100-en": {
99
+ "vocab_size": 50281,
100
+ "n_bytes": 1124813,
101
+ "n_tokens": 258403,
102
+ "n_chars": 1121360
103
+ },
104
+ "crystal_coder.cc100-en": {
105
+ "vocab_size": 32022,
106
+ "n_bytes": 1124813,
107
+ "n_tokens": 284627,
108
+ "n_chars": 1121360
109
+ },
110
+ "dbrx_instruct.cc100-en": {
111
+ "vocab_size": 100280,
112
+ "n_bytes": 1124813,
113
+ "n_tokens": 254985,
114
+ "n_chars": 1121360
115
+ },
116
+ "deepseek_coder_33b_instruct.cc100-en": {
117
+ "vocab_size": 32022,
118
+ "n_bytes": 1124813,
119
+ "n_tokens": 287408,
120
+ "n_chars": 1121360
121
+ },
122
+ "deepseek_llm_7b_base.cc100-en": {
123
+ "vocab_size": 100015,
124
+ "n_bytes": 1124813,
125
+ "n_tokens": 272324,
126
+ "n_chars": 1121360
127
+ },
128
+ "falcon_180b.cc100-en": {
129
+ "vocab_size": 65024,
130
+ "n_bytes": 1124813,
131
+ "n_tokens": 262509,
132
+ "n_chars": 1121360
133
+ },
134
+ "falcon_7b.cc100-en": {
135
+ "vocab_size": 65024,
136
+ "n_bytes": 1124813,
137
+ "n_tokens": 262509,
138
+ "n_chars": 1121360
139
+ },
140
+ "fastchat_t5_3b.cc100-en": {
141
+ "vocab_size": 32110,
142
+ "n_bytes": 1124813,
143
+ "n_tokens": 484941,
144
+ "n_chars": 1121360
145
+ },
146
+ "flan_t5_base.cc100-en": {
147
+ "vocab_size": 32100,
148
+ "n_bytes": 1124813,
149
+ "n_tokens": 290104,
150
+ "n_chars": 1121360
151
+ },
152
+ "gemma_7b.cc100-en": {
153
+ "vocab_size": 256000,
154
+ "n_bytes": 1124813,
155
+ "n_tokens": 268010,
156
+ "n_chars": 1121360
157
+ },
158
+ "gpt2.cc100-en": {
159
+ "vocab_size": 50257,
160
+ "n_bytes": 1124813,
161
+ "n_tokens": 258428,
162
+ "n_chars": 1121360
163
+ },
164
+ "gpt2_chinese.cc100-en": {
165
+ "vocab_size": 21128,
166
+ "n_bytes": 1124813,
167
+ "n_tokens": 392641,
168
+ "n_chars": 1121360
169
+ },
170
+ "gpt_35_turbo.cc100-en": {
171
+ "vocab_size": 100277,
172
+ "n_bytes": 1124813,
173
+ "n_tokens": 254985,
174
+ "n_chars": 1121360
175
+ },
176
+ "gpt_4.cc100-en": {
177
+ "vocab_size": 100277,
178
+ "n_bytes": 1124813,
179
+ "n_tokens": 254985,
180
+ "n_chars": 1121360
181
+ },
182
+ "gpt_nexo_20b.cc100-en": {
183
+ "vocab_size": 50277,
184
+ "n_bytes": 1124813,
185
+ "n_tokens": 259357,
186
+ "n_chars": 1121360
187
+ },
188
+ "grok_1.cc100-en": {
189
+ "vocab_size": 131072,
190
+ "n_bytes": 1124813,
191
+ "n_tokens": 258048,
192
+ "n_chars": 1121360
193
+ },
194
+ "internlm2_chat_7b.cc100-en": {
195
+ "vocab_size": 92544,
196
+ "n_bytes": 1124813,
197
+ "n_tokens": 271583,
198
+ "n_chars": 1121360
199
+ },
200
+ "internlm2_math_7b.cc100-en": {
201
+ "vocab_size": 92544,
202
+ "n_bytes": 1124813,
203
+ "n_tokens": 271583,
204
+ "n_chars": 1121360
205
+ },
206
+ "internlm_chat_7b.cc100-en": {
207
+ "vocab_size": 103168,
208
+ "n_bytes": 1124813,
209
+ "n_tokens": 271293,
210
+ "n_chars": 1121360
211
+ },
212
+ "internlm_xcomposer_7b.cc100-en": {
213
+ "vocab_size": 103168,
214
+ "n_bytes": 1124813,
215
+ "n_tokens": 271293,
216
+ "n_chars": 1121360
217
+ },
218
+ "jamba_v0_1.cc100-en": {
219
+ "vocab_size": 65536,
220
+ "n_bytes": 1124813,
221
+ "n_tokens": 274242,
222
+ "n_chars": 1121360
223
+ },
224
+ "kplug.cc100-en": {
225
+ "vocab_size": 10261,
226
+ "n_bytes": 1124813,
227
+ "n_tokens": 393564,
228
+ "n_chars": 1121360
229
+ },
230
+ "llama.cc100-en": {
231
+ "vocab_size": 32000,
232
+ "n_bytes": 1124813,
233
+ "n_tokens": 294627,
234
+ "n_chars": 1121360
235
+ },
236
+ "llama2.cc100-en": {
237
+ "vocab_size": 32001,
238
+ "n_bytes": 1124813,
239
+ "n_tokens": 294627,
240
+ "n_chars": 1121360
241
+ },
242
+ "llama3.cc100-en": {
243
+ "vocab_size": 128256,
244
+ "n_bytes": 1124813,
245
+ "n_tokens": 254944,
246
+ "n_chars": 1121360
247
+ },
248
+ "mistral_7b.cc100-en": {
249
+ "vocab_size": 32000,
250
+ "n_bytes": 1124813,
251
+ "n_tokens": 285801,
252
+ "n_chars": 1121360
253
+ },
254
+ "mixtral_8_7b.cc100-en": {
255
+ "vocab_size": 32000,
256
+ "n_bytes": 1124813,
257
+ "n_tokens": 285801,
258
+ "n_chars": 1121360
259
+ },
260
+ "mobilebert_uncased.cc100-en": {
261
+ "vocab_size": 30522,
262
+ "n_bytes": 1124813,
263
+ "n_tokens": 280575,
264
+ "n_chars": 1121360
265
+ },
266
+ "moss.cc100-en": {
267
+ "vocab_size": 106072,
268
+ "n_bytes": 1124813,
269
+ "n_tokens": 257070,
270
+ "n_chars": 1121360
271
+ },
272
+ "mt5_large.cc100-en": {
273
+ "vocab_size": 250100,
274
+ "n_bytes": 1124813,
275
+ "n_tokens": 317881,
276
+ "n_chars": 1121360
277
+ },
278
+ "olmo_7b.cc100-en": {
279
+ "vocab_size": 50280,
280
+ "n_bytes": 1124813,
281
+ "n_tokens": 259357,
282
+ "n_chars": 1121360
283
+ },
284
+ "orion_14b_chat.cc100-en": {
285
+ "vocab_size": 84608,
286
+ "n_bytes": 1124813,
287
+ "n_tokens": 265948,
288
+ "n_chars": 1121360
289
+ },
290
+ "phi_1.cc100-en": {
291
+ "vocab_size": 50295,
292
+ "n_bytes": 1124813,
293
+ "n_tokens": 258409,
294
+ "n_chars": 1121360
295
+ },
296
+ "phi_2.cc100-en": {
297
+ "vocab_size": 50295,
298
+ "n_bytes": 1124813,
299
+ "n_tokens": 258409,
300
+ "n_chars": 1121360
301
+ },
302
+ "phi_3_mini.cc100-en": {
303
+ "vocab_size": 32011,
304
+ "n_bytes": 1124813,
305
+ "n_tokens": 294627,
306
+ "n_chars": 1121360
307
+ },
308
+ "pko_t5_large.cc100-en": {
309
+ "vocab_size": 50358,
310
+ "n_bytes": 1124813,
311
+ "n_tokens": 658985,
312
+ "n_chars": 1121360
313
+ },
314
+ "prompt_clue.cc100-en": {
315
+ "vocab_size": 32128,
316
+ "n_bytes": 1124813,
317
+ "n_tokens": 536033,
318
+ "n_chars": 1121360
319
+ },
320
+ "qwen1_5_14b_chat.cc100-en": {
321
+ "vocab_size": 151646,
322
+ "n_bytes": 1124813,
323
+ "n_tokens": 257983,
324
+ "n_chars": 1121360
325
+ },
326
+ "qwen_1_8b_chat.cc100-en": {
327
+ "vocab_size": 151851,
328
+ "n_bytes": 1124813,
329
+ "n_tokens": 257983,
330
+ "n_chars": 1121360
331
+ },
332
+ "qwen_72b_chat.cc100-en": {
333
+ "vocab_size": 151851,
334
+ "n_bytes": 1124813,
335
+ "n_tokens": 257983,
336
+ "n_chars": 1121360
337
+ },
338
+ "qwen_7b_chat.cc100-en": {
339
+ "vocab_size": 151851,
340
+ "n_bytes": 1124813,
341
+ "n_tokens": 257983,
342
+ "n_chars": 1121360
343
+ },
344
+ "roberta_chinese_clue.cc100-en": {
345
+ "vocab_size": 8021,
346
+ "n_bytes": 1124813,
347
+ "n_tokens": 583058,
348
+ "n_chars": 1121360
349
+ },
350
+ "skywork_13b_base.cc100-en": {
351
+ "vocab_size": 65519,
352
+ "n_bytes": 1124813,
353
+ "n_tokens": 294617,
354
+ "n_chars": 1121360
355
+ },
356
+ "skywork_13b_math.cc100-en": {
357
+ "vocab_size": 65519,
358
+ "n_bytes": 1124813,
359
+ "n_tokens": 294617,
360
+ "n_chars": 1121360
361
+ },
362
+ "solar_10_7b.cc100-en": {
363
+ "vocab_size": 32000,
364
+ "n_bytes": 1124813,
365
+ "n_tokens": 285801,
366
+ "n_chars": 1121360
367
+ },
368
+ "starchat_alpha.cc100-en": {
369
+ "vocab_size": 49156,
370
+ "n_bytes": 1124813,
371
+ "n_tokens": 288965,
372
+ "n_chars": 1121360
373
+ },
374
+ "switch_c_2048.cc100-en": {
375
+ "vocab_size": 32100,
376
+ "n_bytes": 1124813,
377
+ "n_tokens": 290104,
378
+ "n_chars": 1121360
379
+ },
380
+ "t5_base.cc100-en": {
381
+ "vocab_size": 32100,
382
+ "n_bytes": 1124813,
383
+ "n_tokens": 290104,
384
+ "n_chars": 1121360
385
+ },
386
+ "t5_large.cc100-en": {
387
+ "vocab_size": 32100,
388
+ "n_bytes": 1124813,
389
+ "n_tokens": 290104,
390
+ "n_chars": 1121360
391
+ },
392
+ "t5_small.cc100-en": {
393
+ "vocab_size": 32100,
394
+ "n_bytes": 1124813,
395
+ "n_tokens": 290104,
396
+ "n_chars": 1121360
397
+ },
398
+ "text_davinci_003.cc100-en": {
399
+ "vocab_size": 50281,
400
+ "n_bytes": 1124813,
401
+ "n_tokens": 258403,
402
+ "n_chars": 1121360
403
+ },
404
+ "tigerbot_13b_chat_v2.cc100-en": {
405
+ "vocab_size": 60515,
406
+ "n_bytes": 1124813,
407
+ "n_tokens": 285652,
408
+ "n_chars": 1121360
409
+ },
410
+ "tigerbot_70b_chat_v4_4k.cc100-en": {
411
+ "vocab_size": 65110,
412
+ "n_bytes": 1124813,
413
+ "n_tokens": 286946,
414
+ "n_chars": 1121360
415
+ },
416
+ "wizardcoder_15b_v1.cc100-en": {
417
+ "vocab_size": 49153,
418
+ "n_bytes": 1124813,
419
+ "n_tokens": 288965,
420
+ "n_chars": 1121360
421
+ },
422
+ "wizardcoder_python_7b_v1.cc100-en": {
423
+ "vocab_size": 32001,
424
+ "n_bytes": 1124813,
425
+ "n_tokens": 294627,
426
+ "n_chars": 1121360
427
+ },
428
+ "wizardlm_7b_v1.cc100-en": {
429
+ "vocab_size": 32001,
430
+ "n_bytes": 1124813,
431
+ "n_tokens": 294627,
432
+ "n_chars": 1121360
433
+ },
434
+ "wizardmath_70b_v1.cc100-en": {
435
+ "vocab_size": 32002,
436
+ "n_bytes": 1124813,
437
+ "n_tokens": 294627,
438
+ "n_chars": 1121360
439
+ },
440
+ "xlm_roberta.cc100-en": {
441
+ "vocab_size": 250002,
442
+ "n_bytes": 1124813,
443
+ "n_tokens": 300026,
444
+ "n_chars": 1121360
445
+ },
446
+ "yi_34b.cc100-en": {
447
+ "vocab_size": 64000,
448
+ "n_bytes": 1124813,
449
+ "n_tokens": 270400,
450
+ "n_chars": 1121360
451
+ },
452
+ "yi_6b.cc100-en": {
453
+ "vocab_size": 64000,
454
+ "n_bytes": 1124813,
455
+ "n_tokens": 270400,
456
+ "n_chars": 1121360
457
+ },
458
+ "yi_vl34b.cc100-en": {
459
+ "vocab_size": 64000,
460
+ "n_bytes": 1124813,
461
+ "n_tokens": 269738,
462
+ "n_chars": 1121360
463
+ },
464
+ "zephyr_7b_beta.cc100-en": {
465
+ "vocab_size": 32000,
466
+ "n_bytes": 1124813,
467
+ "n_tokens": 285801,
468
+ "n_chars": 1121360
469
+ },
470
+ "amber.cc100-zh-Hans": {
471
+ "vocab_size": 32000,
472
+ "n_bytes": 2633047,
473
+ "n_tokens": 1330093,
474
+ "n_chars": 927311
475
+ },
476
+ "aya_101.cc100-zh-Hans": {
477
+ "vocab_size": 250100,
478
+ "n_bytes": 2633047,
479
+ "n_tokens": 631182,
480
+ "n_chars": 927311
481
+ },
482
+ "baichuan.cc100-zh-Hans": {
483
+ "vocab_size": 64000,
484
+ "n_bytes": 2633047,
485
+ "n_tokens": 626117,
486
+ "n_chars": 927311
487
+ },
488
+ "baichuan2.cc100-zh-Hans": {
489
+ "vocab_size": 125696,
490
+ "n_bytes": 2633047,
491
+ "n_tokens": 541464,
492
+ "n_chars": 927311
493
+ },
494
+ "bert_base_cased.cc100-zh-Hans": {
495
+ "vocab_size": 28996,
496
+ "n_bytes": 2633047,
497
+ "n_tokens": 899709,
498
+ "n_chars": 927311
499
+ },
500
+ "bert_base_chinese.cc100-zh-Hans": {
501
+ "vocab_size": 21128,
502
+ "n_bytes": 2633047,
503
+ "n_tokens": 896599,
504
+ "n_chars": 927311
505
+ },
506
+ "bert_base_uncased.cc100-zh-Hans": {
507
+ "vocab_size": 30522,
508
+ "n_bytes": 2633047,
509
+ "n_tokens": 898554,
510
+ "n_chars": 927311
511
+ },
512
+ "bloom.cc100-zh-Hans": {
513
+ "vocab_size": 250680,
514
+ "n_bytes": 2633047,
515
+ "n_tokens": 573008,
516
+ "n_chars": 927311
517
+ },
518
+ "byt5_small.cc100-zh-Hans": {
519
+ "vocab_size": 384,
520
+ "n_bytes": 2633047,
521
+ "n_tokens": 2643047,
522
+ "n_chars": 927311
523
+ },
524
+ "character_glm_6b.cc100-zh-Hans": {
525
+ "vocab_size": 64789,
526
+ "n_bytes": 2633047,
527
+ "n_tokens": 583646,
528
+ "n_chars": 927311
529
+ },
530
+ "chatglm2_6b.cc100-zh-Hans": {
531
+ "vocab_size": 64787,
532
+ "n_bytes": 2633047,
533
+ "n_tokens": 583646,
534
+ "n_chars": 927311
535
+ },
536
+ "chatglm3_6b.cc100-zh-Hans": {
537
+ "vocab_size": 64796,
538
+ "n_bytes": 2633047,
539
+ "n_tokens": 583646,
540
+ "n_chars": 927311
541
+ },
542
+ "chatglm_6b.cc100-zh-Hans": {
543
+ "vocab_size": 150344,
544
+ "n_bytes": 2633047,
545
+ "n_tokens": 527384,
546
+ "n_chars": 927311
547
+ },
548
+ "chatyuan_large_v2.cc100-zh-Hans": {
549
+ "vocab_size": 32128,
550
+ "n_bytes": 2633047,
551
+ "n_tokens": 564905,
552
+ "n_chars": 927311
553
+ },
554
+ "chinese_llama.cc100-zh-Hans": {
555
+ "vocab_size": 49953,
556
+ "n_bytes": 2633047,
557
+ "n_tokens": 623219,
558
+ "n_chars": 927311
559
+ },
560
+ "chinese_llama2.cc100-zh-Hans": {
561
+ "vocab_size": 55296,
562
+ "n_bytes": 2633047,
563
+ "n_tokens": 625766,
564
+ "n_chars": 927311
565
+ },
566
+ "code_davinci_002.cc100-zh-Hans": {
567
+ "vocab_size": 50281,
568
+ "n_bytes": 2633047,
569
+ "n_tokens": 1876809,
570
+ "n_chars": 927311
571
+ },
572
+ "crystal_coder.cc100-zh-Hans": {
573
+ "vocab_size": 32022,
574
+ "n_bytes": 2633047,
575
+ "n_tokens": 1320093,
576
+ "n_chars": 927311
577
+ },
578
+ "dbrx_instruct.cc100-zh-Hans": {
579
+ "vocab_size": 100280,
580
+ "n_bytes": 2633047,
581
+ "n_tokens": 1084939,
582
+ "n_chars": 927311
583
+ },
584
+ "deepseek_coder_33b_instruct.cc100-zh-Hans": {
585
+ "vocab_size": 32022,
586
+ "n_bytes": 2633047,
587
+ "n_tokens": 720577,
588
+ "n_chars": 927311
589
+ },
590
+ "deepseek_llm_7b_base.cc100-zh-Hans": {
591
+ "vocab_size": 100015,
592
+ "n_bytes": 2633047,
593
+ "n_tokens": 605081,
594
+ "n_chars": 927311
595
+ },
596
+ "falcon_180b.cc100-zh-Hans": {
597
+ "vocab_size": 65024,
598
+ "n_bytes": 2633047,
599
+ "n_tokens": 1124681,
600
+ "n_chars": 927311
601
+ },
602
+ "falcon_7b.cc100-zh-Hans": {
603
+ "vocab_size": 65024,
604
+ "n_bytes": 2633047,
605
+ "n_tokens": 1124681,
606
+ "n_chars": 927311
607
+ },
608
+ "fastchat_t5_3b.cc100-zh-Hans": {
609
+ "vocab_size": 32110,
610
+ "n_bytes": 2633047,
611
+ "n_tokens": 178974,
612
+ "n_chars": 927311
613
+ },
614
+ "flan_t5_base.cc100-zh-Hans": {
615
+ "vocab_size": 32100,
616
+ "n_bytes": 2633047,
617
+ "n_tokens": 173520,
618
+ "n_chars": 927311
619
+ },
620
+ "gemma_7b.cc100-zh-Hans": {
621
+ "vocab_size": 256000,
622
+ "n_bytes": 2633047,
623
+ "n_tokens": 641795,
624
+ "n_chars": 927311
625
+ },
626
+ "gpt2.cc100-zh-Hans": {
627
+ "vocab_size": 50257,
628
+ "n_bytes": 2633047,
629
+ "n_tokens": 1876809,
630
+ "n_chars": 927311
631
+ },
632
+ "gpt2_chinese.cc100-zh-Hans": {
633
+ "vocab_size": 21128,
634
+ "n_bytes": 2633047,
635
+ "n_tokens": 899506,
636
+ "n_chars": 927311
637
+ },
638
+ "gpt_35_turbo.cc100-zh-Hans": {
639
+ "vocab_size": 100277,
640
+ "n_bytes": 2633047,
641
+ "n_tokens": 1084939,
642
+ "n_chars": 927311
643
+ },
644
+ "gpt_4.cc100-zh-Hans": {
645
+ "vocab_size": 100277,
646
+ "n_bytes": 2633047,
647
+ "n_tokens": 1084939,
648
+ "n_chars": 927311
649
+ },
650
+ "gpt_nexo_20b.cc100-zh-Hans": {
651
+ "vocab_size": 50277,
652
+ "n_bytes": 2633047,
653
+ "n_tokens": 1220529,
654
+ "n_chars": 927311
655
+ },
656
+ "grok_1.cc100-zh-Hans": {
657
+ "vocab_size": 131072,
658
+ "n_bytes": 2633047,
659
+ "n_tokens": 1414508,
660
+ "n_chars": 927311
661
+ },
662
+ "internlm2_chat_7b.cc100-zh-Hans": {
663
+ "vocab_size": 92544,
664
+ "n_bytes": 2633047,
665
+ "n_tokens": 579976,
666
+ "n_chars": 927311
667
+ },
668
+ "internlm2_math_7b.cc100-zh-Hans": {
669
+ "vocab_size": 92544,
670
+ "n_bytes": 2633047,
671
+ "n_tokens": 579976,
672
+ "n_chars": 927311
673
+ },
674
+ "internlm_chat_7b.cc100-zh-Hans": {
675
+ "vocab_size": 103168,
676
+ "n_bytes": 2633047,
677
+ "n_tokens": 579109,
678
+ "n_chars": 927311
679
+ },
680
+ "internlm_xcomposer_7b.cc100-zh-Hans": {
681
+ "vocab_size": 103168,
682
+ "n_bytes": 2633047,
683
+ "n_tokens": 579109,
684
+ "n_chars": 927311
685
+ },
686
+ "jamba_v0_1.cc100-zh-Hans": {
687
+ "vocab_size": 65536,
688
+ "n_bytes": 2633047,
689
+ "n_tokens": 1067054,
690
+ "n_chars": 927311
691
+ },
692
+ "kplug.cc100-zh-Hans": {
693
+ "vocab_size": 10261,
694
+ "n_bytes": 2633047,
695
+ "n_tokens": 902451,
696
+ "n_chars": 927311
697
+ },
698
+ "llama.cc100-zh-Hans": {
699
+ "vocab_size": 32000,
700
+ "n_bytes": 2633047,
701
+ "n_tokens": 1330093,
702
+ "n_chars": 927311
703
+ },
704
+ "llama2.cc100-zh-Hans": {
705
+ "vocab_size": 32001,
706
+ "n_bytes": 2633047,
707
+ "n_tokens": 1330093,
708
+ "n_chars": 927311
709
+ },
710
+ "llama3.cc100-zh-Hans": {
711
+ "vocab_size": 128256,
712
+ "n_bytes": 2633047,
713
+ "n_tokens": 747405,
714
+ "n_chars": 927311
715
+ },
716
+ "mistral_7b.cc100-zh-Hans": {
717
+ "vocab_size": 32000,
718
+ "n_bytes": 2633047,
719
+ "n_tokens": 1041023,
720
+ "n_chars": 927311
721
+ },
722
+ "mixtral_8_7b.cc100-zh-Hans": {
723
+ "vocab_size": 32000,
724
+ "n_bytes": 2633047,
725
+ "n_tokens": 1041023,
726
+ "n_chars": 927311
727
+ },
728
+ "mobilebert_uncased.cc100-zh-Hans": {
729
+ "vocab_size": 30522,
730
+ "n_bytes": 2633047,
731
+ "n_tokens": 898554,
732
+ "n_chars": 927311
733
+ },
734
+ "moss.cc100-zh-Hans": {
735
+ "vocab_size": 106072,
736
+ "n_bytes": 2633047,
737
+ "n_tokens": 557455,
738
+ "n_chars": 927311
739
+ },
740
+ "mt5_large.cc100-zh-Hans": {
741
+ "vocab_size": 250100,
742
+ "n_bytes": 2633047,
743
+ "n_tokens": 631182,
744
+ "n_chars": 927311
745
+ },
746
+ "olmo_7b.cc100-zh-Hans": {
747
+ "vocab_size": 50280,
748
+ "n_bytes": 2633047,
749
+ "n_tokens": 1220529,
750
+ "n_chars": 927311
751
+ },
752
+ "orion_14b_chat.cc100-zh-Hans": {
753
+ "vocab_size": 84608,
754
+ "n_bytes": 2633047,
755
+ "n_tokens": 529926,
756
+ "n_chars": 927311
757
+ },
758
+ "phi_1.cc100-zh-Hans": {
759
+ "vocab_size": 50295,
760
+ "n_bytes": 2633047,
761
+ "n_tokens": 1876809,
762
+ "n_chars": 927311
763
+ },
764
+ "phi_2.cc100-zh-Hans": {
765
+ "vocab_size": 50295,
766
+ "n_bytes": 2633047,
767
+ "n_tokens": 1876809,
768
+ "n_chars": 927311
769
+ },
770
+ "phi_3_mini.cc100-zh-Hans": {
771
+ "vocab_size": 32011,
772
+ "n_bytes": 2633047,
773
+ "n_tokens": 1330093,
774
+ "n_chars": 927311
775
+ },
776
+ "pko_t5_large.cc100-zh-Hans": {
777
+ "vocab_size": 50358,
778
+ "n_bytes": 2633047,
779
+ "n_tokens": 2533519,
780
+ "n_chars": 927311
781
+ },
782
+ "prompt_clue.cc100-zh-Hans": {
783
+ "vocab_size": 32128,
784
+ "n_bytes": 2633047,
785
+ "n_tokens": 564905,
786
+ "n_chars": 927311
787
+ },
788
+ "qwen1_5_14b_chat.cc100-zh-Hans": {
789
+ "vocab_size": 151646,
790
+ "n_bytes": 2633047,
791
+ "n_tokens": 589211,
792
+ "n_chars": 927311
793
+ },
794
+ "qwen_1_8b_chat.cc100-zh-Hans": {
795
+ "vocab_size": 151851,
796
+ "n_bytes": 2633047,
797
+ "n_tokens": 589211,
798
+ "n_chars": 927311
799
+ },
800
+ "qwen_72b_chat.cc100-zh-Hans": {
801
+ "vocab_size": 151851,
802
+ "n_bytes": 2633047,
803
+ "n_tokens": 589211,
804
+ "n_chars": 927311
805
+ },
806
+ "qwen_7b_chat.cc100-zh-Hans": {
807
+ "vocab_size": 151851,
808
+ "n_bytes": 2633047,
809
+ "n_tokens": 589211,
810
+ "n_chars": 927311
811
+ },
812
+ "roberta_chinese_clue.cc100-zh-Hans": {
813
+ "vocab_size": 8021,
814
+ "n_bytes": 2633047,
815
+ "n_tokens": 907144,
816
+ "n_chars": 927311
817
+ },
818
+ "skywork_13b_base.cc100-zh-Hans": {
819
+ "vocab_size": 65519,
820
+ "n_bytes": 2633047,
821
+ "n_tokens": 663923,
822
+ "n_chars": 927311
823
+ },
824
+ "skywork_13b_math.cc100-zh-Hans": {
825
+ "vocab_size": 65519,
826
+ "n_bytes": 2633047,
827
+ "n_tokens": 663923,
828
+ "n_chars": 927311
829
+ },
830
+ "solar_10_7b.cc100-zh-Hans": {
831
+ "vocab_size": 32000,
832
+ "n_bytes": 2633047,
833
+ "n_tokens": 1041023,
834
+ "n_chars": 927311
835
+ },
836
+ "starchat_alpha.cc100-zh-Hans": {
837
+ "vocab_size": 49156,
838
+ "n_bytes": 2633047,
839
+ "n_tokens": 882018,
840
+ "n_chars": 927311
841
+ },
842
+ "switch_c_2048.cc100-zh-Hans": {
843
+ "vocab_size": 32100,
844
+ "n_bytes": 2633047,
845
+ "n_tokens": 173519,
846
+ "n_chars": 927311
847
+ },
848
+ "t5_base.cc100-zh-Hans": {
849
+ "vocab_size": 32100,
850
+ "n_bytes": 2633047,
851
+ "n_tokens": 173519,
852
+ "n_chars": 927311
853
+ },
854
+ "t5_large.cc100-zh-Hans": {
855
+ "vocab_size": 32100,
856
+ "n_bytes": 2633047,
857
+ "n_tokens": 173519,
858
+ "n_chars": 927311
859
+ },
860
+ "t5_small.cc100-zh-Hans": {
861
+ "vocab_size": 32100,
862
+ "n_bytes": 2633047,
863
+ "n_tokens": 173519,
864
+ "n_chars": 927311
865
+ },
866
+ "text_davinci_003.cc100-zh-Hans": {
867
+ "vocab_size": 50281,
868
+ "n_bytes": 2633047,
869
+ "n_tokens": 1876809,
870
+ "n_chars": 927311
871
+ },
872
+ "tigerbot_13b_chat_v2.cc100-zh-Hans": {
873
+ "vocab_size": 60515,
874
+ "n_bytes": 2633047,
875
+ "n_tokens": 577385,
876
+ "n_chars": 927311
877
+ },
878
+ "tigerbot_70b_chat_v4_4k.cc100-zh-Hans": {
879
+ "vocab_size": 65110,
880
+ "n_bytes": 2633047,
881
+ "n_tokens": 577211,
882
+ "n_chars": 927311
883
+ },
884
+ "wizardcoder_15b_v1.cc100-zh-Hans": {
885
+ "vocab_size": 49153,
886
+ "n_bytes": 2633047,
887
+ "n_tokens": 882018,
888
+ "n_chars": 927311
889
+ },
890
+ "wizardcoder_python_7b_v1.cc100-zh-Hans": {
891
+ "vocab_size": 32001,
892
+ "n_bytes": 2633047,
893
+ "n_tokens": 1330093,
894
+ "n_chars": 927311
895
+ },
896
+ "wizardlm_7b_v1.cc100-zh-Hans": {
897
+ "vocab_size": 32001,
898
+ "n_bytes": 2633047,
899
+ "n_tokens": 1330093,
900
+ "n_chars": 927311
901
+ },
902
+ "wizardmath_70b_v1.cc100-zh-Hans": {
903
+ "vocab_size": 32002,
904
+ "n_bytes": 2633047,
905
+ "n_tokens": 1330093,
906
+ "n_chars": 927311
907
+ },
908
+ "xlm_roberta.cc100-zh-Hans": {
909
+ "vocab_size": 250002,
910
+ "n_bytes": 2633047,
911
+ "n_tokens": 619844,
912
+ "n_chars": 927311
913
+ },
914
+ "yi_34b.cc100-zh-Hans": {
915
+ "vocab_size": 64000,
916
+ "n_bytes": 2633047,
917
+ "n_tokens": 588729,
918
+ "n_chars": 927311
919
+ },
920
+ "yi_6b.cc100-zh-Hans": {
921
+ "vocab_size": 64000,
922
+ "n_bytes": 2633047,
923
+ "n_tokens": 588729,
924
+ "n_chars": 927311
925
+ },
926
+ "yi_vl34b.cc100-zh-Hans": {
927
+ "vocab_size": 64000,
928
+ "n_bytes": 2633047,
929
+ "n_tokens": 596166,
930
+ "n_chars": 927311
931
+ },
932
+ "zephyr_7b_beta.cc100-zh-Hans": {
933
+ "vocab_size": 32000,
934
+ "n_bytes": 2633047,
935
+ "n_tokens": 1041023,
936
+ "n_chars": 927311
937
+ },
938
+ "amber.cc100-es": {
939
+ "vocab_size": 32000,
940
+ "n_bytes": 1664455,
941
+ "n_tokens": 492235,
942
+ "n_chars": 1630297
943
+ },
944
+ "aya_101.cc100-es": {
945
+ "vocab_size": 250100,
946
+ "n_bytes": 1664455,
947
+ "n_tokens": 472231,
948
+ "n_chars": 1630297
949
+ },
950
+ "baichuan.cc100-es": {
951
+ "vocab_size": 64000,
952
+ "n_bytes": 1664455,
953
+ "n_tokens": 585804,
954
+ "n_chars": 1630297
955
+ },
956
+ "baichuan2.cc100-es": {
957
+ "vocab_size": 125696,
958
+ "n_bytes": 1664455,
959
+ "n_tokens": 551326,
960
+ "n_chars": 1630297
961
+ },
962
+ "bert_base_cased.cc100-es": {
963
+ "vocab_size": 28996,
964
+ "n_bytes": 1664455,
965
+ "n_tokens": 630231,
966
+ "n_chars": 1630297
967
+ },
968
+ "bert_base_chinese.cc100-es": {
969
+ "vocab_size": 21128,
970
+ "n_bytes": 1664455,
971
+ "n_tokens": 609419,
972
+ "n_chars": 1630297
973
+ },
974
+ "bert_base_uncased.cc100-es": {
975
+ "vocab_size": 30522,
976
+ "n_bytes": 1664455,
977
+ "n_tokens": 558042,
978
+ "n_chars": 1630297
979
+ },
980
+ "bloom.cc100-es": {
981
+ "vocab_size": 250680,
982
+ "n_bytes": 1664455,
983
+ "n_tokens": 350793,
984
+ "n_chars": 1630297
985
+ },
986
+ "byt5_small.cc100-es": {
987
+ "vocab_size": 384,
988
+ "n_bytes": 1664455,
989
+ "n_tokens": 1674455,
990
+ "n_chars": 1630297
991
+ },
992
+ "character_glm_6b.cc100-es": {
993
+ "vocab_size": 64789,
994
+ "n_bytes": 1664455,
995
+ "n_tokens": 566501,
996
+ "n_chars": 1630297
997
+ },
998
+ "chatglm2_6b.cc100-es": {
999
+ "vocab_size": 64787,
1000
+ "n_bytes": 1664455,
1001
+ "n_tokens": 566476,
1002
+ "n_chars": 1630297
1003
+ },
1004
+ "chatglm3_6b.cc100-es": {
1005
+ "vocab_size": 64796,
1006
+ "n_bytes": 1664455,
1007
+ "n_tokens": 566501,
1008
+ "n_chars": 1630297
1009
+ },
1010
+ "chatglm_6b.cc100-es": {
1011
+ "vocab_size": 150344,
1012
+ "n_bytes": 1664455,
1013
+ "n_tokens": 514848,
1014
+ "n_chars": 1630297
1015
+ },
1016
+ "chatyuan_large_v2.cc100-es": {
1017
+ "vocab_size": 32128,
1018
+ "n_bytes": 1664455,
1019
+ "n_tokens": 889530,
1020
+ "n_chars": 1630297
1021
+ },
1022
+ "chinese_llama.cc100-es": {
1023
+ "vocab_size": 49953,
1024
+ "n_bytes": 1664455,
1025
+ "n_tokens": 486672,
1026
+ "n_chars": 1630297
1027
+ },
1028
+ "chinese_llama2.cc100-es": {
1029
+ "vocab_size": 55296,
1030
+ "n_bytes": 1664455,
1031
+ "n_tokens": 492235,
1032
+ "n_chars": 1630297
1033
+ },
1034
+ "code_davinci_002.cc100-es": {
1035
+ "vocab_size": 50281,
1036
+ "n_bytes": 1664455,
1037
+ "n_tokens": 569853,
1038
+ "n_chars": 1630297
1039
+ },
1040
+ "crystal_coder.cc100-es": {
1041
+ "vocab_size": 32022,
1042
+ "n_bytes": 1664455,
1043
+ "n_tokens": 482235,
1044
+ "n_chars": 1630297
1045
+ },
1046
+ "dbrx_instruct.cc100-es": {
1047
+ "vocab_size": 100280,
1048
+ "n_bytes": 1664455,
1049
+ "n_tokens": 433875,
1050
+ "n_chars": 1630297
1051
+ },
1052
+ "deepseek_coder_33b_instruct.cc100-es": {
1053
+ "vocab_size": 32022,
1054
+ "n_bytes": 1664455,
1055
+ "n_tokens": 523884,
1056
+ "n_chars": 1630297
1057
+ },
1058
+ "deepseek_llm_7b_base.cc100-es": {
1059
+ "vocab_size": 100015,
1060
+ "n_bytes": 1664455,
1061
+ "n_tokens": 480877,
1062
+ "n_chars": 1630297
1063
+ },
1064
+ "falcon_180b.cc100-es": {
1065
+ "vocab_size": 65024,
1066
+ "n_bytes": 1664455,
1067
+ "n_tokens": 442138,
1068
+ "n_chars": 1630297
1069
+ },
1070
+ "falcon_7b.cc100-es": {
1071
+ "vocab_size": 65024,
1072
+ "n_bytes": 1664455,
1073
+ "n_tokens": 442138,
1074
+ "n_chars": 1630297
1075
+ },
1076
+ "fastchat_t5_3b.cc100-es": {
1077
+ "vocab_size": 32110,
1078
+ "n_bytes": 1664455,
1079
+ "n_tokens": 970105,
1080
+ "n_chars": 1630297
1081
+ },
1082
+ "flan_t5_base.cc100-es": {
1083
+ "vocab_size": 32100,
1084
+ "n_bytes": 1664455,
1085
+ "n_tokens": 706405,
1086
+ "n_chars": 1630297
1087
+ },
1088
+ "gemma_7b.cc100-es": {
1089
+ "vocab_size": 256000,
1090
+ "n_bytes": 1664455,
1091
+ "n_tokens": 371321,
1092
+ "n_chars": 1630297
1093
+ },
1094
+ "gpt2.cc100-es": {
1095
+ "vocab_size": 50257,
1096
+ "n_bytes": 1664455,
1097
+ "n_tokens": 569853,
1098
+ "n_chars": 1630297
1099
+ },
1100
+ "gpt2_chinese.cc100-es": {
1101
+ "vocab_size": 21128,
1102
+ "n_bytes": 1664455,
1103
+ "n_tokens": 703390,
1104
+ "n_chars": 1630297
1105
+ },
1106
+ "gpt_35_turbo.cc100-es": {
1107
+ "vocab_size": 100277,
1108
+ "n_bytes": 1664455,
1109
+ "n_tokens": 433875,
1110
+ "n_chars": 1630297
1111
+ },
1112
+ "gpt_4.cc100-es": {
1113
+ "vocab_size": 100277,
1114
+ "n_bytes": 1664455,
1115
+ "n_tokens": 433875,
1116
+ "n_chars": 1630297
1117
+ },
1118
+ "gpt_nexo_20b.cc100-es": {
1119
+ "vocab_size": 50277,
1120
+ "n_bytes": 1664455,
1121
+ "n_tokens": 494577,
1122
+ "n_chars": 1630297
1123
+ },
1124
+ "grok_1.cc100-es": {
1125
+ "vocab_size": 131072,
1126
+ "n_bytes": 1664455,
1127
+ "n_tokens": 449392,
1128
+ "n_chars": 1630297
1129
+ },
1130
+ "internlm2_chat_7b.cc100-es": {
1131
+ "vocab_size": 92544,
1132
+ "n_bytes": 1664455,
1133
+ "n_tokens": 518871,
1134
+ "n_chars": 1630297
1135
+ },
1136
+ "internlm2_math_7b.cc100-es": {
1137
+ "vocab_size": 92544,
1138
+ "n_bytes": 1664455,
1139
+ "n_tokens": 518871,
1140
+ "n_chars": 1630297
1141
+ },
1142
+ "internlm_chat_7b.cc100-es": {
1143
+ "vocab_size": 103168,
1144
+ "n_bytes": 1664455,
1145
+ "n_tokens": 516572,
1146
+ "n_chars": 1630297
1147
+ },
1148
+ "internlm_xcomposer_7b.cc100-es": {
1149
+ "vocab_size": 103168,
1150
+ "n_bytes": 1664455,
1151
+ "n_tokens": 516572,
1152
+ "n_chars": 1630297
1153
+ },
1154
+ "jamba_v0_1.cc100-es": {
1155
+ "vocab_size": 65536,
1156
+ "n_bytes": 1664455,
1157
+ "n_tokens": 420883,
1158
+ "n_chars": 1630297
1159
+ },
1160
+ "kplug.cc100-es": {
1161
+ "vocab_size": 10261,
1162
+ "n_bytes": 1664455,
1163
+ "n_tokens": 704804,
1164
+ "n_chars": 1630297
1165
+ },
1166
+ "llama.cc100-es": {
1167
+ "vocab_size": 32000,
1168
+ "n_bytes": 1664455,
1169
+ "n_tokens": 492235,
1170
+ "n_chars": 1630297
1171
+ },
1172
+ "llama2.cc100-es": {
1173
+ "vocab_size": 32001,
1174
+ "n_bytes": 1664455,
1175
+ "n_tokens": 492235,
1176
+ "n_chars": 1630297
1177
+ },
1178
+ "llama3.cc100-es": {
1179
+ "vocab_size": 128256,
1180
+ "n_bytes": 1664455,
1181
+ "n_tokens": 433289,
1182
+ "n_chars": 1630297
1183
+ },
1184
+ "mistral_7b.cc100-es": {
1185
+ "vocab_size": 32000,
1186
+ "n_bytes": 1664455,
1187
+ "n_tokens": 513915,
1188
+ "n_chars": 1630297
1189
+ },
1190
+ "mixtral_8_7b.cc100-es": {
1191
+ "vocab_size": 32000,
1192
+ "n_bytes": 1664455,
1193
+ "n_tokens": 513915,
1194
+ "n_chars": 1630297
1195
+ },
1196
+ "mobilebert_uncased.cc100-es": {
1197
+ "vocab_size": 30522,
1198
+ "n_bytes": 1664455,
1199
+ "n_tokens": 558042,
1200
+ "n_chars": 1630297
1201
+ },
1202
+ "moss.cc100-es": {
1203
+ "vocab_size": 106072,
1204
+ "n_bytes": 1664455,
1205
+ "n_tokens": 568539,
1206
+ "n_chars": 1630297
1207
+ },
1208
+ "mt5_large.cc100-es": {
1209
+ "vocab_size": 250100,
1210
+ "n_bytes": 1664455,
1211
+ "n_tokens": 472231,
1212
+ "n_chars": 1630297
1213
+ },
1214
+ "olmo_7b.cc100-es": {
1215
+ "vocab_size": 50280,
1216
+ "n_bytes": 1664455,
1217
+ "n_tokens": 494577,
1218
+ "n_chars": 1630297
1219
+ },
1220
+ "orion_14b_chat.cc100-es": {
1221
+ "vocab_size": 84608,
1222
+ "n_bytes": 1664455,
1223
+ "n_tokens": 628571,
1224
+ "n_chars": 1630297
1225
+ },
1226
+ "phi_1.cc100-es": {
1227
+ "vocab_size": 50295,
1228
+ "n_bytes": 1664455,
1229
+ "n_tokens": 569853,
1230
+ "n_chars": 1630297
1231
+ },
1232
+ "phi_2.cc100-es": {
1233
+ "vocab_size": 50295,
1234
+ "n_bytes": 1664455,
1235
+ "n_tokens": 569853,
1236
+ "n_chars": 1630297
1237
+ },
1238
+ "phi_3_mini.cc100-es": {
1239
+ "vocab_size": 32011,
1240
+ "n_bytes": 1664455,
1241
+ "n_tokens": 492235,
1242
+ "n_chars": 1630297
1243
+ },
1244
+ "pko_t5_large.cc100-es": {
1245
+ "vocab_size": 50358,
1246
+ "n_bytes": 1664455,
1247
+ "n_tokens": 1134056,
1248
+ "n_chars": 1630297
1249
+ },
1250
+ "prompt_clue.cc100-es": {
1251
+ "vocab_size": 32128,
1252
+ "n_bytes": 1664455,
1253
+ "n_tokens": 889530,
1254
+ "n_chars": 1630297
1255
+ },
1256
+ "qwen1_5_14b_chat.cc100-es": {
1257
+ "vocab_size": 151646,
1258
+ "n_bytes": 1664455,
1259
+ "n_tokens": 434264,
1260
+ "n_chars": 1630297
1261
+ },
1262
+ "qwen_1_8b_chat.cc100-es": {
1263
+ "vocab_size": 151851,
1264
+ "n_bytes": 1664455,
1265
+ "n_tokens": 434264,
1266
+ "n_chars": 1630297
1267
+ },
1268
+ "qwen_72b_chat.cc100-es": {
1269
+ "vocab_size": 151851,
1270
+ "n_bytes": 1664455,
1271
+ "n_tokens": 434264,
1272
+ "n_chars": 1630297
1273
+ },
1274
+ "qwen_7b_chat.cc100-es": {
1275
+ "vocab_size": 151851,
1276
+ "n_bytes": 1664455,
1277
+ "n_tokens": 434264,
1278
+ "n_chars": 1630297
1279
+ },
1280
+ "roberta_chinese_clue.cc100-es": {
1281
+ "vocab_size": 8021,
1282
+ "n_bytes": 1664455,
1283
+ "n_tokens": 866564,
1284
+ "n_chars": 1630297
1285
+ },
1286
+ "skywork_13b_base.cc100-es": {
1287
+ "vocab_size": 65519,
1288
+ "n_bytes": 1664455,
1289
+ "n_tokens": 492211,
1290
+ "n_chars": 1630297
1291
+ },
1292
+ "skywork_13b_math.cc100-es": {
1293
+ "vocab_size": 65519,
1294
+ "n_bytes": 1664455,
1295
+ "n_tokens": 492211,
1296
+ "n_chars": 1630297
1297
+ },
1298
+ "solar_10_7b.cc100-es": {
1299
+ "vocab_size": 32000,
1300
+ "n_bytes": 1664455,
1301
+ "n_tokens": 513915,
1302
+ "n_chars": 1630297
1303
+ },
1304
+ "starchat_alpha.cc100-es": {
1305
+ "vocab_size": 49156,
1306
+ "n_bytes": 1664455,
1307
+ "n_tokens": 530592,
1308
+ "n_chars": 1630297
1309
+ },
1310
+ "switch_c_2048.cc100-es": {
1311
+ "vocab_size": 32100,
1312
+ "n_bytes": 1664455,
1313
+ "n_tokens": 706400,
1314
+ "n_chars": 1630297
1315
+ },
1316
+ "t5_base.cc100-es": {
1317
+ "vocab_size": 32100,
1318
+ "n_bytes": 1664455,
1319
+ "n_tokens": 706400,
1320
+ "n_chars": 1630297
1321
+ },
1322
+ "t5_large.cc100-es": {
1323
+ "vocab_size": 32100,
1324
+ "n_bytes": 1664455,
1325
+ "n_tokens": 706400,
1326
+ "n_chars": 1630297
1327
+ },
1328
+ "t5_small.cc100-es": {
1329
+ "vocab_size": 32100,
1330
+ "n_bytes": 1664455,
1331
+ "n_tokens": 706400,
1332
+ "n_chars": 1630297
1333
+ },
1334
+ "text_davinci_003.cc100-es": {
1335
+ "vocab_size": 50281,
1336
+ "n_bytes": 1664455,
1337
+ "n_tokens": 569853,
1338
+ "n_chars": 1630297
1339
+ },
1340
+ "tigerbot_13b_chat_v2.cc100-es": {
1341
+ "vocab_size": 60515,
1342
+ "n_bytes": 1664455,
1343
+ "n_tokens": 482553,
1344
+ "n_chars": 1630297
1345
+ },
1346
+ "tigerbot_70b_chat_v4_4k.cc100-es": {
1347
+ "vocab_size": 65110,
1348
+ "n_bytes": 1664455,
1349
+ "n_tokens": 484099,
1350
+ "n_chars": 1630297
1351
+ },
1352
+ "wizardcoder_15b_v1.cc100-es": {
1353
+ "vocab_size": 49153,
1354
+ "n_bytes": 1664455,
1355
+ "n_tokens": 530592,
1356
+ "n_chars": 1630297
1357
+ },
1358
+ "wizardcoder_python_7b_v1.cc100-es": {
1359
+ "vocab_size": 32001,
1360
+ "n_bytes": 1664455,
1361
+ "n_tokens": 492235,
1362
+ "n_chars": 1630297
1363
+ },
1364
+ "wizardlm_7b_v1.cc100-es": {
1365
+ "vocab_size": 32001,
1366
+ "n_bytes": 1664455,
1367
+ "n_tokens": 492235,
1368
+ "n_chars": 1630297
1369
+ },
1370
+ "wizardmath_70b_v1.cc100-es": {
1371
+ "vocab_size": 32002,
1372
+ "n_bytes": 1664455,
1373
+ "n_tokens": 492235,
1374
+ "n_chars": 1630297
1375
+ },
1376
+ "xlm_roberta.cc100-es": {
1377
+ "vocab_size": 250002,
1378
+ "n_bytes": 1664455,
1379
+ "n_tokens": 399850,
1380
+ "n_chars": 1630297
1381
+ },
1382
+ "yi_34b.cc100-es": {
1383
+ "vocab_size": 64000,
1384
+ "n_bytes": 1664455,
1385
+ "n_tokens": 577018,
1386
+ "n_chars": 1630297
1387
+ },
1388
+ "yi_6b.cc100-es": {
1389
+ "vocab_size": 64000,
1390
+ "n_bytes": 1664455,
1391
+ "n_tokens": 577018,
1392
+ "n_chars": 1630297
1393
+ },
1394
+ "yi_vl34b.cc100-es": {
1395
+ "vocab_size": 64000,
1396
+ "n_bytes": 1664455,
1397
+ "n_tokens": 576794,
1398
+ "n_chars": 1630297
1399
+ },
1400
+ "zephyr_7b_beta.cc100-es": {
1401
+ "vocab_size": 32000,
1402
+ "n_bytes": 1664455,
1403
+ "n_tokens": 513915,
1404
+ "n_chars": 1630297
1405
+ },
1406
+ "aya_101.cc100-fr": {
1407
+ "vocab_size": 250100,
1408
+ "n_bytes": 1540504,
1409
+ "n_tokens": 470944,
1410
+ "n_chars": 1484970
1411
+ },
1412
+ "baichuan.cc100-fr": {
1413
+ "vocab_size": 64000,
1414
+ "n_bytes": 1540504,
1415
+ "n_tokens": 540430,
1416
+ "n_chars": 1484970
1417
+ },
1418
+ "baichuan2.cc100-fr": {
1419
+ "vocab_size": 125696,
1420
+ "n_bytes": 1540504,
1421
+ "n_tokens": 512313,
1422
+ "n_chars": 1484970
1423
+ },
1424
+ "bert_base_cased.cc100-fr": {
1425
+ "vocab_size": 28996,
1426
+ "n_bytes": 1540504,
1427
+ "n_tokens": 583210,
1428
+ "n_chars": 1484970
1429
+ },
1430
+ "bert_base_chinese.cc100-fr": {
1431
+ "vocab_size": 21128,
1432
+ "n_bytes": 1540504,
1433
+ "n_tokens": 553134,
1434
+ "n_chars": 1484970
1435
+ },
1436
+ "bert_base_uncased.cc100-fr": {
1437
+ "vocab_size": 30522,
1438
+ "n_bytes": 1540504,
1439
+ "n_tokens": 504075,
1440
+ "n_chars": 1484970
1441
+ },
1442
+ "bloom.cc100-fr": {
1443
+ "vocab_size": 250680,
1444
+ "n_bytes": 1540504,
1445
+ "n_tokens": 321639,
1446
+ "n_chars": 1484970
1447
+ },
1448
+ "byt5_small.cc100-fr": {
1449
+ "vocab_size": 384,
1450
+ "n_bytes": 1540504,
1451
+ "n_tokens": 1550504,
1452
+ "n_chars": 1484970
1453
+ },
1454
+ "character_glm_6b.cc100-fr": {
1455
+ "vocab_size": 64789,
1456
+ "n_bytes": 1540504,
1457
+ "n_tokens": 515052,
1458
+ "n_chars": 1484970
1459
+ },
1460
+ "chatglm2_6b.cc100-fr": {
1461
+ "vocab_size": 64787,
1462
+ "n_bytes": 1540504,
1463
+ "n_tokens": 515028,
1464
+ "n_chars": 1484970
1465
+ },
1466
+ "chatglm3_6b.cc100-fr": {
1467
+ "vocab_size": 64796,
1468
+ "n_bytes": 1540504,
1469
+ "n_tokens": 515052,
1470
+ "n_chars": 1484970
1471
+ },
1472
+ "chatglm_6b.cc100-fr": {
1473
+ "vocab_size": 150344,
1474
+ "n_bytes": 1540504,
1475
+ "n_tokens": 499261,
1476
+ "n_chars": 1484970
1477
+ },
1478
+ "chatyuan_large_v2.cc100-fr": {
1479
+ "vocab_size": 32128,
1480
+ "n_bytes": 1540504,
1481
+ "n_tokens": 822012,
1482
+ "n_chars": 1484970
1483
+ },
1484
+ "chinese_llama.cc100-fr": {
1485
+ "vocab_size": 49953,
1486
+ "n_bytes": 1540504,
1487
+ "n_tokens": 450352,
1488
+ "n_chars": 1484970
1489
+ },
1490
+ "chinese_llama2.cc100-fr": {
1491
+ "vocab_size": 55296,
1492
+ "n_bytes": 1540504,
1493
+ "n_tokens": 457243,
1494
+ "n_chars": 1484970
1495
+ },
1496
+ "code_davinci_002.cc100-fr": {
1497
+ "vocab_size": 50281,
1498
+ "n_bytes": 1540504,
1499
+ "n_tokens": 521776,
1500
+ "n_chars": 1484970
1501
+ },
1502
+ "crystal_coder.cc100-fr": {
1503
+ "vocab_size": 32022,
1504
+ "n_bytes": 1540504,
1505
+ "n_tokens": 447243,
1506
+ "n_chars": 1484970
1507
+ },
1508
+ "dbrx_instruct.cc100-fr": {
1509
+ "vocab_size": 100280,
1510
+ "n_bytes": 1540504,
1511
+ "n_tokens": 412685,
1512
+ "n_chars": 1484970
1513
+ },
1514
+ "deepseek_coder_33b_instruct.cc100-fr": {
1515
+ "vocab_size": 32022,
1516
+ "n_bytes": 1540504,
1517
+ "n_tokens": 537538,
1518
+ "n_chars": 1484970
1519
+ },
1520
+ "deepseek_llm_7b_base.cc100-fr": {
1521
+ "vocab_size": 100015,
1522
+ "n_bytes": 1540504,
1523
+ "n_tokens": 507693,
1524
+ "n_chars": 1484970
1525
+ },
1526
+ "falcon_180b.cc100-fr": {
1527
+ "vocab_size": 65024,
1528
+ "n_bytes": 1540504,
1529
+ "n_tokens": 407853,
1530
+ "n_chars": 1484970
1531
+ },
1532
+ "falcon_7b.cc100-fr": {
1533
+ "vocab_size": 65024,
1534
+ "n_bytes": 1540504,
1535
+ "n_tokens": 407853,
1536
+ "n_chars": 1484970
1537
+ },
1538
+ "fastchat_t5_3b.cc100-fr": {
1539
+ "vocab_size": 32110,
1540
+ "n_bytes": 1540504,
1541
+ "n_tokens": 717675,
1542
+ "n_chars": 1484970
1543
+ },
1544
+ "flan_t5_base.cc100-fr": {
1545
+ "vocab_size": 32100,
1546
+ "n_bytes": 1540504,
1547
+ "n_tokens": 476135,
1548
+ "n_chars": 1484970
1549
+ },
1550
+ "gemma_7b.cc100-fr": {
1551
+ "vocab_size": 256000,
1552
+ "n_bytes": 1540504,
1553
+ "n_tokens": 374551,
1554
+ "n_chars": 1484970
1555
+ },
1556
+ "gpt2.cc100-fr": {
1557
+ "vocab_size": 50257,
1558
+ "n_bytes": 1540504,
1559
+ "n_tokens": 521776,
1560
+ "n_chars": 1484970
1561
+ },
1562
+ "gpt2_chinese.cc100-fr": {
1563
+ "vocab_size": 21128,
1564
+ "n_bytes": 1540504,
1565
+ "n_tokens": 636442,
1566
+ "n_chars": 1484970
1567
+ },
1568
+ "gpt_35_turbo.cc100-fr": {
1569
+ "vocab_size": 100277,
1570
+ "n_bytes": 1540504,
1571
+ "n_tokens": 412685,
1572
+ "n_chars": 1484970
1573
+ },
1574
+ "gpt_4.cc100-fr": {
1575
+ "vocab_size": 100277,
1576
+ "n_bytes": 1540504,
1577
+ "n_tokens": 412685,
1578
+ "n_chars": 1484970
1579
+ },
1580
+ "gpt_nexo_20b.cc100-fr": {
1581
+ "vocab_size": 50277,
1582
+ "n_bytes": 1540504,
1583
+ "n_tokens": 458961,
1584
+ "n_chars": 1484970
1585
+ },
1586
+ "grok_1.cc100-fr": {
1587
+ "vocab_size": 131072,
1588
+ "n_bytes": 1540504,
1589
+ "n_tokens": 428298,
1590
+ "n_chars": 1484970
1591
+ },
1592
+ "internlm2_chat_7b.cc100-fr": {
1593
+ "vocab_size": 92544,
1594
+ "n_bytes": 1540504,
1595
+ "n_tokens": 496629,
1596
+ "n_chars": 1484970
1597
+ },
1598
+ "internlm2_math_7b.cc100-fr": {
1599
+ "vocab_size": 92544,
1600
+ "n_bytes": 1540504,
1601
+ "n_tokens": 496629,
1602
+ "n_chars": 1484970
1603
+ },
1604
+ "internlm_chat_7b.cc100-fr": {
1605
+ "vocab_size": 103168,
1606
+ "n_bytes": 1540504,
1607
+ "n_tokens": 495045,
1608
+ "n_chars": 1484970
1609
+ },
1610
+ "internlm_xcomposer_7b.cc100-fr": {
1611
+ "vocab_size": 103168,
1612
+ "n_bytes": 1540504,
1613
+ "n_tokens": 495045,
1614
+ "n_chars": 1484970
1615
+ },
1616
+ "jamba_v0_1.cc100-fr": {
1617
+ "vocab_size": 65536,
1618
+ "n_bytes": 1540504,
1619
+ "n_tokens": 412899,
1620
+ "n_chars": 1484970
1621
+ },
1622
+ "kplug.cc100-fr": {
1623
+ "vocab_size": 10261,
1624
+ "n_bytes": 1540504,
1625
+ "n_tokens": 638107,
1626
+ "n_chars": 1484970
1627
+ },
1628
+ "llama.cc100-fr": {
1629
+ "vocab_size": 32000,
1630
+ "n_bytes": 1540504,
1631
+ "n_tokens": 457243,
1632
+ "n_chars": 1484970
1633
+ },
1634
+ "llama2.cc100-fr": {
1635
+ "vocab_size": 32001,
1636
+ "n_bytes": 1540504,
1637
+ "n_tokens": 457243,
1638
+ "n_chars": 1484970
1639
+ },
1640
+ "llama3.cc100-fr": {
1641
+ "vocab_size": 128256,
1642
+ "n_bytes": 1540504,
1643
+ "n_tokens": 412146,
1644
+ "n_chars": 1484970
1645
+ },
1646
+ "mistral_7b.cc100-fr": {
1647
+ "vocab_size": 32000,
1648
+ "n_bytes": 1540504,
1649
+ "n_tokens": 476666,
1650
+ "n_chars": 1484970
1651
+ },
1652
+ "mixtral_8_7b.cc100-fr": {
1653
+ "vocab_size": 32000,
1654
+ "n_bytes": 1540504,
1655
+ "n_tokens": 476666,
1656
+ "n_chars": 1484970
1657
+ },
1658
+ "mobilebert_uncased.cc100-fr": {
1659
+ "vocab_size": 30522,
1660
+ "n_bytes": 1540504,
1661
+ "n_tokens": 504075,
1662
+ "n_chars": 1484970
1663
+ },
1664
+ "moss.cc100-fr": {
1665
+ "vocab_size": 106072,
1666
+ "n_bytes": 1540504,
1667
+ "n_tokens": 515669,
1668
+ "n_chars": 1484970
1669
+ },
1670
+ "mt5_large.cc100-fr": {
1671
+ "vocab_size": 250100,
1672
+ "n_bytes": 1540504,
1673
+ "n_tokens": 470944,
1674
+ "n_chars": 1484970
1675
+ },
1676
+ "olmo_7b.cc100-fr": {
1677
+ "vocab_size": 50280,
1678
+ "n_bytes": 1540504,
1679
+ "n_tokens": 458961,
1680
+ "n_chars": 1484970
1681
+ },
1682
+ "orion_14b_chat.cc100-fr": {
1683
+ "vocab_size": 84608,
1684
+ "n_bytes": 1540504,
1685
+ "n_tokens": 564107,
1686
+ "n_chars": 1484970
1687
+ },
1688
+ "phi_1.cc100-fr": {
1689
+ "vocab_size": 50295,
1690
+ "n_bytes": 1540504,
1691
+ "n_tokens": 521776,
1692
+ "n_chars": 1484970
1693
+ },
1694
+ "phi_2.cc100-fr": {
1695
+ "vocab_size": 50295,
1696
+ "n_bytes": 1540504,
1697
+ "n_tokens": 521776,
1698
+ "n_chars": 1484970
1699
+ },
1700
+ "phi_3_mini.cc100-fr": {
1701
+ "vocab_size": 32011,
1702
+ "n_bytes": 1540504,
1703
+ "n_tokens": 457243,
1704
+ "n_chars": 1484970
1705
+ },
1706
+ "pko_t5_large.cc100-fr": {
1707
+ "vocab_size": 50358,
1708
+ "n_bytes": 1540504,
1709
+ "n_tokens": 1044665,
1710
+ "n_chars": 1484970
1711
+ },
1712
+ "prompt_clue.cc100-fr": {
1713
+ "vocab_size": 32128,
1714
+ "n_bytes": 1540504,
1715
+ "n_tokens": 822012,
1716
+ "n_chars": 1484970
1717
+ },
1718
+ "qwen1_5_14b_chat.cc100-fr": {
1719
+ "vocab_size": 151646,
1720
+ "n_bytes": 1540504,
1721
+ "n_tokens": 413637,
1722
+ "n_chars": 1484970
1723
+ },
1724
+ "qwen_1_8b_chat.cc100-fr": {
1725
+ "vocab_size": 151851,
1726
+ "n_bytes": 1540504,
1727
+ "n_tokens": 413637,
1728
+ "n_chars": 1484970
1729
+ },
1730
+ "qwen_72b_chat.cc100-fr": {
1731
+ "vocab_size": 151851,
1732
+ "n_bytes": 1540504,
1733
+ "n_tokens": 413637,
1734
+ "n_chars": 1484970
1735
+ },
1736
+ "qwen_7b_chat.cc100-fr": {
1737
+ "vocab_size": 151851,
1738
+ "n_bytes": 1540504,
1739
+ "n_tokens": 413637,
1740
+ "n_chars": 1484970
1741
+ },
1742
+ "roberta_chinese_clue.cc100-fr": {
1743
+ "vocab_size": 8021,
1744
+ "n_bytes": 1540504,
1745
+ "n_tokens": 787363,
1746
+ "n_chars": 1484970
1747
+ },
1748
+ "skywork_13b_base.cc100-fr": {
1749
+ "vocab_size": 65519,
1750
+ "n_bytes": 1540504,
1751
+ "n_tokens": 457233,
1752
+ "n_chars": 1484970
1753
+ },
1754
+ "skywork_13b_math.cc100-fr": {
1755
+ "vocab_size": 65519,
1756
+ "n_bytes": 1540504,
1757
+ "n_tokens": 457233,
1758
+ "n_chars": 1484970
1759
+ },
1760
+ "solar_10_7b.cc100-fr": {
1761
+ "vocab_size": 32000,
1762
+ "n_bytes": 1540504,
1763
+ "n_tokens": 476666,
1764
+ "n_chars": 1484970
1765
+ },
1766
+ "starchat_alpha.cc100-fr": {
1767
+ "vocab_size": 49156,
1768
+ "n_bytes": 1540504,
1769
+ "n_tokens": 509958,
1770
+ "n_chars": 1484970
1771
+ },
1772
+ "switch_c_2048.cc100-fr": {
1773
+ "vocab_size": 32100,
1774
+ "n_bytes": 1540504,
1775
+ "n_tokens": 476133,
1776
+ "n_chars": 1484970
1777
+ },
1778
+ "t5_base.cc100-fr": {
1779
+ "vocab_size": 32100,
1780
+ "n_bytes": 1540504,
1781
+ "n_tokens": 476133,
1782
+ "n_chars": 1484970
1783
+ },
1784
+ "t5_large.cc100-fr": {
1785
+ "vocab_size": 32100,
1786
+ "n_bytes": 1540504,
1787
+ "n_tokens": 476133,
1788
+ "n_chars": 1484970
1789
+ },
1790
+ "t5_small.cc100-fr": {
1791
+ "vocab_size": 32100,
1792
+ "n_bytes": 1540504,
1793
+ "n_tokens": 476133,
1794
+ "n_chars": 1484970
1795
+ },
1796
+ "text_davinci_003.cc100-fr": {
1797
+ "vocab_size": 50281,
1798
+ "n_bytes": 1540504,
1799
+ "n_tokens": 521776,
1800
+ "n_chars": 1484970
1801
+ },
1802
+ "tigerbot_13b_chat_v2.cc100-fr": {
1803
+ "vocab_size": 60515,
1804
+ "n_bytes": 1540504,
1805
+ "n_tokens": 447372,
1806
+ "n_chars": 1484970
1807
+ },
1808
+ "tigerbot_70b_chat_v4_4k.cc100-fr": {
1809
+ "vocab_size": 65110,
1810
+ "n_bytes": 1540504,
1811
+ "n_tokens": 448567,
1812
+ "n_chars": 1484970
1813
+ },
1814
+ "wizardcoder_15b_v1.cc100-fr": {
1815
+ "vocab_size": 49153,
1816
+ "n_bytes": 1540504,
1817
+ "n_tokens": 509958,
1818
+ "n_chars": 1484970
1819
+ },
1820
+ "wizardcoder_python_7b_v1.cc100-fr": {
1821
+ "vocab_size": 32001,
1822
+ "n_bytes": 1540504,
1823
+ "n_tokens": 457243,
1824
+ "n_chars": 1484970
1825
+ },
1826
+ "wizardlm_7b_v1.cc100-fr": {
1827
+ "vocab_size": 32001,
1828
+ "n_bytes": 1540504,
1829
+ "n_tokens": 457243,
1830
+ "n_chars": 1484970
1831
+ },
1832
+ "wizardmath_70b_v1.cc100-fr": {
1833
+ "vocab_size": 32002,
1834
+ "n_bytes": 1540504,
1835
+ "n_tokens": 457243,
1836
+ "n_chars": 1484970
1837
+ },
1838
+ "xlm_roberta.cc100-fr": {
1839
+ "vocab_size": 250002,
1840
+ "n_bytes": 1540504,
1841
+ "n_tokens": 405041,
1842
+ "n_chars": 1484970
1843
+ },
1844
+ "yi_34b.cc100-fr": {
1845
+ "vocab_size": 64000,
1846
+ "n_bytes": 1540504,
1847
+ "n_tokens": 533106,
1848
+ "n_chars": 1484970
1849
+ },
1850
+ "yi_6b.cc100-fr": {
1851
+ "vocab_size": 64000,
1852
+ "n_bytes": 1540504,
1853
+ "n_tokens": 533106,
1854
+ "n_chars": 1484970
1855
+ },
1856
+ "yi_vl34b.cc100-fr": {
1857
+ "vocab_size": 64000,
1858
+ "n_bytes": 1540504,
1859
+ "n_tokens": 532288,
1860
+ "n_chars": 1484970
1861
+ },
1862
+ "zephyr_7b_beta.cc100-fr": {
1863
+ "vocab_size": 32000,
1864
+ "n_bytes": 1540504,
1865
+ "n_tokens": 476666,
1866
+ "n_chars": 1484970
1867
+ }
1868
+ }
stats/compress_rate/amber.en.json DELETED
@@ -1 +0,0 @@
1
- {"vocab_size": 32000, "n_bytes": 1124813, "n_tokens": 294627, "n_chars": 1121360}
 
 
stats/compress_rate/amber.zh-Hans.json DELETED
@@ -1 +0,0 @@
1
- {"vocab_size": 32000, "n_bytes": 2633047, "n_tokens": 1330093, "n_chars": 927311}
 
 
stats/compress_rate/aya_101.en.json DELETED
@@ -1 +0,0 @@
1
- {"vocab_size": 250100, "n_bytes": 1124813, "n_tokens": 317881, "n_chars": 1121360}
 
 
stats/compress_rate/aya_101.zh-Hans.json DELETED
@@ -1 +0,0 @@
1
- {"vocab_size": 250100, "n_bytes": 2633047, "n_tokens": 631182, "n_chars": 927311}
 
 
stats/compress_rate/baichuan.en.json DELETED
@@ -1 +0,0 @@
1
- {"vocab_size": 64000, "n_bytes": 1124813, "n_tokens": 280108, "n_chars": 1121360}
 
 
stats/compress_rate/baichuan.zh-Hans.json DELETED
@@ -1 +0,0 @@
1
- {"vocab_size": 64000, "n_bytes": 2633047, "n_tokens": 626117, "n_chars": 927311}
 
 
stats/compress_rate/baichuan2.en.json DELETED
@@ -1 +0,0 @@
1
- {"vocab_size": 125696, "n_bytes": 1124813, "n_tokens": 269011, "n_chars": 1121360}
 
 
stats/compress_rate/baichuan2.zh-Hans.json DELETED
@@ -1 +0,0 @@
1
- {"vocab_size": 125696, "n_bytes": 2633047, "n_tokens": 541464, "n_chars": 927311}
 
 
stats/compress_rate/bert_base_cased.en.json DELETED
@@ -1 +0,0 @@
1
- {"vocab_size": 28996, "n_bytes": 1124813, "n_tokens": 288022, "n_chars": 1121360}
 
 
stats/compress_rate/bert_base_cased.zh-Hans.json DELETED
@@ -1 +0,0 @@
1
- {"vocab_size": 28996, "n_bytes": 2633047, "n_tokens": 899709, "n_chars": 927311}
 
 
stats/compress_rate/bert_base_chinese.en.json DELETED
@@ -1 +0,0 @@
1
- {"vocab_size": 21128, "n_bytes": 1124813, "n_tokens": 377068, "n_chars": 1121360}
 
 
stats/compress_rate/bert_base_chinese.zh-Hans.json DELETED
@@ -1 +0,0 @@
1
- {"vocab_size": 21128, "n_bytes": 2633047, "n_tokens": 896599, "n_chars": 927311}
 
 
stats/compress_rate/bert_base_uncased.en.json DELETED
@@ -1 +0,0 @@
1
- {"vocab_size": 30522, "n_bytes": 1124813, "n_tokens": 280575, "n_chars": 1121360}
 
 
stats/compress_rate/bert_base_uncased.zh-Hans.json DELETED
@@ -1 +0,0 @@
1
- {"vocab_size": 30522, "n_bytes": 2633047, "n_tokens": 898554, "n_chars": 927311}
 
 
stats/compress_rate/bloom.en.json DELETED
@@ -1 +0,0 @@
1
- {"vocab_size": 250680, "n_bytes": 1124813, "n_tokens": 257405, "n_chars": 1121360}
 
 
stats/compress_rate/bloom.zh-Hans.json DELETED
@@ -1 +0,0 @@
1
- {"vocab_size": 250680, "n_bytes": 2633047, "n_tokens": 573008, "n_chars": 927311}
 
 
stats/compress_rate/byt5_small.en.json DELETED
@@ -1 +0,0 @@
1
- {"vocab_size": 256, "n_bytes": 1124813, "n_tokens": 1134813, "n_chars": 1121360}
 
 
stats/compress_rate/byt5_small.zh-Hans.json DELETED
@@ -1 +0,0 @@
1
- {"vocab_size": 256, "n_bytes": 2633047, "n_tokens": 2643047, "n_chars": 927311}
 
 
stats/compress_rate/character_glm_6b.en.json DELETED
@@ -1 +0,0 @@
1
- {"vocab_size": 64794, "n_bytes": 1124813, "n_tokens": 289347, "n_chars": 1121360}
 
 
stats/compress_rate/character_glm_6b.zh-Hans.json DELETED
@@ -1 +0,0 @@
1
- {"vocab_size": 64794, "n_bytes": 2633047, "n_tokens": 583646, "n_chars": 927311}
 
 
stats/compress_rate/chatglm2_6b.en.json DELETED
@@ -1 +0,0 @@
1
- {"vocab_size": 64794, "n_bytes": 1124813, "n_tokens": 289329, "n_chars": 1121360}
 
 
stats/compress_rate/chatglm2_6b.zh-Hans.json DELETED
@@ -1 +0,0 @@
1
- {"vocab_size": 64794, "n_bytes": 2633047, "n_tokens": 583646, "n_chars": 927311}
 
 
stats/compress_rate/chatglm3_6b.en.json DELETED
@@ -1 +0,0 @@
1
- {"vocab_size": 64798, "n_bytes": 1124813, "n_tokens": 289347, "n_chars": 1121360}
 
 
stats/compress_rate/chatglm3_6b.zh-Hans.json DELETED
@@ -1 +0,0 @@
1
- {"vocab_size": 64798, "n_bytes": 2633047, "n_tokens": 583646, "n_chars": 927311}
 
 
stats/compress_rate/chatglm_6b.en.json DELETED
@@ -1 +0,0 @@
1
- {"vocab_size": 150344, "n_bytes": 1124813, "n_tokens": 284761, "n_chars": 1121360}
 
 
stats/compress_rate/chatglm_6b.zh-Hans.json DELETED
@@ -1 +0,0 @@
1
- {"vocab_size": 150344, "n_bytes": 2633047, "n_tokens": 527384, "n_chars": 927311}
 
 
stats/compress_rate/chatyuan_large_v2.en.json DELETED
@@ -1 +0,0 @@
1
- {"vocab_size": 32128, "n_bytes": 1124813, "n_tokens": 536033, "n_chars": 1121360}
 
 
stats/compress_rate/chatyuan_large_v2.zh-Hans.json DELETED
@@ -1 +0,0 @@
1
- {"vocab_size": 32128, "n_bytes": 2633047, "n_tokens": 564905, "n_chars": 927311}
 
 
stats/compress_rate/chinese_llama.en.json DELETED
@@ -1 +0,0 @@
1
- {"vocab_size": 49953, "n_bytes": 1124813, "n_tokens": 291514, "n_chars": 1121360}
 
 
stats/compress_rate/chinese_llama.zh-Hans.json DELETED
@@ -1 +0,0 @@
1
- {"vocab_size": 49953, "n_bytes": 2633047, "n_tokens": 623219, "n_chars": 927311}
 
 
stats/compress_rate/chinese_llama2.en.json DELETED
@@ -1 +0,0 @@
1
- {"vocab_size": 55296, "n_bytes": 1124813, "n_tokens": 294627, "n_chars": 1121360}
 
 
stats/compress_rate/chinese_llama2.zh-Hans.json DELETED
@@ -1 +0,0 @@
1
- {"vocab_size": 55296, "n_bytes": 2633047, "n_tokens": 625766, "n_chars": 927311}
 
 
stats/compress_rate/code_davinci_002.en.json DELETED
@@ -1 +0,0 @@
1
- {"vocab_size": 50281, "n_bytes": 1124813, "n_tokens": 258403, "n_chars": 1121360}
 
 
stats/compress_rate/code_davinci_002.zh-Hans.json DELETED
@@ -1 +0,0 @@
1
- {"vocab_size": 50281, "n_bytes": 2633047, "n_tokens": 1876809, "n_chars": 927311}
 
 
stats/compress_rate/crystal_coder.en.json DELETED
@@ -1 +0,0 @@
1
- {"vocab_size": 32000, "n_bytes": 1124813, "n_tokens": 284627, "n_chars": 1121360}
 
 
stats/compress_rate/crystal_coder.zh-Hans.json DELETED
@@ -1 +0,0 @@
1
- {"vocab_size": 32000, "n_bytes": 2633047, "n_tokens": 1320093, "n_chars": 927311}
 
 
stats/compress_rate/dbrx_instruct.en.json DELETED
@@ -1 +0,0 @@
1
- {"vocab_size": 100277, "n_bytes": 1124813, "n_tokens": 254985, "n_chars": 1121360}
 
 
stats/compress_rate/dbrx_instruct.zh-Hans.json DELETED
@@ -1 +0,0 @@
1
- {"vocab_size": 100277, "n_bytes": 2633047, "n_tokens": 1084939, "n_chars": 927311}
 
 
stats/compress_rate/deepseek_coder_33b_instruct.en.json DELETED
@@ -1 +0,0 @@
1
- {"vocab_size": 32000, "n_bytes": 1124813, "n_tokens": 287408, "n_chars": 1121360}
 
 
stats/compress_rate/deepseek_coder_33b_instruct.zh-Hans.json DELETED
@@ -1 +0,0 @@
1
- {"vocab_size": 32000, "n_bytes": 2633047, "n_tokens": 720577, "n_chars": 927311}