whizzzzkid commited on
Commit
6296c6f
1 Parent(s): ea9e55c

Upload tokenizer

Browse files
Files changed (5) hide show
  1. merges.txt +0 -0
  2. special_tokens_map.json +37 -6
  3. tokenizer.json +2 -2
  4. tokenizer_config.json +258 -32
  5. vocab.json +0 -0
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json CHANGED
@@ -1,31 +1,62 @@
1
  {
2
  "additional_special_tokens": [
3
- "<start_of_turn>",
4
- "<end_of_turn>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  ],
6
  "bos_token": {
7
- "content": "<s>",
8
  "lstrip": false,
9
  "normalized": false,
10
  "rstrip": false,
11
  "single_word": false
12
  },
13
  "eos_token": {
14
- "content": "</s>",
15
  "lstrip": false,
16
  "normalized": false,
17
  "rstrip": false,
18
  "single_word": false
19
  },
20
  "pad_token": {
21
- "content": "<pad>",
22
  "lstrip": false,
23
  "normalized": false,
24
  "rstrip": false,
25
  "single_word": false
26
  },
27
  "unk_token": {
28
- "content": "<unk>",
29
  "lstrip": false,
30
  "normalized": false,
31
  "rstrip": false,
 
1
  {
2
  "additional_special_tokens": [
3
+ "<|reg_extra|>",
4
+ "<|endoftext|>",
5
+ "<|fim_prefix|>",
6
+ "<|fim_middle|>",
7
+ "<|fim_suffix|>",
8
+ "<|fim_pad|>",
9
+ "<gh_stars>",
10
+ "<filename>",
11
+ "<issue_start>",
12
+ "<issue_comment>",
13
+ "<issue_closed>",
14
+ "<jupyter_start>",
15
+ "<jupyter_text>",
16
+ "<jupyter_code>",
17
+ "<jupyter_output>",
18
+ "<empty_output>",
19
+ "<commit_before>",
20
+ "<commit_msg>",
21
+ "<commit_after>",
22
+ "<reponame>",
23
+ "<|endofprompt|>",
24
+ "<|im_start|>",
25
+ "<|im_end|>",
26
+ "<|pause|>",
27
+ "<|reg0|>",
28
+ "<|reg1|>",
29
+ "<|reg2|>",
30
+ "<|reg3|>",
31
+ "<|reg4|>",
32
+ "<|reg5|>",
33
+ "<|reg6|>",
34
+ "<|reg7|>",
35
+ "<|extra0|>"
36
  ],
37
  "bos_token": {
38
+ "content": "<|endoftext|>",
39
  "lstrip": false,
40
  "normalized": false,
41
  "rstrip": false,
42
  "single_word": false
43
  },
44
  "eos_token": {
45
+ "content": "<|endoftext|>",
46
  "lstrip": false,
47
  "normalized": false,
48
  "rstrip": false,
49
  "single_word": false
50
  },
51
  "pad_token": {
52
+ "content": "<|endoftext|>",
53
  "lstrip": false,
54
  "normalized": false,
55
  "rstrip": false,
56
  "single_word": false
57
  },
58
  "unk_token": {
59
+ "content": "<|endoftext|>",
60
  "lstrip": false,
61
  "normalized": false,
62
  "rstrip": false,
tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a2d83c8e6a960de3d36db939e71a1857057ea89ce784a795979fe885457c3e9c
3
- size 17478274
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:631fbd6e3f9415c226372013d876a6a5106009bb2637021ea2b73d524163dd5c
3
+ size 4239425
tokenizer_config.json CHANGED
@@ -1,65 +1,264 @@
1
  {
2
- "add_bos_token": true,
3
- "add_eos_token": false,
4
  "added_tokens_decoder": {
5
- "0": {
6
- "content": "<pad>",
7
  "lstrip": false,
8
  "normalized": false,
9
  "rstrip": false,
10
  "single_word": false,
11
  "special": true
12
  },
13
- "1": {
14
- "content": "<eos>",
15
  "lstrip": false,
16
  "normalized": false,
17
  "rstrip": false,
18
  "single_word": false,
19
  "special": true
20
  },
21
- "2": {
22
- "content": "<bos>",
23
  "lstrip": false,
24
  "normalized": false,
25
  "rstrip": false,
26
  "single_word": false,
27
  "special": true
28
  },
29
- "3": {
30
- "content": "<unk>",
31
  "lstrip": false,
32
  "normalized": false,
33
  "rstrip": false,
34
  "single_word": false,
35
  "special": true
36
  },
37
- "106": {
38
- "content": "<start_of_turn>",
39
  "lstrip": false,
40
  "normalized": false,
41
  "rstrip": false,
42
  "single_word": false,
43
  "special": true
44
  },
45
- "107": {
46
- "content": "<end_of_turn>",
47
  "lstrip": false,
48
  "normalized": false,
49
  "rstrip": false,
50
  "single_word": false,
51
  "special": true
52
  },
53
- "204": {
54
- "content": "<s>",
55
  "lstrip": false,
56
  "normalized": false,
57
  "rstrip": false,
58
  "single_word": false,
59
  "special": true
60
  },
61
- "213": {
62
- "content": "</s>",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  "lstrip": false,
64
  "normalized": false,
65
  "rstrip": false,
@@ -68,19 +267,46 @@
68
  }
69
  },
70
  "additional_special_tokens": [
71
- "<start_of_turn>",
72
- "<end_of_turn>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  ],
74
- "bos_token": "<s>",
75
- "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\n' + message['content'] | trim + '<end_of_turn>\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\n'}}{% endif %}",
76
- "clean_up_tokenization_spaces": false,
77
- "eos_token": "</s>",
78
- "legacy": null,
79
- "model_max_length": 1000000000000000019884624838656,
80
- "pad_token": "<pad>",
81
- "sp_model_kwargs": {},
82
- "spaces_between_special_tokens": false,
83
- "tokenizer_class": "GemmaTokenizer",
84
- "unk_token": "<unk>",
85
- "use_default_system_prompt": false
86
  }
 
1
  {
2
+ "add_prefix_space": false,
 
3
  "added_tokens_decoder": {
4
+ "100256": {
5
+ "content": "<|reg_extra|>",
6
  "lstrip": false,
7
  "normalized": false,
8
  "rstrip": false,
9
  "single_word": false,
10
  "special": true
11
  },
12
+ "100257": {
13
+ "content": "<|endoftext|>",
14
  "lstrip": false,
15
  "normalized": false,
16
  "rstrip": false,
17
  "single_word": false,
18
  "special": true
19
  },
20
+ "100258": {
21
+ "content": "<|fim_prefix|>",
22
  "lstrip": false,
23
  "normalized": false,
24
  "rstrip": false,
25
  "single_word": false,
26
  "special": true
27
  },
28
+ "100259": {
29
+ "content": "<|fim_middle|>",
30
  "lstrip": false,
31
  "normalized": false,
32
  "rstrip": false,
33
  "single_word": false,
34
  "special": true
35
  },
36
+ "100260": {
37
+ "content": "<|fim_suffix|>",
38
  "lstrip": false,
39
  "normalized": false,
40
  "rstrip": false,
41
  "single_word": false,
42
  "special": true
43
  },
44
+ "100261": {
45
+ "content": "<|fim_pad|>",
46
  "lstrip": false,
47
  "normalized": false,
48
  "rstrip": false,
49
  "single_word": false,
50
  "special": true
51
  },
52
+ "100262": {
53
+ "content": "<gh_stars>",
54
  "lstrip": false,
55
  "normalized": false,
56
  "rstrip": false,
57
  "single_word": false,
58
  "special": true
59
  },
60
+ "100263": {
61
+ "content": "<filename>",
62
+ "lstrip": false,
63
+ "normalized": false,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": true
67
+ },
68
+ "100264": {
69
+ "content": "<issue_start>",
70
+ "lstrip": false,
71
+ "normalized": false,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": true
75
+ },
76
+ "100265": {
77
+ "content": "<issue_comment>",
78
+ "lstrip": false,
79
+ "normalized": false,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": true
83
+ },
84
+ "100266": {
85
+ "content": "<issue_closed>",
86
+ "lstrip": false,
87
+ "normalized": false,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": true
91
+ },
92
+ "100267": {
93
+ "content": "<jupyter_start>",
94
+ "lstrip": false,
95
+ "normalized": false,
96
+ "rstrip": false,
97
+ "single_word": false,
98
+ "special": true
99
+ },
100
+ "100268": {
101
+ "content": "<jupyter_text>",
102
+ "lstrip": false,
103
+ "normalized": false,
104
+ "rstrip": false,
105
+ "single_word": false,
106
+ "special": true
107
+ },
108
+ "100269": {
109
+ "content": "<jupyter_code>",
110
+ "lstrip": false,
111
+ "normalized": false,
112
+ "rstrip": false,
113
+ "single_word": false,
114
+ "special": true
115
+ },
116
+ "100270": {
117
+ "content": "<jupyter_output>",
118
+ "lstrip": false,
119
+ "normalized": false,
120
+ "rstrip": false,
121
+ "single_word": false,
122
+ "special": true
123
+ },
124
+ "100271": {
125
+ "content": "<empty_output>",
126
+ "lstrip": false,
127
+ "normalized": false,
128
+ "rstrip": false,
129
+ "single_word": false,
130
+ "special": true
131
+ },
132
+ "100272": {
133
+ "content": "<commit_before>",
134
+ "lstrip": false,
135
+ "normalized": false,
136
+ "rstrip": false,
137
+ "single_word": false,
138
+ "special": true
139
+ },
140
+ "100273": {
141
+ "content": "<commit_msg>",
142
+ "lstrip": false,
143
+ "normalized": false,
144
+ "rstrip": false,
145
+ "single_word": false,
146
+ "special": true
147
+ },
148
+ "100274": {
149
+ "content": "<commit_after>",
150
+ "lstrip": false,
151
+ "normalized": false,
152
+ "rstrip": false,
153
+ "single_word": false,
154
+ "special": true
155
+ },
156
+ "100275": {
157
+ "content": "<reponame>",
158
+ "lstrip": false,
159
+ "normalized": false,
160
+ "rstrip": false,
161
+ "single_word": false,
162
+ "special": true
163
+ },
164
+ "100276": {
165
+ "content": "<|endofprompt|>",
166
+ "lstrip": false,
167
+ "normalized": false,
168
+ "rstrip": false,
169
+ "single_word": false,
170
+ "special": true
171
+ },
172
+ "100277": {
173
+ "content": "<|im_start|>",
174
+ "lstrip": false,
175
+ "normalized": false,
176
+ "rstrip": false,
177
+ "single_word": false,
178
+ "special": true
179
+ },
180
+ "100278": {
181
+ "content": "<|im_end|>",
182
+ "lstrip": false,
183
+ "normalized": false,
184
+ "rstrip": false,
185
+ "single_word": false,
186
+ "special": true
187
+ },
188
+ "100279": {
189
+ "content": "<|pause|>",
190
+ "lstrip": false,
191
+ "normalized": false,
192
+ "rstrip": false,
193
+ "single_word": false,
194
+ "special": true
195
+ },
196
+ "100280": {
197
+ "content": "<|reg0|>",
198
+ "lstrip": false,
199
+ "normalized": false,
200
+ "rstrip": false,
201
+ "single_word": false,
202
+ "special": true
203
+ },
204
+ "100281": {
205
+ "content": "<|reg1|>",
206
+ "lstrip": false,
207
+ "normalized": false,
208
+ "rstrip": false,
209
+ "single_word": false,
210
+ "special": true
211
+ },
212
+ "100282": {
213
+ "content": "<|reg2|>",
214
+ "lstrip": false,
215
+ "normalized": false,
216
+ "rstrip": false,
217
+ "single_word": false,
218
+ "special": true
219
+ },
220
+ "100283": {
221
+ "content": "<|reg3|>",
222
+ "lstrip": false,
223
+ "normalized": false,
224
+ "rstrip": false,
225
+ "single_word": false,
226
+ "special": true
227
+ },
228
+ "100284": {
229
+ "content": "<|reg4|>",
230
+ "lstrip": false,
231
+ "normalized": false,
232
+ "rstrip": false,
233
+ "single_word": false,
234
+ "special": true
235
+ },
236
+ "100285": {
237
+ "content": "<|reg5|>",
238
+ "lstrip": false,
239
+ "normalized": false,
240
+ "rstrip": false,
241
+ "single_word": false,
242
+ "special": true
243
+ },
244
+ "100286": {
245
+ "content": "<|reg6|>",
246
+ "lstrip": false,
247
+ "normalized": false,
248
+ "rstrip": false,
249
+ "single_word": false,
250
+ "special": true
251
+ },
252
+ "100287": {
253
+ "content": "<|reg7|>",
254
+ "lstrip": false,
255
+ "normalized": false,
256
+ "rstrip": false,
257
+ "single_word": false,
258
+ "special": true
259
+ },
260
+ "100288": {
261
+ "content": "<|extra0|>",
262
  "lstrip": false,
263
  "normalized": false,
264
  "rstrip": false,
 
267
  }
268
  },
269
  "additional_special_tokens": [
270
+ "<|reg_extra|>",
271
+ "<|endoftext|>",
272
+ "<|fim_prefix|>",
273
+ "<|fim_middle|>",
274
+ "<|fim_suffix|>",
275
+ "<|fim_pad|>",
276
+ "<gh_stars>",
277
+ "<filename>",
278
+ "<issue_start>",
279
+ "<issue_comment>",
280
+ "<issue_closed>",
281
+ "<jupyter_start>",
282
+ "<jupyter_text>",
283
+ "<jupyter_code>",
284
+ "<jupyter_output>",
285
+ "<empty_output>",
286
+ "<commit_before>",
287
+ "<commit_msg>",
288
+ "<commit_after>",
289
+ "<reponame>",
290
+ "<|endofprompt|>",
291
+ "<|im_start|>",
292
+ "<|im_end|>",
293
+ "<|pause|>",
294
+ "<|reg0|>",
295
+ "<|reg1|>",
296
+ "<|reg2|>",
297
+ "<|reg3|>",
298
+ "<|reg4|>",
299
+ "<|reg5|>",
300
+ "<|reg6|>",
301
+ "<|reg7|>",
302
+ "<|extra0|>"
303
  ],
304
+ "bos_token": "<|endoftext|>",
305
+ "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}",
306
+ "clean_up_tokenization_spaces": true,
307
+ "eos_token": "<|endoftext|>",
308
+ "model_max_length": 2048,
309
+ "pad_token": "<|endoftext|>",
310
+ "tokenizer_class": "GPT2Tokenizer",
311
+ "unk_token": "<|endoftext|>"
 
 
 
 
312
  }
vocab.json ADDED
The diff for this file is too large to render. See raw diff