ddobokki commited on
Commit
563b168
·
verified ·
1 Parent(s): e3ba203

Upload tokenizer

Browse files
Files changed (5) hide show
  1. merges.txt +0 -0
  2. special_tokens_map.json +7 -56
  3. tokenizer.json +0 -0
  4. tokenizer_config.json +11 -516
  5. vocab.json +0 -0
merges.txt CHANGED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json CHANGED
@@ -1,78 +1,29 @@
1
  {
2
- "additional_special_tokens": [
3
- "<|bos|>",
4
- "<|pad|>",
5
- "<|unk|>",
6
- "<|start_header_id|>",
7
- "<|end_header_id|>",
8
- "<|eot_id|>",
9
- "<|unused_1|>",
10
- "<|unused_2|>",
11
- "<|unused_3|>",
12
- "<|unused_4|>",
13
- "<|unused_5|>",
14
- "<|unused_6|>",
15
- "<|unused_7|>",
16
- "<|unused_8|>",
17
- "<|unused_9|>",
18
- "<|unused_10|>",
19
- "<|unused_11|>",
20
- "<|unused_12|>",
21
- "<|unused_13|>",
22
- "<|unused_14|>",
23
- "<|unused_15|>",
24
- "<|unused_16|>",
25
- "<|unused_17|>",
26
- "<|unused_18|>",
27
- "<|unused_19|>",
28
- "<|unused_20|>",
29
- "<|unused_21|>",
30
- "<|unused_22|>",
31
- "<|unused_23|>",
32
- "<|unused_24|>",
33
- "<|unused_25|>",
34
- "<|unused_26|>",
35
- "<|unused_27|>",
36
- "<|unused_28|>",
37
- "<|unused_29|>",
38
- "<|unused_30|>",
39
- "<|unused_31|>",
40
- "<|unused_32|>",
41
- "<|unused_33|>",
42
- "<|unused_34|>",
43
- "<|unused_35|>",
44
- "<|unused_36|>",
45
- "<|unused_37|>",
46
- "<|unused_38|>",
47
- "<|unused_39|>",
48
- "<|unused_40|>",
49
- "<|unused_41|>"
50
- ],
51
  "bos_token": {
52
- "content": "<|bos|>",
53
  "lstrip": false,
54
- "normalized": false,
55
  "rstrip": false,
56
  "single_word": false
57
  },
58
  "eos_token": {
59
- "content": "<|endoftext|>",
60
  "lstrip": false,
61
  "normalized": true,
62
  "rstrip": false,
63
  "single_word": false
64
  },
65
  "pad_token": {
66
- "content": "<|pad|>",
67
  "lstrip": false,
68
- "normalized": false,
69
  "rstrip": false,
70
  "single_word": false
71
  },
72
  "unk_token": {
73
- "content": "<|unk|>",
74
  "lstrip": false,
75
- "normalized": false,
76
  "rstrip": false,
77
  "single_word": false
78
  }
 
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "bos_token": {
3
+ "content": "<|begin_of_text|>",
4
  "lstrip": false,
5
+ "normalized": true,
6
  "rstrip": false,
7
  "single_word": false
8
  },
9
  "eos_token": {
10
+ "content": "<|end_of_text|>",
11
  "lstrip": false,
12
  "normalized": true,
13
  "rstrip": false,
14
  "single_word": false
15
  },
16
  "pad_token": {
17
+ "content": "<|end_of_text|>",
18
  "lstrip": false,
19
+ "normalized": true,
20
  "rstrip": false,
21
  "single_word": false
22
  },
23
  "unk_token": {
24
+ "content": "<|end_of_text|>",
25
  "lstrip": false,
26
+ "normalized": true,
27
  "rstrip": false,
28
  "single_word": false
29
  }
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -2,536 +2,31 @@
2
  "add_bos_token": true,
3
  "add_prefix_space": true,
4
  "added_tokens_decoder": {
5
- "15": {
6
- "content": "0",
7
- "lstrip": false,
8
- "normalized": false,
9
- "rstrip": false,
10
- "single_word": false,
11
- "special": true
12
- },
13
- "16": {
14
- "content": "1",
15
- "lstrip": false,
16
- "normalized": false,
17
- "rstrip": false,
18
- "single_word": false,
19
- "special": true
20
- },
21
- "17": {
22
- "content": "2",
23
- "lstrip": false,
24
- "normalized": false,
25
- "rstrip": false,
26
- "single_word": false,
27
- "special": true
28
- },
29
- "18": {
30
- "content": "3",
31
- "lstrip": false,
32
- "normalized": false,
33
- "rstrip": false,
34
- "single_word": false,
35
- "special": true
36
- },
37
- "19": {
38
- "content": "4",
39
- "lstrip": false,
40
- "normalized": false,
41
- "rstrip": false,
42
- "single_word": false,
43
- "special": true
44
- },
45
- "20": {
46
- "content": "5",
47
- "lstrip": false,
48
- "normalized": false,
49
- "rstrip": false,
50
- "single_word": false,
51
- "special": true
52
- },
53
- "21": {
54
- "content": "6",
55
- "lstrip": false,
56
- "normalized": false,
57
- "rstrip": false,
58
- "single_word": false,
59
- "special": true
60
- },
61
- "22": {
62
- "content": "7",
63
- "lstrip": false,
64
- "normalized": false,
65
- "rstrip": false,
66
- "single_word": false,
67
- "special": true
68
- },
69
- "23": {
70
- "content": "8",
71
- "lstrip": false,
72
- "normalized": false,
73
- "rstrip": false,
74
- "single_word": false,
75
- "special": true
76
- },
77
- "24": {
78
- "content": "9",
79
- "lstrip": false,
80
- "normalized": false,
81
- "rstrip": false,
82
- "single_word": false,
83
- "special": true
84
- },
85
- "50256": {
86
- "content": "<|endoftext|>",
87
  "lstrip": false,
88
  "normalized": true,
89
  "rstrip": false,
90
  "single_word": false,
91
  "special": true
92
  },
93
- "50257": {
94
- "content": "<|bos|>",
95
  "lstrip": false,
96
- "normalized": false,
97
- "rstrip": false,
98
- "single_word": false,
99
- "special": true
100
- },
101
- "50258": {
102
- "content": "<|pad|>",
103
- "lstrip": false,
104
- "normalized": false,
105
- "rstrip": false,
106
- "single_word": false,
107
- "special": true
108
- },
109
- "50259": {
110
- "content": "<|unk|>",
111
- "lstrip": false,
112
- "normalized": false,
113
- "rstrip": false,
114
- "single_word": false,
115
- "special": true
116
- },
117
- "50260": {
118
- "content": "<|start_header_id|>",
119
- "lstrip": false,
120
- "normalized": false,
121
- "rstrip": false,
122
- "single_word": false,
123
- "special": true
124
- },
125
- "50261": {
126
- "content": "<|end_header_id|>",
127
- "lstrip": false,
128
- "normalized": false,
129
- "rstrip": false,
130
- "single_word": false,
131
- "special": true
132
- },
133
- "50262": {
134
- "content": "<|eot_id|>",
135
- "lstrip": false,
136
- "normalized": false,
137
- "rstrip": false,
138
- "single_word": false,
139
- "special": true
140
- },
141
- "50263": {
142
- "content": "<|unused_1|>",
143
- "lstrip": false,
144
- "normalized": false,
145
- "rstrip": false,
146
- "single_word": false,
147
- "special": true
148
- },
149
- "50264": {
150
- "content": "<|unused_2|>",
151
- "lstrip": false,
152
- "normalized": false,
153
- "rstrip": false,
154
- "single_word": false,
155
- "special": true
156
- },
157
- "50265": {
158
- "content": "<|unused_3|>",
159
- "lstrip": false,
160
- "normalized": false,
161
- "rstrip": false,
162
- "single_word": false,
163
- "special": true
164
- },
165
- "50266": {
166
- "content": "<|unused_4|>",
167
- "lstrip": false,
168
- "normalized": false,
169
- "rstrip": false,
170
- "single_word": false,
171
- "special": true
172
- },
173
- "50267": {
174
- "content": "<|unused_5|>",
175
- "lstrip": false,
176
- "normalized": false,
177
- "rstrip": false,
178
- "single_word": false,
179
- "special": true
180
- },
181
- "50268": {
182
- "content": "<|unused_6|>",
183
- "lstrip": false,
184
- "normalized": false,
185
- "rstrip": false,
186
- "single_word": false,
187
- "special": true
188
- },
189
- "50269": {
190
- "content": "<|unused_7|>",
191
- "lstrip": false,
192
- "normalized": false,
193
- "rstrip": false,
194
- "single_word": false,
195
- "special": true
196
- },
197
- "50270": {
198
- "content": "<|unused_8|>",
199
- "lstrip": false,
200
- "normalized": false,
201
- "rstrip": false,
202
- "single_word": false,
203
- "special": true
204
- },
205
- "50271": {
206
- "content": "<|unused_9|>",
207
- "lstrip": false,
208
- "normalized": false,
209
- "rstrip": false,
210
- "single_word": false,
211
- "special": true
212
- },
213
- "50272": {
214
- "content": "<|unused_10|>",
215
- "lstrip": false,
216
- "normalized": false,
217
- "rstrip": false,
218
- "single_word": false,
219
- "special": true
220
- },
221
- "50273": {
222
- "content": "<|unused_11|>",
223
- "lstrip": false,
224
- "normalized": false,
225
- "rstrip": false,
226
- "single_word": false,
227
- "special": true
228
- },
229
- "50274": {
230
- "content": "<|unused_12|>",
231
- "lstrip": false,
232
- "normalized": false,
233
- "rstrip": false,
234
- "single_word": false,
235
- "special": true
236
- },
237
- "50275": {
238
- "content": "<|unused_13|>",
239
- "lstrip": false,
240
- "normalized": false,
241
- "rstrip": false,
242
- "single_word": false,
243
- "special": true
244
- },
245
- "50276": {
246
- "content": "<|unused_14|>",
247
- "lstrip": false,
248
- "normalized": false,
249
- "rstrip": false,
250
- "single_word": false,
251
- "special": true
252
- },
253
- "50277": {
254
- "content": "<|unused_15|>",
255
- "lstrip": false,
256
- "normalized": false,
257
- "rstrip": false,
258
- "single_word": false,
259
- "special": true
260
- },
261
- "50278": {
262
- "content": "<|unused_16|>",
263
- "lstrip": false,
264
- "normalized": false,
265
- "rstrip": false,
266
- "single_word": false,
267
- "special": true
268
- },
269
- "50279": {
270
- "content": "<|unused_17|>",
271
- "lstrip": false,
272
- "normalized": false,
273
- "rstrip": false,
274
- "single_word": false,
275
- "special": true
276
- },
277
- "50280": {
278
- "content": "<|unused_18|>",
279
- "lstrip": false,
280
- "normalized": false,
281
- "rstrip": false,
282
- "single_word": false,
283
- "special": true
284
- },
285
- "50281": {
286
- "content": "<|unused_19|>",
287
- "lstrip": false,
288
- "normalized": false,
289
- "rstrip": false,
290
- "single_word": false,
291
- "special": true
292
- },
293
- "50282": {
294
- "content": "<|unused_20|>",
295
- "lstrip": false,
296
- "normalized": false,
297
- "rstrip": false,
298
- "single_word": false,
299
- "special": true
300
- },
301
- "50283": {
302
- "content": "<|unused_21|>",
303
- "lstrip": false,
304
- "normalized": false,
305
- "rstrip": false,
306
- "single_word": false,
307
- "special": true
308
- },
309
- "50284": {
310
- "content": "<|unused_22|>",
311
- "lstrip": false,
312
- "normalized": false,
313
- "rstrip": false,
314
- "single_word": false,
315
- "special": true
316
- },
317
- "50285": {
318
- "content": "<|unused_23|>",
319
- "lstrip": false,
320
- "normalized": false,
321
- "rstrip": false,
322
- "single_word": false,
323
- "special": true
324
- },
325
- "50286": {
326
- "content": "<|unused_24|>",
327
- "lstrip": false,
328
- "normalized": false,
329
- "rstrip": false,
330
- "single_word": false,
331
- "special": true
332
- },
333
- "50287": {
334
- "content": "<|unused_25|>",
335
- "lstrip": false,
336
- "normalized": false,
337
- "rstrip": false,
338
- "single_word": false,
339
- "special": true
340
- },
341
- "50288": {
342
- "content": "<|unused_26|>",
343
- "lstrip": false,
344
- "normalized": false,
345
- "rstrip": false,
346
- "single_word": false,
347
- "special": true
348
- },
349
- "50289": {
350
- "content": "<|unused_27|>",
351
- "lstrip": false,
352
- "normalized": false,
353
- "rstrip": false,
354
- "single_word": false,
355
- "special": true
356
- },
357
- "50290": {
358
- "content": "<|unused_28|>",
359
- "lstrip": false,
360
- "normalized": false,
361
- "rstrip": false,
362
- "single_word": false,
363
- "special": true
364
- },
365
- "50291": {
366
- "content": "<|unused_29|>",
367
- "lstrip": false,
368
- "normalized": false,
369
- "rstrip": false,
370
- "single_word": false,
371
- "special": true
372
- },
373
- "50292": {
374
- "content": "<|unused_30|>",
375
- "lstrip": false,
376
- "normalized": false,
377
- "rstrip": false,
378
- "single_word": false,
379
- "special": true
380
- },
381
- "50293": {
382
- "content": "<|unused_31|>",
383
- "lstrip": false,
384
- "normalized": false,
385
- "rstrip": false,
386
- "single_word": false,
387
- "special": true
388
- },
389
- "50294": {
390
- "content": "<|unused_32|>",
391
- "lstrip": false,
392
- "normalized": false,
393
- "rstrip": false,
394
- "single_word": false,
395
- "special": true
396
- },
397
- "50295": {
398
- "content": "<|unused_33|>",
399
- "lstrip": false,
400
- "normalized": false,
401
- "rstrip": false,
402
- "single_word": false,
403
- "special": true
404
- },
405
- "50296": {
406
- "content": "<|unused_34|>",
407
- "lstrip": false,
408
- "normalized": false,
409
- "rstrip": false,
410
- "single_word": false,
411
- "special": true
412
- },
413
- "50297": {
414
- "content": "<|unused_35|>",
415
- "lstrip": false,
416
- "normalized": false,
417
- "rstrip": false,
418
- "single_word": false,
419
- "special": true
420
- },
421
- "50298": {
422
- "content": "<|unused_36|>",
423
- "lstrip": false,
424
- "normalized": false,
425
- "rstrip": false,
426
- "single_word": false,
427
- "special": true
428
- },
429
- "50299": {
430
- "content": "<|unused_37|>",
431
- "lstrip": false,
432
- "normalized": false,
433
- "rstrip": false,
434
- "single_word": false,
435
- "special": true
436
- },
437
- "50300": {
438
- "content": "<|unused_38|>",
439
- "lstrip": false,
440
- "normalized": false,
441
- "rstrip": false,
442
- "single_word": false,
443
- "special": true
444
- },
445
- "50301": {
446
- "content": "<|unused_39|>",
447
- "lstrip": false,
448
- "normalized": false,
449
- "rstrip": false,
450
- "single_word": false,
451
- "special": true
452
- },
453
- "50302": {
454
- "content": "<|unused_40|>",
455
- "lstrip": false,
456
- "normalized": false,
457
- "rstrip": false,
458
- "single_word": false,
459
- "special": true
460
- },
461
- "50303": {
462
- "content": "<|unused_41|>",
463
- "lstrip": false,
464
- "normalized": false,
465
  "rstrip": false,
466
  "single_word": false,
467
  "special": true
468
  }
469
  },
470
- "additional_special_tokens": [
471
- "<|bos|>",
472
- "<|pad|>",
473
- "<|unk|>",
474
- "<|start_header_id|>",
475
- "<|end_header_id|>",
476
- "<|eot_id|>",
477
- "0",
478
- "1",
479
- "2",
480
- "3",
481
- "4",
482
- "5",
483
- "6",
484
- "7",
485
- "8",
486
- "9",
487
- "<|unused_1|>",
488
- "<|unused_2|>",
489
- "<|unused_3|>",
490
- "<|unused_4|>",
491
- "<|unused_5|>",
492
- "<|unused_6|>",
493
- "<|unused_7|>",
494
- "<|unused_8|>",
495
- "<|unused_9|>",
496
- "<|unused_10|>",
497
- "<|unused_11|>",
498
- "<|unused_12|>",
499
- "<|unused_13|>",
500
- "<|unused_14|>",
501
- "<|unused_15|>",
502
- "<|unused_16|>",
503
- "<|unused_17|>",
504
- "<|unused_18|>",
505
- "<|unused_19|>",
506
- "<|unused_20|>",
507
- "<|unused_21|>",
508
- "<|unused_22|>",
509
- "<|unused_23|>",
510
- "<|unused_24|>",
511
- "<|unused_25|>",
512
- "<|unused_26|>",
513
- "<|unused_27|>",
514
- "<|unused_28|>",
515
- "<|unused_29|>",
516
- "<|unused_30|>",
517
- "<|unused_31|>",
518
- "<|unused_32|>",
519
- "<|unused_33|>",
520
- "<|unused_34|>",
521
- "<|unused_35|>",
522
- "<|unused_36|>",
523
- "<|unused_37|>",
524
- "<|unused_38|>",
525
- "<|unused_39|>",
526
- "<|unused_40|>",
527
- "<|unused_41|>"
528
- ],
529
- "bos_token": "<|bos|>",
530
  "clean_up_tokenization_spaces": true,
531
- "eos_token": "<|endoftext|>",
532
  "errors": "replace",
533
  "model_max_length": 4096,
534
- "pad_token": "<|pad|>",
 
535
  "tokenizer_class": "GPT2Tokenizer",
536
- "unk_token": "<|unk|>"
 
537
  }
 
2
  "add_bos_token": true,
3
  "add_prefix_space": true,
4
  "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<|begin_of_text|>",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  "lstrip": false,
8
  "normalized": true,
9
  "rstrip": false,
10
  "single_word": false,
11
  "special": true
12
  },
13
+ "1": {
14
+ "content": "<|end_of_text|>",
15
  "lstrip": false,
16
+ "normalized": true,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  "rstrip": false,
18
  "single_word": false,
19
  "special": true
20
  }
21
  },
22
+ "bos_token": "<|begin_of_text|>",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  "clean_up_tokenization_spaces": true,
24
+ "eos_token": "<|end_of_text|>",
25
  "errors": "replace",
26
  "model_max_length": 4096,
27
+ "pad_token": "<|end_of_text|>",
28
+ "padding_side": "left",
29
  "tokenizer_class": "GPT2Tokenizer",
30
+ "truncation_side": "left",
31
+ "unk_token": "<|end_of_text|>"
32
  }
vocab.json CHANGED
The diff for this file is too large to render. See raw diff