GinnM commited on
Commit
f703b6e
1 Parent(s): 3864e99

Upload tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +9 -0
  2. tokenizer.json +540 -0
  3. tokenizer_config.json +5 -0
special_tokens_map.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<cls>",
3
+ "cls_token": "<cls>",
4
+ "eos_token": "<sep>",
5
+ "mask_token": "<mask>",
6
+ "pad_token": "<pad>",
7
+ "sep_token": "<sep>",
8
+ "unk_token": "<unk>"
9
+ }
tokenizer.json ADDED
@@ -0,0 +1,540 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "<pad>",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 1,
17
+ "content": "<cls>",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 2,
26
+ "content": "<sep>",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ },
33
+ {
34
+ "id": 3,
35
+ "content": "<unk>",
36
+ "single_word": false,
37
+ "lstrip": false,
38
+ "rstrip": false,
39
+ "normalized": false,
40
+ "special": true
41
+ },
42
+ {
43
+ "id": 4,
44
+ "content": "<mask>",
45
+ "single_word": false,
46
+ "lstrip": false,
47
+ "rstrip": false,
48
+ "normalized": false,
49
+ "special": true
50
+ }
51
+ ],
52
+ "normalizer": null,
53
+ "pre_tokenizer": null,
54
+ "post_processor": {
55
+ "type": "TemplateProcessing",
56
+ "single": [
57
+ {
58
+ "SpecialToken": {
59
+ "id": "<cls>",
60
+ "type_id": 0
61
+ }
62
+ },
63
+ {
64
+ "Sequence": {
65
+ "id": "A",
66
+ "type_id": 0
67
+ }
68
+ },
69
+ {
70
+ "SpecialToken": {
71
+ "id": "<sep>",
72
+ "type_id": 0
73
+ }
74
+ }
75
+ ],
76
+ "pair": [
77
+ {
78
+ "SpecialToken": {
79
+ "id": "<cls>",
80
+ "type_id": 0
81
+ }
82
+ },
83
+ {
84
+ "Sequence": {
85
+ "id": "A",
86
+ "type_id": 0
87
+ }
88
+ },
89
+ {
90
+ "SpecialToken": {
91
+ "id": "<sep>",
92
+ "type_id": 0
93
+ }
94
+ },
95
+ {
96
+ "Sequence": {
97
+ "id": "B",
98
+ "type_id": 1
99
+ }
100
+ },
101
+ {
102
+ "SpecialToken": {
103
+ "id": "<sep>",
104
+ "type_id": 1
105
+ }
106
+ }
107
+ ],
108
+ "special_tokens": {
109
+ "<cls>": {
110
+ "id": "<cls>",
111
+ "ids": [
112
+ 1
113
+ ],
114
+ "tokens": [
115
+ "<cls>"
116
+ ]
117
+ },
118
+ "<sep>": {
119
+ "id": "<sep>",
120
+ "ids": [
121
+ 2
122
+ ],
123
+ "tokens": [
124
+ "<sep>"
125
+ ]
126
+ }
127
+ }
128
+ },
129
+ "decoder": {
130
+ "type": "Metaspace",
131
+ "replacement": "▁",
132
+ "add_prefix_space": true
133
+ },
134
+ "model": {
135
+ "type": "Unigram",
136
+ "unk_id": 3,
137
+ "vocab": [
138
+ [
139
+ "<pad>",
140
+ 0.0
141
+ ],
142
+ [
143
+ "<cls>",
144
+ 0.0
145
+ ],
146
+ [
147
+ "<sep>",
148
+ 0.0
149
+ ],
150
+ [
151
+ "<unk>",
152
+ 0.0
153
+ ],
154
+ [
155
+ "<mask>",
156
+ 0.0
157
+ ],
158
+ [
159
+ "K",
160
+ -2.8926598139924984
161
+ ],
162
+ [
163
+ "P",
164
+ -2.937301100450547
165
+ ],
166
+ [
167
+ "R",
168
+ -2.981133587027994
169
+ ],
170
+ [
171
+ "N",
172
+ -2.984398973523902
173
+ ],
174
+ [
175
+ "F",
176
+ -3.0405482512940942
177
+ ],
178
+ [
179
+ "Q",
180
+ -3.0522167753465475
181
+ ],
182
+ [
183
+ "T",
184
+ -3.057893521866589
185
+ ],
186
+ [
187
+ "D",
188
+ -3.221572989674069
189
+ ],
190
+ [
191
+ "V",
192
+ -3.242957729449966
193
+ ],
194
+ [
195
+ "I",
196
+ -3.2639228157131353
197
+ ],
198
+ [
199
+ "G",
200
+ -3.292222729720054
201
+ ],
202
+ [
203
+ "Y",
204
+ -3.3242430893206976
205
+ ],
206
+ [
207
+ "E",
208
+ -3.329998422834164
209
+ ],
210
+ [
211
+ "A",
212
+ -3.3315100842234955
213
+ ],
214
+ [
215
+ "L",
216
+ -3.354133164459414
217
+ ],
218
+ [
219
+ "S",
220
+ -3.374619245946816
221
+ ],
222
+ [
223
+ "H",
224
+ -3.6029119248880974
225
+ ],
226
+ [
227
+ "M",
228
+ -3.6330880431340073
229
+ ],
230
+ [
231
+ "C",
232
+ -4.03859206801765
233
+ ],
234
+ [
235
+ "W",
236
+ -4.123791187360837
237
+ ],
238
+ [
239
+ "AA",
240
+ -4.973167425197216
241
+ ],
242
+ [
243
+ "LL",
244
+ -5.036049835340242
245
+ ],
246
+ [
247
+ "LA",
248
+ -5.194188976627833
249
+ ],
250
+ [
251
+ "LS",
252
+ -5.284020121682021
253
+ ],
254
+ [
255
+ "AL",
256
+ -5.320469682162143
257
+ ],
258
+ [
259
+ "SS",
260
+ -5.32453022570105
261
+ ],
262
+ [
263
+ "SL",
264
+ -5.499862731096803
265
+ ],
266
+ [
267
+ "LV",
268
+ -5.564795933199976
269
+ ],
270
+ [
271
+ "AG",
272
+ -5.570161883025232
273
+ ],
274
+ [
275
+ "GG",
276
+ -5.586688874755673
277
+ ],
278
+ [
279
+ "AV",
280
+ -5.633866882153034
281
+ ],
282
+ [
283
+ "LG",
284
+ -5.6617838822297415
285
+ ],
286
+ [
287
+ "EL",
288
+ -5.710679099236735
289
+ ],
290
+ [
291
+ "AS",
292
+ -5.716264001404388
293
+ ],
294
+ [
295
+ "VL",
296
+ -5.7256437689651545
297
+ ],
298
+ [
299
+ "SG",
300
+ -5.727475385041664
301
+ ],
302
+ [
303
+ "LR",
304
+ -5.728549131116866
305
+ ],
306
+ [
307
+ "LE",
308
+ -5.743482374609412
309
+ ],
310
+ [
311
+ "GL",
312
+ -5.743495521291919
313
+ ],
314
+ [
315
+ "SA",
316
+ -5.761398906433778
317
+ ],
318
+ [
319
+ "GA",
320
+ -5.779501575037608
321
+ ],
322
+ [
323
+ "EE",
324
+ -5.783251373522621
325
+ ],
326
+ [
327
+ "RL",
328
+ -5.787802487132645
329
+ ],
330
+ [
331
+ "VA",
332
+ -5.822397606320607
333
+ ],
334
+ [
335
+ "TL",
336
+ -5.826633127578567
337
+ ],
338
+ [
339
+ "GS",
340
+ -5.836663706756646
341
+ ],
342
+ [
343
+ "EA",
344
+ -5.854012392063035
345
+ ],
346
+ [
347
+ "TA",
348
+ -5.87623583133286
349
+ ],
350
+ [
351
+ "LD",
352
+ -5.884057317313408
353
+ ],
354
+ [
355
+ "VV",
356
+ -5.886122471861352
357
+ ],
358
+ [
359
+ "LT",
360
+ -5.890501009220628
361
+ ],
362
+ [
363
+ "IL",
364
+ -5.893641541215938
365
+ ],
366
+ [
367
+ "DL",
368
+ -5.908589432452223
369
+ ],
370
+ [
371
+ "SV",
372
+ -5.928508582466128
373
+ ],
374
+ [
375
+ "LP",
376
+ -5.930607331306064
377
+ ],
378
+ [
379
+ "GV",
380
+ -5.931353514901755
381
+ ],
382
+ [
383
+ "IS",
384
+ -5.934577159636316
385
+ ],
386
+ [
387
+ "RR",
388
+ -5.937181139038799
389
+ ],
390
+ [
391
+ "PA",
392
+ -5.95165892093522
393
+ ],
394
+ [
395
+ "LI",
396
+ -5.956444805023796
397
+ ],
398
+ [
399
+ "DG",
400
+ -5.959872500616557
401
+ ],
402
+ [
403
+ "LK",
404
+ -5.977419747714027
405
+ ],
406
+ [
407
+ "AE",
408
+ -5.988346330589572
409
+ ],
410
+ [
411
+ "VS",
412
+ -5.992644676581763
413
+ ],
414
+ [
415
+ "TV",
416
+ -5.992771047038934
417
+ ],
418
+ [
419
+ "TS",
420
+ -5.992862744926201
421
+ ],
422
+ [
423
+ "DA",
424
+ -6.005980312620514
425
+ ],
426
+ [
427
+ "TG",
428
+ -6.020054424800335
429
+ ],
430
+ [
431
+ "IA",
432
+ -6.029275865888938
433
+ ],
434
+ [
435
+ "PS",
436
+ -6.0558861763675385
437
+ ],
438
+ [
439
+ "RA",
440
+ -6.091301969244897
441
+ ],
442
+ [
443
+ "EK",
444
+ -6.143807832748726
445
+ ],
446
+ [
447
+ "EV",
448
+ -6.144172113809702
449
+ ],
450
+ [
451
+ "ST",
452
+ -6.1659183042917824
453
+ ],
454
+ [
455
+ "DE",
456
+ -6.175007806118581
457
+ ],
458
+ [
459
+ "SI",
460
+ -6.196141783532548
461
+ ],
462
+ [
463
+ "IV",
464
+ -6.203941668151083
465
+ ],
466
+ [
467
+ "DV",
468
+ -6.2062585412907065
469
+ ],
470
+ [
471
+ "AD",
472
+ -6.214482574668974
473
+ ],
474
+ [
475
+ "RS",
476
+ -6.219463683778585
477
+ ],
478
+ [
479
+ "EI",
480
+ -6.2248807356059555
481
+ ],
482
+ [
483
+ "GE",
484
+ -6.253174496738863
485
+ ],
486
+ [
487
+ "IE",
488
+ -6.264016428440042
489
+ ],
490
+ [
491
+ "ES",
492
+ -6.272022775598392
493
+ ],
494
+ [
495
+ "SE",
496
+ -6.288530115689479
497
+ ],
498
+ [
499
+ "IG",
500
+ -6.300749800545876
501
+ ],
502
+ [
503
+ "SD",
504
+ -6.305378344633647
505
+ ],
506
+ [
507
+ "VG",
508
+ -6.307254140261982
509
+ ],
510
+ [
511
+ "VE",
512
+ -6.322491387400987
513
+ ],
514
+ [
515
+ "GI",
516
+ -6.346224238884332
517
+ ],
518
+ [
519
+ "X",
520
+ -7.588889207971681
521
+ ],
522
+ [
523
+ "B",
524
+ -14.434103952435104
525
+ ],
526
+ [
527
+ "Z",
528
+ -15.661068824545652
529
+ ],
530
+ [
531
+ "U",
532
+ -16.34051994314413
533
+ ],
534
+ [
535
+ "O",
536
+ -19.83562802135349
537
+ ]
538
+ ]
539
+ }
540
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "clean_up_tokenization_spaces": true,
3
+ "model_max_length": 1000000000000000019884624838656,
4
+ "tokenizer_class": "PreTrainedTokenizerFast"
5
+ }