procesaur commited on
Commit
dd1c587
1 Parent(s): cf4f846

Upload tokenizer.json

Browse files
Files changed (1) hide show
  1. tokenizer.json +453 -27
tokenizer.json CHANGED
@@ -5,73 +5,499 @@
5
  "added_tokens": [
6
  {
7
  "id": 0,
8
- "special": true,
9
  "content": "<s>",
10
  "single_word": false,
11
  "lstrip": false,
12
  "rstrip": false,
13
- "normalized": false
 
14
  },
15
  {
16
  "id": 1,
17
- "special": true,
18
  "content": "<pad>",
19
  "single_word": false,
20
  "lstrip": false,
21
  "rstrip": false,
22
- "normalized": false
 
23
  },
24
  {
25
  "id": 2,
26
- "special": true,
27
  "content": "</s>",
28
  "single_word": false,
29
  "lstrip": false,
30
  "rstrip": false,
31
- "normalized": false
 
32
  },
33
  {
34
  "id": 3,
35
- "special": true,
36
  "content": "<unk>",
37
  "single_word": false,
38
  "lstrip": false,
39
  "rstrip": false,
40
- "normalized": false
 
41
  },
42
  {
43
  "id": 4,
44
- "special": true,
45
  "content": "<mask>",
46
  "single_word": false,
47
  "lstrip": false,
48
  "rstrip": false,
49
- "normalized": false
 
50
  }
51
  ],
52
- "normalizer": null,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  "pre_tokenizer": {
54
  "type": "ByteLevel",
55
- "add_prefix_space": true,
56
- "trim_offsets": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  },
58
- "post_processor":{
59
- "type":"RobertaProcessing",
60
- "sep":[
61
- "</s>",
62
- 2
63
- ],
64
- "cls":[
65
- "<s>",
66
- 0
67
- ],
68
- "trim_offsets":true,
69
- "add_prefix_space":false
70
- },
71
  "decoder": {
72
  "type": "ByteLevel",
73
  "add_prefix_space": true,
74
- "trim_offsets": true
 
75
  },
76
  "model": {
77
  "type": "BPE",
 
5
  "added_tokens": [
6
  {
7
  "id": 0,
 
8
  "content": "<s>",
9
  "single_word": false,
10
  "lstrip": false,
11
  "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
  },
15
  {
16
  "id": 1,
 
17
  "content": "<pad>",
18
  "single_word": false,
19
  "lstrip": false,
20
  "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
  },
24
  {
25
  "id": 2,
 
26
  "content": "</s>",
27
  "single_word": false,
28
  "lstrip": false,
29
  "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
  },
33
  {
34
  "id": 3,
 
35
  "content": "<unk>",
36
  "single_word": false,
37
  "lstrip": false,
38
  "rstrip": false,
39
+ "normalized": false,
40
+ "special": true
41
  },
42
  {
43
  "id": 4,
 
44
  "content": "<mask>",
45
  "single_word": false,
46
  "lstrip": false,
47
  "rstrip": false,
48
+ "normalized": false,
49
+ "special": true
50
  }
51
  ],
52
+ "normalizer": {
53
+ "type": "Sequence",
54
+ "normalizers": [
55
+ {
56
+ "type": "Replace",
57
+ "pattern": {
58
+ "String": "а"
59
+ },
60
+ "content": "a"
61
+ },
62
+ {
63
+ "type": "Replace",
64
+ "pattern": {
65
+ "String": "б"
66
+ },
67
+ "content": "b"
68
+ },
69
+ {
70
+ "type": "Replace",
71
+ "pattern": {
72
+ "String": "в"
73
+ },
74
+ "content": "v"
75
+ },
76
+ {
77
+ "type": "Replace",
78
+ "pattern": {
79
+ "String": "г"
80
+ },
81
+ "content": "g"
82
+ },
83
+ {
84
+ "type": "Replace",
85
+ "pattern": {
86
+ "String": "д"
87
+ },
88
+ "content": "d"
89
+ },
90
+ {
91
+ "type": "Replace",
92
+ "pattern": {
93
+ "String": "ђ"
94
+ },
95
+ "content": "đ"
96
+ },
97
+ {
98
+ "type": "Replace",
99
+ "pattern": {
100
+ "String": "е"
101
+ },
102
+ "content": "e"
103
+ },
104
+ {
105
+ "type": "Replace",
106
+ "pattern": {
107
+ "String": "ж"
108
+ },
109
+ "content": "ž"
110
+ },
111
+ {
112
+ "type": "Replace",
113
+ "pattern": {
114
+ "String": "з"
115
+ },
116
+ "content": "z"
117
+ },
118
+ {
119
+ "type": "Replace",
120
+ "pattern": {
121
+ "String": "и"
122
+ },
123
+ "content": "i"
124
+ },
125
+ {
126
+ "type": "Replace",
127
+ "pattern": {
128
+ "String": "ј"
129
+ },
130
+ "content": "j"
131
+ },
132
+ {
133
+ "type": "Replace",
134
+ "pattern": {
135
+ "String": "к"
136
+ },
137
+ "content": "k"
138
+ },
139
+ {
140
+ "type": "Replace",
141
+ "pattern": {
142
+ "String": "л"
143
+ },
144
+ "content": "l"
145
+ },
146
+ {
147
+ "type": "Replace",
148
+ "pattern": {
149
+ "String": "љ"
150
+ },
151
+ "content": "lj"
152
+ },
153
+ {
154
+ "type": "Replace",
155
+ "pattern": {
156
+ "String": "м"
157
+ },
158
+ "content": "m"
159
+ },
160
+ {
161
+ "type": "Replace",
162
+ "pattern": {
163
+ "String": "н"
164
+ },
165
+ "content": "n"
166
+ },
167
+ {
168
+ "type": "Replace",
169
+ "pattern": {
170
+ "String": "њ"
171
+ },
172
+ "content": "nj"
173
+ },
174
+ {
175
+ "type": "Replace",
176
+ "pattern": {
177
+ "String": "о"
178
+ },
179
+ "content": "o"
180
+ },
181
+ {
182
+ "type": "Replace",
183
+ "pattern": {
184
+ "String": "п"
185
+ },
186
+ "content": "p"
187
+ },
188
+ {
189
+ "type": "Replace",
190
+ "pattern": {
191
+ "String": "р"
192
+ },
193
+ "content": "r"
194
+ },
195
+ {
196
+ "type": "Replace",
197
+ "pattern": {
198
+ "String": "с"
199
+ },
200
+ "content": "s"
201
+ },
202
+ {
203
+ "type": "Replace",
204
+ "pattern": {
205
+ "String": "т"
206
+ },
207
+ "content": "t"
208
+ },
209
+ {
210
+ "type": "Replace",
211
+ "pattern": {
212
+ "String": "ћ"
213
+ },
214
+ "content": "ć"
215
+ },
216
+ {
217
+ "type": "Replace",
218
+ "pattern": {
219
+ "String": "у"
220
+ },
221
+ "content": "u"
222
+ },
223
+ {
224
+ "type": "Replace",
225
+ "pattern": {
226
+ "String": "ф"
227
+ },
228
+ "content": "f"
229
+ },
230
+ {
231
+ "type": "Replace",
232
+ "pattern": {
233
+ "String": "х"
234
+ },
235
+ "content": "h"
236
+ },
237
+ {
238
+ "type": "Replace",
239
+ "pattern": {
240
+ "String": "ц"
241
+ },
242
+ "content": "c"
243
+ },
244
+ {
245
+ "type": "Replace",
246
+ "pattern": {
247
+ "String": "ч"
248
+ },
249
+ "content": "č"
250
+ },
251
+ {
252
+ "type": "Replace",
253
+ "pattern": {
254
+ "String": "џ"
255
+ },
256
+ "content": "dž"
257
+ },
258
+ {
259
+ "type": "Replace",
260
+ "pattern": {
261
+ "String": "ш"
262
+ },
263
+ "content": "š"
264
+ },
265
+ {
266
+ "type": "Replace",
267
+ "pattern": {
268
+ "String": "А"
269
+ },
270
+ "content": "A"
271
+ },
272
+ {
273
+ "type": "Replace",
274
+ "pattern": {
275
+ "String": "Б"
276
+ },
277
+ "content": "B"
278
+ },
279
+ {
280
+ "type": "Replace",
281
+ "pattern": {
282
+ "String": "В"
283
+ },
284
+ "content": "V"
285
+ },
286
+ {
287
+ "type": "Replace",
288
+ "pattern": {
289
+ "String": "Г"
290
+ },
291
+ "content": "G"
292
+ },
293
+ {
294
+ "type": "Replace",
295
+ "pattern": {
296
+ "String": "Д"
297
+ },
298
+ "content": "D"
299
+ },
300
+ {
301
+ "type": "Replace",
302
+ "pattern": {
303
+ "String": "Ђ"
304
+ },
305
+ "content": "Đ"
306
+ },
307
+ {
308
+ "type": "Replace",
309
+ "pattern": {
310
+ "String": "Е"
311
+ },
312
+ "content": "E"
313
+ },
314
+ {
315
+ "type": "Replace",
316
+ "pattern": {
317
+ "String": "Ж"
318
+ },
319
+ "content": "Ž"
320
+ },
321
+ {
322
+ "type": "Replace",
323
+ "pattern": {
324
+ "String": "З"
325
+ },
326
+ "content": "Z"
327
+ },
328
+ {
329
+ "type": "Replace",
330
+ "pattern": {
331
+ "String": "И"
332
+ },
333
+ "content": "I"
334
+ },
335
+ {
336
+ "type": "Replace",
337
+ "pattern": {
338
+ "String": "Ј"
339
+ },
340
+ "content": "J"
341
+ },
342
+ {
343
+ "type": "Replace",
344
+ "pattern": {
345
+ "String": "К"
346
+ },
347
+ "content": "K"
348
+ },
349
+ {
350
+ "type": "Replace",
351
+ "pattern": {
352
+ "String": "Л"
353
+ },
354
+ "content": "L"
355
+ },
356
+ {
357
+ "type": "Replace",
358
+ "pattern": {
359
+ "String": "Љ"
360
+ },
361
+ "content": "Lj"
362
+ },
363
+ {
364
+ "type": "Replace",
365
+ "pattern": {
366
+ "String": "М"
367
+ },
368
+ "content": "M"
369
+ },
370
+ {
371
+ "type": "Replace",
372
+ "pattern": {
373
+ "String": "Н"
374
+ },
375
+ "content": "N"
376
+ },
377
+ {
378
+ "type": "Replace",
379
+ "pattern": {
380
+ "String": "Њ"
381
+ },
382
+ "content": "Nj"
383
+ },
384
+ {
385
+ "type": "Replace",
386
+ "pattern": {
387
+ "String": "О"
388
+ },
389
+ "content": "O"
390
+ },
391
+ {
392
+ "type": "Replace",
393
+ "pattern": {
394
+ "String": "П"
395
+ },
396
+ "content": "P"
397
+ },
398
+ {
399
+ "type": "Replace",
400
+ "pattern": {
401
+ "String": "Р"
402
+ },
403
+ "content": "R"
404
+ },
405
+ {
406
+ "type": "Replace",
407
+ "pattern": {
408
+ "String": "С"
409
+ },
410
+ "content": "S"
411
+ },
412
+ {
413
+ "type": "Replace",
414
+ "pattern": {
415
+ "String": "Т"
416
+ },
417
+ "content": "T"
418
+ },
419
+ {
420
+ "type": "Replace",
421
+ "pattern": {
422
+ "String": "Ћ"
423
+ },
424
+ "content": "Ć"
425
+ },
426
+ {
427
+ "type": "Replace",
428
+ "pattern": {
429
+ "String": "У"
430
+ },
431
+ "content": "U"
432
+ },
433
+ {
434
+ "type": "Replace",
435
+ "pattern": {
436
+ "String": "Ф"
437
+ },
438
+ "content": "F"
439
+ },
440
+ {
441
+ "type": "Replace",
442
+ "pattern": {
443
+ "String": "Х"
444
+ },
445
+ "content": "H"
446
+ },
447
+ {
448
+ "type": "Replace",
449
+ "pattern": {
450
+ "String": "Ц"
451
+ },
452
+ "content": "C"
453
+ },
454
+ {
455
+ "type": "Replace",
456
+ "pattern": {
457
+ "String": "Ч"
458
+ },
459
+ "content": "Č"
460
+ },
461
+ {
462
+ "type": "Replace",
463
+ "pattern": {
464
+ "String": "Џ"
465
+ },
466
+ "content": "Dž"
467
+ },
468
+ {
469
+ "type": "Replace",
470
+ "pattern": {
471
+ "String": "Ш"
472
+ },
473
+ "content": "Š"
474
+ }
475
+ ]
476
+ },
477
  "pre_tokenizer": {
478
  "type": "ByteLevel",
479
+ "add_prefix_space": false,
480
+ "trim_offsets": true,
481
+ "use_regex": true
482
+ },
483
+ "post_processor": {
484
+ "type": "RobertaProcessing",
485
+ "sep": [
486
+ "</s>",
487
+ 2
488
+ ],
489
+ "cls": [
490
+ "<s>",
491
+ 0
492
+ ],
493
+ "trim_offsets": true,
494
+ "add_prefix_space": false
495
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
496
  "decoder": {
497
  "type": "ByteLevel",
498
  "add_prefix_space": true,
499
+ "trim_offsets": true,
500
+ "use_regex": true
501
  },
502
  "model": {
503
  "type": "BPE",