Tanor commited on
Commit
de7a64d
1 Parent(s): 61e6105

Upload tokenizer

Browse files
Files changed (1) hide show
  1. tokenizer.json +426 -7
tokenizer.json CHANGED
@@ -1,11 +1,6 @@
1
  {
2
  "version": "1.0",
3
- "truncation": {
4
- "direction": "Right",
5
- "max_length": 300,
6
- "strategy": "LongestFirst",
7
- "stride": 0
8
- },
9
  "padding": null,
10
  "added_tokens": [
11
  {
@@ -63,7 +58,431 @@
63
  "special": true
64
  }
65
  ],
66
- "normalizer": null,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  "pre_tokenizer": {
68
  "type": "ByteLevel",
69
  "add_prefix_space": false,
 
1
  {
2
  "version": "1.0",
3
+ "truncation": null,
 
 
 
 
 
4
  "padding": null,
5
  "added_tokens": [
6
  {
 
58
  "special": true
59
  }
60
  ],
61
+ "normalizer": {
62
+ "type": "Sequence",
63
+ "normalizers": [
64
+ {
65
+ "type": "Replace",
66
+ "pattern": {
67
+ "String": "а"
68
+ },
69
+ "content": "a"
70
+ },
71
+ {
72
+ "type": "Replace",
73
+ "pattern": {
74
+ "String": "б"
75
+ },
76
+ "content": "b"
77
+ },
78
+ {
79
+ "type": "Replace",
80
+ "pattern": {
81
+ "String": "в"
82
+ },
83
+ "content": "v"
84
+ },
85
+ {
86
+ "type": "Replace",
87
+ "pattern": {
88
+ "String": "г"
89
+ },
90
+ "content": "g"
91
+ },
92
+ {
93
+ "type": "Replace",
94
+ "pattern": {
95
+ "String": "д"
96
+ },
97
+ "content": "d"
98
+ },
99
+ {
100
+ "type": "Replace",
101
+ "pattern": {
102
+ "String": "ђ"
103
+ },
104
+ "content": "đ"
105
+ },
106
+ {
107
+ "type": "Replace",
108
+ "pattern": {
109
+ "String": "е"
110
+ },
111
+ "content": "e"
112
+ },
113
+ {
114
+ "type": "Replace",
115
+ "pattern": {
116
+ "String": "ж"
117
+ },
118
+ "content": "ž"
119
+ },
120
+ {
121
+ "type": "Replace",
122
+ "pattern": {
123
+ "String": "з"
124
+ },
125
+ "content": "z"
126
+ },
127
+ {
128
+ "type": "Replace",
129
+ "pattern": {
130
+ "String": "и"
131
+ },
132
+ "content": "i"
133
+ },
134
+ {
135
+ "type": "Replace",
136
+ "pattern": {
137
+ "String": "ј"
138
+ },
139
+ "content": "j"
140
+ },
141
+ {
142
+ "type": "Replace",
143
+ "pattern": {
144
+ "String": "к"
145
+ },
146
+ "content": "k"
147
+ },
148
+ {
149
+ "type": "Replace",
150
+ "pattern": {
151
+ "String": "л"
152
+ },
153
+ "content": "l"
154
+ },
155
+ {
156
+ "type": "Replace",
157
+ "pattern": {
158
+ "String": "љ"
159
+ },
160
+ "content": "lj"
161
+ },
162
+ {
163
+ "type": "Replace",
164
+ "pattern": {
165
+ "String": "м"
166
+ },
167
+ "content": "m"
168
+ },
169
+ {
170
+ "type": "Replace",
171
+ "pattern": {
172
+ "String": "н"
173
+ },
174
+ "content": "n"
175
+ },
176
+ {
177
+ "type": "Replace",
178
+ "pattern": {
179
+ "String": "њ"
180
+ },
181
+ "content": "nj"
182
+ },
183
+ {
184
+ "type": "Replace",
185
+ "pattern": {
186
+ "String": "о"
187
+ },
188
+ "content": "o"
189
+ },
190
+ {
191
+ "type": "Replace",
192
+ "pattern": {
193
+ "String": "п"
194
+ },
195
+ "content": "p"
196
+ },
197
+ {
198
+ "type": "Replace",
199
+ "pattern": {
200
+ "String": "р"
201
+ },
202
+ "content": "r"
203
+ },
204
+ {
205
+ "type": "Replace",
206
+ "pattern": {
207
+ "String": "с"
208
+ },
209
+ "content": "s"
210
+ },
211
+ {
212
+ "type": "Replace",
213
+ "pattern": {
214
+ "String": "т"
215
+ },
216
+ "content": "t"
217
+ },
218
+ {
219
+ "type": "Replace",
220
+ "pattern": {
221
+ "String": "ћ"
222
+ },
223
+ "content": "ć"
224
+ },
225
+ {
226
+ "type": "Replace",
227
+ "pattern": {
228
+ "String": "у"
229
+ },
230
+ "content": "u"
231
+ },
232
+ {
233
+ "type": "Replace",
234
+ "pattern": {
235
+ "String": "ф"
236
+ },
237
+ "content": "f"
238
+ },
239
+ {
240
+ "type": "Replace",
241
+ "pattern": {
242
+ "String": "х"
243
+ },
244
+ "content": "h"
245
+ },
246
+ {
247
+ "type": "Replace",
248
+ "pattern": {
249
+ "String": "ц"
250
+ },
251
+ "content": "c"
252
+ },
253
+ {
254
+ "type": "Replace",
255
+ "pattern": {
256
+ "String": "ч"
257
+ },
258
+ "content": "č"
259
+ },
260
+ {
261
+ "type": "Replace",
262
+ "pattern": {
263
+ "String": "џ"
264
+ },
265
+ "content": "dž"
266
+ },
267
+ {
268
+ "type": "Replace",
269
+ "pattern": {
270
+ "String": "ш"
271
+ },
272
+ "content": "š"
273
+ },
274
+ {
275
+ "type": "Replace",
276
+ "pattern": {
277
+ "String": "А"
278
+ },
279
+ "content": "A"
280
+ },
281
+ {
282
+ "type": "Replace",
283
+ "pattern": {
284
+ "String": "Б"
285
+ },
286
+ "content": "B"
287
+ },
288
+ {
289
+ "type": "Replace",
290
+ "pattern": {
291
+ "String": "В"
292
+ },
293
+ "content": "V"
294
+ },
295
+ {
296
+ "type": "Replace",
297
+ "pattern": {
298
+ "String": "Г"
299
+ },
300
+ "content": "G"
301
+ },
302
+ {
303
+ "type": "Replace",
304
+ "pattern": {
305
+ "String": "Д"
306
+ },
307
+ "content": "D"
308
+ },
309
+ {
310
+ "type": "Replace",
311
+ "pattern": {
312
+ "String": "Ђ"
313
+ },
314
+ "content": "Đ"
315
+ },
316
+ {
317
+ "type": "Replace",
318
+ "pattern": {
319
+ "String": "Е"
320
+ },
321
+ "content": "E"
322
+ },
323
+ {
324
+ "type": "Replace",
325
+ "pattern": {
326
+ "String": "Ж"
327
+ },
328
+ "content": "Ž"
329
+ },
330
+ {
331
+ "type": "Replace",
332
+ "pattern": {
333
+ "String": "З"
334
+ },
335
+ "content": "Z"
336
+ },
337
+ {
338
+ "type": "Replace",
339
+ "pattern": {
340
+ "String": "И"
341
+ },
342
+ "content": "I"
343
+ },
344
+ {
345
+ "type": "Replace",
346
+ "pattern": {
347
+ "String": "Ј"
348
+ },
349
+ "content": "J"
350
+ },
351
+ {
352
+ "type": "Replace",
353
+ "pattern": {
354
+ "String": "К"
355
+ },
356
+ "content": "K"
357
+ },
358
+ {
359
+ "type": "Replace",
360
+ "pattern": {
361
+ "String": "Л"
362
+ },
363
+ "content": "L"
364
+ },
365
+ {
366
+ "type": "Replace",
367
+ "pattern": {
368
+ "String": "Љ"
369
+ },
370
+ "content": "Lj"
371
+ },
372
+ {
373
+ "type": "Replace",
374
+ "pattern": {
375
+ "String": "М"
376
+ },
377
+ "content": "M"
378
+ },
379
+ {
380
+ "type": "Replace",
381
+ "pattern": {
382
+ "String": "Н"
383
+ },
384
+ "content": "N"
385
+ },
386
+ {
387
+ "type": "Replace",
388
+ "pattern": {
389
+ "String": "Њ"
390
+ },
391
+ "content": "Nj"
392
+ },
393
+ {
394
+ "type": "Replace",
395
+ "pattern": {
396
+ "String": "О"
397
+ },
398
+ "content": "O"
399
+ },
400
+ {
401
+ "type": "Replace",
402
+ "pattern": {
403
+ "String": "П"
404
+ },
405
+ "content": "P"
406
+ },
407
+ {
408
+ "type": "Replace",
409
+ "pattern": {
410
+ "String": "Р"
411
+ },
412
+ "content": "R"
413
+ },
414
+ {
415
+ "type": "Replace",
416
+ "pattern": {
417
+ "String": "С"
418
+ },
419
+ "content": "S"
420
+ },
421
+ {
422
+ "type": "Replace",
423
+ "pattern": {
424
+ "String": "Т"
425
+ },
426
+ "content": "T"
427
+ },
428
+ {
429
+ "type": "Replace",
430
+ "pattern": {
431
+ "String": "Ћ"
432
+ },
433
+ "content": "Ć"
434
+ },
435
+ {
436
+ "type": "Replace",
437
+ "pattern": {
438
+ "String": "У"
439
+ },
440
+ "content": "U"
441
+ },
442
+ {
443
+ "type": "Replace",
444
+ "pattern": {
445
+ "String": "Ф"
446
+ },
447
+ "content": "F"
448
+ },
449
+ {
450
+ "type": "Replace",
451
+ "pattern": {
452
+ "String": "Х"
453
+ },
454
+ "content": "H"
455
+ },
456
+ {
457
+ "type": "Replace",
458
+ "pattern": {
459
+ "String": "Ц"
460
+ },
461
+ "content": "C"
462
+ },
463
+ {
464
+ "type": "Replace",
465
+ "pattern": {
466
+ "String": "Ч"
467
+ },
468
+ "content": "Č"
469
+ },
470
+ {
471
+ "type": "Replace",
472
+ "pattern": {
473
+ "String": "Џ"
474
+ },
475
+ "content": "Dž"
476
+ },
477
+ {
478
+ "type": "Replace",
479
+ "pattern": {
480
+ "String": "Ш"
481
+ },
482
+ "content": "Š"
483
+ }
484
+ ]
485
+ },
486
  "pre_tokenizer": {
487
  "type": "ByteLevel",
488
  "add_prefix_space": false,