michelleyunun commited on
Commit
e508a2e
1 Parent(s): 5d9a85a

Upload tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +4 -0
  2. tokenizer.json +823 -0
  3. tokenizer_config.json +8 -0
special_tokens_map.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "eos_token": "<end>",
3
+ "pad_token": "<pad>"
4
+ }
tokenizer.json ADDED
@@ -0,0 +1,823 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "<start>",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 1,
17
+ "content": "<end>",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 2,
26
+ "content": "<pad>",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ }
33
+ ],
34
+ "normalizer": null,
35
+ "pre_tokenizer": {
36
+ "type": "ByteLevel",
37
+ "add_prefix_space": false,
38
+ "trim_offsets": true,
39
+ "use_regex": true
40
+ },
41
+ "post_processor": {
42
+ "type": "ByteLevel",
43
+ "add_prefix_space": true,
44
+ "trim_offsets": false,
45
+ "use_regex": true
46
+ },
47
+ "decoder": {
48
+ "type": "ByteLevel",
49
+ "add_prefix_space": true,
50
+ "trim_offsets": true,
51
+ "use_regex": true
52
+ },
53
+ "model": {
54
+ "type": "BPE",
55
+ "dropout": null,
56
+ "unk_token": null,
57
+ "continuing_subword_prefix": null,
58
+ "end_of_word_suffix": null,
59
+ "fuse_unk": false,
60
+ "byte_fallback": false,
61
+ "vocab": {
62
+ "<start>": 0,
63
+ "<end>": 1,
64
+ "<pad>": 2,
65
+ "\"": 3,
66
+ "'": 4,
67
+ ",": 5,
68
+ "-": 6,
69
+ ".": 7,
70
+ "<": 8,
71
+ ">": 9,
72
+ "A": 10,
73
+ "B": 11,
74
+ "C": 12,
75
+ "D": 13,
76
+ "G": 14,
77
+ "H": 15,
78
+ "I": 16,
79
+ "M": 17,
80
+ "N": 18,
81
+ "O": 19,
82
+ "S": 20,
83
+ "a": 21,
84
+ "b": 22,
85
+ "c": 23,
86
+ "d": 24,
87
+ "e": 25,
88
+ "f": 26,
89
+ "g": 27,
90
+ "h": 28,
91
+ "i": 29,
92
+ "j": 30,
93
+ "k": 31,
94
+ "l": 32,
95
+ "m": 33,
96
+ "n": 34,
97
+ "o": 35,
98
+ "p": 36,
99
+ "r": 37,
100
+ "s": 38,
101
+ "t": 39,
102
+ "u": 40,
103
+ "w": 41,
104
+ "x": 42,
105
+ "y": 43,
106
+ "²": 44,
107
+ "Ì": 45,
108
+ "Ġ": 46,
109
+ "st": 47,
110
+ "nd": 48,
111
+ "ar": 49,
112
+ "end": 50,
113
+ "Ġ<": 51,
114
+ "star": 52,
115
+ "start": 53,
116
+ "hl": 54,
117
+ "̲": 55,
118
+ "wi": 56,
119
+ "ii": 57,
120
+ "Ġg": 58,
121
+ "aa": 59,
122
+ "oo": 60,
123
+ "Ġn": 61,
124
+ "Ġwi": 62,
125
+ "Ġ'": 63,
126
+ "Ġii": 64,
127
+ "an": 65,
128
+ "Ġy": 66,
129
+ "Ġl": 67,
130
+ "Ii": 68,
131
+ "ĠIi": 69,
132
+ "oohl": 70,
133
+ "ee": 71,
134
+ "im": 72,
135
+ "Ġwil": 73,
136
+ "Ġh": 74,
137
+ "whl": 75,
138
+ "Ġhl": 76,
139
+ "ag": 77,
140
+ "dii": 78,
141
+ "nii": 79,
142
+ "ts": 80,
143
+ "xwi": 81,
144
+ "Ġd": 82,
145
+ "Ġha": 83,
146
+ "uu": 84,
147
+ "Ġnee": 85,
148
+ "xs": 86,
149
+ "Ġyu": 87,
150
+ "Ġa": 88,
151
+ "ip": 89,
152
+ "kwhl": 90,
153
+ "wihl": 91,
154
+ "gi": 92,
155
+ "Ġk": 93,
156
+ "xw": 94,
157
+ "'m": 95,
158
+ "Ġxs": 96,
159
+ "Ġdim": 97,
160
+ "Ġneedii": 98,
161
+ "igi": 99,
162
+ "Ġb": 100,
163
+ "Ġligi": 101,
164
+ "Ġwili": 102,
165
+ "di": 103,
166
+ "Ġj": 104,
167
+ "Ġp": 105,
168
+ "Ġt": 106,
169
+ "Ġwihl": 107,
170
+ "sxwi": 108,
171
+ "Ġs": 109,
172
+ "Ġya": 110,
173
+ "in": 111,
174
+ "Ġhlaa": 112,
175
+ "Ġna": 113,
176
+ "Ġan": 114,
177
+ "ax": 115,
178
+ "ay": 116,
179
+ "ahl": 117,
180
+ "oot": 118,
181
+ "ni": 119,
182
+ "ol": 120,
183
+ "Ġyukwhl": 121,
184
+ "Ġnii": 122,
185
+ "Ġnaa": 123,
186
+ "Ġwilp": 124,
187
+ "ipe": 125,
188
+ "Ġpipe": 126,
189
+ "uxw": 127,
190
+ "tshl": 128,
191
+ "Ġyatshl": 129,
192
+ "ĠS": 130,
193
+ "na": 131,
194
+ "hli": 132,
195
+ "Ġaa": 133,
196
+ "Ġneediit": 134,
197
+ "Ġ\"": 135,
198
+ "̲'": 136,
199
+ "il": 137,
200
+ "Ġw": 138,
201
+ "Ġyee": 139,
202
+ "Ġloot": 140,
203
+ "at": 141,
204
+ "ck": 142,
205
+ "hol": 143,
206
+ "ka": 144,
207
+ "lhl": 145,
208
+ "ock": 146,
209
+ "tock": 147,
210
+ "ya": 148,
211
+ "wil": 149,
212
+ "Ġgya": 150,
213
+ "Ġiin": 151,
214
+ "Ġluu": 152,
215
+ "uuhl": 153,
216
+ "ĠStock": 154,
217
+ "holm": 155,
218
+ "ĠStockholm": 156,
219
+ "ad": 157,
220
+ "ls": 158,
221
+ "xu": 159,
222
+ "Ġts": 160,
223
+ "hla": 161,
224
+ "Ġwina": 162,
225
+ "Ġhlg": 163,
226
+ "Ġhahla": 164,
227
+ "uut": 165,
228
+ "Ġbag": 166,
229
+ "ayt": 167,
230
+ "Ġwag": 168,
231
+ "lsdi": 169,
232
+ "as": 170,
233
+ "ok": 171,
234
+ "Ġhe": 172,
235
+ "diit": 173,
236
+ "ain": 174,
237
+ "wit": 175,
238
+ "Ġxsa": 176,
239
+ "Ġxsi": 177,
240
+ "Ġja": 178,
241
+ "nit": 179,
242
+ "xhl": 180,
243
+ "xwhl": 181,
244
+ "iihli": 182,
245
+ "Ġgiihli": 183,
246
+ "Ġlax": 184,
247
+ "ak": 185,
248
+ "̲.": 186,
249
+ "eek": 187,
250
+ "Ġap": 188,
251
+ "Ġxseek": 189,
252
+ "Ġji": 190,
253
+ "Ġaats": 191,
254
+ "hahl": 192,
255
+ "un": 193,
256
+ "waa": 194,
257
+ "oos": 195,
258
+ "Ġanhahl": 196,
259
+ "Ġanhahla": 197,
260
+ "ĠA": 198,
261
+ "Ġneet": 199,
262
+ "Ġam": 200,
263
+ "akwhl": 201,
264
+ "Ġak": 202,
265
+ "--": 203,
266
+ "Can": 204,
267
+ "Dim": 205,
268
+ "bi": 206,
269
+ "da": 207,
270
+ "fl": 208,
271
+ "gwaa": 209,
272
+ "isxwi": 210,
273
+ "ika": 211,
274
+ "ja": 212,
275
+ "kst": 213,
276
+ "lt": 214,
277
+ "lst": 215,
278
+ "nag": 216,
279
+ "pja": 217,
280
+ "rain": 218,
281
+ "sii": 219,
282
+ "ska": 220,
283
+ "sgwaa": 221,
284
+ "upja": 222,
285
+ "yt": 223,
286
+ "Ġag": 224,
287
+ "ĠCan": 225,
288
+ "Ġfl": 226,
289
+ "Ġisxwi": 227,
290
+ "Ġupja": 228,
291
+ "ndoos": 229,
292
+ "Ġgi": 230,
293
+ "Ġgwil": 231,
294
+ "Ġguuhl": 232,
295
+ "aahli": 233,
296
+ "oodi": 234,
297
+ "Ġno": 235,
298
+ "anhl": 236,
299
+ "anwil": 237,
300
+ "anuut": 238,
301
+ "anska": 239,
302
+ "Ġlip": 240,
303
+ "imil": 241,
304
+ "niig": 242,
305
+ "niisgwaa": 243,
306
+ "Ġyuwi": 244,
307
+ "Ġandoos": 245,
308
+ "gihl": 246,
309
+ "Ġky": 247,
310
+ "dilhl": 248,
311
+ "Ġpol": 249,
312
+ "Ġtun": 250,
313
+ "Ġtrain": 251,
314
+ "Ġsgihl": 252,
315
+ "Ġsdilhl": 253,
316
+ "Ġyalt": 254,
317
+ "insxwi": 255,
318
+ "Ġnakst": 256,
319
+ "Ġant": 257,
320
+ "Ġansii": 258,
321
+ "ayoo": 259,
322
+ "uxwt": 260,
323
+ "Ġaam": 261,
324
+ "adanska": 262,
325
+ "Ġhlgu": 263,
326
+ "Ġxsawi": 264,
327
+ "Ġjabi": 265,
328
+ "nagwit": 266,
329
+ "Ġagwihl": 267,
330
+ "ĠCanadanska": 268,
331
+ "Ġflika": 269,
332
+ "Ġgwila": 270,
333
+ "aahlihl": 271,
334
+ "anwilat": 272,
335
+ "anuutxw": 273,
336
+ "Ġandoosda": 274,
337
+ "Ġpole": 275,
338
+ "Ġyaltxu": 276,
339
+ "Ġansiip": 277,
340
+ "Hl": 278,
341
+ "Nii": 279,
342
+ "Oo": 280,
343
+ "nim": 281,
344
+ "wahl": 282,
345
+ "yhl": 283,
346
+ "ĠHl": 284,
347
+ "ĠNii": 285,
348
+ "wii": 286,
349
+ "Ġguxw": 287,
350
+ "Ġguut": 288,
351
+ "aaxhl": 289,
352
+ "Ġyuxwhl": 290,
353
+ "Ġkw": 291,
354
+ "Ġbas": 292,
355
+ "inhl": 293,
356
+ "ootxwi": 294,
357
+ "nisxwi": 295,
358
+ "uxwsxwi": 296,
359
+ "ilx": 297,
360
+ "adaaxhl": 298,
361
+ "Ġhlguxwsxwi": 299,
362
+ "Ġbagu": 300,
363
+ "asinhl": 301,
364
+ "Ġamhl": 302,
365
+ "ĠHlaa": 303,
366
+ "Ġguxws": 304,
367
+ "Ġbasax": 305,
368
+ ".\"": 306,
369
+ "daa": 307,
370
+ "ix": 308,
371
+ "idaa": 309,
372
+ "loohl": 310,
373
+ "phl": 311,
374
+ "pain": 312,
375
+ "sx": 313,
376
+ "yim": 314,
377
+ "Ġxhl": 315,
378
+ "aahl": 316,
379
+ "aasx": 317,
380
+ "aayim": 318,
381
+ "ook": 319,
382
+ "Ġhlag": 320,
383
+ "Ġhlidaa": 321,
384
+ "xwit": 322,
385
+ "Ġdok": 323,
386
+ "Ġdaayim": 324,
387
+ "Ġyuxw": 325,
388
+ "Ġaloohl": 326,
389
+ "Ġbax": 327,
390
+ "Ġbaasx": 328,
391
+ "Ġligit": 329,
392
+ "Ġjok": 330,
393
+ "Ġsg": 331,
394
+ "Ġsi": 332,
395
+ "ĠSpain": 333,
396
+ "nakwhl": 334,
397
+ "Ġhehl": 335,
398
+ "Ġhediit": 336,
399
+ "diithl": 337,
400
+ "witxwit": 338,
401
+ "Ġjaphl": 339,
402
+ "nithl": 340,
403
+ "ytxwhl": 341,
404
+ "Ġxhlii": 342,
405
+ "Ġdaayimaahl": 343,
406
+ "Ġyuxwdiithl": 344,
407
+ "Ġbaasxi": 345,
408
+ "Nakwhl": 346,
409
+ "gwi": 347,
410
+ "ukwhl": 348,
411
+ "yukwhl": 349,
412
+ "ĠAk": 350,
413
+ "ĠAgwi": 351,
414
+ "ĠAgwiyukwhl": 352,
415
+ "BM": 353,
416
+ "De": 354,
417
+ "Gi": 355,
418
+ "IBM": 356,
419
+ "aw": 357,
420
+ "ail": 358,
421
+ "ce": 359,
422
+ "ff": 360,
423
+ "gee": 361,
424
+ "it": 362,
425
+ "iwaa": 363,
426
+ "ice": 364,
427
+ "jit": 365,
428
+ "ljit": 366,
429
+ "mar": 367,
430
+ "mail": 368,
431
+ "nmar": 369,
432
+ "oxs": 370,
433
+ "off": 371,
434
+ "si": 372,
435
+ "wan": 373,
436
+ "way": 374,
437
+ "yo": 375,
438
+ "ĠDe": 376,
439
+ "ĠGi": 377,
440
+ "ĠIBM": 378,
441
+ "Ġmail": 379,
442
+ "Ġoff": 380,
443
+ "niiwan": 381,
444
+ "niiyo": 382,
445
+ "xsiwaa": 383,
446
+ "Ġsaw": 384,
447
+ "nix": 385,
448
+ "Ġwok": 386,
449
+ "atdiit": 387,
450
+ "̲.\"": 388,
451
+ "oosun": 389,
452
+ "ĠAp": 390,
453
+ "Ġamxsiwaa": 391,
454
+ "Ġaks": 392,
455
+ "geenix": 393,
456
+ "nmark": 394,
457
+ "oxsxw": 395,
458
+ "wayi": 396,
459
+ "ĠDenmark": 397,
460
+ "ĠGigeenix": 398,
461
+ "Ġoffice": 399,
462
+ "Ġsawatdiit": 400,
463
+ "ytxw": 401
464
+ },
465
+ "merges": [
466
+ "s t",
467
+ "n d",
468
+ "a r",
469
+ "e nd",
470
+ "Ġ <",
471
+ "st ar",
472
+ "star t",
473
+ "h l",
474
+ "Ì ²",
475
+ "w i",
476
+ "i i",
477
+ "Ġ g",
478
+ "a a",
479
+ "o o",
480
+ "Ġ n",
481
+ "Ġ wi",
482
+ "Ġ '",
483
+ "Ġ ii",
484
+ "a n",
485
+ "Ġ y",
486
+ "Ġ l",
487
+ "I i",
488
+ "Ġ Ii",
489
+ "oo hl",
490
+ "e e",
491
+ "i m",
492
+ "Ġwi l",
493
+ "Ġ h",
494
+ "w hl",
495
+ "Ġ hl",
496
+ "a g",
497
+ "d ii",
498
+ "n ii",
499
+ "t s",
500
+ "x wi",
501
+ "Ġ d",
502
+ "Ġh a",
503
+ "u u",
504
+ "Ġn ee",
505
+ "x s",
506
+ "Ġy u",
507
+ "Ġ a",
508
+ "i p",
509
+ "k whl",
510
+ "wi hl",
511
+ "g i",
512
+ "Ġ k",
513
+ "x w",
514
+ "' m",
515
+ "Ġ xs",
516
+ "Ġd im",
517
+ "Ġnee dii",
518
+ "i gi",
519
+ "Ġ b",
520
+ "Ġl igi",
521
+ "Ġwil i",
522
+ "d i",
523
+ "Ġ j",
524
+ "Ġ p",
525
+ "Ġ t",
526
+ "Ġwi hl",
527
+ "s xwi",
528
+ "Ġ s",
529
+ "Ġy a",
530
+ "i n",
531
+ "Ġhl aa",
532
+ "Ġn a",
533
+ "Ġ an",
534
+ "a x",
535
+ "a y",
536
+ "a hl",
537
+ "oo t",
538
+ "n i",
539
+ "o l",
540
+ "Ġyu kwhl",
541
+ "Ġn ii",
542
+ "Ġn aa",
543
+ "Ġwil p",
544
+ "ip e",
545
+ "Ġp ipe",
546
+ "u xw",
547
+ "ts hl",
548
+ "Ġya tshl",
549
+ "Ġ S",
550
+ "n a",
551
+ "hl i",
552
+ "Ġ aa",
553
+ "Ġneedii t",
554
+ "Ġ \"",
555
+ "̲ '",
556
+ "i l",
557
+ "Ġ w",
558
+ "Ġy ee",
559
+ "Ġl oot",
560
+ "a t",
561
+ "c k",
562
+ "h ol",
563
+ "k a",
564
+ "l hl",
565
+ "o ck",
566
+ "t ock",
567
+ "y a",
568
+ "wi l",
569
+ "Ġg ya",
570
+ "Ġii n",
571
+ "Ġl uu",
572
+ "uu hl",
573
+ "ĠS tock",
574
+ "hol m",
575
+ "ĠStock holm",
576
+ "a d",
577
+ "l s",
578
+ "x u",
579
+ "Ġ ts",
580
+ "hl a",
581
+ "Ġwi na",
582
+ "Ġhl g",
583
+ "Ġha hla",
584
+ "uu t",
585
+ "Ġb ag",
586
+ "ay t",
587
+ "Ġw ag",
588
+ "ls di",
589
+ "a s",
590
+ "o k",
591
+ "Ġh e",
592
+ "dii t",
593
+ "a in",
594
+ "wi t",
595
+ "Ġxs a",
596
+ "Ġxs i",
597
+ "Ġj a",
598
+ "ni t",
599
+ "x hl",
600
+ "x whl",
601
+ "ii hli",
602
+ "Ġg iihli",
603
+ "Ġl ax",
604
+ "a k",
605
+ "̲ .",
606
+ "ee k",
607
+ "Ġa p",
608
+ "Ġxs eek",
609
+ "Ġj i",
610
+ "Ġaa ts",
611
+ "h ahl",
612
+ "u n",
613
+ "w aa",
614
+ "oo s",
615
+ "Ġan hahl",
616
+ "Ġanhahl a",
617
+ "Ġ A",
618
+ "Ġnee t",
619
+ "Ġa m",
620
+ "a kwhl",
621
+ "Ġa k",
622
+ "- -",
623
+ "C an",
624
+ "D im",
625
+ "b i",
626
+ "d a",
627
+ "f l",
628
+ "g waa",
629
+ "i sxwi",
630
+ "i ka",
631
+ "j a",
632
+ "k st",
633
+ "l t",
634
+ "l st",
635
+ "n ag",
636
+ "p ja",
637
+ "r ain",
638
+ "s ii",
639
+ "s ka",
640
+ "s gwaa",
641
+ "u pja",
642
+ "y t",
643
+ "Ġ ag",
644
+ "Ġ Can",
645
+ "Ġ fl",
646
+ "Ġ isxwi",
647
+ "Ġ upja",
648
+ "nd oos",
649
+ "Ġg i",
650
+ "Ġg wil",
651
+ "Ġg uuhl",
652
+ "aa hli",
653
+ "oo di",
654
+ "Ġn o",
655
+ "an hl",
656
+ "an wil",
657
+ "an uut",
658
+ "an ska",
659
+ "Ġl ip",
660
+ "im il",
661
+ "nii g",
662
+ "nii sgwaa",
663
+ "Ġyu wi",
664
+ "Ġa ndoos",
665
+ "gi hl",
666
+ "Ġk y",
667
+ "di lhl",
668
+ "Ġp ol",
669
+ "Ġt un",
670
+ "Ġt rain",
671
+ "Ġs gihl",
672
+ "Ġs dilhl",
673
+ "Ġya lt",
674
+ "in sxwi",
675
+ "Ġna kst",
676
+ "Ġan t",
677
+ "Ġan sii",
678
+ "ay oo",
679
+ "uxw t",
680
+ "Ġaa m",
681
+ "ad anska",
682
+ "Ġhlg u",
683
+ "Ġxsa wi",
684
+ "Ġja bi",
685
+ "nag wit",
686
+ "Ġag wihl",
687
+ "ĠCan adanska",
688
+ "Ġfl ika",
689
+ "Ġgwil a",
690
+ "aahli hl",
691
+ "anwil at",
692
+ "anuut xw",
693
+ "Ġandoos da",
694
+ "Ġpol e",
695
+ "Ġyalt xu",
696
+ "Ġansii p",
697
+ "H l",
698
+ "N ii",
699
+ "O o",
700
+ "n im",
701
+ "w ahl",
702
+ "y hl",
703
+ "Ġ Hl",
704
+ "Ġ Nii",
705
+ "wi i",
706
+ "Ġg uxw",
707
+ "Ġg uut",
708
+ "aa xhl",
709
+ "Ġyu xwhl",
710
+ "Ġk w",
711
+ "Ġb as",
712
+ "in hl",
713
+ "oot xwi",
714
+ "ni sxwi",
715
+ "uxw sxwi",
716
+ "il x",
717
+ "ad aaxhl",
718
+ "Ġhlg uxwsxwi",
719
+ "Ġbag u",
720
+ "as inhl",
721
+ "Ġam hl",
722
+ "ĠHl aa",
723
+ "Ġguxw s",
724
+ "Ġbas ax",
725
+ ". \"",
726
+ "d aa",
727
+ "i x",
728
+ "i daa",
729
+ "l oohl",
730
+ "p hl",
731
+ "p ain",
732
+ "s x",
733
+ "y im",
734
+ "Ġ xhl",
735
+ "aa hl",
736
+ "aa sx",
737
+ "aa yim",
738
+ "oo k",
739
+ "Ġhl ag",
740
+ "Ġhl idaa",
741
+ "xwi t",
742
+ "Ġd ok",
743
+ "Ġd aayim",
744
+ "Ġyu xw",
745
+ "Ġa loohl",
746
+ "Ġb ax",
747
+ "Ġb aasx",
748
+ "Ġligi t",
749
+ "Ġj ok",
750
+ "Ġs g",
751
+ "Ġs i",
752
+ "ĠS pain",
753
+ "na kwhl",
754
+ "Ġhe hl",
755
+ "Ġhe diit",
756
+ "diit hl",
757
+ "wit xwit",
758
+ "Ġja phl",
759
+ "nit hl",
760
+ "yt xwhl",
761
+ "Ġxhl ii",
762
+ "Ġdaayim aahl",
763
+ "Ġyuxw diithl",
764
+ "Ġbaasx i",
765
+ "N akwhl",
766
+ "g wi",
767
+ "u kwhl",
768
+ "y ukwhl",
769
+ "ĠA k",
770
+ "ĠA gwi",
771
+ "ĠAgwi yukwhl",
772
+ "B M",
773
+ "D e",
774
+ "G i",
775
+ "I BM",
776
+ "a w",
777
+ "a il",
778
+ "c e",
779
+ "f f",
780
+ "g ee",
781
+ "i t",
782
+ "i waa",
783
+ "i ce",
784
+ "j it",
785
+ "l jit",
786
+ "m ar",
787
+ "m ail",
788
+ "n mar",
789
+ "o xs",
790
+ "o ff",
791
+ "s i",
792
+ "w an",
793
+ "w ay",
794
+ "y o",
795
+ "Ġ De",
796
+ "Ġ Gi",
797
+ "Ġ IBM",
798
+ "Ġ mail",
799
+ "Ġ off",
800
+ "nii wan",
801
+ "nii yo",
802
+ "xs iwaa",
803
+ "Ġs aw",
804
+ "ni x",
805
+ "Ġw ok",
806
+ "at diit",
807
+ "̲. \"",
808
+ "oos un",
809
+ "ĠA p",
810
+ "Ġam xsiwaa",
811
+ "Ġak s",
812
+ "gee nix",
813
+ "nmar k",
814
+ "oxs xw",
815
+ "way i",
816
+ "ĠDe nmark",
817
+ "ĠGi geenix",
818
+ "Ġoff ice",
819
+ "Ġsaw atdiit",
820
+ "yt xw"
821
+ ]
822
+ }
823
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "clean_up_tokenization_spaces": true,
3
+ "eos_token": "<end>",
4
+ "model_max_length": 1000000000000000019884624838656,
5
+ "pad_token": "<pad>",
6
+ "sos_token": "<start>",
7
+ "tokenizer_class": "PreTrainedTokenizerFast"
8
+ }