kiansheik commited on
Commit
fe9c799
1 Parent(s): d0dfd61

Add special character tokens for broken words

Browse files
added_tokens.json CHANGED
@@ -166,27 +166,59 @@
166
  "[SUBJECT_PREFIX:2ps]": 32130,
167
  "[SUBJECT_PREFIX:3p]": 32171,
168
  "[SUB_VERB]": 32197,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
  "a'e": 32127,
 
170
  "amo": 32104,
171
  "bo": 32110,
172
  "e'ym": 32139,
 
 
173
  "endé": 32128,
174
  "ere": 32157,
 
175
  "gûi": 32185,
 
176
  "ixé": 32165,
177
  "mo": 32164,
178
  "n'": 32126,
 
179
  "na": 32155,
180
  "namo": 32142,
181
  "nde": 32181,
182
  "opo": 32183,
 
183
  "oro": 32105,
184
  "oré": 32169,
185
  "pa": 32136,
 
 
186
  "peîepé": 32117,
187
  "peẽ": 32138,
188
  "ramo": 32161,
189
  "ta": 32129,
 
190
  "umẽ": 32191,
191
  "xe": 32182,
192
  "í": 32121,
 
166
  "[SUBJECT_PREFIX:2ps]": 32130,
167
  "[SUBJECT_PREFIX:3p]": 32171,
168
  "[SUB_VERB]": 32197,
169
+ "[w0q]": 32216,
170
+ "[w10q]": 32224,
171
+ "[w11q]": 32225,
172
+ "[w12q]": 32226,
173
+ "[w13q]": 32227,
174
+ "[w14q]": 32228,
175
+ "[w15q]": 32229,
176
+ "[w1q]": 32204,
177
+ "[w1q]a": 32211,
178
+ "[w1q]and[w4q]": 32200,
179
+ "[w1q]e": 32209,
180
+ "[w1q]ep[w4q]": 32205,
181
+ "[w1q]o": 32207,
182
+ "[w1q]os": 32199,
183
+ "[w2q]": 32217,
184
+ "[w3q]": 32218,
185
+ "[w4q]": 32219,
186
+ "[w5q]": 32214,
187
+ "[w6q]": 32220,
188
+ "[w7q]": 32221,
189
+ "[w8q]": 32222,
190
+ "[w9q]": 32223,
191
  "a'e": 32127,
192
+ "a[w15q]e": 32213,
193
  "amo": 32104,
194
  "bo": 32110,
195
  "e'ym": 32139,
196
+ "e[w15q]ym": 32201,
197
+ "end[w4q]": 32215,
198
  "endé": 32128,
199
  "ere": 32157,
200
+ "g[w0q]i": 32212,
201
  "gûi": 32185,
202
+ "ix[w4q]": 32208,
203
  "ixé": 32165,
204
  "mo": 32164,
205
  "n'": 32126,
206
+ "n[w15q]": 32202,
207
  "na": 32155,
208
  "namo": 32142,
209
  "nde": 32181,
210
  "opo": 32183,
211
+ "or[w4q]": 32210,
212
  "oro": 32105,
213
  "oré": 32169,
214
  "pa": 32136,
215
+ "pe[w10q]": 32206,
216
+ "pe[w1q]ep[w4q]": 32198,
217
  "peîepé": 32117,
218
  "peẽ": 32138,
219
  "ramo": 32161,
220
  "ta": 32129,
221
+ "um[w10q]": 32203,
222
  "umẽ": 32191,
223
  "xe": 32182,
224
  "í": 32121,
config.json CHANGED
@@ -57,5 +57,5 @@
57
  "torch_dtype": "float32",
58
  "transformers_version": "4.38.1",
59
  "use_cache": true,
60
- "vocab_size": 32198
61
  }
 
57
  "torch_dtype": "float32",
58
  "transformers_version": "4.38.1",
59
  "use_cache": true,
60
+ "vocab_size": 32230
61
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d80dfecd26b9f6c7e6fecc204679f434def1ff66f0c81494fdbb784d1234b6a9
3
- size 242185256
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e50d118ba9f820756e704f94946c0589e8e6def3296aafae22476dfd1eb1fcc5
3
+ size 242250792
special_tokens_map.json CHANGED
@@ -21,20 +21,6 @@
21
  "rstrip": false,
22
  "single_word": false
23
  },
24
- {
25
- "content": "î",
26
- "lstrip": false,
27
- "normalized": false,
28
- "rstrip": false,
29
- "single_word": false
30
- },
31
- {
32
- "content": "îepé",
33
- "lstrip": false,
34
- "normalized": false,
35
- "rstrip": false,
36
- "single_word": false
37
- },
38
  {
39
  "content": "e",
40
  "lstrip": false,
@@ -50,21 +36,21 @@
50
  "single_word": false
51
  },
52
  {
53
- "content": "ta",
54
  "lstrip": false,
55
  "normalized": false,
56
  "rstrip": false,
57
  "single_word": false
58
  },
59
  {
60
- "content": "n'",
61
  "lstrip": false,
62
  "normalized": false,
63
  "rstrip": false,
64
  "single_word": false
65
  },
66
  {
67
- "content": "peẽ",
68
  "lstrip": false,
69
  "normalized": false,
70
  "rstrip": false,
@@ -78,7 +64,7 @@
78
  "single_word": false
79
  },
80
  {
81
- "content": "îe",
82
  "lstrip": false,
83
  "normalized": false,
84
  "rstrip": false,
@@ -112,13 +98,6 @@
112
  "rstrip": false,
113
  "single_word": false
114
  },
115
- {
116
- "content": "îos",
117
- "lstrip": false,
118
- "normalized": false,
119
- "rstrip": false,
120
- "single_word": false
121
- },
122
  {
123
  "content": "[MAIN_VERB]",
124
  "lstrip": false,
@@ -141,7 +120,7 @@
141
  "single_word": false
142
  },
143
  {
144
- "content": "û",
145
  "lstrip": false,
146
  "normalized": false,
147
  "rstrip": false,
@@ -183,14 +162,14 @@
183
  "single_word": false
184
  },
185
  {
186
- "content": "[GERUND_SUFFIX:CLASS_1:IYU]",
187
  "lstrip": false,
188
  "normalized": false,
189
  "rstrip": false,
190
  "single_word": false
191
  },
192
  {
193
- "content": "îo",
194
  "lstrip": false,
195
  "normalized": false,
196
  "rstrip": false,
@@ -239,14 +218,21 @@
239
  "single_word": false
240
  },
241
  {
242
- "content": "xe",
243
  "lstrip": false,
244
  "normalized": false,
245
  "rstrip": false,
246
  "single_word": false
247
  },
248
  {
249
- "content": "a'e",
 
 
 
 
 
 
 
250
  "lstrip": false,
251
  "normalized": false,
252
  "rstrip": false,
@@ -280,6 +266,13 @@
280
  "rstrip": false,
281
  "single_word": false
282
  },
 
 
 
 
 
 
 
283
  {
284
  "content": "[SUBJECT_PREFIX:1ppe]",
285
  "lstrip": false,
@@ -301,6 +294,13 @@
301
  "rstrip": false,
302
  "single_word": false
303
  },
 
 
 
 
 
 
 
304
  {
305
  "content": "[OBJECT:1ppe]",
306
  "lstrip": false,
@@ -329,6 +329,13 @@
329
  "rstrip": false,
330
  "single_word": false
331
  },
 
 
 
 
 
 
 
332
  {
333
  "content": "[NEGATION_PREFIX]",
334
  "lstrip": false,
@@ -336,6 +343,13 @@
336
  "rstrip": false,
337
  "single_word": false
338
  },
 
 
 
 
 
 
 
339
  {
340
  "content": "[CIRCUMSTANTIAL_SUFFIX:NULL_ENDING]",
341
  "lstrip": false,
@@ -357,6 +371,13 @@
357
  "rstrip": false,
358
  "single_word": false
359
  },
 
 
 
 
 
 
 
360
  {
361
  "content": "pe",
362
  "lstrip": false,
@@ -386,70 +407,70 @@
386
  "single_word": false
387
  },
388
  {
389
- "content": "[GERUND_SUBJECT_PREFIX:2ps]",
390
  "lstrip": false,
391
  "normalized": false,
392
  "rstrip": false,
393
  "single_word": false
394
  },
395
  {
396
- "content": "[OBJECT:3p]",
397
  "lstrip": false,
398
  "normalized": false,
399
  "rstrip": false,
400
  "single_word": false
401
  },
402
  {
403
- "content": "îandé",
404
  "lstrip": false,
405
  "normalized": false,
406
  "rstrip": false,
407
  "single_word": false
408
  },
409
  {
410
- "content": "gûi",
411
  "lstrip": false,
412
  "normalized": false,
413
  "rstrip": false,
414
  "single_word": false
415
  },
416
  {
417
- "content": "[OBJECT:2pp:SUBJECT_1P]",
418
  "lstrip": false,
419
  "normalized": false,
420
  "rstrip": false,
421
  "single_word": false
422
  },
423
  {
424
- "content": "[CIRCUMSTANTIAL_SUFFIX:VOWEL_ENDING]",
425
  "lstrip": false,
426
  "normalized": false,
427
  "rstrip": false,
428
  "single_word": false
429
  },
430
  {
431
- "content": "[OBJECT_MARKER:3p:MONOSYLLABIC]",
432
  "lstrip": false,
433
  "normalized": false,
434
  "rstrip": false,
435
  "single_word": false
436
  },
437
  {
438
- "content": "peîepé",
439
  "lstrip": false,
440
  "normalized": false,
441
  "rstrip": false,
442
  "single_word": false
443
  },
444
  {
445
- "content": "[GERUND_SUFFIX:CLASS_1:R]",
446
  "lstrip": false,
447
  "normalized": false,
448
  "rstrip": false,
449
  "single_word": false
450
  },
451
  {
452
- "content": "endé",
453
  "lstrip": false,
454
  "normalized": false,
455
  "rstrip": false,
@@ -511,13 +532,6 @@
511
  "rstrip": false,
512
  "single_word": false
513
  },
514
- {
515
- "content": "e'ym",
516
- "lstrip": false,
517
- "normalized": false,
518
- "rstrip": false,
519
- "single_word": false
520
- },
521
  {
522
  "content": "[SUBJECT_PREFIX:3p]",
523
  "lstrip": false,
@@ -547,42 +561,42 @@
547
  "single_word": false
548
  },
549
  {
550
- "content": "ixé",
551
  "lstrip": false,
552
  "normalized": false,
553
  "rstrip": false,
554
  "single_word": false
555
  },
556
  {
557
- "content": "[GERUND_SUFFIX:CLASS_1:ORAL_VOWEL]",
558
  "lstrip": false,
559
  "normalized": false,
560
  "rstrip": false,
561
  "single_word": false
562
  },
563
  {
564
- "content": "pa",
565
  "lstrip": false,
566
  "normalized": false,
567
  "rstrip": false,
568
  "single_word": false
569
  },
570
  {
571
- "content": "ramo",
572
  "lstrip": false,
573
  "normalized": false,
574
  "rstrip": false,
575
  "single_word": false
576
  },
577
  {
578
- "content": "ere",
579
  "lstrip": false,
580
  "normalized": false,
581
  "rstrip": false,
582
  "single_word": false
583
  },
584
  {
585
- "content": "[OBJECT:REFLEXIVE]",
586
  "lstrip": false,
587
  "normalized": false,
588
  "rstrip": false,
@@ -652,105 +666,203 @@
652
  "single_word": false
653
  },
654
  {
655
- "content": "í",
656
  "lstrip": false,
657
  "normalized": false,
658
  "rstrip": false,
659
  "single_word": false
660
  },
661
  {
662
- "content": "oré",
663
  "lstrip": false,
664
  "normalized": false,
665
  "rstrip": false,
666
  "single_word": false
667
  },
668
  {
669
- "content": "umẽ",
670
  "lstrip": false,
671
  "normalized": false,
672
  "rstrip": false,
673
  "single_word": false
674
  },
675
  {
676
- "content": "[PLURIFORM_PREFIX:R]",
677
  "lstrip": false,
678
  "normalized": false,
679
  "rstrip": false,
680
  "single_word": false
681
  },
682
  {
683
- "content": "[SUBJECT:2ps]",
684
  "lstrip": false,
685
  "normalized": false,
686
  "rstrip": false,
687
  "single_word": false
688
  },
689
  {
690
- "content": "o",
691
  "lstrip": false,
692
  "normalized": false,
693
  "rstrip": false,
694
  "single_word": false
695
  },
696
  {
697
- "content": "[SUBJECT:2ps:OBJECT_1P]",
698
  "lstrip": false,
699
  "normalized": false,
700
  "rstrip": false,
701
  "single_word": false
702
  },
703
  {
704
- "content": "îa",
705
  "lstrip": false,
706
  "normalized": false,
707
  "rstrip": false,
708
  "single_word": false
709
  },
710
  {
711
- "content": "[OBJECT:1ps]",
712
  "lstrip": false,
713
  "normalized": false,
714
  "rstrip": false,
715
  "single_word": false
716
  },
717
  {
718
- "content": "[GERUND_SUFFIX:CLASS_2:DEFAULT]",
719
  "lstrip": false,
720
  "normalized": false,
721
  "rstrip": false,
722
  "single_word": false
723
  },
724
  {
725
- "content": "[GERUND_SUFFIX:CLASS_2:NASAL_VOWEL_ENDING]",
726
  "lstrip": false,
727
  "normalized": false,
728
  "rstrip": false,
729
  "single_word": false
730
  },
731
  {
732
- "content": "[SUBJECT_PREFIX:1ppi]",
733
  "lstrip": false,
734
  "normalized": false,
735
  "rstrip": false,
736
  "single_word": false
737
  },
738
  {
739
- "content": "[SUBJECT_PREFIX:2ps]",
740
  "lstrip": false,
741
  "normalized": false,
742
  "rstrip": false,
743
  "single_word": false
744
  },
745
  {
746
- "content": "s",
747
  "lstrip": false,
748
  "normalized": false,
749
  "rstrip": false,
750
  "single_word": false
751
  },
752
  {
753
- "content": "[SPACE]",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
754
  "lstrip": false,
755
  "normalized": false,
756
  "rstrip": false,
 
21
  "rstrip": false,
22
  "single_word": false
23
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  {
25
  "content": "e",
26
  "lstrip": false,
 
36
  "single_word": false
37
  },
38
  {
39
+ "content": "pe[w1q]ep[w4q]",
40
  "lstrip": false,
41
  "normalized": false,
42
  "rstrip": false,
43
  "single_word": false
44
  },
45
  {
46
+ "content": "ta",
47
  "lstrip": false,
48
  "normalized": false,
49
  "rstrip": false,
50
  "single_word": false
51
  },
52
  {
53
+ "content": "[w1q]os",
54
  "lstrip": false,
55
  "normalized": false,
56
  "rstrip": false,
 
64
  "single_word": false
65
  },
66
  {
67
+ "content": "[w1q]and[w4q]",
68
  "lstrip": false,
69
  "normalized": false,
70
  "rstrip": false,
 
98
  "rstrip": false,
99
  "single_word": false
100
  },
 
 
 
 
 
 
 
101
  {
102
  "content": "[MAIN_VERB]",
103
  "lstrip": false,
 
120
  "single_word": false
121
  },
122
  {
123
+ "content": "e[w15q]ym",
124
  "lstrip": false,
125
  "normalized": false,
126
  "rstrip": false,
 
162
  "single_word": false
163
  },
164
  {
165
+ "content": "n[w15q]",
166
  "lstrip": false,
167
  "normalized": false,
168
  "rstrip": false,
169
  "single_word": false
170
  },
171
  {
172
+ "content": "[GERUND_SUFFIX:CLASS_1:IYU]",
173
  "lstrip": false,
174
  "normalized": false,
175
  "rstrip": false,
 
218
  "single_word": false
219
  },
220
  {
221
+ "content": "um[w10q]",
222
  "lstrip": false,
223
  "normalized": false,
224
  "rstrip": false,
225
  "single_word": false
226
  },
227
  {
228
+ "content": "[w1q]",
229
+ "lstrip": false,
230
+ "normalized": false,
231
+ "rstrip": false,
232
+ "single_word": false
233
+ },
234
+ {
235
+ "content": "xe",
236
  "lstrip": false,
237
  "normalized": false,
238
  "rstrip": false,
 
266
  "rstrip": false,
267
  "single_word": false
268
  },
269
+ {
270
+ "content": "[w1q]ep[w4q]",
271
+ "lstrip": false,
272
+ "normalized": false,
273
+ "rstrip": false,
274
+ "single_word": false
275
+ },
276
  {
277
  "content": "[SUBJECT_PREFIX:1ppe]",
278
  "lstrip": false,
 
294
  "rstrip": false,
295
  "single_word": false
296
  },
297
+ {
298
+ "content": "pe[w10q]",
299
+ "lstrip": false,
300
+ "normalized": false,
301
+ "rstrip": false,
302
+ "single_word": false
303
+ },
304
  {
305
  "content": "[OBJECT:1ppe]",
306
  "lstrip": false,
 
329
  "rstrip": false,
330
  "single_word": false
331
  },
332
+ {
333
+ "content": "[w1q]o",
334
+ "lstrip": false,
335
+ "normalized": false,
336
+ "rstrip": false,
337
+ "single_word": false
338
+ },
339
  {
340
  "content": "[NEGATION_PREFIX]",
341
  "lstrip": false,
 
343
  "rstrip": false,
344
  "single_word": false
345
  },
346
+ {
347
+ "content": "ix[w4q]",
348
+ "lstrip": false,
349
+ "normalized": false,
350
+ "rstrip": false,
351
+ "single_word": false
352
+ },
353
  {
354
  "content": "[CIRCUMSTANTIAL_SUFFIX:NULL_ENDING]",
355
  "lstrip": false,
 
371
  "rstrip": false,
372
  "single_word": false
373
  },
374
+ {
375
+ "content": "[w1q]e",
376
+ "lstrip": false,
377
+ "normalized": false,
378
+ "rstrip": false,
379
+ "single_word": false
380
+ },
381
  {
382
  "content": "pe",
383
  "lstrip": false,
 
407
  "single_word": false
408
  },
409
  {
410
+ "content": "or[w4q]",
411
  "lstrip": false,
412
  "normalized": false,
413
  "rstrip": false,
414
  "single_word": false
415
  },
416
  {
417
+ "content": "[GERUND_SUBJECT_PREFIX:2ps]",
418
  "lstrip": false,
419
  "normalized": false,
420
  "rstrip": false,
421
  "single_word": false
422
  },
423
  {
424
+ "content": "[w1q]a",
425
  "lstrip": false,
426
  "normalized": false,
427
  "rstrip": false,
428
  "single_word": false
429
  },
430
  {
431
+ "content": "g[w0q]i",
432
  "lstrip": false,
433
  "normalized": false,
434
  "rstrip": false,
435
  "single_word": false
436
  },
437
  {
438
+ "content": "[OBJECT:3p]",
439
  "lstrip": false,
440
  "normalized": false,
441
  "rstrip": false,
442
  "single_word": false
443
  },
444
  {
445
+ "content": "[OBJECT:2pp:SUBJECT_1P]",
446
  "lstrip": false,
447
  "normalized": false,
448
  "rstrip": false,
449
  "single_word": false
450
  },
451
  {
452
+ "content": "[CIRCUMSTANTIAL_SUFFIX:VOWEL_ENDING]",
453
  "lstrip": false,
454
  "normalized": false,
455
  "rstrip": false,
456
  "single_word": false
457
  },
458
  {
459
+ "content": "[OBJECT_MARKER:3p:MONOSYLLABIC]",
460
  "lstrip": false,
461
  "normalized": false,
462
  "rstrip": false,
463
  "single_word": false
464
  },
465
  {
466
+ "content": "a[w15q]e",
467
  "lstrip": false,
468
  "normalized": false,
469
  "rstrip": false,
470
  "single_word": false
471
  },
472
  {
473
+ "content": "[GERUND_SUFFIX:CLASS_1:R]",
474
  "lstrip": false,
475
  "normalized": false,
476
  "rstrip": false,
 
532
  "rstrip": false,
533
  "single_word": false
534
  },
 
 
 
 
 
 
 
535
  {
536
  "content": "[SUBJECT_PREFIX:3p]",
537
  "lstrip": false,
 
561
  "single_word": false
562
  },
563
  {
564
+ "content": "[GERUND_SUFFIX:CLASS_1:ORAL_VOWEL]",
565
  "lstrip": false,
566
  "normalized": false,
567
  "rstrip": false,
568
  "single_word": false
569
  },
570
  {
571
+ "content": "pa",
572
  "lstrip": false,
573
  "normalized": false,
574
  "rstrip": false,
575
  "single_word": false
576
  },
577
  {
578
+ "content": "ramo",
579
  "lstrip": false,
580
  "normalized": false,
581
  "rstrip": false,
582
  "single_word": false
583
  },
584
  {
585
+ "content": "ere",
586
  "lstrip": false,
587
  "normalized": false,
588
  "rstrip": false,
589
  "single_word": false
590
  },
591
  {
592
+ "content": "[OBJECT:REFLEXIVE]",
593
  "lstrip": false,
594
  "normalized": false,
595
  "rstrip": false,
596
  "single_word": false
597
  },
598
  {
599
+ "content": "[w5q]",
600
  "lstrip": false,
601
  "normalized": false,
602
  "rstrip": false,
 
666
  "single_word": false
667
  },
668
  {
669
+ "content": "[PLURIFORM_PREFIX:R]",
670
  "lstrip": false,
671
  "normalized": false,
672
  "rstrip": false,
673
  "single_word": false
674
  },
675
  {
676
+ "content": "[SUBJECT:2ps]",
677
  "lstrip": false,
678
  "normalized": false,
679
  "rstrip": false,
680
  "single_word": false
681
  },
682
  {
683
+ "content": "o",
684
  "lstrip": false,
685
  "normalized": false,
686
  "rstrip": false,
687
  "single_word": false
688
  },
689
  {
690
+ "content": "[SUBJECT:2ps:OBJECT_1P]",
691
  "lstrip": false,
692
  "normalized": false,
693
  "rstrip": false,
694
  "single_word": false
695
  },
696
  {
697
+ "content": "end[w4q]",
698
  "lstrip": false,
699
  "normalized": false,
700
  "rstrip": false,
701
  "single_word": false
702
  },
703
  {
704
+ "content": "[OBJECT:1ps]",
705
  "lstrip": false,
706
  "normalized": false,
707
  "rstrip": false,
708
  "single_word": false
709
  },
710
  {
711
+ "content": "[w0q]",
712
  "lstrip": false,
713
  "normalized": false,
714
  "rstrip": false,
715
  "single_word": false
716
  },
717
  {
718
+ "content": "[GERUND_SUFFIX:CLASS_2:DEFAULT]",
719
  "lstrip": false,
720
  "normalized": false,
721
  "rstrip": false,
722
  "single_word": false
723
  },
724
  {
725
+ "content": "[GERUND_SUFFIX:CLASS_2:NASAL_VOWEL_ENDING]",
726
  "lstrip": false,
727
  "normalized": false,
728
  "rstrip": false,
729
  "single_word": false
730
  },
731
  {
732
+ "content": "[SUBJECT_PREFIX:1ppi]",
733
  "lstrip": false,
734
  "normalized": false,
735
  "rstrip": false,
736
  "single_word": false
737
  },
738
  {
739
+ "content": "[SUBJECT_PREFIX:2ps]",
740
  "lstrip": false,
741
  "normalized": false,
742
  "rstrip": false,
743
  "single_word": false
744
  },
745
  {
746
+ "content": "s",
747
  "lstrip": false,
748
  "normalized": false,
749
  "rstrip": false,
750
  "single_word": false
751
  },
752
  {
753
+ "content": "[SPACE]",
754
  "lstrip": false,
755
  "normalized": false,
756
  "rstrip": false,
757
  "single_word": false
758
  },
759
  {
760
+ "content": "[w0q]",
761
  "lstrip": false,
762
  "normalized": false,
763
  "rstrip": false,
764
  "single_word": false
765
  },
766
  {
767
+ "content": "[w1q]",
768
+ "lstrip": false,
769
+ "normalized": false,
770
+ "rstrip": false,
771
+ "single_word": false
772
+ },
773
+ {
774
+ "content": "[w2q]",
775
+ "lstrip": false,
776
+ "normalized": false,
777
+ "rstrip": false,
778
+ "single_word": false
779
+ },
780
+ {
781
+ "content": "[w3q]",
782
+ "lstrip": false,
783
+ "normalized": false,
784
+ "rstrip": false,
785
+ "single_word": false
786
+ },
787
+ {
788
+ "content": "[w4q]",
789
+ "lstrip": false,
790
+ "normalized": false,
791
+ "rstrip": false,
792
+ "single_word": false
793
+ },
794
+ {
795
+ "content": "[w5q]",
796
+ "lstrip": false,
797
+ "normalized": false,
798
+ "rstrip": false,
799
+ "single_word": false
800
+ },
801
+ {
802
+ "content": "[w6q]",
803
+ "lstrip": false,
804
+ "normalized": false,
805
+ "rstrip": false,
806
+ "single_word": false
807
+ },
808
+ {
809
+ "content": "[w7q]",
810
+ "lstrip": false,
811
+ "normalized": false,
812
+ "rstrip": false,
813
+ "single_word": false
814
+ },
815
+ {
816
+ "content": "[w8q]",
817
+ "lstrip": false,
818
+ "normalized": false,
819
+ "rstrip": false,
820
+ "single_word": false
821
+ },
822
+ {
823
+ "content": "[w9q]",
824
+ "lstrip": false,
825
+ "normalized": false,
826
+ "rstrip": false,
827
+ "single_word": false
828
+ },
829
+ {
830
+ "content": "[w10q]",
831
+ "lstrip": false,
832
+ "normalized": false,
833
+ "rstrip": false,
834
+ "single_word": false
835
+ },
836
+ {
837
+ "content": "[w11q]",
838
+ "lstrip": false,
839
+ "normalized": false,
840
+ "rstrip": false,
841
+ "single_word": false
842
+ },
843
+ {
844
+ "content": "[w12q]",
845
+ "lstrip": false,
846
+ "normalized": false,
847
+ "rstrip": false,
848
+ "single_word": false
849
+ },
850
+ {
851
+ "content": "[w13q]",
852
+ "lstrip": false,
853
+ "normalized": false,
854
+ "rstrip": false,
855
+ "single_word": false
856
+ },
857
+ {
858
+ "content": "[w14q]",
859
+ "lstrip": false,
860
+ "normalized": false,
861
+ "rstrip": false,
862
+ "single_word": false
863
+ },
864
+ {
865
+ "content": "[w15q]",
866
  "lstrip": false,
867
  "normalized": false,
868
  "rstrip": false,
tokenizer_config.json CHANGED
@@ -1696,74 +1696,333 @@
1696
  "rstrip": false,
1697
  "single_word": false,
1698
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1699
  }
1700
  },
1701
  "additional_special_tokens": [
1702
  "[SUBJECT:3p:DIRECT]",
1703
  "[GERUND_SUBJECT_PREFIX:1ppi]",
1704
  "[OBJECT:DIRECT]",
1705
- "î",
1706
- "îepé",
1707
  "e",
1708
  "[OBJECT:2pp]",
 
1709
  "ta",
1710
- "n'",
1711
- "peẽ",
1712
  "[SUBJECT:2pp]",
1713
- "îe",
1714
  "[GERUND_SUBJECT_PREFIX:3p]",
1715
  "[GERUND_SUBJECT_PREFIX:2pp]",
1716
  "[NEGATION_PARTICLE:NA]",
1717
  "[IMPERATIVE_PREFIX:2ps]",
1718
- "îos",
1719
  "[MAIN_VERB]",
1720
  "[GERUND_SUFFIX:CLASS_1:NASAL_IYU]",
1721
  "[PLURIFORM_PREFIX:S]",
1722
- "û",
1723
  "[OBJECT_MARKER:3p:PLURIFORM_PREFIX:MONOSYLLABIC]",
1724
  "[OBJECT:1ppi]",
1725
  "abo",
1726
  "[GERUND_SUFFIX:CLASS_1]",
1727
  "[SUB_VERB]",
 
1728
  "[GERUND_SUFFIX:CLASS_1:IYU]",
1729
- "îo",
1730
  "[NEGATION_SUFFIX]",
1731
  "[GERUND_SUBJECT_PREFIX:1ppe]",
1732
  "i",
1733
  "[SUBJECT:3p]",
1734
  "[IMPERATIVE_PREFIX:2pp]",
1735
  "́",
 
 
1736
  "xe",
1737
- "a'e",
1738
  "t",
1739
  "[SUBJECT:1ppe]",
1740
  "[SUBJECT:1ps]",
1741
  "a",
 
1742
  "[SUBJECT_PREFIX:1ppe]",
1743
  "amo",
1744
  "[OBJECT:2ps]",
 
1745
  "[OBJECT:1ppe]",
1746
  "[NEGATION_SUFFIX:CONSONANT_ENDING]",
1747
  "[OBJECT:MUTUAL]",
1748
  "[GERUND_SUFFIX:CLASS_2:ORAL_VOWEL_ENDING]",
 
1749
  "[NEGATION_PREFIX]",
 
1750
  "[CIRCUMSTANTIAL_SUFFIX:NULL_ENDING]",
1751
  "[SUBJECT:1ppi]",
1752
  "[PERMISSIVE_PREFIX:VOWEL]",
 
1753
  "pe",
1754
  "nde",
1755
  "[OBJECT_MARKER:3p:DEFAULT]",
1756
  "opo",
 
1757
  "[GERUND_SUBJECT_PREFIX:2ps]",
 
 
1758
  "[OBJECT:3p]",
1759
- "îandé",
1760
- "gûi",
1761
  "[OBJECT:2pp:SUBJECT_1P]",
1762
  "[CIRCUMSTANTIAL_SUFFIX:VOWEL_ENDING]",
1763
  "[OBJECT_MARKER:3p:MONOSYLLABIC]",
1764
- "peîepé",
1765
  "[GERUND_SUFFIX:CLASS_1:R]",
1766
- "endé",
1767
  "namo",
1768
  "[SUBJECT:2pp:OBJECT_1P]",
1769
  "mo",
@@ -1772,17 +2031,16 @@
1772
  "[CIRCUMSTANTIAL_SUFFIX:CONSONANT_ENDING]",
1773
  "[ROOT]",
1774
  "[NEGATION_PARTICLE:UME]",
1775
- "e'ym",
1776
  "[SUBJECT_PREFIX:3p]",
1777
  "r",
1778
  "[NEGATION_SUFFIX:VOWEL_ENDING]",
1779
  "[SUBJECT_PREFIX:1ps]",
1780
- "ixé",
1781
  "[GERUND_SUFFIX:CLASS_1:ORAL_VOWEL]",
1782
  "pa",
1783
  "ramo",
1784
  "ere",
1785
  "[OBJECT:REFLEXIVE]",
 
1786
  "[GERUND_SUFFIX:CLASS_1:NASAL_VOWEL]",
1787
  "na",
1788
  "[GERUND_SUFFIX:CLASS_1:B]",
@@ -1792,21 +2050,35 @@
1792
  "[OBJECT:2ps:SUBJECT_1P]",
1793
  "[GERUND_SUBJECT_PREFIX:1ps]",
1794
  "[PERMISSIVE_PREFIX:CONSONANT]",
1795
- "í",
1796
- "oré",
1797
- "umẽ",
1798
  "[PLURIFORM_PREFIX:R]",
1799
  "[SUBJECT:2ps]",
1800
  "o",
1801
  "[SUBJECT:2ps:OBJECT_1P]",
1802
- "îa",
1803
  "[OBJECT:1ps]",
 
1804
  "[GERUND_SUFFIX:CLASS_2:DEFAULT]",
1805
  "[GERUND_SUFFIX:CLASS_2:NASAL_VOWEL_ENDING]",
1806
  "[SUBJECT_PREFIX:1ppi]",
1807
  "[SUBJECT_PREFIX:2ps]",
1808
  "s",
1809
- "[SPACE]"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1810
  ],
1811
  "clean_up_tokenization_spaces": true,
1812
  "eos_token": "</s>",
 
1696
  "rstrip": false,
1697
  "single_word": false,
1698
  "special": true
1699
+ },
1700
+ "32198": {
1701
+ "content": "pe[w1q]ep[w4q]",
1702
+ "lstrip": false,
1703
+ "normalized": false,
1704
+ "rstrip": false,
1705
+ "single_word": false,
1706
+ "special": true
1707
+ },
1708
+ "32199": {
1709
+ "content": "[w1q]os",
1710
+ "lstrip": false,
1711
+ "normalized": false,
1712
+ "rstrip": false,
1713
+ "single_word": false,
1714
+ "special": true
1715
+ },
1716
+ "32200": {
1717
+ "content": "[w1q]and[w4q]",
1718
+ "lstrip": false,
1719
+ "normalized": false,
1720
+ "rstrip": false,
1721
+ "single_word": false,
1722
+ "special": true
1723
+ },
1724
+ "32201": {
1725
+ "content": "e[w15q]ym",
1726
+ "lstrip": false,
1727
+ "normalized": false,
1728
+ "rstrip": false,
1729
+ "single_word": false,
1730
+ "special": true
1731
+ },
1732
+ "32202": {
1733
+ "content": "n[w15q]",
1734
+ "lstrip": false,
1735
+ "normalized": false,
1736
+ "rstrip": false,
1737
+ "single_word": false,
1738
+ "special": true
1739
+ },
1740
+ "32203": {
1741
+ "content": "um[w10q]",
1742
+ "lstrip": false,
1743
+ "normalized": false,
1744
+ "rstrip": false,
1745
+ "single_word": false,
1746
+ "special": true
1747
+ },
1748
+ "32204": {
1749
+ "content": "[w1q]",
1750
+ "lstrip": false,
1751
+ "normalized": false,
1752
+ "rstrip": false,
1753
+ "single_word": false,
1754
+ "special": true
1755
+ },
1756
+ "32205": {
1757
+ "content": "[w1q]ep[w4q]",
1758
+ "lstrip": false,
1759
+ "normalized": false,
1760
+ "rstrip": false,
1761
+ "single_word": false,
1762
+ "special": true
1763
+ },
1764
+ "32206": {
1765
+ "content": "pe[w10q]",
1766
+ "lstrip": false,
1767
+ "normalized": false,
1768
+ "rstrip": false,
1769
+ "single_word": false,
1770
+ "special": true
1771
+ },
1772
+ "32207": {
1773
+ "content": "[w1q]o",
1774
+ "lstrip": false,
1775
+ "normalized": false,
1776
+ "rstrip": false,
1777
+ "single_word": false,
1778
+ "special": true
1779
+ },
1780
+ "32208": {
1781
+ "content": "ix[w4q]",
1782
+ "lstrip": false,
1783
+ "normalized": false,
1784
+ "rstrip": false,
1785
+ "single_word": false,
1786
+ "special": true
1787
+ },
1788
+ "32209": {
1789
+ "content": "[w1q]e",
1790
+ "lstrip": false,
1791
+ "normalized": false,
1792
+ "rstrip": false,
1793
+ "single_word": false,
1794
+ "special": true
1795
+ },
1796
+ "32210": {
1797
+ "content": "or[w4q]",
1798
+ "lstrip": false,
1799
+ "normalized": false,
1800
+ "rstrip": false,
1801
+ "single_word": false,
1802
+ "special": true
1803
+ },
1804
+ "32211": {
1805
+ "content": "[w1q]a",
1806
+ "lstrip": false,
1807
+ "normalized": false,
1808
+ "rstrip": false,
1809
+ "single_word": false,
1810
+ "special": true
1811
+ },
1812
+ "32212": {
1813
+ "content": "g[w0q]i",
1814
+ "lstrip": false,
1815
+ "normalized": false,
1816
+ "rstrip": false,
1817
+ "single_word": false,
1818
+ "special": true
1819
+ },
1820
+ "32213": {
1821
+ "content": "a[w15q]e",
1822
+ "lstrip": false,
1823
+ "normalized": false,
1824
+ "rstrip": false,
1825
+ "single_word": false,
1826
+ "special": true
1827
+ },
1828
+ "32214": {
1829
+ "content": "[w5q]",
1830
+ "lstrip": false,
1831
+ "normalized": false,
1832
+ "rstrip": false,
1833
+ "single_word": false,
1834
+ "special": true
1835
+ },
1836
+ "32215": {
1837
+ "content": "end[w4q]",
1838
+ "lstrip": false,
1839
+ "normalized": false,
1840
+ "rstrip": false,
1841
+ "single_word": false,
1842
+ "special": true
1843
+ },
1844
+ "32216": {
1845
+ "content": "[w0q]",
1846
+ "lstrip": false,
1847
+ "normalized": false,
1848
+ "rstrip": false,
1849
+ "single_word": false,
1850
+ "special": true
1851
+ },
1852
+ "32217": {
1853
+ "content": "[w2q]",
1854
+ "lstrip": false,
1855
+ "normalized": false,
1856
+ "rstrip": false,
1857
+ "single_word": false,
1858
+ "special": true
1859
+ },
1860
+ "32218": {
1861
+ "content": "[w3q]",
1862
+ "lstrip": false,
1863
+ "normalized": false,
1864
+ "rstrip": false,
1865
+ "single_word": false,
1866
+ "special": true
1867
+ },
1868
+ "32219": {
1869
+ "content": "[w4q]",
1870
+ "lstrip": false,
1871
+ "normalized": false,
1872
+ "rstrip": false,
1873
+ "single_word": false,
1874
+ "special": true
1875
+ },
1876
+ "32220": {
1877
+ "content": "[w6q]",
1878
+ "lstrip": false,
1879
+ "normalized": false,
1880
+ "rstrip": false,
1881
+ "single_word": false,
1882
+ "special": true
1883
+ },
1884
+ "32221": {
1885
+ "content": "[w7q]",
1886
+ "lstrip": false,
1887
+ "normalized": false,
1888
+ "rstrip": false,
1889
+ "single_word": false,
1890
+ "special": true
1891
+ },
1892
+ "32222": {
1893
+ "content": "[w8q]",
1894
+ "lstrip": false,
1895
+ "normalized": false,
1896
+ "rstrip": false,
1897
+ "single_word": false,
1898
+ "special": true
1899
+ },
1900
+ "32223": {
1901
+ "content": "[w9q]",
1902
+ "lstrip": false,
1903
+ "normalized": false,
1904
+ "rstrip": false,
1905
+ "single_word": false,
1906
+ "special": true
1907
+ },
1908
+ "32224": {
1909
+ "content": "[w10q]",
1910
+ "lstrip": false,
1911
+ "normalized": false,
1912
+ "rstrip": false,
1913
+ "single_word": false,
1914
+ "special": true
1915
+ },
1916
+ "32225": {
1917
+ "content": "[w11q]",
1918
+ "lstrip": false,
1919
+ "normalized": false,
1920
+ "rstrip": false,
1921
+ "single_word": false,
1922
+ "special": true
1923
+ },
1924
+ "32226": {
1925
+ "content": "[w12q]",
1926
+ "lstrip": false,
1927
+ "normalized": false,
1928
+ "rstrip": false,
1929
+ "single_word": false,
1930
+ "special": true
1931
+ },
1932
+ "32227": {
1933
+ "content": "[w13q]",
1934
+ "lstrip": false,
1935
+ "normalized": false,
1936
+ "rstrip": false,
1937
+ "single_word": false,
1938
+ "special": true
1939
+ },
1940
+ "32228": {
1941
+ "content": "[w14q]",
1942
+ "lstrip": false,
1943
+ "normalized": false,
1944
+ "rstrip": false,
1945
+ "single_word": false,
1946
+ "special": true
1947
+ },
1948
+ "32229": {
1949
+ "content": "[w15q]",
1950
+ "lstrip": false,
1951
+ "normalized": false,
1952
+ "rstrip": false,
1953
+ "single_word": false,
1954
+ "special": true
1955
  }
1956
  },
1957
  "additional_special_tokens": [
1958
  "[SUBJECT:3p:DIRECT]",
1959
  "[GERUND_SUBJECT_PREFIX:1ppi]",
1960
  "[OBJECT:DIRECT]",
 
 
1961
  "e",
1962
  "[OBJECT:2pp]",
1963
+ "pe[w1q]ep[w4q]",
1964
  "ta",
1965
+ "[w1q]os",
 
1966
  "[SUBJECT:2pp]",
1967
+ "[w1q]and[w4q]",
1968
  "[GERUND_SUBJECT_PREFIX:3p]",
1969
  "[GERUND_SUBJECT_PREFIX:2pp]",
1970
  "[NEGATION_PARTICLE:NA]",
1971
  "[IMPERATIVE_PREFIX:2ps]",
 
1972
  "[MAIN_VERB]",
1973
  "[GERUND_SUFFIX:CLASS_1:NASAL_IYU]",
1974
  "[PLURIFORM_PREFIX:S]",
1975
+ "e[w15q]ym",
1976
  "[OBJECT_MARKER:3p:PLURIFORM_PREFIX:MONOSYLLABIC]",
1977
  "[OBJECT:1ppi]",
1978
  "abo",
1979
  "[GERUND_SUFFIX:CLASS_1]",
1980
  "[SUB_VERB]",
1981
+ "n[w15q]",
1982
  "[GERUND_SUFFIX:CLASS_1:IYU]",
 
1983
  "[NEGATION_SUFFIX]",
1984
  "[GERUND_SUBJECT_PREFIX:1ppe]",
1985
  "i",
1986
  "[SUBJECT:3p]",
1987
  "[IMPERATIVE_PREFIX:2pp]",
1988
  "́",
1989
+ "um[w10q]",
1990
+ "[w1q]",
1991
  "xe",
 
1992
  "t",
1993
  "[SUBJECT:1ppe]",
1994
  "[SUBJECT:1ps]",
1995
  "a",
1996
+ "[w1q]ep[w4q]",
1997
  "[SUBJECT_PREFIX:1ppe]",
1998
  "amo",
1999
  "[OBJECT:2ps]",
2000
+ "pe[w10q]",
2001
  "[OBJECT:1ppe]",
2002
  "[NEGATION_SUFFIX:CONSONANT_ENDING]",
2003
  "[OBJECT:MUTUAL]",
2004
  "[GERUND_SUFFIX:CLASS_2:ORAL_VOWEL_ENDING]",
2005
+ "[w1q]o",
2006
  "[NEGATION_PREFIX]",
2007
+ "ix[w4q]",
2008
  "[CIRCUMSTANTIAL_SUFFIX:NULL_ENDING]",
2009
  "[SUBJECT:1ppi]",
2010
  "[PERMISSIVE_PREFIX:VOWEL]",
2011
+ "[w1q]e",
2012
  "pe",
2013
  "nde",
2014
  "[OBJECT_MARKER:3p:DEFAULT]",
2015
  "opo",
2016
+ "or[w4q]",
2017
  "[GERUND_SUBJECT_PREFIX:2ps]",
2018
+ "[w1q]a",
2019
+ "g[w0q]i",
2020
  "[OBJECT:3p]",
 
 
2021
  "[OBJECT:2pp:SUBJECT_1P]",
2022
  "[CIRCUMSTANTIAL_SUFFIX:VOWEL_ENDING]",
2023
  "[OBJECT_MARKER:3p:MONOSYLLABIC]",
2024
+ "a[w15q]e",
2025
  "[GERUND_SUFFIX:CLASS_1:R]",
 
2026
  "namo",
2027
  "[SUBJECT:2pp:OBJECT_1P]",
2028
  "mo",
 
2031
  "[CIRCUMSTANTIAL_SUFFIX:CONSONANT_ENDING]",
2032
  "[ROOT]",
2033
  "[NEGATION_PARTICLE:UME]",
 
2034
  "[SUBJECT_PREFIX:3p]",
2035
  "r",
2036
  "[NEGATION_SUFFIX:VOWEL_ENDING]",
2037
  "[SUBJECT_PREFIX:1ps]",
 
2038
  "[GERUND_SUFFIX:CLASS_1:ORAL_VOWEL]",
2039
  "pa",
2040
  "ramo",
2041
  "ere",
2042
  "[OBJECT:REFLEXIVE]",
2043
+ "[w5q]",
2044
  "[GERUND_SUFFIX:CLASS_1:NASAL_VOWEL]",
2045
  "na",
2046
  "[GERUND_SUFFIX:CLASS_1:B]",
 
2050
  "[OBJECT:2ps:SUBJECT_1P]",
2051
  "[GERUND_SUBJECT_PREFIX:1ps]",
2052
  "[PERMISSIVE_PREFIX:CONSONANT]",
 
 
 
2053
  "[PLURIFORM_PREFIX:R]",
2054
  "[SUBJECT:2ps]",
2055
  "o",
2056
  "[SUBJECT:2ps:OBJECT_1P]",
2057
+ "end[w4q]",
2058
  "[OBJECT:1ps]",
2059
+ "[w0q]",
2060
  "[GERUND_SUFFIX:CLASS_2:DEFAULT]",
2061
  "[GERUND_SUFFIX:CLASS_2:NASAL_VOWEL_ENDING]",
2062
  "[SUBJECT_PREFIX:1ppi]",
2063
  "[SUBJECT_PREFIX:2ps]",
2064
  "s",
2065
+ "[SPACE]",
2066
+ "[w0q]",
2067
+ "[w1q]",
2068
+ "[w2q]",
2069
+ "[w3q]",
2070
+ "[w4q]",
2071
+ "[w5q]",
2072
+ "[w6q]",
2073
+ "[w7q]",
2074
+ "[w8q]",
2075
+ "[w9q]",
2076
+ "[w10q]",
2077
+ "[w11q]",
2078
+ "[w12q]",
2079
+ "[w13q]",
2080
+ "[w14q]",
2081
+ "[w15q]"
2082
  ],
2083
  "clean_up_tokenization_spaces": true,
2084
  "eos_token": "</s>",