Correct added token ids

#35
by sanchit-gandhi HF staff - opened
Files changed (3) hide show
  1. special_tokens_map.json +3 -21
  2. tokenizer.json +213 -213
  3. tokenizer_config.json +0 -0
special_tokens_map.json CHANGED
@@ -108,26 +108,8 @@
108
  "<|nocaptions|>",
109
  "<|notimestamps|>"
110
  ],
111
- "bos_token": {
112
- "content": "<|endoftext|>",
113
- "lstrip": false,
114
- "normalized": true,
115
- "rstrip": false,
116
- "single_word": false
117
- },
118
- "eos_token": {
119
- "content": "<|endoftext|>",
120
- "lstrip": false,
121
- "normalized": true,
122
- "rstrip": false,
123
- "single_word": false
124
- },
125
  "pad_token": "<|endoftext|>",
126
- "unk_token": {
127
- "content": "<|endoftext|>",
128
- "lstrip": false,
129
- "normalized": true,
130
- "rstrip": false,
131
- "single_word": false
132
- }
133
  }
 
108
  "<|nocaptions|>",
109
  "<|notimestamps|>"
110
  ],
111
+ "bos_token": "<|endoftext|>",
112
+ "eos_token": "<|endoftext|>",
 
 
 
 
 
 
 
 
 
 
 
 
113
  "pad_token": "<|endoftext|>",
114
+ "unk_token": "<|endoftext|>"
 
 
 
 
 
 
115
  }
tokenizer.json CHANGED
@@ -9,15 +9,15 @@
9
  "single_word": false,
10
  "lstrip": false,
11
  "rstrip": false,
12
- "normalized": false,
13
  "special": true
14
  },
15
  {
16
  "id": 50258,
17
  "content": "<|startoftranscript|>",
18
  "single_word": false,
19
- "lstrip": false,
20
- "rstrip": false,
21
  "normalized": false,
22
  "special": true
23
  },
@@ -25,8 +25,8 @@
25
  "id": 50259,
26
  "content": "<|en|>",
27
  "single_word": false,
28
- "lstrip": false,
29
- "rstrip": false,
30
  "normalized": false,
31
  "special": true
32
  },
@@ -34,8 +34,8 @@
34
  "id": 50260,
35
  "content": "<|zh|>",
36
  "single_word": false,
37
- "lstrip": false,
38
- "rstrip": false,
39
  "normalized": false,
40
  "special": true
41
  },
@@ -43,8 +43,8 @@
43
  "id": 50261,
44
  "content": "<|de|>",
45
  "single_word": false,
46
- "lstrip": false,
47
- "rstrip": false,
48
  "normalized": false,
49
  "special": true
50
  },
@@ -52,8 +52,8 @@
52
  "id": 50262,
53
  "content": "<|es|>",
54
  "single_word": false,
55
- "lstrip": false,
56
- "rstrip": false,
57
  "normalized": false,
58
  "special": true
59
  },
@@ -61,8 +61,8 @@
61
  "id": 50263,
62
  "content": "<|ru|>",
63
  "single_word": false,
64
- "lstrip": false,
65
- "rstrip": false,
66
  "normalized": false,
67
  "special": true
68
  },
@@ -70,8 +70,8 @@
70
  "id": 50264,
71
  "content": "<|ko|>",
72
  "single_word": false,
73
- "lstrip": false,
74
- "rstrip": false,
75
  "normalized": false,
76
  "special": true
77
  },
@@ -79,8 +79,8 @@
79
  "id": 50265,
80
  "content": "<|fr|>",
81
  "single_word": false,
82
- "lstrip": false,
83
- "rstrip": false,
84
  "normalized": false,
85
  "special": true
86
  },
@@ -88,8 +88,8 @@
88
  "id": 50266,
89
  "content": "<|ja|>",
90
  "single_word": false,
91
- "lstrip": false,
92
- "rstrip": false,
93
  "normalized": false,
94
  "special": true
95
  },
@@ -97,8 +97,8 @@
97
  "id": 50267,
98
  "content": "<|pt|>",
99
  "single_word": false,
100
- "lstrip": false,
101
- "rstrip": false,
102
  "normalized": false,
103
  "special": true
104
  },
@@ -106,8 +106,8 @@
106
  "id": 50268,
107
  "content": "<|tr|>",
108
  "single_word": false,
109
- "lstrip": false,
110
- "rstrip": false,
111
  "normalized": false,
112
  "special": true
113
  },
@@ -115,8 +115,8 @@
115
  "id": 50269,
116
  "content": "<|pl|>",
117
  "single_word": false,
118
- "lstrip": false,
119
- "rstrip": false,
120
  "normalized": false,
121
  "special": true
122
  },
@@ -124,8 +124,8 @@
124
  "id": 50270,
125
  "content": "<|ca|>",
126
  "single_word": false,
127
- "lstrip": false,
128
- "rstrip": false,
129
  "normalized": false,
130
  "special": true
131
  },
@@ -133,8 +133,8 @@
133
  "id": 50271,
134
  "content": "<|nl|>",
135
  "single_word": false,
136
- "lstrip": false,
137
- "rstrip": false,
138
  "normalized": false,
139
  "special": true
140
  },
@@ -142,8 +142,8 @@
142
  "id": 50272,
143
  "content": "<|ar|>",
144
  "single_word": false,
145
- "lstrip": false,
146
- "rstrip": false,
147
  "normalized": false,
148
  "special": true
149
  },
@@ -151,8 +151,8 @@
151
  "id": 50273,
152
  "content": "<|sv|>",
153
  "single_word": false,
154
- "lstrip": false,
155
- "rstrip": false,
156
  "normalized": false,
157
  "special": true
158
  },
@@ -160,8 +160,8 @@
160
  "id": 50274,
161
  "content": "<|it|>",
162
  "single_word": false,
163
- "lstrip": false,
164
- "rstrip": false,
165
  "normalized": false,
166
  "special": true
167
  },
@@ -169,8 +169,8 @@
169
  "id": 50275,
170
  "content": "<|id|>",
171
  "single_word": false,
172
- "lstrip": false,
173
- "rstrip": false,
174
  "normalized": false,
175
  "special": true
176
  },
@@ -178,8 +178,8 @@
178
  "id": 50276,
179
  "content": "<|hi|>",
180
  "single_word": false,
181
- "lstrip": false,
182
- "rstrip": false,
183
  "normalized": false,
184
  "special": true
185
  },
@@ -187,8 +187,8 @@
187
  "id": 50277,
188
  "content": "<|fi|>",
189
  "single_word": false,
190
- "lstrip": false,
191
- "rstrip": false,
192
  "normalized": false,
193
  "special": true
194
  },
@@ -196,8 +196,8 @@
196
  "id": 50278,
197
  "content": "<|vi|>",
198
  "single_word": false,
199
- "lstrip": false,
200
- "rstrip": false,
201
  "normalized": false,
202
  "special": true
203
  },
@@ -205,8 +205,8 @@
205
  "id": 50279,
206
  "content": "<|he|>",
207
  "single_word": false,
208
- "lstrip": false,
209
- "rstrip": false,
210
  "normalized": false,
211
  "special": true
212
  },
@@ -214,8 +214,8 @@
214
  "id": 50280,
215
  "content": "<|uk|>",
216
  "single_word": false,
217
- "lstrip": false,
218
- "rstrip": false,
219
  "normalized": false,
220
  "special": true
221
  },
@@ -223,8 +223,8 @@
223
  "id": 50281,
224
  "content": "<|el|>",
225
  "single_word": false,
226
- "lstrip": false,
227
- "rstrip": false,
228
  "normalized": false,
229
  "special": true
230
  },
@@ -232,8 +232,8 @@
232
  "id": 50282,
233
  "content": "<|ms|>",
234
  "single_word": false,
235
- "lstrip": false,
236
- "rstrip": false,
237
  "normalized": false,
238
  "special": true
239
  },
@@ -241,8 +241,8 @@
241
  "id": 50283,
242
  "content": "<|cs|>",
243
  "single_word": false,
244
- "lstrip": false,
245
- "rstrip": false,
246
  "normalized": false,
247
  "special": true
248
  },
@@ -250,8 +250,8 @@
250
  "id": 50284,
251
  "content": "<|ro|>",
252
  "single_word": false,
253
- "lstrip": false,
254
- "rstrip": false,
255
  "normalized": false,
256
  "special": true
257
  },
@@ -259,8 +259,8 @@
259
  "id": 50285,
260
  "content": "<|da|>",
261
  "single_word": false,
262
- "lstrip": false,
263
- "rstrip": false,
264
  "normalized": false,
265
  "special": true
266
  },
@@ -268,8 +268,8 @@
268
  "id": 50286,
269
  "content": "<|hu|>",
270
  "single_word": false,
271
- "lstrip": false,
272
- "rstrip": false,
273
  "normalized": false,
274
  "special": true
275
  },
@@ -277,8 +277,8 @@
277
  "id": 50287,
278
  "content": "<|ta|>",
279
  "single_word": false,
280
- "lstrip": false,
281
- "rstrip": false,
282
  "normalized": false,
283
  "special": true
284
  },
@@ -286,8 +286,8 @@
286
  "id": 50288,
287
  "content": "<|no|>",
288
  "single_word": false,
289
- "lstrip": false,
290
- "rstrip": false,
291
  "normalized": false,
292
  "special": true
293
  },
@@ -295,8 +295,8 @@
295
  "id": 50289,
296
  "content": "<|th|>",
297
  "single_word": false,
298
- "lstrip": false,
299
- "rstrip": false,
300
  "normalized": false,
301
  "special": true
302
  },
@@ -304,8 +304,8 @@
304
  "id": 50290,
305
  "content": "<|ur|>",
306
  "single_word": false,
307
- "lstrip": false,
308
- "rstrip": false,
309
  "normalized": false,
310
  "special": true
311
  },
@@ -313,8 +313,8 @@
313
  "id": 50291,
314
  "content": "<|hr|>",
315
  "single_word": false,
316
- "lstrip": false,
317
- "rstrip": false,
318
  "normalized": false,
319
  "special": true
320
  },
@@ -322,8 +322,8 @@
322
  "id": 50292,
323
  "content": "<|bg|>",
324
  "single_word": false,
325
- "lstrip": false,
326
- "rstrip": false,
327
  "normalized": false,
328
  "special": true
329
  },
@@ -331,8 +331,8 @@
331
  "id": 50293,
332
  "content": "<|lt|>",
333
  "single_word": false,
334
- "lstrip": false,
335
- "rstrip": false,
336
  "normalized": false,
337
  "special": true
338
  },
@@ -340,8 +340,8 @@
340
  "id": 50294,
341
  "content": "<|la|>",
342
  "single_word": false,
343
- "lstrip": false,
344
- "rstrip": false,
345
  "normalized": false,
346
  "special": true
347
  },
@@ -349,8 +349,8 @@
349
  "id": 50295,
350
  "content": "<|mi|>",
351
  "single_word": false,
352
- "lstrip": false,
353
- "rstrip": false,
354
  "normalized": false,
355
  "special": true
356
  },
@@ -358,8 +358,8 @@
358
  "id": 50296,
359
  "content": "<|ml|>",
360
  "single_word": false,
361
- "lstrip": false,
362
- "rstrip": false,
363
  "normalized": false,
364
  "special": true
365
  },
@@ -367,8 +367,8 @@
367
  "id": 50297,
368
  "content": "<|cy|>",
369
  "single_word": false,
370
- "lstrip": false,
371
- "rstrip": false,
372
  "normalized": false,
373
  "special": true
374
  },
@@ -376,8 +376,8 @@
376
  "id": 50298,
377
  "content": "<|sk|>",
378
  "single_word": false,
379
- "lstrip": false,
380
- "rstrip": false,
381
  "normalized": false,
382
  "special": true
383
  },
@@ -385,8 +385,8 @@
385
  "id": 50299,
386
  "content": "<|te|>",
387
  "single_word": false,
388
- "lstrip": false,
389
- "rstrip": false,
390
  "normalized": false,
391
  "special": true
392
  },
@@ -394,8 +394,8 @@
394
  "id": 50300,
395
  "content": "<|fa|>",
396
  "single_word": false,
397
- "lstrip": false,
398
- "rstrip": false,
399
  "normalized": false,
400
  "special": true
401
  },
@@ -403,8 +403,8 @@
403
  "id": 50301,
404
  "content": "<|lv|>",
405
  "single_word": false,
406
- "lstrip": false,
407
- "rstrip": false,
408
  "normalized": false,
409
  "special": true
410
  },
@@ -412,8 +412,8 @@
412
  "id": 50302,
413
  "content": "<|bn|>",
414
  "single_word": false,
415
- "lstrip": false,
416
- "rstrip": false,
417
  "normalized": false,
418
  "special": true
419
  },
@@ -421,8 +421,8 @@
421
  "id": 50303,
422
  "content": "<|sr|>",
423
  "single_word": false,
424
- "lstrip": false,
425
- "rstrip": false,
426
  "normalized": false,
427
  "special": true
428
  },
@@ -430,8 +430,8 @@
430
  "id": 50304,
431
  "content": "<|az|>",
432
  "single_word": false,
433
- "lstrip": false,
434
- "rstrip": false,
435
  "normalized": false,
436
  "special": true
437
  },
@@ -439,8 +439,8 @@
439
  "id": 50305,
440
  "content": "<|sl|>",
441
  "single_word": false,
442
- "lstrip": false,
443
- "rstrip": false,
444
  "normalized": false,
445
  "special": true
446
  },
@@ -448,8 +448,8 @@
448
  "id": 50306,
449
  "content": "<|kn|>",
450
  "single_word": false,
451
- "lstrip": false,
452
- "rstrip": false,
453
  "normalized": false,
454
  "special": true
455
  },
@@ -457,8 +457,8 @@
457
  "id": 50307,
458
  "content": "<|et|>",
459
  "single_word": false,
460
- "lstrip": false,
461
- "rstrip": false,
462
  "normalized": false,
463
  "special": true
464
  },
@@ -466,8 +466,8 @@
466
  "id": 50308,
467
  "content": "<|mk|>",
468
  "single_word": false,
469
- "lstrip": false,
470
- "rstrip": false,
471
  "normalized": false,
472
  "special": true
473
  },
@@ -475,8 +475,8 @@
475
  "id": 50309,
476
  "content": "<|br|>",
477
  "single_word": false,
478
- "lstrip": false,
479
- "rstrip": false,
480
  "normalized": false,
481
  "special": true
482
  },
@@ -484,8 +484,8 @@
484
  "id": 50310,
485
  "content": "<|eu|>",
486
  "single_word": false,
487
- "lstrip": false,
488
- "rstrip": false,
489
  "normalized": false,
490
  "special": true
491
  },
@@ -493,8 +493,8 @@
493
  "id": 50311,
494
  "content": "<|is|>",
495
  "single_word": false,
496
- "lstrip": false,
497
- "rstrip": false,
498
  "normalized": false,
499
  "special": true
500
  },
@@ -502,8 +502,8 @@
502
  "id": 50312,
503
  "content": "<|hy|>",
504
  "single_word": false,
505
- "lstrip": false,
506
- "rstrip": false,
507
  "normalized": false,
508
  "special": true
509
  },
@@ -511,8 +511,8 @@
511
  "id": 50313,
512
  "content": "<|ne|>",
513
  "single_word": false,
514
- "lstrip": false,
515
- "rstrip": false,
516
  "normalized": false,
517
  "special": true
518
  },
@@ -520,8 +520,8 @@
520
  "id": 50314,
521
  "content": "<|mn|>",
522
  "single_word": false,
523
- "lstrip": false,
524
- "rstrip": false,
525
  "normalized": false,
526
  "special": true
527
  },
@@ -529,8 +529,8 @@
529
  "id": 50315,
530
  "content": "<|bs|>",
531
  "single_word": false,
532
- "lstrip": false,
533
- "rstrip": false,
534
  "normalized": false,
535
  "special": true
536
  },
@@ -538,8 +538,8 @@
538
  "id": 50316,
539
  "content": "<|kk|>",
540
  "single_word": false,
541
- "lstrip": false,
542
- "rstrip": false,
543
  "normalized": false,
544
  "special": true
545
  },
@@ -547,8 +547,8 @@
547
  "id": 50317,
548
  "content": "<|sq|>",
549
  "single_word": false,
550
- "lstrip": false,
551
- "rstrip": false,
552
  "normalized": false,
553
  "special": true
554
  },
@@ -556,8 +556,8 @@
556
  "id": 50318,
557
  "content": "<|sw|>",
558
  "single_word": false,
559
- "lstrip": false,
560
- "rstrip": false,
561
  "normalized": false,
562
  "special": true
563
  },
@@ -565,8 +565,8 @@
565
  "id": 50319,
566
  "content": "<|gl|>",
567
  "single_word": false,
568
- "lstrip": false,
569
- "rstrip": false,
570
  "normalized": false,
571
  "special": true
572
  },
@@ -574,8 +574,8 @@
574
  "id": 50320,
575
  "content": "<|mr|>",
576
  "single_word": false,
577
- "lstrip": false,
578
- "rstrip": false,
579
  "normalized": false,
580
  "special": true
581
  },
@@ -583,8 +583,8 @@
583
  "id": 50321,
584
  "content": "<|pa|>",
585
  "single_word": false,
586
- "lstrip": false,
587
- "rstrip": false,
588
  "normalized": false,
589
  "special": true
590
  },
@@ -592,8 +592,8 @@
592
  "id": 50322,
593
  "content": "<|si|>",
594
  "single_word": false,
595
- "lstrip": false,
596
- "rstrip": false,
597
  "normalized": false,
598
  "special": true
599
  },
@@ -601,8 +601,8 @@
601
  "id": 50323,
602
  "content": "<|km|>",
603
  "single_word": false,
604
- "lstrip": false,
605
- "rstrip": false,
606
  "normalized": false,
607
  "special": true
608
  },
@@ -610,8 +610,8 @@
610
  "id": 50324,
611
  "content": "<|sn|>",
612
  "single_word": false,
613
- "lstrip": false,
614
- "rstrip": false,
615
  "normalized": false,
616
  "special": true
617
  },
@@ -619,8 +619,8 @@
619
  "id": 50325,
620
  "content": "<|yo|>",
621
  "single_word": false,
622
- "lstrip": false,
623
- "rstrip": false,
624
  "normalized": false,
625
  "special": true
626
  },
@@ -628,8 +628,8 @@
628
  "id": 50326,
629
  "content": "<|so|>",
630
  "single_word": false,
631
- "lstrip": false,
632
- "rstrip": false,
633
  "normalized": false,
634
  "special": true
635
  },
@@ -637,8 +637,8 @@
637
  "id": 50327,
638
  "content": "<|af|>",
639
  "single_word": false,
640
- "lstrip": false,
641
- "rstrip": false,
642
  "normalized": false,
643
  "special": true
644
  },
@@ -646,8 +646,8 @@
646
  "id": 50328,
647
  "content": "<|oc|>",
648
  "single_word": false,
649
- "lstrip": false,
650
- "rstrip": false,
651
  "normalized": false,
652
  "special": true
653
  },
@@ -655,8 +655,8 @@
655
  "id": 50329,
656
  "content": "<|ka|>",
657
  "single_word": false,
658
- "lstrip": false,
659
- "rstrip": false,
660
  "normalized": false,
661
  "special": true
662
  },
@@ -664,8 +664,8 @@
664
  "id": 50330,
665
  "content": "<|be|>",
666
  "single_word": false,
667
- "lstrip": false,
668
- "rstrip": false,
669
  "normalized": false,
670
  "special": true
671
  },
@@ -673,8 +673,8 @@
673
  "id": 50331,
674
  "content": "<|tg|>",
675
  "single_word": false,
676
- "lstrip": false,
677
- "rstrip": false,
678
  "normalized": false,
679
  "special": true
680
  },
@@ -682,8 +682,8 @@
682
  "id": 50332,
683
  "content": "<|sd|>",
684
  "single_word": false,
685
- "lstrip": false,
686
- "rstrip": false,
687
  "normalized": false,
688
  "special": true
689
  },
@@ -691,8 +691,8 @@
691
  "id": 50333,
692
  "content": "<|gu|>",
693
  "single_word": false,
694
- "lstrip": false,
695
- "rstrip": false,
696
  "normalized": false,
697
  "special": true
698
  },
@@ -700,8 +700,8 @@
700
  "id": 50334,
701
  "content": "<|am|>",
702
  "single_word": false,
703
- "lstrip": false,
704
- "rstrip": false,
705
  "normalized": false,
706
  "special": true
707
  },
@@ -709,8 +709,8 @@
709
  "id": 50335,
710
  "content": "<|yi|>",
711
  "single_word": false,
712
- "lstrip": false,
713
- "rstrip": false,
714
  "normalized": false,
715
  "special": true
716
  },
@@ -718,8 +718,8 @@
718
  "id": 50336,
719
  "content": "<|lo|>",
720
  "single_word": false,
721
- "lstrip": false,
722
- "rstrip": false,
723
  "normalized": false,
724
  "special": true
725
  },
@@ -727,8 +727,8 @@
727
  "id": 50337,
728
  "content": "<|uz|>",
729
  "single_word": false,
730
- "lstrip": false,
731
- "rstrip": false,
732
  "normalized": false,
733
  "special": true
734
  },
@@ -736,8 +736,8 @@
736
  "id": 50338,
737
  "content": "<|fo|>",
738
  "single_word": false,
739
- "lstrip": false,
740
- "rstrip": false,
741
  "normalized": false,
742
  "special": true
743
  },
@@ -745,8 +745,8 @@
745
  "id": 50339,
746
  "content": "<|ht|>",
747
  "single_word": false,
748
- "lstrip": false,
749
- "rstrip": false,
750
  "normalized": false,
751
  "special": true
752
  },
@@ -754,8 +754,8 @@
754
  "id": 50340,
755
  "content": "<|ps|>",
756
  "single_word": false,
757
- "lstrip": false,
758
- "rstrip": false,
759
  "normalized": false,
760
  "special": true
761
  },
@@ -763,8 +763,8 @@
763
  "id": 50341,
764
  "content": "<|tk|>",
765
  "single_word": false,
766
- "lstrip": false,
767
- "rstrip": false,
768
  "normalized": false,
769
  "special": true
770
  },
@@ -772,8 +772,8 @@
772
  "id": 50342,
773
  "content": "<|nn|>",
774
  "single_word": false,
775
- "lstrip": false,
776
- "rstrip": false,
777
  "normalized": false,
778
  "special": true
779
  },
@@ -781,8 +781,8 @@
781
  "id": 50343,
782
  "content": "<|mt|>",
783
  "single_word": false,
784
- "lstrip": false,
785
- "rstrip": false,
786
  "normalized": false,
787
  "special": true
788
  },
@@ -790,8 +790,8 @@
790
  "id": 50344,
791
  "content": "<|sa|>",
792
  "single_word": false,
793
- "lstrip": false,
794
- "rstrip": false,
795
  "normalized": false,
796
  "special": true
797
  },
@@ -799,8 +799,8 @@
799
  "id": 50345,
800
  "content": "<|lb|>",
801
  "single_word": false,
802
- "lstrip": false,
803
- "rstrip": false,
804
  "normalized": false,
805
  "special": true
806
  },
@@ -808,8 +808,8 @@
808
  "id": 50346,
809
  "content": "<|my|>",
810
  "single_word": false,
811
- "lstrip": false,
812
- "rstrip": false,
813
  "normalized": false,
814
  "special": true
815
  },
@@ -817,8 +817,8 @@
817
  "id": 50347,
818
  "content": "<|bo|>",
819
  "single_word": false,
820
- "lstrip": false,
821
- "rstrip": false,
822
  "normalized": false,
823
  "special": true
824
  },
@@ -826,8 +826,8 @@
826
  "id": 50348,
827
  "content": "<|tl|>",
828
  "single_word": false,
829
- "lstrip": false,
830
- "rstrip": false,
831
  "normalized": false,
832
  "special": true
833
  },
@@ -835,8 +835,8 @@
835
  "id": 50349,
836
  "content": "<|mg|>",
837
  "single_word": false,
838
- "lstrip": false,
839
- "rstrip": false,
840
  "normalized": false,
841
  "special": true
842
  },
@@ -844,8 +844,8 @@
844
  "id": 50350,
845
  "content": "<|as|>",
846
  "single_word": false,
847
- "lstrip": false,
848
- "rstrip": false,
849
  "normalized": false,
850
  "special": true
851
  },
@@ -853,8 +853,8 @@
853
  "id": 50351,
854
  "content": "<|tt|>",
855
  "single_word": false,
856
- "lstrip": false,
857
- "rstrip": false,
858
  "normalized": false,
859
  "special": true
860
  },
@@ -862,8 +862,8 @@
862
  "id": 50352,
863
  "content": "<|haw|>",
864
  "single_word": false,
865
- "lstrip": false,
866
- "rstrip": false,
867
  "normalized": false,
868
  "special": true
869
  },
@@ -871,8 +871,8 @@
871
  "id": 50353,
872
  "content": "<|ln|>",
873
  "single_word": false,
874
- "lstrip": false,
875
- "rstrip": false,
876
  "normalized": false,
877
  "special": true
878
  },
@@ -880,8 +880,8 @@
880
  "id": 50354,
881
  "content": "<|ha|>",
882
  "single_word": false,
883
- "lstrip": false,
884
- "rstrip": false,
885
  "normalized": false,
886
  "special": true
887
  },
@@ -889,8 +889,8 @@
889
  "id": 50355,
890
  "content": "<|ba|>",
891
  "single_word": false,
892
- "lstrip": false,
893
- "rstrip": false,
894
  "normalized": false,
895
  "special": true
896
  },
@@ -898,8 +898,8 @@
898
  "id": 50356,
899
  "content": "<|jw|>",
900
  "single_word": false,
901
- "lstrip": false,
902
- "rstrip": false,
903
  "normalized": false,
904
  "special": true
905
  },
@@ -907,8 +907,8 @@
907
  "id": 50357,
908
  "content": "<|su|>",
909
  "single_word": false,
910
- "lstrip": false,
911
- "rstrip": false,
912
  "normalized": false,
913
  "special": true
914
  },
@@ -916,8 +916,8 @@
916
  "id": 50358,
917
  "content": "<|translate|>",
918
  "single_word": false,
919
- "lstrip": false,
920
- "rstrip": false,
921
  "normalized": false,
922
  "special": true
923
  },
@@ -925,8 +925,8 @@
925
  "id": 50359,
926
  "content": "<|transcribe|>",
927
  "single_word": false,
928
- "lstrip": false,
929
- "rstrip": false,
930
  "normalized": false,
931
  "special": true
932
  },
@@ -934,8 +934,8 @@
934
  "id": 50360,
935
  "content": "<|startoflm|>",
936
  "single_word": false,
937
- "lstrip": false,
938
- "rstrip": false,
939
  "normalized": false,
940
  "special": true
941
  },
@@ -943,8 +943,8 @@
943
  "id": 50361,
944
  "content": "<|startofprev|>",
945
  "single_word": false,
946
- "lstrip": false,
947
- "rstrip": false,
948
  "normalized": false,
949
  "special": true
950
  },
@@ -952,8 +952,8 @@
952
  "id": 50362,
953
  "content": "<|nocaptions|>",
954
  "single_word": false,
955
- "lstrip": false,
956
- "rstrip": false,
957
  "normalized": false,
958
  "special": true
959
  },
@@ -961,8 +961,8 @@
961
  "id": 50363,
962
  "content": "<|notimestamps|>",
963
  "single_word": false,
964
- "lstrip": false,
965
- "rstrip": false,
966
  "normalized": false,
967
  "special": true
968
  },
 
9
  "single_word": false,
10
  "lstrip": false,
11
  "rstrip": false,
12
+ "normalized": true,
13
  "special": true
14
  },
15
  {
16
  "id": 50258,
17
  "content": "<|startoftranscript|>",
18
  "single_word": false,
19
+ "lstrip": true,
20
+ "rstrip": true,
21
  "normalized": false,
22
  "special": true
23
  },
 
25
  "id": 50259,
26
  "content": "<|en|>",
27
  "single_word": false,
28
+ "lstrip": true,
29
+ "rstrip": true,
30
  "normalized": false,
31
  "special": true
32
  },
 
34
  "id": 50260,
35
  "content": "<|zh|>",
36
  "single_word": false,
37
+ "lstrip": true,
38
+ "rstrip": true,
39
  "normalized": false,
40
  "special": true
41
  },
 
43
  "id": 50261,
44
  "content": "<|de|>",
45
  "single_word": false,
46
+ "lstrip": true,
47
+ "rstrip": true,
48
  "normalized": false,
49
  "special": true
50
  },
 
52
  "id": 50262,
53
  "content": "<|es|>",
54
  "single_word": false,
55
+ "lstrip": true,
56
+ "rstrip": true,
57
  "normalized": false,
58
  "special": true
59
  },
 
61
  "id": 50263,
62
  "content": "<|ru|>",
63
  "single_word": false,
64
+ "lstrip": true,
65
+ "rstrip": true,
66
  "normalized": false,
67
  "special": true
68
  },
 
70
  "id": 50264,
71
  "content": "<|ko|>",
72
  "single_word": false,
73
+ "lstrip": true,
74
+ "rstrip": true,
75
  "normalized": false,
76
  "special": true
77
  },
 
79
  "id": 50265,
80
  "content": "<|fr|>",
81
  "single_word": false,
82
+ "lstrip": true,
83
+ "rstrip": true,
84
  "normalized": false,
85
  "special": true
86
  },
 
88
  "id": 50266,
89
  "content": "<|ja|>",
90
  "single_word": false,
91
+ "lstrip": true,
92
+ "rstrip": true,
93
  "normalized": false,
94
  "special": true
95
  },
 
97
  "id": 50267,
98
  "content": "<|pt|>",
99
  "single_word": false,
100
+ "lstrip": true,
101
+ "rstrip": true,
102
  "normalized": false,
103
  "special": true
104
  },
 
106
  "id": 50268,
107
  "content": "<|tr|>",
108
  "single_word": false,
109
+ "lstrip": true,
110
+ "rstrip": true,
111
  "normalized": false,
112
  "special": true
113
  },
 
115
  "id": 50269,
116
  "content": "<|pl|>",
117
  "single_word": false,
118
+ "lstrip": true,
119
+ "rstrip": true,
120
  "normalized": false,
121
  "special": true
122
  },
 
124
  "id": 50270,
125
  "content": "<|ca|>",
126
  "single_word": false,
127
+ "lstrip": true,
128
+ "rstrip": true,
129
  "normalized": false,
130
  "special": true
131
  },
 
133
  "id": 50271,
134
  "content": "<|nl|>",
135
  "single_word": false,
136
+ "lstrip": true,
137
+ "rstrip": true,
138
  "normalized": false,
139
  "special": true
140
  },
 
142
  "id": 50272,
143
  "content": "<|ar|>",
144
  "single_word": false,
145
+ "lstrip": true,
146
+ "rstrip": true,
147
  "normalized": false,
148
  "special": true
149
  },
 
151
  "id": 50273,
152
  "content": "<|sv|>",
153
  "single_word": false,
154
+ "lstrip": true,
155
+ "rstrip": true,
156
  "normalized": false,
157
  "special": true
158
  },
 
160
  "id": 50274,
161
  "content": "<|it|>",
162
  "single_word": false,
163
+ "lstrip": true,
164
+ "rstrip": true,
165
  "normalized": false,
166
  "special": true
167
  },
 
169
  "id": 50275,
170
  "content": "<|id|>",
171
  "single_word": false,
172
+ "lstrip": true,
173
+ "rstrip": true,
174
  "normalized": false,
175
  "special": true
176
  },
 
178
  "id": 50276,
179
  "content": "<|hi|>",
180
  "single_word": false,
181
+ "lstrip": true,
182
+ "rstrip": true,
183
  "normalized": false,
184
  "special": true
185
  },
 
187
  "id": 50277,
188
  "content": "<|fi|>",
189
  "single_word": false,
190
+ "lstrip": true,
191
+ "rstrip": true,
192
  "normalized": false,
193
  "special": true
194
  },
 
196
  "id": 50278,
197
  "content": "<|vi|>",
198
  "single_word": false,
199
+ "lstrip": true,
200
+ "rstrip": true,
201
  "normalized": false,
202
  "special": true
203
  },
 
205
  "id": 50279,
206
  "content": "<|he|>",
207
  "single_word": false,
208
+ "lstrip": true,
209
+ "rstrip": true,
210
  "normalized": false,
211
  "special": true
212
  },
 
214
  "id": 50280,
215
  "content": "<|uk|>",
216
  "single_word": false,
217
+ "lstrip": true,
218
+ "rstrip": true,
219
  "normalized": false,
220
  "special": true
221
  },
 
223
  "id": 50281,
224
  "content": "<|el|>",
225
  "single_word": false,
226
+ "lstrip": true,
227
+ "rstrip": true,
228
  "normalized": false,
229
  "special": true
230
  },
 
232
  "id": 50282,
233
  "content": "<|ms|>",
234
  "single_word": false,
235
+ "lstrip": true,
236
+ "rstrip": true,
237
  "normalized": false,
238
  "special": true
239
  },
 
241
  "id": 50283,
242
  "content": "<|cs|>",
243
  "single_word": false,
244
+ "lstrip": true,
245
+ "rstrip": true,
246
  "normalized": false,
247
  "special": true
248
  },
 
250
  "id": 50284,
251
  "content": "<|ro|>",
252
  "single_word": false,
253
+ "lstrip": true,
254
+ "rstrip": true,
255
  "normalized": false,
256
  "special": true
257
  },
 
259
  "id": 50285,
260
  "content": "<|da|>",
261
  "single_word": false,
262
+ "lstrip": true,
263
+ "rstrip": true,
264
  "normalized": false,
265
  "special": true
266
  },
 
268
  "id": 50286,
269
  "content": "<|hu|>",
270
  "single_word": false,
271
+ "lstrip": true,
272
+ "rstrip": true,
273
  "normalized": false,
274
  "special": true
275
  },
 
277
  "id": 50287,
278
  "content": "<|ta|>",
279
  "single_word": false,
280
+ "lstrip": true,
281
+ "rstrip": true,
282
  "normalized": false,
283
  "special": true
284
  },
 
286
  "id": 50288,
287
  "content": "<|no|>",
288
  "single_word": false,
289
+ "lstrip": true,
290
+ "rstrip": true,
291
  "normalized": false,
292
  "special": true
293
  },
 
295
  "id": 50289,
296
  "content": "<|th|>",
297
  "single_word": false,
298
+ "lstrip": true,
299
+ "rstrip": true,
300
  "normalized": false,
301
  "special": true
302
  },
 
304
  "id": 50290,
305
  "content": "<|ur|>",
306
  "single_word": false,
307
+ "lstrip": true,
308
+ "rstrip": true,
309
  "normalized": false,
310
  "special": true
311
  },
 
313
  "id": 50291,
314
  "content": "<|hr|>",
315
  "single_word": false,
316
+ "lstrip": true,
317
+ "rstrip": true,
318
  "normalized": false,
319
  "special": true
320
  },
 
322
  "id": 50292,
323
  "content": "<|bg|>",
324
  "single_word": false,
325
+ "lstrip": true,
326
+ "rstrip": true,
327
  "normalized": false,
328
  "special": true
329
  },
 
331
  "id": 50293,
332
  "content": "<|lt|>",
333
  "single_word": false,
334
+ "lstrip": true,
335
+ "rstrip": true,
336
  "normalized": false,
337
  "special": true
338
  },
 
340
  "id": 50294,
341
  "content": "<|la|>",
342
  "single_word": false,
343
+ "lstrip": true,
344
+ "rstrip": true,
345
  "normalized": false,
346
  "special": true
347
  },
 
349
  "id": 50295,
350
  "content": "<|mi|>",
351
  "single_word": false,
352
+ "lstrip": true,
353
+ "rstrip": true,
354
  "normalized": false,
355
  "special": true
356
  },
 
358
  "id": 50296,
359
  "content": "<|ml|>",
360
  "single_word": false,
361
+ "lstrip": true,
362
+ "rstrip": true,
363
  "normalized": false,
364
  "special": true
365
  },
 
367
  "id": 50297,
368
  "content": "<|cy|>",
369
  "single_word": false,
370
+ "lstrip": true,
371
+ "rstrip": true,
372
  "normalized": false,
373
  "special": true
374
  },
 
376
  "id": 50298,
377
  "content": "<|sk|>",
378
  "single_word": false,
379
+ "lstrip": true,
380
+ "rstrip": true,
381
  "normalized": false,
382
  "special": true
383
  },
 
385
  "id": 50299,
386
  "content": "<|te|>",
387
  "single_word": false,
388
+ "lstrip": true,
389
+ "rstrip": true,
390
  "normalized": false,
391
  "special": true
392
  },
 
394
  "id": 50300,
395
  "content": "<|fa|>",
396
  "single_word": false,
397
+ "lstrip": true,
398
+ "rstrip": true,
399
  "normalized": false,
400
  "special": true
401
  },
 
403
  "id": 50301,
404
  "content": "<|lv|>",
405
  "single_word": false,
406
+ "lstrip": true,
407
+ "rstrip": true,
408
  "normalized": false,
409
  "special": true
410
  },
 
412
  "id": 50302,
413
  "content": "<|bn|>",
414
  "single_word": false,
415
+ "lstrip": true,
416
+ "rstrip": true,
417
  "normalized": false,
418
  "special": true
419
  },
 
421
  "id": 50303,
422
  "content": "<|sr|>",
423
  "single_word": false,
424
+ "lstrip": true,
425
+ "rstrip": true,
426
  "normalized": false,
427
  "special": true
428
  },
 
430
  "id": 50304,
431
  "content": "<|az|>",
432
  "single_word": false,
433
+ "lstrip": true,
434
+ "rstrip": true,
435
  "normalized": false,
436
  "special": true
437
  },
 
439
  "id": 50305,
440
  "content": "<|sl|>",
441
  "single_word": false,
442
+ "lstrip": true,
443
+ "rstrip": true,
444
  "normalized": false,
445
  "special": true
446
  },
 
448
  "id": 50306,
449
  "content": "<|kn|>",
450
  "single_word": false,
451
+ "lstrip": true,
452
+ "rstrip": true,
453
  "normalized": false,
454
  "special": true
455
  },
 
457
  "id": 50307,
458
  "content": "<|et|>",
459
  "single_word": false,
460
+ "lstrip": true,
461
+ "rstrip": true,
462
  "normalized": false,
463
  "special": true
464
  },
 
466
  "id": 50308,
467
  "content": "<|mk|>",
468
  "single_word": false,
469
+ "lstrip": true,
470
+ "rstrip": true,
471
  "normalized": false,
472
  "special": true
473
  },
 
475
  "id": 50309,
476
  "content": "<|br|>",
477
  "single_word": false,
478
+ "lstrip": true,
479
+ "rstrip": true,
480
  "normalized": false,
481
  "special": true
482
  },
 
484
  "id": 50310,
485
  "content": "<|eu|>",
486
  "single_word": false,
487
+ "lstrip": true,
488
+ "rstrip": true,
489
  "normalized": false,
490
  "special": true
491
  },
 
493
  "id": 50311,
494
  "content": "<|is|>",
495
  "single_word": false,
496
+ "lstrip": true,
497
+ "rstrip": true,
498
  "normalized": false,
499
  "special": true
500
  },
 
502
  "id": 50312,
503
  "content": "<|hy|>",
504
  "single_word": false,
505
+ "lstrip": true,
506
+ "rstrip": true,
507
  "normalized": false,
508
  "special": true
509
  },
 
511
  "id": 50313,
512
  "content": "<|ne|>",
513
  "single_word": false,
514
+ "lstrip": true,
515
+ "rstrip": true,
516
  "normalized": false,
517
  "special": true
518
  },
 
520
  "id": 50314,
521
  "content": "<|mn|>",
522
  "single_word": false,
523
+ "lstrip": true,
524
+ "rstrip": true,
525
  "normalized": false,
526
  "special": true
527
  },
 
529
  "id": 50315,
530
  "content": "<|bs|>",
531
  "single_word": false,
532
+ "lstrip": true,
533
+ "rstrip": true,
534
  "normalized": false,
535
  "special": true
536
  },
 
538
  "id": 50316,
539
  "content": "<|kk|>",
540
  "single_word": false,
541
+ "lstrip": true,
542
+ "rstrip": true,
543
  "normalized": false,
544
  "special": true
545
  },
 
547
  "id": 50317,
548
  "content": "<|sq|>",
549
  "single_word": false,
550
+ "lstrip": true,
551
+ "rstrip": true,
552
  "normalized": false,
553
  "special": true
554
  },
 
556
  "id": 50318,
557
  "content": "<|sw|>",
558
  "single_word": false,
559
+ "lstrip": true,
560
+ "rstrip": true,
561
  "normalized": false,
562
  "special": true
563
  },
 
565
  "id": 50319,
566
  "content": "<|gl|>",
567
  "single_word": false,
568
+ "lstrip": true,
569
+ "rstrip": true,
570
  "normalized": false,
571
  "special": true
572
  },
 
574
  "id": 50320,
575
  "content": "<|mr|>",
576
  "single_word": false,
577
+ "lstrip": true,
578
+ "rstrip": true,
579
  "normalized": false,
580
  "special": true
581
  },
 
583
  "id": 50321,
584
  "content": "<|pa|>",
585
  "single_word": false,
586
+ "lstrip": true,
587
+ "rstrip": true,
588
  "normalized": false,
589
  "special": true
590
  },
 
592
  "id": 50322,
593
  "content": "<|si|>",
594
  "single_word": false,
595
+ "lstrip": true,
596
+ "rstrip": true,
597
  "normalized": false,
598
  "special": true
599
  },
 
601
  "id": 50323,
602
  "content": "<|km|>",
603
  "single_word": false,
604
+ "lstrip": true,
605
+ "rstrip": true,
606
  "normalized": false,
607
  "special": true
608
  },
 
610
  "id": 50324,
611
  "content": "<|sn|>",
612
  "single_word": false,
613
+ "lstrip": true,
614
+ "rstrip": true,
615
  "normalized": false,
616
  "special": true
617
  },
 
619
  "id": 50325,
620
  "content": "<|yo|>",
621
  "single_word": false,
622
+ "lstrip": true,
623
+ "rstrip": true,
624
  "normalized": false,
625
  "special": true
626
  },
 
628
  "id": 50326,
629
  "content": "<|so|>",
630
  "single_word": false,
631
+ "lstrip": true,
632
+ "rstrip": true,
633
  "normalized": false,
634
  "special": true
635
  },
 
637
  "id": 50327,
638
  "content": "<|af|>",
639
  "single_word": false,
640
+ "lstrip": true,
641
+ "rstrip": true,
642
  "normalized": false,
643
  "special": true
644
  },
 
646
  "id": 50328,
647
  "content": "<|oc|>",
648
  "single_word": false,
649
+ "lstrip": true,
650
+ "rstrip": true,
651
  "normalized": false,
652
  "special": true
653
  },
 
655
  "id": 50329,
656
  "content": "<|ka|>",
657
  "single_word": false,
658
+ "lstrip": true,
659
+ "rstrip": true,
660
  "normalized": false,
661
  "special": true
662
  },
 
664
  "id": 50330,
665
  "content": "<|be|>",
666
  "single_word": false,
667
+ "lstrip": true,
668
+ "rstrip": true,
669
  "normalized": false,
670
  "special": true
671
  },
 
673
  "id": 50331,
674
  "content": "<|tg|>",
675
  "single_word": false,
676
+ "lstrip": true,
677
+ "rstrip": true,
678
  "normalized": false,
679
  "special": true
680
  },
 
682
  "id": 50332,
683
  "content": "<|sd|>",
684
  "single_word": false,
685
+ "lstrip": true,
686
+ "rstrip": true,
687
  "normalized": false,
688
  "special": true
689
  },
 
691
  "id": 50333,
692
  "content": "<|gu|>",
693
  "single_word": false,
694
+ "lstrip": true,
695
+ "rstrip": true,
696
  "normalized": false,
697
  "special": true
698
  },
 
700
  "id": 50334,
701
  "content": "<|am|>",
702
  "single_word": false,
703
+ "lstrip": true,
704
+ "rstrip": true,
705
  "normalized": false,
706
  "special": true
707
  },
 
709
  "id": 50335,
710
  "content": "<|yi|>",
711
  "single_word": false,
712
+ "lstrip": true,
713
+ "rstrip": true,
714
  "normalized": false,
715
  "special": true
716
  },
 
718
  "id": 50336,
719
  "content": "<|lo|>",
720
  "single_word": false,
721
+ "lstrip": true,
722
+ "rstrip": true,
723
  "normalized": false,
724
  "special": true
725
  },
 
727
  "id": 50337,
728
  "content": "<|uz|>",
729
  "single_word": false,
730
+ "lstrip": true,
731
+ "rstrip": true,
732
  "normalized": false,
733
  "special": true
734
  },
 
736
  "id": 50338,
737
  "content": "<|fo|>",
738
  "single_word": false,
739
+ "lstrip": true,
740
+ "rstrip": true,
741
  "normalized": false,
742
  "special": true
743
  },
 
745
  "id": 50339,
746
  "content": "<|ht|>",
747
  "single_word": false,
748
+ "lstrip": true,
749
+ "rstrip": true,
750
  "normalized": false,
751
  "special": true
752
  },
 
754
  "id": 50340,
755
  "content": "<|ps|>",
756
  "single_word": false,
757
+ "lstrip": true,
758
+ "rstrip": true,
759
  "normalized": false,
760
  "special": true
761
  },
 
763
  "id": 50341,
764
  "content": "<|tk|>",
765
  "single_word": false,
766
+ "lstrip": true,
767
+ "rstrip": true,
768
  "normalized": false,
769
  "special": true
770
  },
 
772
  "id": 50342,
773
  "content": "<|nn|>",
774
  "single_word": false,
775
+ "lstrip": true,
776
+ "rstrip": true,
777
  "normalized": false,
778
  "special": true
779
  },
 
781
  "id": 50343,
782
  "content": "<|mt|>",
783
  "single_word": false,
784
+ "lstrip": true,
785
+ "rstrip": true,
786
  "normalized": false,
787
  "special": true
788
  },
 
790
  "id": 50344,
791
  "content": "<|sa|>",
792
  "single_word": false,
793
+ "lstrip": true,
794
+ "rstrip": true,
795
  "normalized": false,
796
  "special": true
797
  },
 
799
  "id": 50345,
800
  "content": "<|lb|>",
801
  "single_word": false,
802
+ "lstrip": true,
803
+ "rstrip": true,
804
  "normalized": false,
805
  "special": true
806
  },
 
808
  "id": 50346,
809
  "content": "<|my|>",
810
  "single_word": false,
811
+ "lstrip": true,
812
+ "rstrip": true,
813
  "normalized": false,
814
  "special": true
815
  },
 
817
  "id": 50347,
818
  "content": "<|bo|>",
819
  "single_word": false,
820
+ "lstrip": true,
821
+ "rstrip": true,
822
  "normalized": false,
823
  "special": true
824
  },
 
826
  "id": 50348,
827
  "content": "<|tl|>",
828
  "single_word": false,
829
+ "lstrip": true,
830
+ "rstrip": true,
831
  "normalized": false,
832
  "special": true
833
  },
 
835
  "id": 50349,
836
  "content": "<|mg|>",
837
  "single_word": false,
838
+ "lstrip": true,
839
+ "rstrip": true,
840
  "normalized": false,
841
  "special": true
842
  },
 
844
  "id": 50350,
845
  "content": "<|as|>",
846
  "single_word": false,
847
+ "lstrip": true,
848
+ "rstrip": true,
849
  "normalized": false,
850
  "special": true
851
  },
 
853
  "id": 50351,
854
  "content": "<|tt|>",
855
  "single_word": false,
856
+ "lstrip": true,
857
+ "rstrip": true,
858
  "normalized": false,
859
  "special": true
860
  },
 
862
  "id": 50352,
863
  "content": "<|haw|>",
864
  "single_word": false,
865
+ "lstrip": true,
866
+ "rstrip": true,
867
  "normalized": false,
868
  "special": true
869
  },
 
871
  "id": 50353,
872
  "content": "<|ln|>",
873
  "single_word": false,
874
+ "lstrip": true,
875
+ "rstrip": true,
876
  "normalized": false,
877
  "special": true
878
  },
 
880
  "id": 50354,
881
  "content": "<|ha|>",
882
  "single_word": false,
883
+ "lstrip": true,
884
+ "rstrip": true,
885
  "normalized": false,
886
  "special": true
887
  },
 
889
  "id": 50355,
890
  "content": "<|ba|>",
891
  "single_word": false,
892
+ "lstrip": true,
893
+ "rstrip": true,
894
  "normalized": false,
895
  "special": true
896
  },
 
898
  "id": 50356,
899
  "content": "<|jw|>",
900
  "single_word": false,
901
+ "lstrip": true,
902
+ "rstrip": true,
903
  "normalized": false,
904
  "special": true
905
  },
 
907
  "id": 50357,
908
  "content": "<|su|>",
909
  "single_word": false,
910
+ "lstrip": true,
911
+ "rstrip": true,
912
  "normalized": false,
913
  "special": true
914
  },
 
916
  "id": 50358,
917
  "content": "<|translate|>",
918
  "single_word": false,
919
+ "lstrip": true,
920
+ "rstrip": true,
921
  "normalized": false,
922
  "special": true
923
  },
 
925
  "id": 50359,
926
  "content": "<|transcribe|>",
927
  "single_word": false,
928
+ "lstrip": true,
929
+ "rstrip": true,
930
  "normalized": false,
931
  "special": true
932
  },
 
934
  "id": 50360,
935
  "content": "<|startoflm|>",
936
  "single_word": false,
937
+ "lstrip": true,
938
+ "rstrip": true,
939
  "normalized": false,
940
  "special": true
941
  },
 
943
  "id": 50361,
944
  "content": "<|startofprev|>",
945
  "single_word": false,
946
+ "lstrip": true,
947
+ "rstrip": true,
948
  "normalized": false,
949
  "special": true
950
  },
 
952
  "id": 50362,
953
  "content": "<|nocaptions|>",
954
  "single_word": false,
955
+ "lstrip": true,
956
+ "rstrip": true,
957
  "normalized": false,
958
  "special": true
959
  },
 
961
  "id": 50363,
962
  "content": "<|notimestamps|>",
963
  "single_word": false,
964
+ "lstrip": true,
965
+ "rstrip": true,
966
  "normalized": false,
967
  "special": true
968
  },
tokenizer_config.json CHANGED
The diff for this file is too large to render. See raw diff