silabi-tokenizer / tokenizer.json
nguthiru's picture
Upload tokenizer
241cc30 verified
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "[PAD]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "[UNK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "[CLS]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "[SEP]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 4,
"content": "[MASK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 5,
"content": "[SOS]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 6,
"content": "[EOS]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 7,
"content": "[SPACE]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": null,
"pre_tokenizer": {
"type": "Whitespace"
},
"post_processor": null,
"decoder": null,
"model": {
"type": "WordLevel",
"vocab": {
"[PAD]": 0,
"[UNK]": 1,
"[CLS]": 2,
"[SEP]": 3,
"[MASK]": 4,
"[SOS]": 5,
"[EOS]": 6,
"[SPACE]": 7,
"’": 8,
"Ng": 9,
"a": 10,
"e": 11,
"ng": 12,
"o": 13,
"\u0000": 14,
"\u0001": 15,
"\u0002": 16,
"\u0003": 17,
"\u0004": 18,
"\u0005": 19,
"\u0006": 20,
"\u0007": 21,
"\b": 22,
"\u000e": 23,
"\u000f": 24,
"\u0010": 25,
"\u0011": 26,
"\u0012": 27,
"\u0013": 28,
"\u0014": 29,
"\u0015": 30,
"\u0016": 31,
"\u0017": 32,
"\u0018": 33,
"\u0019": 34,
"\u001a": 35,
"\u001b": 36,
"\u001c": 37,
"\u001d": 38,
"\u001e": 39,
"\u001f": 40,
"!": 41,
"\"": 42,
"#": 43,
"$": 44,
"%": 45,
"&": 46,
"'": 47,
"(": 48,
")": 49,
"*": 50,
"+": 51,
",": 52,
"-": 53,
".": 54,
"/": 55,
"0": 56,
"1": 57,
"2": 58,
"3": 59,
"4": 60,
"5": 61,
"6": 62,
"7": 63,
"8": 64,
"9": 65,
":": 66,
";": 67,
"<": 68,
"=": 69,
">": 70,
"?": 71,
"@": 72,
"A": 73,
"B": 74,
"Ba": 75,
"Be": 76,
"Bi": 77,
"Bo": 78,
"Bu": 79,
"Bwa": 80,
"Bwe": 81,
"Bwi": 82,
"C": 83,
"Cha": 84,
"Che": 85,
"Chi": 86,
"Cho": 87,
"Chu": 88,
"Chwa": 89,
"Chwe": 90,
"Chwi": 91,
"D": 92,
"Da": 93,
"De": 94,
"Dha": 95,
"Dhe": 96,
"Dhi": 97,
"Dho": 98,
"Dhu": 99,
"Di": 100,
"Do": 101,
"Du": 102,
"E": 103,
"F": 104,
"Fa": 105,
"Fe": 106,
"Fi": 107,
"Fo": 108,
"Fu": 109,
"G": 110,
"Ga": 111,
"Ge": 112,
"Gha": 113,
"Ghe": 114,
"Ghi": 115,
"Gho": 116,
"Ghu": 117,
"Gi": 118,
"Go": 119,
"Gu": 120,
"Gwa": 121,
"Gwe": 122,
"Gwi": 123,
"H": 124,
"Ha": 125,
"He": 126,
"Hi": 127,
"Ho": 128,
"Hu": 129,
"I": 130,
"J": 131,
"Ja": 132,
"Je": 133,
"Ji": 134,
"Jo": 135,
"Ju": 136,
"Jwa": 137,
"Jwe": 138,
"Jwi": 139,
"K": 140,
"Ka": 141,
"Ke": 142,
"Kha": 143,
"Khe": 144,
"Kho": 145,
"Khu": 146,
"Ki": 147,
"Ko": 148,
"Ku": 149,
"Kwa": 150,
"Kwe": 151,
"Kwi": 152,
"L": 153,
"La": 154,
"Le": 155,
"Li": 156,
"Lo": 157,
"Lu": 158,
"Lwa": 159,
"Lwe": 160,
"Lwi": 161,
"M": 162,
"Ma": 163,
"Mba": 164,
"Mbe": 165,
"Mbi": 166,
"Mbo": 167,
"Mbu": 168,
"Mbwa": 169,
"Mbwe": 170,
"Mbwi": 171,
"Me": 172,
"Mi": 173,
"Mo": 174,
"Mu": 175,
"Mwa": 176,
"Mwe": 177,
"Mwi": 178,
"N": 179,
"Na": 180,
"Nda": 181,
"Nde": 182,
"Ndi": 183,
"Ndo": 184,
"Ndu": 185,
"Ndwa": 186,
"Ndwe": 187,
"Ndwi": 188,
"Ne": 189,
"Nga": 190,
"Nge": 191,
"Ngi": 192,
"Ngo": 193,
"Ngu": 194,
"Ngwa": 195,
"Ngwe": 196,
"Ngwi": 197,
"Ni": 198,
"Nja": 199,
"Nje": 200,
"Nji": 201,
"Njo": 202,
"Nju": 203,
"Njwa": 204,
"Njwe": 205,
"Njwi": 206,
"No": 207,
"Nu": 208,
"Nya": 209,
"Nye": 210,
"Nyi": 211,
"Nyo": 212,
"Nyu": 213,
"Nywa": 214,
"Nywe": 215,
"Nza": 216,
"Nze": 217,
"Nzi": 218,
"Nzo": 219,
"Nzu": 220,
"O": 221,
"P": 222,
"Pa": 223,
"Pe": 224,
"Pi": 225,
"Po": 226,
"Pu": 227,
"Pwa": 228,
"Pwe": 229,
"Pwi": 230,
"Pwo": 231,
"Q": 232,
"R": 233,
"Ra": 234,
"Re": 235,
"Ri": 236,
"Ro": 237,
"Ru": 238,
"S": 239,
"Sa": 240,
"Se": 241,
"Sha": 242,
"She": 243,
"Shi": 244,
"Sho": 245,
"Shu": 246,
"Shwa": 247,
"Shwe": 248,
"Shwi": 249,
"Si": 250,
"So": 251,
"Su": 252,
"Swa": 253,
"Swe": 254,
"Swi": 255,
"T": 256,
"Ta": 257,
"Te": 258,
"Tha": 259,
"The": 260,
"Thi": 261,
"Tho": 262,
"Thu": 263,
"Ti": 264,
"To": 265,
"Twa": 266,
"Twe": 267,
"Twi": 268,
"U": 269,
"V": 270,
"Va": 271,
"Ve": 272,
"Vi": 273,
"Vo": 274,
"Vu": 275,
"Vya": 276,
"Vye": 277,
"Vyo": 278,
"W": 279,
"Wa": 280,
"We": 281,
"Wi": 282,
"Wo": 283,
"Wu": 284,
"X": 285,
"Y": 286,
"Ya": 287,
"Ye": 288,
"Yi": 289,
"Yo": 290,
"Yu": 291,
"Z": 292,
"Za": 293,
"Ze": 294,
"Zi": 295,
"Zo": 296,
"Zu": 297,
"Zwa": 298,
"Zwe": 299,
"Zwi": 300,
"[": 301,
"\\": 302,
"]": 303,
"^": 304,
"_": 305,
"`": 306,
"b": 307,
"ba": 308,
"be": 309,
"bi": 310,
"bo": 311,
"bu": 312,
"bwa": 313,
"bwe": 314,
"bwi": 315,
"c": 316,
"cha": 317,
"che": 318,
"chi": 319,
"cho": 320,
"chu": 321,
"chwa": 322,
"chwe": 323,
"chwi": 324,
"d": 325,
"da": 326,
"de": 327,
"dha": 328,
"dhe": 329,
"dhi": 330,
"dho": 331,
"dhu": 332,
"di": 333,
"do": 334,
"du": 335,
"f": 336,
"fa": 337,
"fe": 338,
"fi": 339,
"fo": 340,
"fu": 341,
"g": 342,
"ga": 343,
"ge": 344,
"gha": 345,
"ghe": 346,
"ghi": 347,
"gho": 348,
"ghu": 349,
"gi": 350,
"go": 351,
"gu": 352,
"gwa": 353,
"gwe": 354,
"gwi": 355,
"h": 356,
"ha": 357,
"he": 358,
"hi": 359,
"ho": 360,
"hu": 361,
"i": 362,
"j": 363,
"ja": 364,
"je": 365,
"ji": 366,
"jo": 367,
"ju": 368,
"jwa": 369,
"jwe": 370,
"jwi": 371,
"k": 372,
"ka": 373,
"ke": 374,
"kha": 375,
"khe": 376,
"kho": 377,
"khu": 378,
"ki": 379,
"ko": 380,
"ku": 381,
"kwa": 382,
"kwe": 383,
"kwi": 384,
"l": 385,
"la": 386,
"le": 387,
"li": 388,
"lo": 389,
"lu": 390,
"lwa": 391,
"lwe": 392,
"lwi": 393,
"m": 394,
"ma": 395,
"mba": 396,
"mbe": 397,
"mbi": 398,
"mbo": 399,
"mbu": 400,
"mbwa": 401,
"mbwe": 402,
"mbwi": 403,
"me": 404,
"mi": 405,
"mo": 406,
"mu": 407,
"mwa": 408,
"mwe": 409,
"mwi": 410,
"n": 411,
"na": 412,
"nda": 413,
"nde": 414,
"ndi": 415,
"ndo": 416,
"ndu": 417,
"ndwa": 418,
"ndwe": 419,
"ndwi": 420,
"ne": 421,
"nga": 422,
"nge": 423,
"ngi": 424,
"ngo": 425,
"ngu": 426,
"ngwa": 427,
"ngwe": 428,
"ngwi": 429,
"ni": 430,
"nja": 431,
"nje": 432,
"nji": 433,
"njo": 434,
"nju": 435,
"njwa": 436,
"njwe": 437,
"njwi": 438,
"no": 439,
"nu": 440,
"nya": 441,
"nye": 442,
"nyi": 443,
"nyo": 444,
"nyu": 445,
"nywa": 446,
"nywe": 447,
"nza": 448,
"nze": 449,
"nzi": 450,
"nzo": 451,
"nzu": 452,
"p": 453,
"pa": 454,
"pe": 455,
"pi": 456,
"po": 457,
"pu": 458,
"pwa": 459,
"pwe": 460,
"pwi": 461,
"pwo": 462,
"q": 463,
"r": 464,
"ra": 465,
"re": 466,
"ri": 467,
"ro": 468,
"ru": 469,
"s": 470,
"sa": 471,
"se": 472,
"sha": 473,
"she": 474,
"shi": 475,
"sho": 476,
"shu": 477,
"shwa": 478,
"shwe": 479,
"shwi": 480,
"si": 481,
"so": 482,
"su": 483,
"swa": 484,
"swe": 485,
"swi": 486,
"t": 487,
"ta": 488,
"te": 489,
"tha": 490,
"the": 491,
"thi": 492,
"tho": 493,
"thu": 494,
"ti": 495,
"to": 496,
"twa": 497,
"twe": 498,
"twi": 499,
"u": 500,
"v": 501,
"va": 502,
"ve": 503,
"vi": 504,
"vo": 505,
"vu": 506,
"vya": 507,
"vye": 508,
"vyo": 509,
"w": 510,
"wa": 511,
"we": 512,
"wi": 513,
"wo": 514,
"wu": 515,
"x": 516,
"y": 517,
"ya": 518,
"ye": 519,
"yi": 520,
"yo": 521,
"yu": 522,
"z": 523,
"za": 524,
"ze": 525,
"zi": 526,
"zo": 527,
"zu": 528,
"zwa": 529,
"zwe": 530,
"zwi": 531,
"{": 532,
"|": 533,
"}": 534,
"~": 535,
"": 536,
"€": 537,
"": 538,
"‚": 539,
"ƒ": 540,
"„": 541,
"†": 542,
"‡": 543,
"ˆ": 544,
"‰": 545,
"Š": 546,
"‹": 547,
"Œ": 548,
"": 549,
"Ž": 550,
"": 551,
"": 552,
"‘": 553,
"’": 554,
"“": 555,
"”": 556,
"•": 557,
"–": 558,
"—": 559,
"˜": 560,
"™": 561,
"š": 562,
"›": 563,
"œ": 564,
"": 565,
"ž": 566,
"Ÿ": 567,
"¡": 568,
"¢": 569,
"£": 570,
"¤": 571,
"¥": 572,
"¦": 573,
"§": 574,
"¨": 575,
"©": 576,
"ª": 577,
"«": 578,
"¬": 579,
"­": 580,
"®": 581,
"¯": 582,
"°": 583,
"±": 584,
"²": 585,
"³": 586,
"´": 587,
"µ": 588,
"¶": 589,
"·": 590,
"¸": 591,
"¹": 592,
"º": 593,
"»": 594,
"¼": 595,
"½": 596,
"¾": 597,
"¿": 598,
"À": 599,
"Á": 600,
"Â": 601,
"Ã": 602,
"Ä": 603,
"Å": 604,
"Æ": 605,
"Ç": 606,
"È": 607,
"É": 608,
"Ê": 609,
"Ë": 610,
"Ì": 611,
"Í": 612,
"Î": 613,
"Ï": 614,
"Ð": 615,
"Ñ": 616,
"Ò": 617,
"Ó": 618,
"Ô": 619,
"Õ": 620,
"Ö": 621,
"×": 622,
"Ø": 623,
"Ù": 624,
"Ú": 625,
"Û": 626,
"Ü": 627,
"Ý": 628,
"Þ": 629,
"ß": 630,
"à": 631,
"á": 632,
"â": 633,
"ã": 634,
"ä": 635,
"å": 636,
"æ": 637,
"ç": 638,
"è": 639,
"é": 640,
"ê": 641,
"ë": 642,
"ì": 643,
"í": 644,
"î": 645,
"ï": 646,
"ð": 647,
"ñ": 648,
"ò": 649,
"ó": 650,
"ô": 651,
"õ": 652,
"ö": 653,
"÷": 654,
"ø": 655,
"ù": 656,
"ú": 657,
"û": 658,
"ü": 659,
"ý": 660,
"þ": 661
},
"unk_token": "[UNK]"
}
}