my-new-shiny-tokenizer / tokenizer.json
Inioluwa's picture
Upload tokenizer
355d63f verified
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "<|endoftext|>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": true
}
],
"normalizer": null,
"pre_tokenizer": {
"type": "ByteLevel",
"add_prefix_space": false,
"trim_offsets": true,
"use_regex": true
},
"post_processor": {
"type": "ByteLevel",
"add_prefix_space": true,
"trim_offsets": false,
"use_regex": true
},
"decoder": {
"type": "ByteLevel",
"add_prefix_space": true,
"trim_offsets": true,
"use_regex": true
},
"model": {
"type": "BPE",
"dropout": null,
"unk_token": null,
"continuing_subword_prefix": "",
"end_of_word_suffix": "",
"fuse_unk": false,
"byte_fallback": false,
"ignore_merges": false,
"vocab": {
"<|endoftext|>": 0,
"!": 1,
"\"": 2,
"#": 3,
"$": 4,
"%": 5,
"&": 6,
"'": 7,
"(": 8,
")": 9,
"*": 10,
"+": 11,
",": 12,
"-": 13,
".": 14,
"/": 15,
"0": 16,
"1": 17,
"2": 18,
"3": 19,
"4": 20,
"5": 21,
"6": 22,
"7": 23,
"8": 24,
"9": 25,
":": 26,
";": 27,
"<": 28,
"=": 29,
">": 30,
"?": 31,
"@": 32,
"A": 33,
"B": 34,
"C": 35,
"D": 36,
"E": 37,
"F": 38,
"G": 39,
"H": 40,
"I": 41,
"J": 42,
"K": 43,
"L": 44,
"M": 45,
"N": 46,
"O": 47,
"P": 48,
"Q": 49,
"R": 50,
"S": 51,
"T": 52,
"U": 53,
"V": 54,
"W": 55,
"X": 56,
"Y": 57,
"Z": 58,
"[": 59,
"\\": 60,
"]": 61,
"^": 62,
"_": 63,
"`": 64,
"a": 65,
"b": 66,
"c": 67,
"d": 68,
"e": 69,
"f": 70,
"g": 71,
"h": 72,
"i": 73,
"j": 74,
"k": 75,
"l": 76,
"m": 77,
"n": 78,
"o": 79,
"p": 80,
"q": 81,
"r": 82,
"s": 83,
"t": 84,
"u": 85,
"v": 86,
"w": 87,
"x": 88,
"y": 89,
"z": 90,
"{": 91,
"|": 92,
"}": 93,
"~": 94,
"¡": 95,
"¢": 96,
"£": 97,
"¤": 98,
"¥": 99,
"¦": 100,
"§": 101,
"¨": 102,
"©": 103,
"ª": 104,
"«": 105,
"¬": 106,
"®": 107,
"¯": 108,
"°": 109,
"±": 110,
"²": 111,
"³": 112,
"´": 113,
"µ": 114,
"¶": 115,
"·": 116,
"¸": 117,
"¹": 118,
"º": 119,
"»": 120,
"¼": 121,
"½": 122,
"¾": 123,
"¿": 124,
"À": 125,
"Á": 126,
"Â": 127,
"Ã": 128,
"Ä": 129,
"Å": 130,
"Æ": 131,
"Ç": 132,
"È": 133,
"É": 134,
"Ê": 135,
"Ë": 136,
"Ì": 137,
"Í": 138,
"Î": 139,
"Ï": 140,
"Ð": 141,
"Ñ": 142,
"Ò": 143,
"Ó": 144,
"Ô": 145,
"Õ": 146,
"Ö": 147,
"×": 148,
"Ø": 149,
"Ù": 150,
"Ú": 151,
"Û": 152,
"Ü": 153,
"Ý": 154,
"Þ": 155,
"ß": 156,
"à": 157,
"á": 158,
"â": 159,
"ã": 160,
"ä": 161,
"å": 162,
"æ": 163,
"ç": 164,
"è": 165,
"é": 166,
"ê": 167,
"ë": 168,
"ì": 169,
"í": 170,
"î": 171,
"ï": 172,
"ð": 173,
"ñ": 174,
"ò": 175,
"ó": 176,
"ô": 177,
"õ": 178,
"ö": 179,
"÷": 180,
"ø": 181,
"ù": 182,
"ú": 183,
"û": 184,
"ü": 185,
"ý": 186,
"þ": 187,
"ÿ": 188,
"Ā": 189,
"ā": 190,
"Ă": 191,
"ă": 192,
"Ą": 193,
"ą": 194,
"Ć": 195,
"ć": 196,
"Ĉ": 197,
"ĉ": 198,
"Ċ": 199,
"ċ": 200,
"Č": 201,
"č": 202,
"Ď": 203,
"ď": 204,
"Đ": 205,
"đ": 206,
"Ē": 207,
"ē": 208,
"Ĕ": 209,
"ĕ": 210,
"Ė": 211,
"ė": 212,
"Ę": 213,
"ę": 214,
"Ě": 215,
"ě": 216,
"Ĝ": 217,
"ĝ": 218,
"Ğ": 219,
"ğ": 220,
"Ġ": 221,
"ġ": 222,
"Ģ": 223,
"ģ": 224,
"Ĥ": 225,
"ĥ": 226,
"Ħ": 227,
"ħ": 228,
"Ĩ": 229,
"ĩ": 230,
"Ī": 231,
"ī": 232,
"Ĭ": 233,
"ĭ": 234,
"Į": 235,
"į": 236,
"İ": 237,
"ı": 238,
"IJ": 239,
"ij": 240,
"Ĵ": 241,
"ĵ": 242,
"Ķ": 243,
"ķ": 244,
"ĸ": 245,
"Ĺ": 246,
"ĺ": 247,
"Ļ": 248,
"ļ": 249,
"Ľ": 250,
"ľ": 251,
"Ŀ": 252,
"ŀ": 253,
"Ł": 254,
"ł": 255,
"Ń": 256,
"in": 257,
"an": 258,
"Ġa": 259,
"Ġn": 260,
"Ġt": 261,
"Ġw": 262,
"Ġe": 263,
"Ġth": 264,
"ga": 265,
"Ġy": 266,
"bo": 267,
"un": 268,
"Ġna": 269,
"Ġthe": 270,
"do": 271,
"Ġk": 272,
"pe": 273,
"ba": 274,
"hi": 275,
"Ġm": 276,
"be": 277,
"gan": 278,
"ya": 279,
"zhi": 280,
"Ġf": 281,
"aa": 282,
"ci": 283,
"jin": 284,
"ll": 285,
"lo": 286,
"re": 287,
"Ġo": 288,
"Ġga": 289,
"Ġbe": 290,
"Ġkp": 291,
"as": 292,
"is": 293,
"ill": 294,
"wa": 295,
"ye": 296,
"ĠN": 297,
"Ġd": 298,
"Ġl": 299,
"Ġs": 300,
"Ġgan": 301,
"ing": 302,
"Ġndo": 303,
"Ġwo": 304,
"bolo": 305,
"ak": 306,
"am": 307,
"aye": 308,
"de": 309,
"es": 310,
"gba": 311,
"ndo": 312,
"on": 313,
"or": 314,
"upe": 315,
"yi": 316,
"za": 317,
"Ġc": 318,
"Ġzhi": 319,
"Ġjin": 320,
"Ġnya": 321,
"Ġta": 322,
"Ġwun": 323,
"Ġwill": 324,
"Ġthey": 325,
"Ġman": 326,
"ĠNupe": 327,
"Ġndondo": 328,
"Ġwunga": 329,
"Al": 330,
"Ta": 331,
"bi": 332,
"ea": 333,
"er": 334,
"fe": 335,
"ik": 336,
"ir": 337,
"it": 338,
"ka": 339,
"lÃ": 340,
"maa": 341,
"na": 342,
"nya": 343,
"se": 344,
"ti": 345,
"Ġg": 346,
"Ġr": 347,
"Ġz": 348,
"Ġin": 349,
"Ġci": 350,
"Ġis": 351,
"Ġbolo": 352,
"ang": 353,
"Ġaga": 354,
"Ġamaa": 355,
"Ġwr": 356,
"Ġebo": 357,
"Ġenya": 358,
"Ġeti": 359,
"Ġthan": 360,
"Ġya": 361,
"Ġye": 362,
"Ġyi": 363,
"Ġtheir": 364,
"Ġkam": 365,
"peak": 366,
"Ġmaa": 367,
"Ġfi": 368,
"Ġfu": 369,
"Ġof": 370,
"Ġkpaye": 371,
"Ġkpik": 372,
"ash": 373,
"wal": 374,
"Ġda": 375,
"Ġsh": 376,
"Ġspeak": 377,
"Ġgangan": 378,
"zaba": 379,
"Ġco": 380,
"Ġbolobolo": 381,
"Ġkamin": 382,
"Ġkpikpe": 383,
"An": 384,
"Hash": 385,
"Mu": 386,
"Oh": 387,
"So": 388,
"To": 389,
"Wor": 390,
"Yea": 391,
"Za": 392,
"Zan": 393,
"ab": 394,
"ac": 395,
"ag": 396,
"au": 397,
"abo": 398,
"ajin": 399,
"all": 400,
"bba": 401,
"cau": 402,
"dl": 403,
"dÃ": 404,
"ed": 405,
"ez": 406,
"eas": 407,
"fill": 408,
"gh": 409,
"ho": 410,
"ion": 411,
"ier": 412,
"igh": 413,
"jan": 414,
"kn": 415,
"ko": 416,
"kan": 417,
"le": 418,
"lea": 419,
"lfill": 420,
"mp": 421,
"min": 422,
"oo": 423,
"ou": 424,
"ore": 425,
"ose": 426,
"plea": 427,
"rd": 428,
"rde": 429,
"rkn": 430,
"st": 431,
"sÃ": 432,
"ska": 433,
"ta": 434,
"ts": 435,
"tun": 436,
"ubo": 437,
"ure": 438,
"uis": 439,
"uag": 440,
"utun": 441,
"ver": 442,
"wun": 443,
"zin": 444,
"¡lÃ": 445,
"¢n": 446,
"Ġu": 447,
"Ġan": 448,
"Ġbo": 449,
"Ġun": 450,
"Ġdo": 451,
"Ġlo": 452,
"Ġre": 453,
"ĠAl": 454,
"Ġbi": 455,
"Ġit": 456,
"Ġzaba": 457,
"ĠSo": 458,
"ĠTo": 459,
"ĠWor": 460,
"ĠYea": 461,
"ĠZa": 462,
"Ġplea": 463,
"łdÃ": 464,
"inmin": 465,
"inst": 466,
"anba": 467,
"anya": 468,
"anion": 469,
"Ġaa": 470,
"Ġaci": 471,
"Ġare": 472,
"Ġaye": 473,
"Ġabi": 474,
"Ġafe": 475,
"Ġazaba": 476,
"Ġnor": 477,
"Ġnyi": 478,
"Ġto": 479,
"Ġtis": 480,
"Ġtutun": 481,
"Ġwa": 482,
"Ġwang": 483,
"Ġwho": 484,
"Ġega": 485,
"Ġeas": 486,
"Ġegba": 487,
"Ġeza": 488,
"Ġthose": 489,
"Ġyin": 490,
"Ġyan": 491,
"Ġyaa": 492,
"Ġyes": 493,
"Ġyou": 494,
"uny": 495,
"unci": 496,
"ungba": 497,
"Ġthem": 498,
"Ġthere": 499,
"doing": 500,
"Ġkinmin": 501,
"Ġkanya": 502,
"pea": 503,
"Ġmeas": 504,
"Ġmore": 505,
"zhio": 506,
"Ġfor": 507,
"Ġfac": 508,
"Ġgajin": 509,
"Ġgayi": 510,
"Ġgaajin": 511,
"Ġgarde": 512,
"Ġbecau": 513,
"Ġkpin": 514,
"wazhi": 515,
"ward": 516,
"ĠNo": 517,
"Ġdash": 518,
"Ġdanba": 519,
"Ġling": 520,
"Ġlak": 521,
"Ġlaye": 522,
"Ġlang": 523,
"Ġndoci": 524,
"Ġwoa": 525,
"ame": 526,
"ess": 527,
"ong": 528,
"onyi": 529,
"Ġcin": 530,
"Ġjina": 531,
"Ġtabba": 532,
"Alka": 533,
"iting": 534,
"lÃłdÃ": 535,
"nazhio": 536,
"Ġgoo": 537,
"Ġgonyi": 538,
"Ġrigh": 539,
"Ġrungba": 540,
"Ġzun": 541,
"Ġzwal": 542,
"Ġcia": 543,
"Ġagainst": 544,
"Ġwrong": 545,
"Ġwriting": 546,
"Ġebona": 547,
"Ġetiabo": 548,
"Ġetiubo": 549,
"Ġyefe": 550,
"Ġmaade": 551,
"Ġfibo": 552,
"Ġfulfill": 553,
"Ġfuska": 554,
"wali": 555,
"Ġdarkn": 556,
"Ġshall": 557,
"Ġshame": 558,
"Ġspeaking": 559,
"Ġcomp": 560,
"Ġcover": 561,
"Ġkamina": 562,
"Ġkpikpewazhi": 563,
"And": 564,
"Muez": 565,
"Zana": 566,
"able": 567,
"dly": 568,
"jannazhio": 569,
"kanga": 570,
"sálÃ": 571,
"uists": 572,
"uage": 573,
"Ġand": 574,
"Ġboa": 575,
"Ġunable": 576,
"Ġreward": 577,
"ĠAljannazhio": 578,
"Ġbibe": 579,
"ĠSoko": 580,
"ĠWord": 581,
"ĠZawun": 582,
"anions": 583,
"Ġabide": 584,
"Ġtishi": 585,
"Ġwangi": 586,
"Ġegagba": 587,
"Ġeasier": 588,
"Ġezazhi": 589,
"Ġyesuny": 590,
"Ġtherein": 591,
"Ġmeasure": 592,
"Ġfaces": 593,
"Ġgarden": 594,
"Ġbecause": 595,
"Ġdashin": 596,
"Ġlinguists": 597,
"Ġlakpea": 598,
"Ġlanguage": 599,
"Ġtabbata": 600,
"Alkawali": 601,
"lÃłdân": 602,
"Ġgoodly": 603,
"Ġright": 604,
"Ġzwalunci": 605,
"Ġfulfilled": 606,
"Ġdarkness": 607,
"Ġcompanions": 608,
"Muezzin": 609,
"sálú": 610
},
"merges": [
"i n",
"a n",
"Ġ a",
"Ġ n",
"Ġ t",
"Ġ w",
"Ġ e",
"Ġt h",
"g a",
"Ġ y",
"b o",
"u n",
"Ġn a",
"Ġth e",
"d o",
"Ġ k",
"p e",
"b a",
"h i",
"Ġ m",
"b e",
"g an",
"y a",
"z hi",
"Ġ f",
"a a",
"c i",
"j in",
"l l",
"l o",
"r e",
"Ġ o",
"Ġ ga",
"Ġ be",
"Ġk p",
"a s",
"i s",
"i ll",
"w a",
"y e",
"Ġ N",
"Ġ d",
"Ġ l",
"Ġ s",
"Ġ gan",
"in g",
"Ġn do",
"Ġw o",
"bo lo",
"a k",
"a m",
"a ye",
"d e",
"e s",
"g ba",
"n do",
"o n",
"o r",
"u pe",
"y i",
"z a",
"Ġ c",
"Ġ zhi",
"Ġ jin",
"Ġn ya",
"Ġt a",
"Ġw un",
"Ġw ill",
"Ġthe y",
"Ġm an",
"ĠN upe",
"Ġndo ndo",
"Ġwun ga",
"A l",
"T a",
"b i",
"e a",
"e r",
"f e",
"i k",
"i r",
"i t",
"k a",
"l Ã",
"m aa",
"n a",
"n ya",
"s e",
"t i",
"Ġ g",
"Ġ r",
"Ġ z",
"Ġ in",
"Ġ ci",
"Ġ is",
"Ġ bolo",
"an g",
"Ġa ga",
"Ġa maa",
"Ġw r",
"Ġe bo",
"Ġe nya",
"Ġe ti",
"Ġth an",
"Ġy a",
"Ġy e",
"Ġy i",
"Ġthe ir",
"Ġk am",
"pe ak",
"Ġm aa",
"Ġf i",
"Ġf u",
"Ġo f",
"Ġkp aye",
"Ġkp ik",
"as h",
"wa l",
"Ġd a",
"Ġs h",
"Ġs peak",
"Ġgan gan",
"za ba",
"Ġc o",
"Ġbolo bolo",
"Ġkam in",
"Ġkpik pe",
"A n",
"H ash",
"M u",
"O h",
"S o",
"T o",
"W or",
"Y ea",
"Z a",
"Z an",
"a b",
"a c",
"a g",
"a u",
"a bo",
"a jin",
"a ll",
"b ba",
"c au",
"d l",
"d Ã",
"e d",
"e z",
"e as",
"f ill",
"g h",
"h o",
"i on",
"i er",
"i gh",
"j an",
"k n",
"k o",
"k an",
"l e",
"l ea",
"l fill",
"m p",
"m in",
"o o",
"o u",
"o re",
"o se",
"p lea",
"r d",
"r de",
"r kn",
"s t",
"s Ã",
"s ka",
"t a",
"t s",
"t un",
"u bo",
"u re",
"u is",
"u ag",
"u tun",
"v er",
"w un",
"z in",
"¡ lÃ",
"¢ n",
"Ġ u",
"Ġ an",
"Ġ bo",
"Ġ un",
"Ġ do",
"Ġ lo",
"Ġ re",
"Ġ Al",
"Ġ bi",
"Ġ it",
"Ġ zaba",
"Ġ So",
"Ġ To",
"Ġ Wor",
"Ġ Yea",
"Ġ Za",
"Ġ plea",
"ł dÃ",
"in min",
"in st",
"an ba",
"an ya",
"an ion",
"Ġa a",
"Ġa ci",
"Ġa re",
"Ġa ye",
"Ġa bi",
"Ġa fe",
"Ġa zaba",
"Ġn or",
"Ġn yi",
"Ġt o",
"Ġt is",
"Ġt utun",
"Ġw a",
"Ġw ang",
"Ġw ho",
"Ġe ga",
"Ġe as",
"Ġe gba",
"Ġe za",
"Ġth ose",
"Ġy in",
"Ġy an",
"Ġy aa",
"Ġy es",
"Ġy ou",
"un y",
"un ci",
"un gba",
"Ġthe m",
"Ġthe re",
"do ing",
"Ġk inmin",
"Ġk anya",
"pe a",
"Ġm eas",
"Ġm ore",
"zhi o",
"Ġf or",
"Ġf ac",
"Ġga jin",
"Ġga yi",
"Ġga ajin",
"Ġga rde",
"Ġbe cau",
"Ġkp in",
"wa zhi",
"wa rd",
"ĠN o",
"Ġd ash",
"Ġd anba",
"Ġl ing",
"Ġl ak",
"Ġl aye",
"Ġl ang",
"Ġndo ci",
"Ġwo a",
"am e",
"es s",
"on g",
"on yi",
"Ġc in",
"Ġjin a",
"Ġta bba",
"Al ka",
"it ing",
"là łdÃ",
"na zhio",
"Ġg oo",
"Ġg onyi",
"Ġr igh",
"Ġr ungba",
"Ġz un",
"Ġz wal",
"Ġci a",
"Ġaga inst",
"Ġwr ong",
"Ġwr iting",
"Ġebo na",
"Ġeti abo",
"Ġeti ubo",
"Ġye fe",
"Ġmaa de",
"Ġfi bo",
"Ġfu lfill",
"Ġfu ska",
"wal i",
"Ġda rkn",
"Ġsh all",
"Ġsh ame",
"Ġspeak ing",
"Ġco mp",
"Ġco ver",
"Ġkamin a",
"Ġkpikpe wazhi",
"An d",
"Mu ez",
"Zan a",
"ab le",
"dl y",
"jan nazhio",
"kan ga",
"sà ¡lÃ",
"uis ts",
"uag e",
"Ġan d",
"Ġbo a",
"Ġun able",
"Ġre ward",
"ĠAl jannazhio",
"Ġbi be",
"ĠSo ko",
"ĠWor d",
"ĠZa wun",
"anion s",
"Ġabi de",
"Ġtis hi",
"Ġwang i",
"Ġega gba",
"Ġeas ier",
"Ġeza zhi",
"Ġyes uny",
"Ġthere in",
"Ġmeas ure",
"Ġfac es",
"Ġgarde n",
"Ġbecau se",
"Ġdash in",
"Ġling uists",
"Ġlak pea",
"Ġlang uage",
"Ġtabba ta",
"Alka wali",
"lÃłdà ¢n",
"Ġgoo dly",
"Ġrigh t",
"Ġzwal unci",
"Ġfulfill ed",
"Ġdarkn ess",
"Ġcomp anions",
"Muez zin",
"sálà º"
]
}
}