YU-tokenizer / tokenizer.json
codegood's picture
Upload tokenizer
75912f7 verified
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "<|endoftext|>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": true
}
],
"normalizer": null,
"pre_tokenizer": {
"type": "ByteLevel",
"add_prefix_space": false,
"trim_offsets": true,
"use_regex": true
},
"post_processor": {
"type": "ByteLevel",
"add_prefix_space": true,
"trim_offsets": false,
"use_regex": true
},
"decoder": {
"type": "ByteLevel",
"add_prefix_space": true,
"trim_offsets": true,
"use_regex": true
},
"model": {
"type": "BPE",
"dropout": null,
"unk_token": null,
"continuing_subword_prefix": "",
"end_of_word_suffix": "",
"fuse_unk": false,
"byte_fallback": false,
"vocab": {
"<|endoftext|>": 0,
"!": 1,
"\"": 2,
"#": 3,
"$": 4,
"%": 5,
"&": 6,
"'": 7,
"(": 8,
")": 9,
"*": 10,
"+": 11,
",": 12,
"-": 13,
".": 14,
"/": 15,
"0": 16,
"1": 17,
"2": 18,
"3": 19,
"4": 20,
"5": 21,
"6": 22,
"7": 23,
"8": 24,
"9": 25,
":": 26,
";": 27,
"<": 28,
"=": 29,
">": 30,
"?": 31,
"@": 32,
"A": 33,
"B": 34,
"C": 35,
"D": 36,
"E": 37,
"F": 38,
"G": 39,
"H": 40,
"I": 41,
"J": 42,
"K": 43,
"L": 44,
"M": 45,
"N": 46,
"O": 47,
"P": 48,
"Q": 49,
"R": 50,
"S": 51,
"T": 52,
"U": 53,
"V": 54,
"W": 55,
"X": 56,
"Y": 57,
"Z": 58,
"[": 59,
"\\": 60,
"]": 61,
"^": 62,
"_": 63,
"`": 64,
"a": 65,
"b": 66,
"c": 67,
"d": 68,
"e": 69,
"f": 70,
"g": 71,
"h": 72,
"i": 73,
"j": 74,
"k": 75,
"l": 76,
"m": 77,
"n": 78,
"o": 79,
"p": 80,
"q": 81,
"r": 82,
"s": 83,
"t": 84,
"u": 85,
"v": 86,
"w": 87,
"x": 88,
"y": 89,
"z": 90,
"{": 91,
"|": 92,
"}": 93,
"~": 94,
"¡": 95,
"¢": 96,
"£": 97,
"¤": 98,
"¥": 99,
"¦": 100,
"§": 101,
"¨": 102,
"©": 103,
"ª": 104,
"«": 105,
"¬": 106,
"®": 107,
"¯": 108,
"°": 109,
"±": 110,
"²": 111,
"³": 112,
"´": 113,
"µ": 114,
"¶": 115,
"·": 116,
"¸": 117,
"¹": 118,
"º": 119,
"»": 120,
"¼": 121,
"½": 122,
"¾": 123,
"¿": 124,
"À": 125,
"Á": 126,
"Â": 127,
"Ã": 128,
"Ä": 129,
"Å": 130,
"Æ": 131,
"Ç": 132,
"È": 133,
"É": 134,
"Ê": 135,
"Ë": 136,
"Ì": 137,
"Í": 138,
"Î": 139,
"Ï": 140,
"Ð": 141,
"Ñ": 142,
"Ò": 143,
"Ó": 144,
"Ô": 145,
"Õ": 146,
"Ö": 147,
"×": 148,
"Ø": 149,
"Ù": 150,
"Ú": 151,
"Û": 152,
"Ü": 153,
"Ý": 154,
"Þ": 155,
"ß": 156,
"à": 157,
"á": 158,
"â": 159,
"ã": 160,
"ä": 161,
"å": 162,
"æ": 163,
"ç": 164,
"è": 165,
"é": 166,
"ê": 167,
"ë": 168,
"ì": 169,
"í": 170,
"î": 171,
"ï": 172,
"ð": 173,
"ñ": 174,
"ò": 175,
"ó": 176,
"ô": 177,
"õ": 178,
"ö": 179,
"÷": 180,
"ø": 181,
"ù": 182,
"ú": 183,
"û": 184,
"ü": 185,
"ý": 186,
"þ": 187,
"ÿ": 188,
"Ā": 189,
"ā": 190,
"Ă": 191,
"ă": 192,
"Ą": 193,
"ą": 194,
"Ć": 195,
"ć": 196,
"Ĉ": 197,
"ĉ": 198,
"Ċ": 199,
"ċ": 200,
"Č": 201,
"č": 202,
"Ď": 203,
"ď": 204,
"Đ": 205,
"đ": 206,
"Ē": 207,
"ē": 208,
"Ĕ": 209,
"ĕ": 210,
"Ė": 211,
"ė": 212,
"Ę": 213,
"ę": 214,
"Ě": 215,
"ě": 216,
"Ĝ": 217,
"ĝ": 218,
"Ğ": 219,
"ğ": 220,
"Ġ": 221,
"ġ": 222,
"Ģ": 223,
"ģ": 224,
"Ĥ": 225,
"ĥ": 226,
"Ħ": 227,
"ħ": 228,
"Ĩ": 229,
"ĩ": 230,
"Ī": 231,
"ī": 232,
"Ĭ": 233,
"ĭ": 234,
"Į": 235,
"į": 236,
"İ": 237,
"ı": 238,
"IJ": 239,
"ij": 240,
"Ĵ": 241,
"ĵ": 242,
"Ķ": 243,
"ķ": 244,
"ĸ": 245,
"Ĺ": 246,
"ĺ": 247,
"Ļ": 248,
"ļ": 249,
"Ľ": 250,
"ľ": 251,
"Ŀ": 252,
"ŀ": 253,
"Ł": 254,
"ł": 255,
"Ń": 256,
"in": 257,
"Ġt": 258,
"er": 259,
"Ġa": 260,
"he": 261,
"on": 262,
"ti": 263,
"en": 264,
"nd": 265,
"ing": 266,
"Ġthe": 267,
"tion": 268,
"Ġs": 269,
"at": 270,
"Ġf": 271,
"al": 272,
"as": 273,
"es": 274,
"of": 275,
"ro": 276,
"Ġc": 277,
"Ġd": 278,
"Ġof": 279,
"an": 280,
"ci": 281,
"le": 282,
"ns": 283,
"und": 284,
"Ġp": 285,
"ent": 286,
"ar": 287,
"ce": 288,
"ed": 289,
"el": 290,
"hi": 291,
"or": 292,
"ĠA": 293,
"Ġin": 294,
"Ġsci": 295,
"ata": 296,
"Ġdata": 297,
"Ġscien": 298,
"Qu": 299,
"gr": 300,
"ment": 301,
"re": 302,
"ter": 303,
"wer": 304,
"ĠB": 305,
"ĠP": 306,
"ĠW": 307,
"Ġw": 308,
"Ġto": 309,
"estion": 310,
"nswer": 311,
"Ġpro": 312,
"ĠAnswer": 313,
"Ġscience": 314,
"Question": 315,
"gra": 316,
".<": 317,
"ation": 318,
"ge": 319,
"ib": 320,
"is": 321,
"ition": 322,
"os": 323,
"par": 324,
"se": 325,
"ĠH": 326,
"ĠJ": 327,
"ĠS": 328,
"ĠT": 329,
"Ġhe": 330,
"ĠQuestion": 331,
"Ġand": 332,
"Ġco": 333,
"under": 334,
".</": 335,
"part": 336,
"De": 337,
"ded": 338,
"ef": 339,
"ex": 340,
"hat": 341,
"has": 342,
"ic": 343,
"il": 344,
"ies": 345,
"iel": 346,
"kil": 347,
"ls": 348,
"mm": 349,
"man": 350,
"ol": 351,
"ow": 352,
"ound": 353,
"pl": 354,
"per": 355,
"pos": 356,
"ppl": 357,
"rib": 358,
"st": 359,
"tel": 360,
"trib": 361,
"ut": 362,
"ĠE": 363,
"Ġb": 364,
"Ġg": 365,
"Ġon": 366,
"Ġhi": 367,
"Ġgra": 368,
"Ġis": 369,
"ĠDe": 370,
"Ġhas": 371,
"inan": 372,
"Ġth": 373,
"Ġat": 374,
"Ġare": 375,
"Ġappl": 376,
"ontrib": 377,
"tise": 378,
"Ġskil": 379,
"Ġfiel": 380,
"Ġfound": 381,
"Ġfinan": 382,
"ass": 383,
"Ġcontrib": 384,
"ana": 385,
"nshi": 386,
"ork": 387,
"ortel": 388,
"ternshi": 389,
"ĠPass": 390,
"ĠWunder": 391,
"ĠWhat": 392,
"Ġwork": 393,
"Ġpropos": 394,
"gramm": 395,
"itions": 396,
"ĠHow": 397,
"ĠJef": 398,
"partment": 399,
"pertise": 400,
"Ġhis": 401,
"Ġgraded": 402,
"Ġskills": 403,
"Ġfield": 404,
"Ġcontribut": 405,
"ortell": 406,
"ternship": 407,
"ĠWunderman": 408,
"Ġpropositions": 409,
"gramming": 410,
"ĠJeff": 411,
"Col": 412,
"ES": 413,
"HN": 414,
"In": 415,
"Mana": 416,
"Ne": 417,
"No": 418,
"ac": 419,
"ay": 420,
"ain": 421,
"and": 422,
"ble": 423,
"bre": 424,
"ct": 425,
"cat": 426,
"cal": 427,
"dd": 428,
"ding": 429,
"dge": 430,
"dies": 431,
"eer": 432,
"epartment": 433,
"eac": 434,
"ebre": 435,
"ect": 436,
"gh": 437,
"gin": 438,
"her": 439,
"hing": 440,
"hortell": 441,
"id": 442,
"im": 443,
"ir": 444,
"it": 445,
"iv": 446,
"iti": 447,
"kh": 448,
"ly": 449,
"lay": 450,
"mp": 451,
"ms": 452,
"nex": 453,
"oe": 454,
"op": 455,
"ot": 456,
"oar": 457,
"pir": 458,
"ral": 459,
"rse": 460,
"riti": 461,
"spir": 462,
"tu": 463,
"ucat": 464,
"ugh": 465,
"urse": 466,
"ve": 467,
"vel": 468,
"xpertise": 469,
"Ġen": 470,
"Ġro": 471,
"Ġed": 472,
"Ġunder": 473,
"Ġex": 474,
"ĠCol": 475,
"ĠIn": 476,
"ĠMana": 477,
"ĠNe": 478,
"Ġim": 479,
"Ġnex": 480,
"Ġteac": 481,
"eration": 482,
"Ġan": 483,
"Ġas": 484,
"tist": 485,
"eneration": 486,
"Ġsol": 487,
"Ġsect": 488,
"Ġfund": 489,
"Ġfos": 490,
"alent": 491,
"ally": 492,
"aster": 493,
"asis": 494,
"asic": 495,
"rogramming": 496,
"rough": 497,
"Ġcent": 498,
"Ġcriti": 499,
"Ġdepartment": 500,
"Ġdid": 501,
"cial": 502,
"lex": 503,
"lege": 504,
"Ġpiv": 505,
"Ġplay": 506,
"hile": 507,
"ĠAdd": 508,
"Ġinternship": 509,
"Ġinspir": 510,
"Ġscientist": 511,
"ree": 512,
"tering": 513,
"ĠBib": 514,
"ĠBHN": 515,
"ĠBoar": 516,
"ĠBasic": 517,
"ĠPN": 518,
"ĠProgramming": 519,
"Ġwher": 520,
"Ġwhile": 521,
"Ġprogramming": 522,
"Ġproble": 523,
"ations": 524,
"ational": 525,
"gement": 526,
"itionally": 527,
"ĠHebre": 528,
"ĠJoe": 529,
"ĠSortell": 530,
"ĠShortell": 531,
"ĠStu": 532,
"ĠThe": 533,
"ĠTana": 534,
"ĠTalent": 535,
"Ġhedge": 536,
"Ġcoding": 537,
"Ġcomp": 538,
"Ġcourse": 539,
"parting": 540,
"ications": 541,
"stand": 542,
"ĠExpertise": 543,
"ĠEaster": 544,
"Ġby": 545,
"Ġbasis": 546,
"Ġgain": 547,
"Ġgeneration": 548,
"ĠDepartment": 549,
"ĠDevel": 550,
"Ġthrough": 551,
"Ġthree": 552,
"Ġapplies": 553,
"Ġapplications": 554,
"Ġfounded": 555,
"Ġfoundational": 556,
"Ġfinance": 557,
"Ġfinancial": 558,
"ĠPassNo": 559,
"Ġworking": 560,
"Ġcontributes": 561,
"Ġcontributed": 562,
"ternships": 563,
"gineer": 564,
"ity": 565,
"opment": 566,
"otal": 567,
"rality": 568,
"ucating": 569,
"Ġengineer": 570,
"Ġrole": 571,
"Ġeducating": 572,
"Ġunderstand": 573,
"Ġexpertise": 574,
"ĠCollege": 575,
"ĠInternships": 576,
"ĠManagement": 577,
"ĠNear": 578,
"Ġimparting": 579,
"Ġnext": 580,
"Ġteaching": 581,
"Ġsolve": 582,
"Ġsector": 583,
"Ġfostering": 584,
"Ġcentrality": 585,
"Ġcritical": 586,
"Ġpivotal": 587,
"Ġplays": 588,
"ĠAdditionally": 589,
"Ġinspiring": 590,
"Ġscientists": 591,
"ĠBible": 592,
"ĠBHNES": 593,
"ĠBoard": 594,
"ĠBasics": 595,
"Ġwhere": 596,
"Ġproblems": 597,
"ĠHebrew": 598,
"ĠStudies": 599,
"ĠTanakh": 600,
"Ġcomplex": 601,
"ĠEastern": 602,
"ĠDevelopment": 603,
"Ġunderstanding": 604
},
"merges": [
"i n",
"Ġ t",
"e r",
"Ġ a",
"h e",
"o n",
"t i",
"e n",
"n d",
"in g",
"Ġt he",
"ti on",
"Ġ s",
"a t",
"Ġ f",
"a l",
"a s",
"e s",
"o f",
"r o",
"Ġ c",
"Ġ d",
"Ġ of",
"a n",
"c i",
"l e",
"n s",
"u nd",
"Ġ p",
"en t",
"a r",
"c e",
"e d",
"e l",
"h i",
"o r",
"Ġ A",
"Ġ in",
"Ġs ci",
"at a",
"Ġd ata",
"Ġsci en",
"Q u",
"g r",
"m ent",
"r e",
"t er",
"w er",
"Ġ B",
"Ġ P",
"Ġ W",
"Ġ w",
"Ġt o",
"es tion",
"ns wer",
"Ġp ro",
"ĠA nswer",
"Ġscien ce",
"Qu estion",
"gr a",
". <",
"a tion",
"g e",
"i b",
"i s",
"i tion",
"o s",
"p ar",
"s e",
"Ġ H",
"Ġ J",
"Ġ S",
"Ġ T",
"Ġ he",
"Ġ Question",
"Ġa nd",
"Ġc o",
"und er",
".< /",
"par t",
"D e",
"d ed",
"e f",
"e x",
"h at",
"h as",
"i c",
"i l",
"i es",
"i el",
"k il",
"l s",
"m m",
"m an",
"o l",
"o w",
"o und",
"p l",
"p er",
"p os",
"p pl",
"r ib",
"s t",
"t el",
"t rib",
"u t",
"Ġ E",
"Ġ b",
"Ġ g",
"Ġ on",
"Ġ hi",
"Ġ gra",
"Ġ is",
"Ġ De",
"Ġ has",
"in an",
"Ġt h",
"Ġa t",
"Ġa re",
"Ġa ppl",
"on trib",
"ti se",
"Ġs kil",
"Ġf iel",
"Ġf ound",
"Ġf inan",
"as s",
"Ġc ontrib",
"an a",
"ns hi",
"or k",
"or tel",
"ter nshi",
"ĠP ass",
"ĠW under",
"ĠW hat",
"Ġw ork",
"Ġpro pos",
"gra mm",
"ition s",
"ĠH ow",
"ĠJ ef",
"part ment",
"per tise",
"Ġhi s",
"Ġgra ded",
"Ġskil ls",
"Ġfiel d",
"Ġcontrib ut",
"ortel l",
"ternshi p",
"ĠWunder man",
"Ġpropos itions",
"gramm ing",
"ĠJef f",
"C ol",
"E S",
"H N",
"I n",
"M ana",
"N e",
"N o",
"a c",
"a y",
"a in",
"a nd",
"b le",
"b re",
"c t",
"c at",
"c al",
"d d",
"d ing",
"d ge",
"d ies",
"e er",
"e partment",
"e ac",
"e bre",
"e ct",
"g h",
"g in",
"h er",
"h ing",
"h ortell",
"i d",
"i m",
"i r",
"i t",
"i v",
"i ti",
"k h",
"l y",
"l ay",
"m p",
"m s",
"n ex",
"o e",
"o p",
"o t",
"o ar",
"p ir",
"r al",
"r se",
"r iti",
"s pir",
"t u",
"u cat",
"u gh",
"u rse",
"v e",
"v el",
"x pertise",
"Ġ en",
"Ġ ro",
"Ġ ed",
"Ġ under",
"Ġ ex",
"Ġ Col",
"Ġ In",
"Ġ Mana",
"Ġ Ne",
"Ġ im",
"Ġ nex",
"Ġt eac",
"er ation",
"Ġa n",
"Ġa s",
"ti st",
"en eration",
"Ġs ol",
"Ġs ect",
"Ġf und",
"Ġf os",
"al ent",
"al ly",
"as ter",
"as is",
"as ic",
"ro gramming",
"ro ugh",
"Ġc ent",
"Ġc riti",
"Ġd epartment",
"Ġd id",
"ci al",
"le x",
"le ge",
"Ġp iv",
"Ġp lay",
"hi le",
"ĠA dd",
"Ġin ternship",
"Ġin spir",
"Ġscien tist",
"re e",
"ter ing",
"ĠB ib",
"ĠB HN",
"ĠB oar",
"ĠB asic",
"ĠP N",
"ĠP rogramming",
"Ġw her",
"Ġw hile",
"Ġpro gramming",
"Ġpro ble",
"ation s",
"ation al",
"ge ment",
"ition ally",
"ĠH ebre",
"ĠJ oe",
"ĠS ortell",
"ĠS hortell",
"ĠS tu",
"ĠT he",
"ĠT ana",
"ĠT alent",
"Ġhe dge",
"Ġco ding",
"Ġco mp",
"Ġco urse",
"part ing",
"ic ations",
"st and",
"ĠE xpertise",
"ĠE aster",
"Ġb y",
"Ġb asis",
"Ġg ain",
"Ġg eneration",
"ĠDe partment",
"ĠDe vel",
"Ġth rough",
"Ġth ree",
"Ġappl ies",
"Ġappl ications",
"Ġfound ed",
"Ġfound ational",
"Ġfinan ce",
"Ġfinan cial",
"ĠPass No",
"Ġwork ing",
"Ġcontribut es",
"Ġcontribut ed",
"ternship s",
"gin eer",
"it y",
"op ment",
"ot al",
"ral ity",
"ucat ing",
"Ġen gineer",
"Ġro le",
"Ġed ucating",
"Ġunder stand",
"Ġex pertise",
"ĠCol lege",
"ĠIn ternships",
"ĠMana gement",
"ĠNe ar",
"Ġim parting",
"Ġnex t",
"Ġteac hing",
"Ġsol ve",
"Ġsect or",
"Ġfos tering",
"Ġcent rality",
"Ġcriti cal",
"Ġpiv otal",
"Ġplay s",
"ĠAdd itionally",
"Ġinspir ing",
"Ġscientist s",
"ĠBib le",
"ĠBHN ES",
"ĠBoar d",
"ĠBasic s",
"Ġwher e",
"Ġproble ms",
"ĠHebre w",
"ĠStu dies",
"ĠTana kh",
"Ġcomp lex",
"ĠEaster n",
"ĠDevel opment",
"Ġunderstand ing"
]
}
}