TinyStories-1024 / tokenizer.json
tdooms's picture
Upload tokenizer
9bdfd4e verified
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "[UNK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "[BOS]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "[EOS]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": {
"type": "Sequence",
"normalizers": [
{
"type": "NFD"
},
{
"type": "StripAccents"
},
{
"type": "Lowercase"
}
]
},
"pre_tokenizer": {
"type": "Sequence",
"pretokenizers": [
{
"type": "Punctuation",
"behavior": "Isolated"
},
{
"type": "Whitespace"
},
{
"type": "Digits",
"individual_digits": true
}
]
},
"post_processor": {
"type": "TemplateProcessing",
"single": [
{
"SpecialToken": {
"id": "[BOS]",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
}
],
"pair": [
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"Sequence": {
"id": "B",
"type_id": 1
}
}
],
"special_tokens": {
"[BOS]": {
"id": "[BOS]",
"ids": [
1
],
"tokens": [
"[BOS]"
]
}
}
},
"decoder": null,
"model": {
"type": "WordPiece",
"unk_token": "[UNK]",
"continuing_subword_prefix": "##",
"max_input_chars_per_word": 100,
"vocab": {
"[UNK]": 0,
"[BOS]": 1,
"[EOS]": 2,
"!": 3,
"\"": 4,
"#": 5,
"$": 6,
"%": 7,
"&": 8,
"'": 9,
"(": 10,
")": 11,
"*": 12,
"+": 13,
",": 14,
"-": 15,
".": 16,
"/": 17,
"0": 18,
"1": 19,
"2": 20,
"3": 21,
"4": 22,
"5": 23,
"6": 24,
"7": 25,
"8": 26,
"9": 27,
":": 28,
";": 29,
"<": 30,
"=": 31,
">": 32,
"?": 33,
"@": 34,
"[": 35,
"\\": 36,
"]": 37,
"_": 38,
"`": 39,
"a": 40,
"b": 41,
"c": 42,
"d": 43,
"e": 44,
"f": 45,
"g": 46,
"h": 47,
"i": 48,
"j": 49,
"k": 50,
"l": 51,
"m": 52,
"n": 53,
"o": 54,
"p": 55,
"q": 56,
"r": 57,
"s": 58,
"t": 59,
"u": 60,
"v": 61,
"w": 62,
"x": 63,
"y": 64,
"z": 65,
"{": 66,
"|": 67,
"}": 68,
"~": 69,
"##a": 70,
"##l": 71,
"##n": 72,
"##c": 73,
"##i": 74,
"##x": 75,
"##m": 76,
"##t": 77,
"##o": 78,
"##s": 79,
"##e": 80,
"##r": 81,
"##v": 82,
"##g": 83,
"##y": 84,
"##k": 85,
"##d": 86,
"##b": 87,
"##p": 88,
"##h": 89,
"##u": 90,
"##w": 91,
"##f": 92,
"##j": 93,
"##z": 94,
"##q": 95,
"##he": 96,
"the": 97,
"##nd": 98,
"##ed": 99,
"and": 100,
"to": 101,
"##er": 102,
"wa": 103,
"##ou": 104,
"##in": 105,
"he": 106,
"##re": 107,
"was": 108,
"sa": 109,
"##ing": 110,
"##om": 111,
"she": 112,
"##ar": 113,
"##il": 114,
"##it": 115,
"##ay": 116,
"it": 117,
"##id": 118,
"##at": 119,
"they": 120,
"ha": 121,
"##en": 122,
"##is": 123,
"##an": 124,
"on": 125,
"th": 126,
"##or": 127,
"##im": 128,
"##on": 129,
"##ut": 130,
"her": 131,
"##ll": 132,
"##le": 133,
"##et": 134,
"##ot": 135,
"##ir": 136,
"##es": 137,
"in": 138,
"##ow": 139,
"you": 140,
"##ck": 141,
"##ld": 142,
"##oo": 143,
"said": 144,
"be": 145,
"##ily": 146,
"tim": 147,
"st": 148,
"##ig": 149,
"so": 150,
"##ce": 151,
"##pp": 152,
"his": 153,
"wit": 154,
"with": 155,
"mom": 156,
"##ve": 157,
"lily": 158,
"of": 159,
"fr": 160,
"that": 161,
"##ked": 162,
"##am": 163,
"pl": 164,
"##ery": 165,
"##ad": 166,
"##nt": 167,
"##ke": 168,
"but": 169,
"day": 170,
"up": 171,
"##ie": 172,
"play": 173,
"had": 174,
"##el": 175,
"wh": 176,
"for": 177,
"##my": 178,
"##st": 179,
"##un": 180,
"##ould": 181,
"##ent": 182,
"an": 183,
"li": 184,
"##ra": 185,
"##ch": 186,
"happ": 187,
"one": 188,
"##itt": 189,
"do": 190,
"sh": 191,
"want": 192,
"##her": 193,
"there": 194,
"##ly": 195,
"very": 196,
"##ome": 197,
"##se": 198,
"not": 199,
"##ound": 200,
"litt": 201,
"little": 202,
"as": 203,
"ba": 204,
"time": 205,
"ne": 206,
"##ht": 207,
"##al": 208,
"ma": 209,
"happy": 210,
"big": 211,
"sm": 212,
"is": 213,
"saw": 214,
"##iend": 215,
"friend": 216,
"loo": 217,
"##ry": 218,
"re": 219,
"bo": 220,
"##ur": 221,
"##ter": 222,
"##ved": 223,
"##ug": 224,
"once": 225,
"lo": 226,
"##ere": 227,
"were": 228,
"##ore": 229,
"se": 230,
"ev": 231,
"go": 232,
"sp": 233,
"him": 234,
"too": 235,
"##ide": 236,
"ca": 237,
"we": 238,
"at": 239,
"##irl": 240,
"tom": 241,
"are": 242,
"upon": 243,
"can": 244,
"whe": 245,
"girl": 246,
"wanted": 247,
"##ard": 248,
"##ec": 249,
"##ill": 250,
"out": 251,
"ben": 252,
"their": 253,
"them": 254,
"##fu": 255,
"##way": 256,
"##ys": 257,
"did": 258,
"##ind": 259,
"could": 260,
"smil": 261,
"##ri": 262,
"no": 263,
"have": 264,
"##ted": 265,
"##ver": 266,
"##ain": 267,
"ex": 268,
"all": 269,
"##hed": 270,
"went": 271,
"hel": 272,
"ar": 273,
"su": 274,
"when": 275,
"nam": 276,
"##ic": 277,
"help": 278,
"ta": 279,
"friends": 280,
"##ful": 281,
"##ood": 282,
"##hing": 283,
"##ight": 284,
"kn": 285,
"what": 286,
"le": 287,
"##um": 288,
"##ark": 289,
"back": 290,
"##one": 291,
"cl": 292,
"from": 293,
"timmy": 294,
"fun": 295,
"al": 296,
"then": 297,
"named": 298,
"##all": 299,
"ro": 300,
"every": 301,
"star": 302,
"sc": 303,
"loved": 304,
"smiled": 305,
"##oug": 306,
"##side": 307,
"asked": 308,
"##elt": 309,
"man": 310,
"some": 311,
"##ick": 312,
"see": 313,
"me": 314,
"like": 315,
"fe": 316,
"felt": 317,
"##op": 318,
"br": 319,
"looked": 320,
"around": 321,
"##ame": 322,
"##ep": 323,
"bir": 324,
"look": 325,
"##omet": 326,
"would": 327,
"##get": 328,
"somet": 329,
"boy": 330,
"fa": 331,
"bird": 332,
"##ong": 333,
"##ss": 334,
"pr": 335,
"mommy": 336,
"##dd": 337,
"##est": 338,
"##ings": 339,
"ag": 340,
"jo": 341,
"wor": 342,
"##ade": 343,
"car": 344,
"than": 345,
"make": 346,
"##gether": 347,
"together": 348,
"tre": 349,
"##own": 350,
"ran": 351,
"away": 352,
"la": 353,
"dad": 354,
"started": 355,
"##ice": 356,
"##oud": 357,
"##ared": 358,
"made": 359,
"says": 360,
"something": 361,
"co": 362,
"fl": 363,
"##ited": 364,
"park": 365,
"sad": 366,
"##ther": 367,
"good": 368,
"##ack": 369,
"exc": 370,
"new": 371,
"ch": 372,
"other": 373,
"put": 374,
"who": 375,
"##out": 376,
"let": 377,
"mu": 378,
"##ble": 379,
"again": 380,
"home": 381,
"hug": 382,
"found": 383,
"sam": 384,
"dec": 385,
"##ried": 386,
"wal": 387,
"##pped": 388,
"##ure": 389,
"get": 390,
"playing": 391,
"##ought": 392,
"##ach": 393,
"##pl": 394,
"gra": 395,
"sw": 396,
"things": 397,
"##ous": 398,
"excited": 399,
"##na": 400,
"got": 401,
"bl": 402,
"##ny": 403,
"##king": 404,
"##uck": 405,
"liked": 406,
"your": 407,
"##ge": 408,
"##ided": 409,
"decided": 410,
"came": 411,
"my": 412,
"bec": 413,
"dog": 414,
"scared": 415,
"##ust": 416,
"down": 417,
"this": 418,
"##ouse": 419,
"ab": 420,
"find": 421,
"care": 422,
"pa": 423,
"gr": 424,
"feel": 425,
"po": 426,
"will": 427,
"max": 428,
"sara": 429,
"##ell": 430,
"bu": 431,
"##ist": 432,
"##arn": 433,
"##ways": 434,
"##as": 435,
"##ave": 436,
"always": 437,
"anna": 438,
"##nder": 439,
"didn": 440,
"##ess": 441,
"mo": 442,
"about": 443,
"took": 444,
"kne": 445,
"lot": 446,
"toys": 447,
"outside": 448,
"##ers": 449,
"##ook": 450,
"tree": 451,
"##ally": 452,
"af": 453,
"##ant": 454,
"##ise": 455,
"##bb": 456,
"##ged": 457,
"how": 458,
"old": 459,
"##ite": 460,
"thought": 461,
"ball": 462,
"ho": 463,
"more": 464,
"##ma": 465,
"##eci": 466,
"##ened": 467,
"##ched": 468,
"sor": 469,
"learn": 470,
"##ret": 471,
"tw": 472,
"pu": 473,
"cat": 474,
"know": 475,
"##to": 476,
"take": 477,
"pe": 478,
"don": 479,
"laug": 480,
"knew": 481,
"speci": 482,
"sudd": 483,
"special": 484,
"sudden": 485,
"##ty": 486,
"mi": 487,
"inside": 488,
"##ive": 489,
"any": 490,
"toy": 491,
"jack": 492,
"suddenly": 493,
"##ro": 494,
"sorry": 495,
"after": 496,
"##ff": 497,
"just": 498,
"##ue": 499,
"if": 500,
"tr": 501,
"show": 502,
"##lly": 503,
"##ink": 504,
"much": 505,
"ra": 506,
"or": 507,
"run": 508,
"sl": 509,
"##ish": 510,
"hand": 511,
"house": 512,
"sun": 513,
"yes": 514,
"op": 515,
"sk": 516,
"clo": 517,
"tried": 518,
"fin": 519,
"told": 520,
"into": 521,
"en": 522,
"##ate": 523,
"water": 524,
"over": 525,
"##ea": 526,
"proud": 527,
"##ump": 528,
"##dy": 529,
"##use": 530,
"gave": 531,
"never": 532,
"each": 533,
"heard": 534,
"eat": 535,
"##by": 536,
"ok": 537,
"expl": 538,
"played": 539,
"couldn": 540,
"room": 541,
"thank": 542,
"##ause": 543,
"pick": 544,
"pret": 545,
"because": 546,
"##other": 547,
"qu": 548,
"gre": 549,
"##lled": 550,
"##ion": 551,
"come": 552,
"sha": 553,
"wat": 554,
"bear": 555,
"mia": 556,
"##ious": 557,
"off": 558,
"hugged": 559,
"now": 560,
"com": 561,
"##oth": 562,
"fo": 563,
"bet": 564,
"need": 565,
"nice": 566,
"##our": 567,
"box": 568,
"str": 569,
"##ile": 570,
"##fe": 571,
"many": 572,
"##ft": 573,
"small": 574,
"long": 575,
"##eep": 576,
"##ving": 577,
"##sed": 578,
"end": 579,
"anim": 580,
"animal": 581,
"##ough": 582,
"try": 583,
"unt": 584,
"##gry": 585,
"##cy": 586,
"##kes": 587,
"even": 588,
"##ort": 589,
"until": 590,
"##ild": 591,
"##urt": 592,
"##iz": 593,
"##elf": 594,
"learned": 595,
"soon": 596,
"kind": 597,
"bea": 598,
"everyone": 599,
"by": 600,
"better": 601,
"ad": 602,
"flow": 603,
"love": 604,
"spot": 605,
"##mp": 606,
"best": 607,
"##ine": 608,
"cle": 609,
"##ady": 610,
"##urp": 611,
"##ream": 612,
"##urn": 613,
"##ace": 614,
"fi": 615,
"say": 616,
"##ber": 617,
"gard": 618,
"garden": 619,
"##ves": 620,
"fast": 621,
"its": 622,
"careful": 623,
"beaut": 624,
"che": 625,
"##ies": 626,
"bra": 627,
"sky": 628,
"thanked": 629,
"laughed": 630,
"jump": 631,
"gl": 632,
"loud": 633,
"ow": 634,
"sn": 635,
"list": 636,
"##ct": 637,
"##iny": 638,
"##ear": 639,
"lots": 640,
"##lew": 641,
"beauti": 642,
"wo": 643,
"beautiful": 644,
"##sh": 645,
"hard": 646,
"fam": 647,
"still": 648,
"animals": 649,
"lu": 650,
"joh": 651,
"under": 652,
"john": 653,
"stay": 654,
"hurt": 655,
"##ning": 656,
"mum": 657,
"both": 658,
"dan": 659,
"##self": 660,
"rem": 661,
"way": 662,
"##ree": 663,
"safe": 664,
"two": 665,
"##ool": 666,
"bad": 667,
"col": 668,
"##hes": 669,
"imp": 670,
"di": 671,
"lived": 672,
"red": 673,
"tow": 674,
"##ople": 675,
"##be": 676,
"book": 677,
"##em": 678,
"people": 679,
"##ane": 680,
"walked": 681,
"okay": 682,
"lucy": 683,
"surp": 684,
"surpr": 685,
"brave": 686,
"family": 687,
"should": 688,
"##ase": 689,
"adv": 690,
"flew": 691,
"##ished": 692,
"##igh": 693,
"##ress": 694,
"##ock": 695,
"stor": 696,
"##ept": 697,
"called": 698,
"##eet": 699,
"##ip": 700,
"fore": 701,
"angry": 702,
"sure": 703,
"fly": 704,
"while": 705,
"kept": 706,
"##fore": 707,
"##led": 708,
"before": 709,
"##ect": 710,
"##xt": 711,
"##ger": 712,
"share": 713,
"##ised": 714,
"##art": 715,
"pic": 716,
"pretty": 717,
"keep": 718,
"going": 719,
"rock": 720,
"door": 721,
"##dded": 722,
"clean": 723,
"##ied": 724,
"next": 725,
"dra": 726,
"advent": 727,
"con": 728,
"why": 729,
"##ary": 730,
"un": 731,
"##illy": 732,
"far": 733,
"real": 734,
"id": 735,
"shiny": 736,
"give": 737,
"noise": 738,
"wind": 739,
"opened": 740,
"cry": 741,
"may": 742,
"grand": 743,
"##end": 744,
"sto": 745,
"doll": 746,
"ground": 747,
"##ner": 748,
"explore": 749,
"turn": 750,
"##so": 751,
"##les": 752,
"also": 753,
"ey": 754,
"idea": 755,
"color": 756,
"war": 757,
"feeling": 758,
"where": 759,
"##ap": 760,
"bob": 761,
"picked": 762,
"blue": 763,
"##imb": 764,
"nodded": 765,
"walking": 766,
"climb": 767,
"##thing": 768,
"clos": 769,
"##ting": 770,
"thr": 771,
"bed": 772,
"ple": 773,
"wait": 774,
"adventure": 775,
"being": 776,
"smile": 777,
"##oy": 778,
"finally": 779,
"##th": 780,
"##iced": 781,
"has": 782,
"looking": 783,
"da": 784,
"food": 785,
"##ture": 786,
"diff": 787,
"wr": 788,
"remem": 789,
"##and": 790,
"repl": 791,
"maybe": 792,
"picture": 793,
"##joy": 794,
"listen": 795,
"del": 796,
"tra": 797,
"bro": 798,
"great": 799,
"##ught": 800,
"truck": 801,
"think": 802,
"stopped": 803,
"eyes": 804,
"walk": 805,
"##qu": 806,
"gi": 807,
"remember": 808,
"ru": 809,
"bre": 810,
"enjoy": 811,
"sue": 812,
"##able": 813,
"here": 814,
"import": 815,
"vo": 816,
"year": 817,
"forest": 818,
"ever": 819,
"quick": 820,
"wonder": 821,
"ac": 822,
"##ized": 823,
"flowers": 824,
"##og": 825,
"hands": 826,
"##bbit": 827,
"##per": 828,
"app": 829,
"noticed": 830,
"near": 831,
"cur": 832,
"head": 833,
"important": 834,
"rabbit": 835,
"dis": 836,
"watch": 837,
"fish": 838,
"replied": 839,
"bun": 840,
"##irst": 841,
"##age": 842,
"rain": 843,
"ama": 844,
"##llow": 845,
"sound": 846,
"showed": 847,
"amaz": 848,
"mor": 849,
"us": 850,
"work": 851,
"slide": 852,
"tal": 853,
"follow": 854,
"##gan": 855,
"sarah": 856,
"stop": 857,
"right": 858,
"##ces": 859,
"mag": 860,
"tou": 861,
"mean": 862,
"differ": 863,
"goodby": 864,
"##bbed": 865,
"watched": 866,
"bright": 867,
"daddy": 868,
"##day": 869,
"ask": 870,
"goodbye": 871,
"strong": 872,
"our": 873,
"use": 874,
"please": 875,
"quickly": 876,
"hop": 877,
"am": 878,
"been": 879,
"stick": 880,
"voice": 881,
"became": 882,
"##ath": 883,
"yell": 884,
"different": 885,
"boat": 886,
"jane": 887,
"##co": 888,
"child": 889,
"store": 890,
"##che": 891,
"##llo": 892,
"high": 893,
"place": 894,
"hello": 895,
"first": 896,
"face": 897,
"##ange": 898,
"##ng": 899,
"##ummy": 900,
"warm": 901,
"##ak": 902,
"closer": 903,
"dress": 904,
"curious": 905,
"sand": 906,
"cook": 907,
"fav": 908,
"bel": 909,
"does": 910,
"forg": 911,
"em": 912,
"joe": 913,
"tell": 914,
"##ount": 915,
"three": 916,
"grandma": 917,
"##oon": 918,
"##leep": 919,
"bunny": 920,
"night": 921,
"butter": 922,
"open": 923,
"##more": 924,
"anymore": 925,
"pie": 926,
"mon": 927,
"cake": 928,
"##ila": 929,
"##ired": 930,
"lea": 931,
"##ull": 932,
"##iss": 933,
"sweet": 934,
"##ached": 935,
"block": 936,
"pain": 937,
"lila": 938,
"kid": 939,
"kit": 940,
"duck": 941,
"flo": 942,
"only": 943,
"fell": 944,
"cont": 945,
"grabbed": 946,
"##isy": 947,
"birds": 948,
"##ered": 949,
"helped": 950,
"##here": 951,
"jumped": 952,
"cra": 953,
"per": 954,
"fire": 955,
"pet": 956,
"bit": 957,
"glad": 958,
"##chen": 959,
"kitchen": 960,
"dr": 961,
"sing": 962,
"yummy": 963,
"squ": 964,
"prin": 965,
"##ul": 966,
"##outed": 967,
"happened": 968,
"hear": 969,
"grass": 970,
"story": 971,
"realized": 972,
"ready": 973,
"tommy": 974,
"tri": 975,
"##nts": 976,
"sees": 977,
"really": 978,
"beh": 979,
"brother": 980,
"favor": 981,
"shouted": 982,
"##ey": 983,
"draw": 984,
"cr": 985,
"favorite": 986,
"lady": 987,
"having": 988,
"reached": 989,
"through": 990,
"ate": 991,
"game": 992,
"cre": 993,
"##zy": 994,
"mess": 995,
"soft": 996,
"pare": 997,
"underst": 998,
"##ins": 999,
"hat": 1000,
"##imes": 1001,
"less": 1002,
"##ather": 1003,
"butterf": 1004,
"thing": 1005,
"##ket": 1006,
"magic": 1007,
"began": 1008,
"##where": 1009,
"world": 1010,
"cu": 1011,
"##ken": 1012,
"himself": 1013,
"rest": 1014,
"##fully": 1015,
"##ppy": 1016,
"sometimes": 1017,
"pretend": 1018,
"mouse": 1019,
"making": 1020,
"cut": 1021,
"ted": 1022,
"done": 1023
}
}
}