TinyStories-656K / tokenizer.json
raincandy-u's picture
Upload 6 files
a652083 verified
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "<unk>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": true
},
{
"id": 1,
"content": "<|start_story|>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": true
},
{
"id": 2,
"content": "<|end_story|>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": true
}
],
"normalizer": {
"type": "Sequence",
"normalizers": [
{
"type": "Prepend",
"prepend": "▁"
},
{
"type": "Replace",
"pattern": {
"String": " "
},
"content": "▁"
}
]
},
"pre_tokenizer": null,
"post_processor": {
"type": "TemplateProcessing",
"single": [
{
"SpecialToken": {
"id": "<|start_story|>",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
}
],
"pair": [
{
"SpecialToken": {
"id": "<|start_story|>",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "<|start_story|>",
"type_id": 1
}
},
{
"Sequence": {
"id": "B",
"type_id": 1
}
}
],
"special_tokens": {
"<|start_story|>": {
"id": "<|start_story|>",
"ids": [
1
],
"tokens": [
"<|start_story|>"
]
}
}
},
"decoder": {
"type": "Sequence",
"decoders": [
{
"type": "Replace",
"pattern": {
"String": "▁"
},
"content": " "
},
{
"type": "ByteFallback"
},
{
"type": "Fuse"
},
{
"type": "Strip",
"content": " ",
"start": 1,
"stop": 0
}
]
},
"model": {
"type": "BPE",
"dropout": null,
"unk_token": "<unk>",
"continuing_subword_prefix": null,
"end_of_word_suffix": null,
"fuse_unk": true,
"byte_fallback": true,
"ignore_merges": false,
"vocab": {
"<unk>": 0,
"<|start_story|>": 1,
"<|end_story|>": 2,
"\n": 3,
"!": 4,
"\"": 5,
"$": 6,
"'": 7,
",": 8,
"-": 9,
".": 10,
"0": 11,
"1": 12,
"2": 13,
"3": 14,
"4": 15,
"5": 16,
"6": 17,
"7": 18,
"8": 19,
"9": 20,
":": 21,
";": 22,
"<": 23,
">": 24,
"?": 25,
"A": 26,
"B": 27,
"C": 28,
"D": 29,
"E": 30,
"F": 31,
"G": 32,
"H": 33,
"I": 34,
"J": 35,
"K": 36,
"L": 37,
"M": 38,
"N": 39,
"O": 40,
"P": 41,
"Q": 42,
"R": 43,
"S": 44,
"T": 45,
"U": 46,
"V": 47,
"W": 48,
"X": 49,
"Y": 50,
"Z": 51,
"_": 52,
"a": 53,
"b": 54,
"c": 55,
"d": 56,
"e": 57,
"f": 58,
"g": 59,
"h": 60,
"i": 61,
"j": 62,
"k": 63,
"l": 64,
"m": 65,
"n": 66,
"o": 67,
"p": 68,
"q": 69,
"r": 70,
"s": 71,
"t": 72,
"u": 73,
"v": 74,
"w": 75,
"x": 76,
"y": 77,
"z": 78,
"|": 79,
"▁": 80,
"e▁": 81,
"d▁": 82,
"th": 83,
".▁": 84,
"▁a": 85,
"t▁": 86,
"y▁": 87,
"to": 88,
"s▁": 89,
"nd▁": 90,
"er": 91,
"ed▁": 92,
"the▁": 93,
",▁": 94,
"wa": 95,
"in": 96,
"he▁": 97,
"to▁": 98,
"ou": 99,
"▁and▁": 100,
"ar": 101,
"en": 102,
".▁T": 103,
"▁a▁": 104,
"ha": 105,
"om": 106,
"sa": 107,
"▁the▁": 108,
"he": 109,
"im": 110,
"on": 111,
"g▁": 112,
"ll": 113,
"st": 114,
"was▁": 115,
"an": 116,
"or": 117,
"ay": 118,
"it": 119,
"er▁": 120,
"id": 121,
".\n": 122,
"re": 123,
"is▁": 124,
"pl": 125,
"ir": 126,
"am": 127,
"ed▁to▁": 128,
"il": 129,
"wi": 130,
"the": 131,
"ri": 132,
"lo": 133,
"a▁": 134,
"and▁": 135,
".▁H": 136,
"hey▁": 137,
".▁S": 138,
"o▁": 139,
".▁He▁": 140,
"▁h": 141,
"no": 142,
"at▁": 143,
".▁They▁": 144,
"ere▁": 145,
"with": 146,
"On": 147,
"un": 148,
"ing▁": 149,
"ver": 150,
"pp": 151,
".▁The▁": 152,
"\"▁": 153,
"ck": 154,
"ry": 155,
"bi": 156,
"art": 157,
"li": 158,
"ll▁": 159,
",▁\"": 160,
"le": 161,
"sto": 162,
".▁She▁": 163,
"be": 164,
"day": 165,
"start": 166,
"it▁": 167,
"ut▁": 168,
"sh": 169,
"om▁": 170,
"ok": 171,
"w▁": 172,
"you": 173,
"said": 174,
"ma": 175,
"ing": 176,
"le▁": 177,
"se": 178,
"bo": 179,
"ld▁": 180,
"happ": 181,
"im▁": 182,
"end": 183,
"fri": 184,
"do": 185,
"gh": 186,
"of": 187,
"ba": 188,
"up": 189,
"ay▁": 190,
"ch": 191,
"ed": 192,
"very▁": 193,
"an▁": 194,
"ne": 195,
"for": 196,
"was": 197,
"had▁": 198,
"wan": 199,
"l▁": 200,
"ce▁": 201,
"big▁": 202,
"en▁": 203,
"said,▁\"": 204,
"story": 205,
"friend": 206,
"itt": 207,
"<|": 208,
"_story": 209,
"|>": 210,
"▁<|": 211,
"start_story": 212,
"▁<|start_story": 213,
"▁<|start_story|>": 214,
"e,▁": 215,
"Lil": 216,
"they▁": 217,
"ve▁": 218,
"ro": 219,
"play": 220,
"not▁": 221,
".▁I": 222,
"One▁": 223,
"ge": 224,
"ittle▁": 225,
"was▁a▁": 226,
"ke▁": 227,
"'s▁": 228,
"little▁": 229,
"tim": 230,
"his▁": 231,
"at": 232,
"es": 233,
"that▁": 234,
"One▁day": 235,