frpron / tokenizer.json
Marxav's picture
add tokenizer
d41cb5e
raw
history blame
5.68 kB
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "<|endoftext|>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 265,
"content": ">",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 266,
"content": "<",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 267,
"content": "#",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": null,
"pre_tokenizer": {
"type": "ByteLevel",
"add_prefix_space": false,
"trim_offsets": true
},
"post_processor": {
"type": "ByteLevel",
"add_prefix_space": true,
"trim_offsets": false
},
"decoder": {
"type": "ByteLevel",
"add_prefix_space": true,
"trim_offsets": true
},
"model": {
"type": "BPE",
"dropout": null,
"unk_token": null,
"continuing_subword_prefix": "",
"end_of_word_suffix": "",
"fuse_unk": false,
"vocab": {
"<|endoftext|>": 0,
"\n": 1,
" ": 2,
"!": 3,
"%": 4,
"&": 5,
"'": 6,
"(": 7,
")": 8,
"*": 9,
"+": 10,
",": 11,
"-": 12,
".": 13,
"/": 14,
"0": 15,
"1": 16,
"2": 17,
"3": 18,
"4": 19,
"5": 20,
"6": 21,
"7": 22,
"8": 23,
"9": 24,
":": 25,
";": 26,
"=": 27,
"?": 28,
"@": 29,
"A": 30,
"B": 31,
"C": 32,
"D": 33,
"E": 34,
"F": 35,
"G": 36,
"H": 37,
"I": 38,
"J": 39,
"K": 40,
"L": 41,
"M": 42,
"N": 43,
"O": 44,
"P": 45,
"Q": 46,
"R": 47,
"S": 48,
"T": 49,
"U": 50,
"V": 51,
"W": 52,
"X": 53,
"Y": 54,
"Z": 55,
"\\": 56,
"]": 57,
"a": 58,
"b": 59,
"c": 60,
"d": 61,
"e": 62,
"f": 63,
"g": 64,
"h": 65,
"i": 66,
"j": 67,
"k": 68,
"l": 69,
"m": 70,
"n": 71,
"o": 72,
"p": 73,
"q": 74,
"r": 75,
"s": 76,
"t": 77,
"u": 78,
"v": 79,
"w": 80,
"x": 81,
"y": 82,
"z": 83,
"}": 84,
"¡": 85,
"¢": 86,
"£": 87,
"¤": 88,
"¥": 89,
"¦": 90,
"§": 91,
"¨": 92,
"©": 93,
"ª": 94,
"«": 95,
"¬": 96,
"®": 97,
"¯": 98,
"°": 99,
"±": 100,
"²": 101,
"³": 102,
"´": 103,
"µ": 104,
"¶": 105,
"·": 106,
"¸": 107,
"¹": 108,
"º": 109,
"»": 110,
"¼": 111,
"½": 112,
"¾": 113,
"¿": 114,
"Á": 115,
"Â": 116,
"Ã": 117,
"Ä": 118,
"Å": 119,
"Æ": 120,
"Ç": 121,
"È": 122,
"É": 123,
"Ê": 124,
"Ë": 125,
"Ì": 126,
"Í": 127,
"Î": 128,
"Ï": 129,
"Ð": 130,
"Ô": 131,
"Ö": 132,
"×": 133,
"Ü": 134,
"ß": 135,
"à": 136,
"á": 137,
"â": 138,
"ã": 139,
"ä": 140,
"å": 141,
"æ": 142,
"ç": 143,
"è": 144,
"é": 145,
"ê": 146,
"ë": 147,
"ì": 148,
"í": 149,
"î": 150,
"ï": 151,
"ñ": 152,
"ò": 153,
"ó": 154,
"ô": 155,
"õ": 156,
"ö": 157,
"ø": 158,
"ù": 159,
"ú": 160,
"û": 161,
"ü": 162,
"ý": 163,
"ÿ": 164,
"ā": 165,
"ć": 166,
"č": 167,
"Ğ": 168,
"ğ": 169,
"Ġ": 170,
"Ģ": 171,
"ģ": 172,
"Ĥ": 173,
"ĥ": 174,
"Ħ": 175,
"ħ": 176,
"Ĩ": 177,
"ĩ": 178,
"Ī": 179,
"ī": 180,
"Ĭ": 181,
"ĭ": 182,
"Į": 183,
"į": 184,
"İ": 185,
"ı": 186,
"IJ": 187,
"ij": 188,
"Ĵ": 189,
"ĵ": 190,
"Ķ": 191,
"ĸ": 192,
"Ĺ": 193,
"ĺ": 194,
"Ļ": 195,
"ļ": 196,
"Ľ": 197,
"ľ": 198,
"Ŀ": 199,
"ŀ": 200,
"Ł": 201,
"ł": 202,
"Ń": 203,
"ŋ": 204,
"ō": 205,
"Œ": 206,
"œ": 207,
"Ś": 208,
"ŝ": 209,
"Ş": 210,
"ş": 211,
"š": 212,
"ū": 213,
"ŵ": 214,
"ź": 215,
"Ž": 216,
"ž": 217,
"ǘ": 218,
"ȝ": 219,
"ɑ": 220,
"ɔ": 221,
"ə": 222,
"ɛ": 223,
"ɡ": 224,
"ɣ": 225,
"ɥ": 226,
"ɨ": 227,
"ɩ": 228,
"ɲ": 229,
"ʁ": 230,
"ʃ": 231,
"ʋ": 232,
"ʒ": 233,
"ʔ": 234,
"ʻ": 235,
"ʼ": 236,
"ʾ": 237,
"ʿ": 238,
"́": 239,
"̃": 240,
"̐": 241,
"̓": 242,
"͠": 243,
"Δ": 244,
"α": 245,
"β": 246,
"μ": 247,
"σ": 248,
"а": 249,
"е": 250,
"ḷ": 251,
"ṃ": 252,
"ṇ": 253,
"ṣ": 254,
"ṭ": 255,
"–": 256,
"‘": 257,
"’": 258,
"′": 259,
"‿": 260,
"₂": 261,
"€": 262,
"−": 263,
"∴": 264
},
"merges": []
}
}