frpron / tokenizer.json
Marxav's picture
add tokenizer
56a7094
raw
history blame
5.5 kB
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "<|endoftext|>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 254,
"content": ">",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 255,
"content": "<",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 256,
"content": "#",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": null,
"pre_tokenizer": {
"type": "ByteLevel",
"add_prefix_space": false,
"trim_offsets": true
},
"post_processor": {
"type": "ByteLevel",
"add_prefix_space": true,
"trim_offsets": false
},
"decoder": {
"type": "ByteLevel",
"add_prefix_space": true,
"trim_offsets": true
},
"model": {
"type": "BPE",
"dropout": null,
"unk_token": null,
"continuing_subword_prefix": "",
"end_of_word_suffix": "",
"fuse_unk": false,
"vocab": {
"<|endoftext|>": 0,
"\n": 1,
" ": 2,
"!": 3,
"&": 4,
"'": 5,
"(": 6,
")": 7,
"*": 8,
"+": 9,
"-": 10,
".": 11,
"/": 12,
"0": 13,
"1": 14,
"2": 15,
"3": 16,
"4": 17,
"5": 18,
"6": 19,
"7": 20,
"8": 21,
"9": 22,
":": 23,
";": 24,
"@": 25,
"A": 26,
"B": 27,
"C": 28,
"D": 29,
"E": 30,
"F": 31,
"G": 32,
"H": 33,
"I": 34,
"J": 35,
"K": 36,
"L": 37,
"M": 38,
"N": 39,
"O": 40,
"P": 41,
"Q": 42,
"R": 43,
"S": 44,
"T": 45,
"U": 46,
"V": 47,
"W": 48,
"X": 49,
"Y": 50,
"Z": 51,
"a": 52,
"b": 53,
"c": 54,
"d": 55,
"e": 56,
"f": 57,
"g": 58,
"h": 59,
"i": 60,
"j": 61,
"k": 62,
"l": 63,
"m": 64,
"n": 65,
"o": 66,
"p": 67,
"q": 68,
"r": 69,
"s": 70,
"t": 71,
"u": 72,
"v": 73,
"w": 74,
"x": 75,
"y": 76,
"z": 77,
"¡": 78,
"¢": 79,
"£": 80,
"¤": 81,
"¥": 82,
"¦": 83,
"§": 84,
"¨": 85,
"©": 86,
"ª": 87,
"«": 88,
"¬": 89,
"®": 90,
"¯": 91,
"°": 92,
"±": 93,
"²": 94,
"³": 95,
"´": 96,
"µ": 97,
"¶": 98,
"·": 99,
"¸": 100,
"¹": 101,
"º": 102,
"»": 103,
"¼": 104,
"½": 105,
"¾": 106,
"¿": 107,
"Á": 108,
"Â": 109,
"Ã": 110,
"Ä": 111,
"Å": 112,
"Æ": 113,
"Ç": 114,
"È": 115,
"É": 116,
"Ê": 117,
"Ì": 118,
"Í": 119,
"Î": 120,
"Ï": 121,
"Ð": 122,
"Ô": 123,
"Ö": 124,
"×": 125,
"Ü": 126,
"ß": 127,
"à": 128,
"á": 129,
"â": 130,
"ã": 131,
"ä": 132,
"å": 133,
"æ": 134,
"ç": 135,
"è": 136,
"é": 137,
"ê": 138,
"ë": 139,
"ì": 140,
"í": 141,
"î": 142,
"ï": 143,
"ñ": 144,
"ò": 145,
"ó": 146,
"ô": 147,
"õ": 148,
"ö": 149,
"ø": 150,
"ù": 151,
"ú": 152,
"û": 153,
"ü": 154,
"ý": 155,
"ÿ": 156,
"ā": 157,
"ć": 158,
"č": 159,
"Ğ": 160,
"ğ": 161,
"Ġ": 162,
"Ģ": 163,
"ģ": 164,
"Ĥ": 165,
"ĥ": 166,
"Ĩ": 167,
"ĩ": 168,
"Ī": 169,
"ī": 170,
"Ĭ": 171,
"ĭ": 172,
"į": 173,
"İ": 174,
"ı": 175,
"IJ": 176,
"ij": 177,
"Ĵ": 178,
"ĵ": 179,
"Ķ": 180,
"ĸ": 181,
"Ĺ": 182,
"ĺ": 183,
"Ļ": 184,
"ļ": 185,
"Ľ": 186,
"ľ": 187,
"Ŀ": 188,
"ŀ": 189,
"Ł": 190,
"ł": 191,
"Ń": 192,
"ŋ": 193,
"ō": 194,
"Œ": 195,
"œ": 196,
"Ś": 197,
"ŝ": 198,
"Ş": 199,
"ş": 200,
"š": 201,
"ū": 202,
"ŵ": 203,
"ź": 204,
"Ž": 205,
"ž": 206,
"ǘ": 207,
"ȝ": 208,
"ɑ": 209,
"ɔ": 210,
"ə": 211,
"ɛ": 212,
"ɡ": 213,
"ɣ": 214,
"ɥ": 215,
"ɨ": 216,
"ɩ": 217,
"ɲ": 218,
"ʁ": 219,
"ʃ": 220,
"ʋ": 221,
"ʒ": 222,
"ʔ": 223,
"ʻ": 224,
"ʼ": 225,
"ʾ": 226,
"ʿ": 227,
"́": 228,
"̃": 229,
"̐": 230,
"̓": 231,
"͠": 232,
"Δ": 233,
"α": 234,
"β": 235,
"μ": 236,
"σ": 237,
"а": 238,
"е": 239,
"ḷ": 240,
"ṃ": 241,
"ṇ": 242,
"ṣ": 243,
"ṭ": 244,
"–": 245,
"‘": 246,
"’": 247,
"′": 248,
"‿": 249,
"₂": 250,
"€": 251,
"−": 252,
"∴": 253
},
"merges": []
}
}