diallomama's picture
tokenizer and vocab (from up)
2e00fbe
{
"!": 1,
"\"": 2,
"$": 3,
"%": 4,
"&": 5,
"'": 6,
"(": 7,
")": 8,
"*": 9,
",": 10,
"-": 11,
".": 12,
"/": 13,
":": 14,
";": 15,
"=": 16,
"?": 17,
"[PAD]": 314,
"[UNK]": 313,
"^": 18,
"_": 19,
"`": 20,
"a": 21,
"b": 22,
"c": 23,
"d": 24,
"e": 25,
"f": 26,
"g": 27,
"h": 28,
"i": 29,
"j": 30,
"k": 31,
"l": 32,
"m": 33,
"n": 34,
"o": 35,
"p": 36,
"q": 37,
"r": 38,
"s": 39,
"t": 40,
"u": 41,
"v": 42,
"w": 43,
"x": 44,
"y": 45,
"z": 46,
"{": 47,
"|": 0,
"}": 49,
"~": 50,
"£": 51,
"§": 52,
"«": 53,
"®": 54,
"°": 55,
"±": 56,
"´": 57,
"µ": 58,
"·": 59,
"º": 60,
"»": 61,
"½": 62,
"×": 63,
"ß": 64,
"à": 65,
"á": 66,
"â": 67,
"ã": 68,
"ä": 69,
"å": 70,
"æ": 71,
"ç": 72,
"è": 73,
"é": 74,
"ê": 75,
"ë": 76,
"ì": 77,
"í": 78,
"î": 79,
"ï": 80,
"ð": 81,
"ñ": 82,
"ò": 83,
"ó": 84,
"ô": 85,
"õ": 86,
"ö": 87,
"ø": 88,
"ù": 89,
"ú": 90,
"û": 91,
"ü": 92,
"ý": 93,
"þ": 94,
"ÿ": 95,
"ā": 96,
"ă": 97,
"ą": 98,
"ć": 99,
"ċ": 100,
"č": 101,
"ď": 102,
"đ": 103,
"ē": 104,
"ė": 105,
"ę": 106,
"ě": 107,
"ğ": 108,
"ġ": 109,
"ħ": 110,
"ĩ": 111,
"ī": 112,
"ı": 113,
"ķ": 114,
"ĺ": 115,
"ļ": 116,
"ľ": 117,
"ł": 118,
"ń": 119,
"ņ": 120,
"ň": 121,
"ō": 122,
"ŏ": 123,
"ő": 124,
"œ": 125,
"ř": 126,
"ś": 127,
"ş": 128,
"š": 129,
"ţ": 130,
"ť": 131,
"ũ": 132,
"ū": 133,
"ů": 134,
"ų": 135,
"ź": 136,
"ż": 137,
"ž": 138,
"ơ": 139,
"ư": 140,
"ǀ": 141,
"ǃ": 142,
"ǎ": 143,
"ǔ": 144,
"ǫ": 145,
"ǹ": 146,
"ș": 147,
"ț": 148,
"ə": 149,
"ɨ": 150,
"ʉ": 151,
"ʔ": 152,
"ʻ": 153,
"ʼ": 154,
"ʽ": 155,
"ʾ": 156,
"ʿ": 157,
"ː": 158,
"ˢ": 159,
"̀": 160,
"́": 161,
"̂": 162,
"̃": 163,
"̇": 164,
"̈": 165,
"̐": 166,
"̠": 167,
"̧": 168,
"̱": 169,
"̲": 170,
"α": 171,
"β": 172,
"γ": 173,
"δ": 174,
"ε": 175,
"ζ": 176,
"η": 177,
"θ": 178,
"ι": 179,
"κ": 180,
"μ": 181,
"ν": 182,
"ο": 183,
"π": 184,
"ρ": 185,
"ς": 186,
"σ": 187,
"τ": 188,
"υ": 189,
"ψ": 190,
"ω": 191,
"ό": 192,
"а": 193,
"г": 194,
"е": 195,
"з": 196,
"и": 197,
"к": 198,
"м": 199,
"н": 200,
"п": 201,
"р": 202,
"э": 203,
"я": 204,
"і": 205,
"ј": 206,
"ҫ": 207,
"գ": 208,
"զ": 209,
"ا": 210,
"ب": 211,
"ة": 212,
"د": 213,
"ر": 214,
"ل": 215,
"م": 216,
"ن": 217,
"و": 218,
"ي": 219,
"ቀ": 220,
"ወ": 221,
"ደ": 222,
"ጠ": 223,
"ḍ": 224,
"ṅ": 225,
"ṇ": 226,
"ṣ": 227,
"ṭ": 228,
"ṯ": 229,
"ạ": 230,
"ả": 231,
"ầ": 232,
"ậ": 233,
"ắ": 234,
"ẵ": 235,
"ề": 236,
"ễ": 237,
"ệ": 238,
"ị": 239,
"ồ": 240,
"ổ": 241,
"ộ": 242,
"ờ": 243,
"ợ": 244,
"ủ": 245,
"ử": 246,
"ỳ": 247,
"‐": 248,
"–": 249,
"—": 250,
"―": 251,
"‘": 252,
"’": 253,
"“": 254,
"”": 255,
"„": 256,
"†": 257,
"…": 258,
"′": 259,
"‹": 260,
"›": 261,
"€": 262,
"₽": 263,
"ℂ": 264,
"ℕ": 265,
"ℝ": 266,
"ℤ": 267,
"ℰ": 268,
"ℵ": 269,
"→": 270,
"∅": 271,
"∆": 272,
"∈": 273,
"−": 274,
"∞": 275,
"∨": 276,
"∼": 277,
"≥": 278,
"─": 279,
"☉": 280,
"い": 281,
"う": 282,
"た": 283,
"つ": 284,
"ぬ": 285,
"の": 286,
"ひ": 287,
"へ": 288,
"ま": 289,
"め": 290,
"や": 291,
"ゔ": 292,
"乃": 293,
"京": 294,
"北": 295,
"扬": 296,
"文": 297,
"星": 298,
"术": 299,
"杜": 300,
"甌": 301,
"美": 302,
"西": 303,
"貴": 304,
"青": 305,
"馆": 306,
"ꝑ": 307,
"고": 308,
"기": 309,
"먹": 310,
"삼": 311,
"생": 312,
"집": 313
}