jean2's picture
add tokenizer
d6cabad
{"$": 1, "&": 2, "(": 3, ")": 4, "*": 5, "/": 6, "=": 7, "^": 8, "_": 9, "`": 10, "a": 11, "b": 12, "c": 13, "d": 14, "e": 15, "f": 16, "g": 17, "h": 18, "i": 19, "j": 20, "k": 21, "l": 22, "m": 23, "n": 24, "o": 25, "p": 26, "q": 27, "r": 28, "s": 29, "t": 30, "u": 31, "v": 32, "w": 33, "x": 34, "y": 35, "z": 36, "{": 37, "|": 0, "}": 39, "~": 40, "£": 41, "§": 42, "«": 43, "®": 44, "°": 45, "±": 46, "´": 47, "µ": 48, "·": 49, "º": 50, "»": 51, "½": 52, "×": 53, "ß": 54, "à": 55, "á": 56, "ã": 57, "ä": 58, "å": 59, "æ": 60, "ç": 61, "è": 62, "é": 63, "ê": 64, "ë": 65, "ì": 66, "í": 67, "ï": 68, "ð": 69, "ñ": 70, "ò": 71, "ó": 72, "õ": 73, "ö": 74, "ø": 75, "ù": 76, "ú": 77, "ü": 78, "ý": 79, "þ": 80, "ÿ": 81, "ā": 82, "ă": 83, "ą": 84, "ć": 85, "ċ": 86, "č": 87, "ď": 88, "đ": 89, "ē": 90, "ė": 91, "ę": 92, "ě": 93, "ğ": 94, "ġ": 95, "ħ": 96, "ĩ": 97, "ī": 98, "ı": 99, "ķ": 100, "ĺ": 101, "ļ": 102, "ľ": 103, "ł": 104, "ń": 105, "ņ": 106, "ň": 107, "ō": 108, "ŏ": 109, "ő": 110, "œ": 111, "ř": 112, "ś": 113, "ş": 114, "š": 115, "ţ": 116, "ť": 117, "ũ": 118, "ū": 119, "ů": 120, "ų": 121, "ź": 122, "ż": 123, "ž": 124, "ơ": 125, "ư": 126, "ǀ": 127, "ǃ": 128, "ǎ": 129, "ǔ": 130, "ǫ": 131, "ǹ": 132, "ș": 133, "ț": 134, "ə": 135, "ɨ": 136, "ʉ": 137, "ʔ": 138, "ʻ": 139, "ʼ": 140, "ʽ": 141, "ʾ": 142, "ʿ": 143, "ː": 144, "ˢ": 145, "̀": 146, "́": 147, "̂": 148, "̃": 149, "̇": 150, "̈": 151, "̐": 152, "̠": 153, "̧": 154, "̱": 155, "̲": 156, "α": 157, "β": 158, "γ": 159, "δ": 160, "ε": 161, "ζ": 162, "η": 163, "θ": 164, "ι": 165, "κ": 166, "μ": 167, "ν": 168, "ο": 169, "π": 170, "ρ": 171, "ς": 172, "σ": 173, "τ": 174, "υ": 175, "ψ": 176, "ω": 177, "ό": 178, "а": 179, "г": 180, "е": 181, "з": 182, "и": 183, "к": 184, "м": 185, "н": 186, "п": 187, "р": 188, "э": 189, "я": 190, "і": 191, "ј": 192, "ҫ": 193, "գ": 194, "զ": 195, "ا": 196, "ب": 197, "ة": 198, "د": 199, "ر": 200, "ل": 201, "م": 202, "ن": 203, "و": 204, "ي": 205, "ቀ": 206, "ወ": 207, "ደ": 208, "ጠ": 209, "ḍ": 210, "ṅ": 211, "ṇ": 212, "ṣ": 213, "ṭ": 214, "ṯ": 215, "ạ": 216, "ả": 217, "ầ": 218, "ậ": 219, "ắ": 220, "ẵ": 221, "ề": 222, "ễ": 223, "ệ": 224, "ị": 225, "ồ": 226, "ổ": 227, "ộ": 228, "ờ": 229, "ợ": 230, "ủ": 231, "ử": 232, "ỳ": 233, "‐": 234, "–": 235, "—": 236, "―": 237, "’": 238, "„": 239, "†": 240, "…": 241, "′": 242, "‹": 243, "›": 244, "€": 245, "₽": 246, "ℂ": 247, "ℕ": 248, "ℝ": 249, "ℤ": 250, "ℰ": 251, "ℵ": 252, "→": 253, "∅": 254, "∆": 255, "∈": 256, "−": 257, "∞": 258, "∨": 259, "∼": 260, "≥": 261, "─": 262, "☉": 263, "い": 264, "う": 265, "た": 266, "つ": 267, "ぬ": 268, "の": 269, "ひ": 270, "へ": 271, "ま": 272, "め": 273, "や": 274, "ゔ": 275, "乃": 276, "京": 277, "北": 278, "扬": 279, "文": 280, "星": 281, "术": 282, "杜": 283, "甌": 284, "美": 285, "西": 286, "貴": 287, "青": 288, "馆": 289, "ꝑ": 290, "고": 291, "기": 292, "먹": 293, "삼": 294, "생": 295, "집": 296, "[UNK]": 296, "[PAD]": 297}