mirari commited on
Commit
051cf87
1 Parent(s): bea9dea

add tokenizer

Browse files
added_tokens.json ADDED
@@ -0,0 +1 @@
 
1
+ {"<s>": 229, "</s>": 230}
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
1
+ {"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|", "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "./", "tokenizer_class": "Wav2Vec2CTCTokenizer"}
vocab.json ADDED
@@ -0,0 +1 @@
 
1
+ {"$": 1, "&": 2, "=": 3, "_": 4, "`": 5, "a": 6, "b": 7, "c": 8, "d": 9, "e": 10, "f": 11, "g": 12, "h": 13, "i": 14, "j": 15, "k": 16, "l": 17, "m": 18, "n": 19, "o": 20, "p": 21, "q": 22, "r": 23, "s": 24, "t": 25, "u": 26, "v": 27, "w": 28, "x": 29, "y": 30, "z": 31, "{": 32, "|": 0, "}": 34, "~": 35, "¨": 36, "ª": 37, "«": 38, "®": 39, "°": 40, "´": 41, "·": 42, "»": 43, "ß": 44, "à": 45, "á": 46, "â": 47, "ã": 48, "ä": 49, "å": 50, "æ": 51, "ç": 52, "é": 53, "ê": 54, "ë": 55, "ì": 56, "í": 57, "î": 58, "ï": 59, "ð": 60, "ñ": 61, "ò": 62, "ó": 63, "ô": 64, "õ": 65, "ö": 66, "ø": 67, "ù": 68, "ú": 69, "û": 70, "ü": 71, "ý": 72, "þ": 73, "ā": 74, "ă": 75, "ć": 76, "č": 77, "đ": 78, "ė": 79, "ę": 80, "ě": 81, "ğ": 82, "ī": 83, "ı": 84, "ł": 85, "ń": 86, "ō": 87, "ŏ": 88, "ő": 89, "œ": 90, "ř": 91, "ś": 92, "ş": 93, "š": 94, "ū": 95, "ź": 96, "ż": 97, "ž": 98, "ș": 99, "ț": 100, "ə": 101, "ʷ": 102, "ʻ": 103, "ʽ": 104, "ʿ": 105, "ː": 106, "́": 107, "̇": 108, "ϙ": 109, "а": 110, "б": 111, "в": 112, "г": 113, "д": 114, "е": 115, "и": 116, "й": 117, "к": 118, "л": 119, "н": 120, "о": 121, "п": 122, "р": 123, "с": 124, "т": 125, "ч": 126, "ш": 127, "ы": 128, "ь": 129, "ю": 130, "я": 131, "ё": 132, "ү": 133, "ө": 134, "ְ": 135, "ִ": 136, "ֵ": 137, "ָ": 138, "ֹ": 139, "ּ": 140, "ב": 141, "ה": 142, "ו": 143, "י": 144, "כ": 145, "ל": 146, "ם": 147, "מ": 148, "נ": 149, "ס": 150, "ק": 151, "ר": 152, "ש": 153, "ת": 154, "ا": 155, "ب": 156, "ة": 157, "د": 158, "ذ": 159, "ر": 160, "ل": 161, "م": 162, "ه": 163, "و": 164, "ي": 165, "ਆ": 166, "ਘ": 167, "ਤ": 168, "ਨ": 169, "ਮ": 170, "ਸ": 171, "ਾ": 172, "ਿ": 173, "ੰ": 174, "ṁ": 175, "ṃ": 176, "ṇ": 177, "ồ": 178, "‐": 179, "‑": 180, "–": 181, "—": 182, "―": 183, "’": 184, "„": 185, "…": 186, "‧": 187, "‹": 188, "›": 189, "→": 190, "≪": 191, "≫": 192, "し": 193, "の": 194, "ひ": 195, "ら": 196, "ゴ": 197, "ヒ": 198, "ミ": 199, "ム": 200, "ラ": 201, "㓁": 202, "口": 203, "周": 204, "夷": 205, "山": 206, "戌": 207, "日": 208, "本": 209, "比": 210, "毵": 211, "消": 212, "生": 213, "申": 214, "真": 215, "箱": 216, "网": 217, "罒": 218, "罓": 219, "肋": 220, "肌": 221, "背": 222, "良": 223, "蝦": 224, "鮓": 225, "鮨": 226, "fi": 227, "": 228, "[UNK]": 228, "[PAD]": 229}