tomascufaro commited on
Commit
bcfe9c5
1 Parent(s): 2d35c9c

add tokenizer

Browse files
added_tokens.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"<s>": 227, "</s>": 228}
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|", "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "./", "tokenizer_class": "Wav2Vec2CTCTokenizer"}
vocab.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"$": 1, "&": 2, "=": 3, "_": 4, "`": 5, "a": 6, "b": 7, "c": 8, "d": 9, "e": 10, "f": 11, "g": 12, "h": 13, "i": 14, "j": 15, "k": 16, "l": 17, "m": 18, "n": 19, "o": 20, "p": 21, "q": 22, "r": 23, "s": 24, "t": 25, "u": 26, "v": 27, "w": 28, "x": 29, "y": 30, "z": 31, "{": 32, "|": 0, "}": 34, "~": 35, "¡": 36, "¨": 37, "ª": 38, "«": 39, "®": 40, "°": 41, "´": 42, "·": 43, "»": 44, "¿": 45, "ß": 46, "à": 47, "á": 48, "ã": 49, "ä": 50, "å": 51, "æ": 52, "ç": 53, "é": 54, "ê": 55, "ë": 56, "ì": 57, "í": 58, "ï": 59, "ð": 60, "ñ": 61, "ò": 62, "ó": 63, "õ": 64, "ö": 65, "ø": 66, "ù": 67, "ú": 68, "ü": 69, "ý": 70, "þ": 71, "ā": 72, "ă": 73, "ć": 74, "č": 75, "đ": 76, "ė": 77, "ę": 78, "ě": 79, "ğ": 80, "ī": 81, "ı": 82, "ł": 83, "ń": 84, "ō": 85, "ŏ": 86, "ő": 87, "œ": 88, "ř": 89, "ś": 90, "ş": 91, "š": 92, "ū": 93, "ź": 94, "ż": 95, "ž": 96, "ș": 97, "ț": 98, "ə": 99, "ʷ": 100, "ʻ": 101, "ʽ": 102, "ʿ": 103, "ː": 104, "́": 105, "̇": 106, "ϙ": 107, "а": 108, "б": 109, "в": 110, "г": 111, "д": 112, "е": 113, "и": 114, "й": 115, "к": 116, "л": 117, "н": 118, "о": 119, "п": 120, "р": 121, "с": 122, "т": 123, "ч": 124, "ш": 125, "ы": 126, "ь": 127, "ю": 128, "я": 129, "ё": 130, "ү": 131, "ө": 132, "ְ": 133, "ִ": 134, "ֵ": 135, "ָ": 136, "ֹ": 137, "ּ": 138, "ב": 139, "ה": 140, "ו": 141, "י": 142, "כ": 143, "ל": 144, "ם": 145, "מ": 146, "נ": 147, "ס": 148, "ק": 149, "ר": 150, "ש": 151, "ת": 152, "ا": 153, "ب": 154, "ة": 155, "د": 156, "ذ": 157, "ر": 158, "ل": 159, "م": 160, "ه": 161, "و": 162, "ي": 163, "ਆ": 164, "ਘ": 165, "ਤ": 166, "ਨ": 167, "ਮ": 168, "ਸ": 169, "ਾ": 170, "ਿ": 171, "ੰ": 172, "ṁ": 173, "ṃ": 174, "ṇ": 175, "ồ": 176, "‐": 177, "‑": 178, "–": 179, "—": 180, "―": 181, "’": 182, "„": 183, "…": 184, "‧": 185, "‹": 186, "›": 187, "→": 188, "≪": 189, "≫": 190, "し": 191, "の": 192, "ひ": 193, "ら": 194, "ゴ": 195, "ヒ": 196, "ミ": 197, "ム": 198, "ラ": 199, "㓁": 200, "口": 201, "周": 202, "夷": 203, "山": 204, "戌": 205, "日": 206, "本": 207, "比": 208, "毵": 209, "消": 210, "生": 211, "申": 212, "真": 213, "箱": 214, "网": 215, "罒": 216, "罓": 217, "肋": 218, "肌": 219, "背": 220, "良": 221, "蝦": 222, "鮓": 223, "鮨": 224, "fi": 225, "": 226, "[UNK]": 226, "[PAD]": 227}