g9rant commited on
Commit
d722d34
1 Parent(s): 11a44a1

add tokenizer

Browse files
added_tokens.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"<s>": 106, "</s>": 107}
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|", "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "./", "tokenizer_class": "Wav2Vec2CTCTokenizer"}
vocab.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"&": 1, "(": 2, ")": 3, "+": 4, "/": 5, "=": 6, "[": 7, "]": 8, "_": 9, "`": 10, "a": 11, "b": 12, "c": 13, "d": 14, "e": 15, "f": 16, "g": 17, "h": 18, "i": 19, "j": 20, "k": 21, "l": 22, "m": 23, "n": 24, "o": 25, "p": 26, "q": 27, "r": 28, "s": 29, "t": 30, "u": 31, "v": 32, "w": 33, "x": 34, "y": 35, "z": 36, "¡": 37, "ß": 38, "à": 39, "á": 40, "ã": 41, "ä": 42, "å": 43, "æ": 44, "ç": 45, "è": 46, "é": 47, "ê": 48, "ë": 49, "í": 50, "ð": 51, "ñ": 52, "ó": 53, "ö": 54, "ø": 55, "ú": 56, "ü": 57, "þ": 58, "ā": 59, "ă": 60, "ć": 61, "č": 62, "ę": 63, "ě": 64, "ğ": 65, "ī": 66, "ł": 67, "ń": 68, "ō": 69, "ő": 70, "œ": 71, "ř": 72, "š": 73, "ū": 74, "ž": 75, "ʻ": 76, "α": 77, "π": 78, "χ": 79, "в": 80, "е": 81, "з": 82, "и": 83, "й": 84, "к": 85, "л": 86, "н": 87, "ь": 88, "я": 89, "ṃ": 90, "ạ": 91, "ụ": 92, "–": 93, "—": 94, "’": 95, "…": 96, "→": 97, "≡": 98, "京": 99, "大": 100, "都": 101, "阪": 102, "fl": 103, "|": 0, "[UNK]": 104, "[PAD]": 105}