leviethoang commited on
Commit
467675c
1 Parent(s): 5dec88c

add tokenizer

Browse files
added_tokens.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"<s>": 99, "</s>": 100, "<pad>": 101}
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "<pad>", "do_lower_case": false, "word_delimiter_token": "|", "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "ai-project/wav2vec2-large-xls-r-300m-vi-25p", "tokenizer_class": "Wav2Vec2CTCTokenizer"}
vocab.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"'": 1, "<": 2, ">": 3, "a": 4, "b": 5, "c": 6, "d": 7, "e": 8, "f": 9, "g": 10, "h": 11, "i": 12, "j": 13, "k": 14, "l": 15, "m": 16, "n": 17, "o": 18, "p": 19, "q": 20, "r": 21, "s": 22, "t": 23, "u": 24, "v": 25, "w": 26, "x": 27, "y": 28, "z": 29, "à": 30, "á": 31, "â": 32, "ã": 33, "è": 34, "é": 35, "ê": 36, "ì": 37, "í": 38, "ò": 39, "ó": 40, "ô": 41, "õ": 42, "ù": 43, "ú": 44, "ý": 45, "ă": 46, "đ": 47, "ĩ": 48, "ũ": 49, "ơ": 50, "ư": 51, "ạ": 52, "ả": 53, "ấ": 54, "ầ": 55, "ẩ": 56, "ẫ": 57, "ậ": 58, "ắ": 59, "ằ": 60, "ẳ": 61, "ẵ": 62, "ặ": 63, "ẹ": 64, "ẻ": 65, "ẽ": 66, "ế": 67, "ề": 68, "ể": 69, "ễ": 70, "ệ": 71, "ỉ": 72, "ị": 73, "ọ": 74, "ỏ": 75, "ố": 76, "ồ": 77, "ổ": 78, "ỗ": 79, "ộ": 80, "ớ": 81, "ờ": 82, "ở": 83, "ỡ": 84, "ợ": 85, "ụ": 86, "ủ": 87, "ứ": 88, "ừ": 89, "ử": 90, "ữ": 91, "ự": 92, "ỳ": 93, "ỵ": 94, "ỷ": 95, "ỹ": 96, "|": 0, "<unk>": 97, "[<pad>": 98}