thocheat commited on
Commit
8cf8291
1 Parent(s): 80563f4

add tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +1 -0
  2. tokenizer_config.json +1 -0
  3. vocab.json +1 -0
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]"}
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|", "replace_word_delimiter_char": " ", "tokenizer_class": "Wav2Vec2CTCTokenizer"}
vocab.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"ậ": 0, "k": 1, "ả": 2, "p": 3, "ị": 4, "ễ": 5, "v": 7, "á": 8, "ỗ": 9, "ô": 10, "ă": 11, "ò": 12, "ở": 13, "o": 14, "ẩ": 15, "t": 16, "ơ": 17, "ỳ": 18, "ứ": 19, "ý": 20, "ỉ": 21, "ệ": 22, "ổ": 23, "q": 24, "ã": 25, "ầ": 26, "ũ": 27, "ờ": 28, "d": 29, "l": 30, "ù": 31, "ẹ": 32, "ạ": 33, "ĩ": 34, "g": 35, "r": 36, "è": 37, "ọ": 38, "ê": 39, "ợ": 40, "ụ": 41, "ể": 42, "ó": 43, "s": 44, "ỵ": 45, "ộ": 46, "ì": 47, "h": 48, "ằ": 49, "ú": 50, "ẵ": 51, "ữ": 52, "õ": 53, "ẳ": 54, "ố": 55, "m": 56, "ế": 57, "ề": 58, "ẫ": 59, "ự": 60, "đ": 61, "í": 62, "ớ": 63, "e": 64, "a": 65, "ỷ": 66, "ủ": 67, "ắ": 68, "ỹ": 69, "ồ": 70, "x": 71, "ỏ": 72, "ử": 73, "u": 74, "ư": 75, "ỡ": 76, "n": 77, "ẽ": 78, "à": 79, "y": 80, "b": 81, "é": 82, "c": 83, "i": 84, "ừ": 85, "â": 86, "4": 87, "ẻ": 88, "ấ": 89, "ặ": 90, "|": 6, "[UNK]": 91, "[PAD]": 92}