thocheat commited on
Commit
d5b6614
1 Parent(s): 18732ce

add tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +1 -0
  2. tokenizer_config.json +1 -0
  3. vocab.json +1 -0
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]"}
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|", "replace_word_delimiter_char": " ", "tokenizer_class": "Wav2Vec2CTCTokenizer"}
vocab.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"Ỏ": 0, "Ẵ": 1, "Ẳ": 2, "A": 3, "Ể": 4, "Ó": 5, "Ổ": 6, "O": 7, "Ỹ": 8, "Ò": 9, "Ẫ": 10, "Ỉ": 11, "Â": 12, "Ế": 13, "Đ": 14, "4": 15, "B": 16, "X": 17, "Ơ": 18, "Ả": 19, "Ố": 20, "Ẽ": 21, "Ỳ": 22, "Ủ": 23, "Ặ": 24, "Ừ": 25, "Ô": 26, "Ụ": 27, "E": 28, "Ư": 29, "U": 30, "Ã": 31, "K": 32, "Í": 33, "Y": 34, "P": 35, "V": 36, "Ă": 37, "G": 38, "É": 39, "L": 40, "Ỵ": 41, "Ắ": 42, "Ệ": 43, "Ẹ": 44, "Ú": 45, "Ợ": 46, "Ớ": 47, "T": 48, "Ồ": 49, "S": 50, "Ự": 51, "Ạ": 52, "Ý": 53, "Ậ": 54, "Q": 55, "Ì": 56, "Ữ": 57, ":": 58, "Ỷ": 59, "Ỗ": 60, "Ẻ": 61, "Õ": 62, "R": 63, "Ề": 64, "Ễ": 65, "Ù": 66, "Ẩ": 67, "Ê": 68, "Ộ": 69, "Ị": 70, "Ầ": 72, "Ỡ": 73, "Ũ": 74, "Ấ": 75, "Ử": 76, "Ờ": 77, "I": 78, "Ở": 79, "Ĩ": 80, "È": 81, "M": 82, "À": 83, "Ứ": 84, "C": 85, "Á": 86, "Ọ": 87, "Ằ": 88, "D": 89, "H": 90, "N": 91, "|": 71, "[UNK]": 92, "[PAD]": 93}