StephennFernandes commited on
Commit
b77c3b2
1 Parent(s): 5914caf

add tokenizer

Browse files
added_tokens.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"<s>": 101, "</s>": 102}
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|", "special_tokens_map_file": "XLS-R-assamese/special_tokens_map.json", "tokenizer_file": null, "name_or_path": "XLS-R-assamese-LM", "processor_class": "Wav2Vec2ProcessorWithLM", "tokenizer_class": "Wav2Vec2CTCTokenizer"}
vocab.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"ঙ": 0, "্": 1, "৷": 2, "ৌ": 3, "(": 4, "ৰ": 5, "ম": 6, "প": 7, "ু": 8, "ৎ": 9, "+": 10, "ঢ়": 11, "ূ": 12, "ঠ": 13, "১": 14, "ৈ": 15, "৯": 16, "য়": 17, "'": 18, "অ": 19, "থ": 20, "ঊ": 21, "‍": 22, "ঝ": 23, "ঐ": 24, "ি": 25, "ট": 26, "ণ": 27, "ঞ": 28, "গ": 29, "৪": 30, "ঁ": 31, "‌": 32, "আ": 33, "এ": 34, "৩": 35, "ষ": 36, "২": 37, "ভ": 38, "শ": 39, "ঢ": 40, "d": 41, "ছ": 42, "ো": 43, "৭": 44, "৬": 45, "’": 46, "9": 47, "ী": 48, "ঘ": 49, "°": 50, "ব": 51, "ড": 52, "ও": 53, "ঃ": 54, "ধ": 55, "৫": 56, "*": 57, "।": 58, "দ": 59, "ন": 60, "০": 61, "হ": 62, "স": 63, "e": 64, "র": 65, "ঔ": 66, "ে": 67, "উ": 68, "p": 69, "s": 70, "8": 71, "n": 72, "w": 73, "`": 74, "ড়": 75, "ৃ": 76, "ঋ": 77, "়": 78, "ৱ": 79, "৺": 80, "জ": 81, "b": 82, "ই": 83, "ফ": 84, "ল": 85, "ত": 86, "f": 87, "চ": 88, "য": 89, "ং": 90, "ক": 91, ")": 92, "া": 94, "৮": 95, "ঈ": 96, "খ": 97, "1": 98, "|": 93, "[UNK]": 99, "[PAD]": 100}