feat: add best model iteration from second run and language models
Browse files- .gitattributes +1 -0
- README.md +63 -0
- alphabet.json +1 -0
- config.json +106 -0
- language_model/attrs.json +1 -0
- language_model/commonvoice-bn.5.arpa +3 -0
- language_model/unigrams.txt +0 -0
- preprocessor_config.json +10 -0
- pytorch_model.bin +3 -0
- special_tokens_map.json +6 -0
- tokenizer_config.json +13 -0
- vocab.json +114 -0
.gitattributes
CHANGED
@@ -30,3 +30,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
30 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
31 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
32 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
30 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
31 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
32 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.arpa filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
@@ -1,3 +1,66 @@
|
|
1 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
license: apache-2.0
|
3 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
+
language:
|
3 |
+
- bn
|
4 |
+
language_bcp47:
|
5 |
+
- bn-BD
|
6 |
+
tags:
|
7 |
+
- automatic-speech-recognition
|
8 |
+
- bn
|
9 |
+
- common_voice_9_0
|
10 |
+
- openslr_SLR53
|
11 |
+
datasets:
|
12 |
+
- common_voice_bn
|
13 |
+
- openSLR53
|
14 |
+
- multilingual_librispeech
|
15 |
+
metrics:
|
16 |
+
- wer
|
17 |
+
- cer
|
18 |
+
model-index:
|
19 |
+
- name: shahruk10/wav2vec2-xls-r-300m-bengali-commonvoice
|
20 |
+
results:
|
21 |
+
- task:
|
22 |
+
type: automatic-speech-recognition
|
23 |
+
name: Speech Recognition
|
24 |
+
dataset:
|
25 |
+
type: common_voice_9_0
|
26 |
+
name: Common Voice (Bengali)
|
27 |
+
args: common_voice_bn
|
28 |
+
metrics:
|
29 |
+
- type: wer
|
30 |
+
value: 0.01793038418929547
|
31 |
+
name: Validation WER with 5-gram LM
|
32 |
+
- type: cer
|
33 |
+
value: 0.08078964599673999
|
34 |
+
name: Validation CER with 5-gram LM
|
35 |
license: apache-2.0
|
36 |
---
|
37 |
+
|
38 |
+
# Wav2Vec2-XLS-R-300M-Bengali-CommonVoice
|
39 |
+
|
40 |
+
- This model is a fine-tuned version of [arijitx/wav2vec2-xls-r-300m-bengali](https://huggingface.co/arijitx/wav2vec2-xls-r-300m-bengali) on the the Common Voice 9.0 Bengali dataset. In total, the model was trained on ~300 hours of Bengali (Bangladesh accent) 16 kHz audio data.
|
41 |
+
|
42 |
+
- The training and and validation partitions used were provided by the organizers of the [BUET CSE Fest 2022 DL Sprint Competition on Kaggle](https://www.kaggle.com/competitions/dlsprint).
|
43 |
+
|
44 |
+
- The model placed first on both the public and private leader boards.
|
45 |
+
|
46 |
+
- A 5-gram language model generated from the training split was used with model.
|
47 |
+
|
48 |
+
## Metrics
|
49 |
+
|
50 |
+
- The model was evaluated using Word Error Rate (WER) and Character Error Rate (CER) for the validation set. At the time, the test set labels were not made available by the organizers of the Kaggle competition which provided the data splits for training.
|
51 |
+
|
52 |
+
|
53 |
+
| Model | Split | CER | WER |
|
54 |
+
|:-------:|:-----:|:-----:|:------:|
|
55 |
+
| With 5-gram LM | Validation | 0.08079 | 0.017939 |
|
56 |
+
|
57 |
+
|
58 |
+
## Training
|
59 |
+
|
60 |
+
- The training notebook for this model can be found on Kaggle [here](https://www.kaggle.com/code/shahruk10/training-notebook-wav2vec2).
|
61 |
+
|
62 |
+
- The inference notebook for this model can be found on Kaggle [here](https://www.kaggle.com/code/shahruk10/inference-notebook-wav2vec2).
|
63 |
+
|
64 |
+
- The model was first trained for 15 epochs on the training split (with on-the-fly augmentation). Dropouts were enabled and a cosine decay learning rate schedule starting from 3e-5 was used.
|
65 |
+
|
66 |
+
- The best iteration from the first run was further fine-tuned for 5 epochs at constant learning rate of 1e-7 with dropouts disabled.
|
alphabet.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"labels": [" ", "_", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "r", "s", "t", "u", "v", "w", "x", "y", "z", "\u0093", "\u0094", "\u0153", "\u0964", "\u0981", "\u0982", "\u0983", "\u0985", "\u0986", "\u0987", "\u0988", "\u0989", "\u098a", "\u098b", "\u098f", "\u0990", "\u0993", "\u0994", "\u0995", "\u0996", "\u0997", "\u0998", "\u0999", "\u099a", "\u099b", "\u099c", "\u099d", "\u099e", "\u099f", "\u09a0", "\u09a1", "\u09a2", "\u09a3", "\u09a4", "\u09a5", "\u09a6", "\u09a7", "\u09a8", "\u09aa", "\u09ab", "\u09ac", "\u09ad", "\u09ae", "\u09af", "\u09b0", "\u09b2", "\u09b6", "\u09b7", "\u09b8", "\u09b9", "\u09bc", "\u09be", "\u09bf", "\u09c0", "\u09c1", "\u09c2", "\u09c3", "\u09c7", "\u09c8", "\u09cb", "\u09cc", "\u09cd", "\u09ce", "\u09d7", "\u09dc", "\u09dd", "\u09df", "\u09e6", "\u09e7", "\u09e8", "\u09e9", "\u09ea", "\u09eb", "\u09ec", "\u09ed", "\u09ee", "\u09ef", "\u09f0", "\u200c", "\u200d", "\u200e", "\u2047", "", "<s>", "</s>"], "is_bpe": false}
|
config.json
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"activation_dropout": 0.1,
|
3 |
+
"adapter_kernel_size": 3,
|
4 |
+
"adapter_stride": 2,
|
5 |
+
"add_adapter": false,
|
6 |
+
"apply_spec_augment": true,
|
7 |
+
"architectures": [
|
8 |
+
"Wav2Vec2ForCTC"
|
9 |
+
],
|
10 |
+
"attention_dropout": 0,
|
11 |
+
"bos_token_id": 1,
|
12 |
+
"classifier_proj_size": 256,
|
13 |
+
"codevector_dim": 768,
|
14 |
+
"contrastive_logits_temperature": 0.1,
|
15 |
+
"conv_bias": true,
|
16 |
+
"conv_dim": [
|
17 |
+
512,
|
18 |
+
512,
|
19 |
+
512,
|
20 |
+
512,
|
21 |
+
512,
|
22 |
+
512,
|
23 |
+
512
|
24 |
+
],
|
25 |
+
"conv_kernel": [
|
26 |
+
10,
|
27 |
+
3,
|
28 |
+
3,
|
29 |
+
3,
|
30 |
+
3,
|
31 |
+
2,
|
32 |
+
2
|
33 |
+
],
|
34 |
+
"conv_stride": [
|
35 |
+
5,
|
36 |
+
2,
|
37 |
+
2,
|
38 |
+
2,
|
39 |
+
2,
|
40 |
+
2,
|
41 |
+
2
|
42 |
+
],
|
43 |
+
"ctc_loss_reduction": "mean",
|
44 |
+
"ctc_zero_infinity": false,
|
45 |
+
"diversity_loss_weight": 0.1,
|
46 |
+
"do_stable_layer_norm": true,
|
47 |
+
"eos_token_id": 2,
|
48 |
+
"feat_extract_activation": "gelu",
|
49 |
+
"feat_extract_dropout": 0.0,
|
50 |
+
"feat_extract_norm": "layer",
|
51 |
+
"feat_proj_dropout": 0,
|
52 |
+
"feat_quantizer_dropout": 0.0,
|
53 |
+
"final_dropout": 0.0,
|
54 |
+
"hidden_act": "gelu",
|
55 |
+
"hidden_dropout": 0,
|
56 |
+
"hidden_size": 1024,
|
57 |
+
"initializer_range": 0.02,
|
58 |
+
"intermediate_size": 4096,
|
59 |
+
"layer_norm_eps": 1e-05,
|
60 |
+
"layerdrop": 0,
|
61 |
+
"mask_feature_length": 64,
|
62 |
+
"mask_feature_min_masks": 0,
|
63 |
+
"mask_feature_prob": 0.25,
|
64 |
+
"mask_time_length": 10,
|
65 |
+
"mask_time_min_masks": 2,
|
66 |
+
"mask_time_prob": 0,
|
67 |
+
"model_type": "wav2vec2",
|
68 |
+
"num_adapter_layers": 3,
|
69 |
+
"num_attention_heads": 16,
|
70 |
+
"num_codevector_groups": 2,
|
71 |
+
"num_codevectors_per_group": 320,
|
72 |
+
"num_conv_pos_embedding_groups": 16,
|
73 |
+
"num_conv_pos_embeddings": 128,
|
74 |
+
"num_feat_extract_layers": 7,
|
75 |
+
"num_hidden_layers": 24,
|
76 |
+
"num_negatives": 100,
|
77 |
+
"output_hidden_size": 1024,
|
78 |
+
"pad_token_id": 109,
|
79 |
+
"proj_codevector_dim": 768,
|
80 |
+
"tdnn_dilation": [
|
81 |
+
1,
|
82 |
+
2,
|
83 |
+
3,
|
84 |
+
1,
|
85 |
+
1
|
86 |
+
],
|
87 |
+
"tdnn_dim": [
|
88 |
+
512,
|
89 |
+
512,
|
90 |
+
512,
|
91 |
+
512,
|
92 |
+
1500
|
93 |
+
],
|
94 |
+
"tdnn_kernel": [
|
95 |
+
5,
|
96 |
+
3,
|
97 |
+
3,
|
98 |
+
1,
|
99 |
+
1
|
100 |
+
],
|
101 |
+
"torch_dtype": "float32",
|
102 |
+
"transformers_version": "4.20.1",
|
103 |
+
"use_weighted_layer_sum": false,
|
104 |
+
"vocab_size": 112,
|
105 |
+
"xvector_output_dim": 512
|
106 |
+
}
|
language_model/attrs.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"alpha": 0.6, "beta": 0.25, "unk_score_offset": -10.0, "score_boundary": true}
|
language_model/commonvoice-bn.5.arpa
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c6c7a0d8e40c3cc0ef49c8bcd7ec63d3c5b05a6b6b124f801b14c5634d95424a
|
3 |
+
size 301084756
|
language_model/unigrams.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
preprocessor_config.json
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"do_normalize": true,
|
3 |
+
"feature_extractor_type": "Wav2Vec2FeatureExtractor",
|
4 |
+
"feature_size": 1,
|
5 |
+
"padding_side": "right",
|
6 |
+
"padding_value": 0.0,
|
7 |
+
"processor_class": "Wav2Vec2ProcessorWithLM",
|
8 |
+
"return_attention_mask": true,
|
9 |
+
"sampling_rate": 16000
|
10 |
+
}
|
pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:60a0e2307c2524b1facd5f4a9e6ad585b656246da8186f35948d92d2c377ffd3
|
3 |
+
size 1262357937
|
special_tokens_map.json
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": "<s>",
|
3 |
+
"eos_token": "</s>",
|
4 |
+
"pad_token": "[PAD]",
|
5 |
+
"unk_token": "[UNK]"
|
6 |
+
}
|
tokenizer_config.json
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": "<s>",
|
3 |
+
"do_lower_case": false,
|
4 |
+
"eos_token": "</s>",
|
5 |
+
"name_or_path": "./",
|
6 |
+
"pad_token": "[PAD]",
|
7 |
+
"processor_class": "Wav2Vec2ProcessorWithLM",
|
8 |
+
"replace_word_delimiter_char": " ",
|
9 |
+
"special_tokens_map_file": null,
|
10 |
+
"tokenizer_class": "Wav2Vec2CTCTokenizer",
|
11 |
+
"unk_token": "[UNK]",
|
12 |
+
"word_delimiter_token": "|"
|
13 |
+
}
|
vocab.json
ADDED
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"</s>": 111,
|
3 |
+
"<s>": 110,
|
4 |
+
"[PAD]": 109,
|
5 |
+
"[UNK]": 108,
|
6 |
+
"_": 1,
|
7 |
+
"a": 2,
|
8 |
+
"b": 3,
|
9 |
+
"c": 4,
|
10 |
+
"d": 5,
|
11 |
+
"e": 6,
|
12 |
+
"f": 7,
|
13 |
+
"g": 8,
|
14 |
+
"h": 9,
|
15 |
+
"i": 10,
|
16 |
+
"j": 11,
|
17 |
+
"k": 12,
|
18 |
+
"l": 13,
|
19 |
+
"m": 14,
|
20 |
+
"n": 15,
|
21 |
+
"o": 16,
|
22 |
+
"p": 17,
|
23 |
+
"r": 18,
|
24 |
+
"s": 19,
|
25 |
+
"t": 20,
|
26 |
+
"u": 21,
|
27 |
+
"v": 22,
|
28 |
+
"w": 23,
|
29 |
+
"x": 24,
|
30 |
+
"y": 25,
|
31 |
+
"z": 26,
|
32 |
+
"|": 0,
|
33 |
+
"": 27,
|
34 |
+
"": 28,
|
35 |
+
"œ": 29,
|
36 |
+
"।": 30,
|
37 |
+
"ঁ": 31,
|
38 |
+
"ং": 32,
|
39 |
+
"ঃ": 33,
|
40 |
+
"অ": 34,
|
41 |
+
"আ": 35,
|
42 |
+
"ই": 36,
|
43 |
+
"ঈ": 37,
|
44 |
+
"উ": 38,
|
45 |
+
"ঊ": 39,
|
46 |
+
"ঋ": 40,
|
47 |
+
"এ": 41,
|
48 |
+
"ঐ": 42,
|
49 |
+
"ও": 43,
|
50 |
+
"ঔ": 44,
|
51 |
+
"ক": 45,
|
52 |
+
"খ": 46,
|
53 |
+
"গ": 47,
|
54 |
+
"ঘ": 48,
|
55 |
+
"ঙ": 49,
|
56 |
+
"চ": 50,
|
57 |
+
"ছ": 51,
|
58 |
+
"জ": 52,
|
59 |
+
"ঝ": 53,
|
60 |
+
"ঞ": 54,
|
61 |
+
"ট": 55,
|
62 |
+
"ঠ": 56,
|
63 |
+
"ড": 57,
|
64 |
+
"ঢ": 58,
|
65 |
+
"ণ": 59,
|
66 |
+
"ত": 60,
|
67 |
+
"থ": 61,
|
68 |
+
"দ": 62,
|
69 |
+
"ধ": 63,
|
70 |
+
"ন": 64,
|
71 |
+
"প": 65,
|
72 |
+
"ফ": 66,
|
73 |
+
"ব": 67,
|
74 |
+
"ভ": 68,
|
75 |
+
"ম": 69,
|
76 |
+
"য": 70,
|
77 |
+
"র": 71,
|
78 |
+
"ল": 72,
|
79 |
+
"শ": 73,
|
80 |
+
"ষ": 74,
|
81 |
+
"স": 75,
|
82 |
+
"হ": 76,
|
83 |
+
"়": 77,
|
84 |
+
"া": 78,
|
85 |
+
"ি": 79,
|
86 |
+
"ী": 80,
|
87 |
+
"ু": 81,
|
88 |
+
"ূ": 82,
|
89 |
+
"ৃ": 83,
|
90 |
+
"ে": 84,
|
91 |
+
"ৈ": 85,
|
92 |
+
"ো": 86,
|
93 |
+
"ৌ": 87,
|
94 |
+
"্": 88,
|
95 |
+
"ৎ": 89,
|
96 |
+
"ৗ": 90,
|
97 |
+
"ড়": 91,
|
98 |
+
"ঢ়": 92,
|
99 |
+
"য়": 93,
|
100 |
+
"০": 94,
|
101 |
+
"১": 95,
|
102 |
+
"২": 96,
|
103 |
+
"৩": 97,
|
104 |
+
"৪": 98,
|
105 |
+
"৫": 99,
|
106 |
+
"৬": 100,
|
107 |
+
"৭": 101,
|
108 |
+
"৮": 102,
|
109 |
+
"৯": 103,
|
110 |
+
"ৰ": 104,
|
111 |
+
"": 105,
|
112 |
+
"": 106,
|
113 |
+
"": 107
|
114 |
+
}
|