shahruk10 commited on
Commit
62b7482
1 Parent(s): fed092a

feat: add best model iteration from second run and language models

Browse files
.gitattributes CHANGED
@@ -30,3 +30,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
30
  *.zip filter=lfs diff=lfs merge=lfs -text
31
  *.zst filter=lfs diff=lfs merge=lfs -text
32
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
30
  *.zip filter=lfs diff=lfs merge=lfs -text
31
  *.zst filter=lfs diff=lfs merge=lfs -text
32
  *tfevents* filter=lfs diff=lfs merge=lfs -text
33
+ *.arpa filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,3 +1,66 @@
1
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  license: apache-2.0
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ language:
3
+ - bn
4
+ language_bcp47:
5
+ - bn-BD
6
+ tags:
7
+ - automatic-speech-recognition
8
+ - bn
9
+ - common_voice_9_0
10
+ - openslr_SLR53
11
+ datasets:
12
+ - common_voice_bn
13
+ - openSLR53
14
+ - multilingual_librispeech
15
+ metrics:
16
+ - wer
17
+ - cer
18
+ model-index:
19
+ - name: shahruk10/wav2vec2-xls-r-300m-bengali-commonvoice
20
+ results:
21
+ - task:
22
+ type: automatic-speech-recognition
23
+ name: Speech Recognition
24
+ dataset:
25
+ type: common_voice_9_0
26
+ name: Common Voice (Bengali)
27
+ args: common_voice_bn
28
+ metrics:
29
+ - type: wer
30
+ value: 0.01793038418929547
31
+ name: Validation WER with 5-gram LM
32
+ - type: cer
33
+ value: 0.08078964599673999
34
+ name: Validation CER with 5-gram LM
35
  license: apache-2.0
36
  ---
37
+
38
+ # Wav2Vec2-XLS-R-300M-Bengali-CommonVoice
39
+
40
+ - This model is a fine-tuned version of [arijitx/wav2vec2-xls-r-300m-bengali](https://huggingface.co/arijitx/wav2vec2-xls-r-300m-bengali) on the the Common Voice 9.0 Bengali dataset. In total, the model was trained on ~300 hours of Bengali (Bangladesh accent) 16 kHz audio data.
41
+
42
+ - The training and and validation partitions used were provided by the organizers of the [BUET CSE Fest 2022 DL Sprint Competition on Kaggle](https://www.kaggle.com/competitions/dlsprint).
43
+
44
+ - The model placed first on both the public and private leader boards.
45
+
46
+ - A 5-gram language model generated from the training split was used with model.
47
+
48
+ ## Metrics
49
+
50
+ - The model was evaluated using Word Error Rate (WER) and Character Error Rate (CER) for the validation set. At the time, the test set labels were not made available by the organizers of the Kaggle competition which provided the data splits for training.
51
+
52
+
53
+ | Model | Split | CER | WER |
54
+ |:-------:|:-----:|:-----:|:------:|
55
+ | With 5-gram LM | Validation | 0.08079 | 0.017939 |
56
+
57
+
58
+ ## Training
59
+
60
+ - The training notebook for this model can be found on Kaggle [here](https://www.kaggle.com/code/shahruk10/training-notebook-wav2vec2).
61
+
62
+ - The inference notebook for this model can be found on Kaggle [here](https://www.kaggle.com/code/shahruk10/inference-notebook-wav2vec2).
63
+
64
+ - The model was first trained for 15 epochs on the training split (with on-the-fly augmentation). Dropouts were enabled and a cosine decay learning rate schedule starting from 3e-5 was used.
65
+
66
+ - The best iteration from the first run was further fine-tuned for 5 epochs at constant learning rate of 1e-7 with dropouts disabled.
alphabet.json ADDED
@@ -0,0 +1 @@
 
1
+ {"labels": [" ", "_", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "r", "s", "t", "u", "v", "w", "x", "y", "z", "\u0093", "\u0094", "\u0153", "\u0964", "\u0981", "\u0982", "\u0983", "\u0985", "\u0986", "\u0987", "\u0988", "\u0989", "\u098a", "\u098b", "\u098f", "\u0990", "\u0993", "\u0994", "\u0995", "\u0996", "\u0997", "\u0998", "\u0999", "\u099a", "\u099b", "\u099c", "\u099d", "\u099e", "\u099f", "\u09a0", "\u09a1", "\u09a2", "\u09a3", "\u09a4", "\u09a5", "\u09a6", "\u09a7", "\u09a8", "\u09aa", "\u09ab", "\u09ac", "\u09ad", "\u09ae", "\u09af", "\u09b0", "\u09b2", "\u09b6", "\u09b7", "\u09b8", "\u09b9", "\u09bc", "\u09be", "\u09bf", "\u09c0", "\u09c1", "\u09c2", "\u09c3", "\u09c7", "\u09c8", "\u09cb", "\u09cc", "\u09cd", "\u09ce", "\u09d7", "\u09dc", "\u09dd", "\u09df", "\u09e6", "\u09e7", "\u09e8", "\u09e9", "\u09ea", "\u09eb", "\u09ec", "\u09ed", "\u09ee", "\u09ef", "\u09f0", "\u200c", "\u200d", "\u200e", "\u2047", "", "<s>", "</s>"], "is_bpe": false}
config.json ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.1,
3
+ "adapter_kernel_size": 3,
4
+ "adapter_stride": 2,
5
+ "add_adapter": false,
6
+ "apply_spec_augment": true,
7
+ "architectures": [
8
+ "Wav2Vec2ForCTC"
9
+ ],
10
+ "attention_dropout": 0,
11
+ "bos_token_id": 1,
12
+ "classifier_proj_size": 256,
13
+ "codevector_dim": 768,
14
+ "contrastive_logits_temperature": 0.1,
15
+ "conv_bias": true,
16
+ "conv_dim": [
17
+ 512,
18
+ 512,
19
+ 512,
20
+ 512,
21
+ 512,
22
+ 512,
23
+ 512
24
+ ],
25
+ "conv_kernel": [
26
+ 10,
27
+ 3,
28
+ 3,
29
+ 3,
30
+ 3,
31
+ 2,
32
+ 2
33
+ ],
34
+ "conv_stride": [
35
+ 5,
36
+ 2,
37
+ 2,
38
+ 2,
39
+ 2,
40
+ 2,
41
+ 2
42
+ ],
43
+ "ctc_loss_reduction": "mean",
44
+ "ctc_zero_infinity": false,
45
+ "diversity_loss_weight": 0.1,
46
+ "do_stable_layer_norm": true,
47
+ "eos_token_id": 2,
48
+ "feat_extract_activation": "gelu",
49
+ "feat_extract_dropout": 0.0,
50
+ "feat_extract_norm": "layer",
51
+ "feat_proj_dropout": 0,
52
+ "feat_quantizer_dropout": 0.0,
53
+ "final_dropout": 0.0,
54
+ "hidden_act": "gelu",
55
+ "hidden_dropout": 0,
56
+ "hidden_size": 1024,
57
+ "initializer_range": 0.02,
58
+ "intermediate_size": 4096,
59
+ "layer_norm_eps": 1e-05,
60
+ "layerdrop": 0,
61
+ "mask_feature_length": 64,
62
+ "mask_feature_min_masks": 0,
63
+ "mask_feature_prob": 0.25,
64
+ "mask_time_length": 10,
65
+ "mask_time_min_masks": 2,
66
+ "mask_time_prob": 0,
67
+ "model_type": "wav2vec2",
68
+ "num_adapter_layers": 3,
69
+ "num_attention_heads": 16,
70
+ "num_codevector_groups": 2,
71
+ "num_codevectors_per_group": 320,
72
+ "num_conv_pos_embedding_groups": 16,
73
+ "num_conv_pos_embeddings": 128,
74
+ "num_feat_extract_layers": 7,
75
+ "num_hidden_layers": 24,
76
+ "num_negatives": 100,
77
+ "output_hidden_size": 1024,
78
+ "pad_token_id": 109,
79
+ "proj_codevector_dim": 768,
80
+ "tdnn_dilation": [
81
+ 1,
82
+ 2,
83
+ 3,
84
+ 1,
85
+ 1
86
+ ],
87
+ "tdnn_dim": [
88
+ 512,
89
+ 512,
90
+ 512,
91
+ 512,
92
+ 1500
93
+ ],
94
+ "tdnn_kernel": [
95
+ 5,
96
+ 3,
97
+ 3,
98
+ 1,
99
+ 1
100
+ ],
101
+ "torch_dtype": "float32",
102
+ "transformers_version": "4.20.1",
103
+ "use_weighted_layer_sum": false,
104
+ "vocab_size": 112,
105
+ "xvector_output_dim": 512
106
+ }
language_model/attrs.json ADDED
@@ -0,0 +1 @@
 
1
+ {"alpha": 0.6, "beta": 0.25, "unk_score_offset": -10.0, "score_boundary": true}
language_model/commonvoice-bn.5.arpa ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6c7a0d8e40c3cc0ef49c8bcd7ec63d3c5b05a6b6b124f801b14c5634d95424a
3
+ size 301084756
language_model/unigrams.txt ADDED
The diff for this file is too large to render. See raw diff
preprocessor_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0.0,
7
+ "processor_class": "Wav2Vec2ProcessorWithLM",
8
+ "return_attention_mask": true,
9
+ "sampling_rate": 16000
10
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60a0e2307c2524b1facd5f4a9e6ad585b656246da8186f35948d92d2c377ffd3
3
+ size 1262357937
special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "pad_token": "[PAD]",
5
+ "unk_token": "[UNK]"
6
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "do_lower_case": false,
4
+ "eos_token": "</s>",
5
+ "name_or_path": "./",
6
+ "pad_token": "[PAD]",
7
+ "processor_class": "Wav2Vec2ProcessorWithLM",
8
+ "replace_word_delimiter_char": " ",
9
+ "special_tokens_map_file": null,
10
+ "tokenizer_class": "Wav2Vec2CTCTokenizer",
11
+ "unk_token": "[UNK]",
12
+ "word_delimiter_token": "|"
13
+ }
vocab.json ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</s>": 111,
3
+ "<s>": 110,
4
+ "[PAD]": 109,
5
+ "[UNK]": 108,
6
+ "_": 1,
7
+ "a": 2,
8
+ "b": 3,
9
+ "c": 4,
10
+ "d": 5,
11
+ "e": 6,
12
+ "f": 7,
13
+ "g": 8,
14
+ "h": 9,
15
+ "i": 10,
16
+ "j": 11,
17
+ "k": 12,
18
+ "l": 13,
19
+ "m": 14,
20
+ "n": 15,
21
+ "o": 16,
22
+ "p": 17,
23
+ "r": 18,
24
+ "s": 19,
25
+ "t": 20,
26
+ "u": 21,
27
+ "v": 22,
28
+ "w": 23,
29
+ "x": 24,
30
+ "y": 25,
31
+ "z": 26,
32
+ "|": 0,
33
+ "“": 27,
34
+ "”": 28,
35
+ "œ": 29,
36
+ "।": 30,
37
+ "ঁ": 31,
38
+ "ং": 32,
39
+ "ঃ": 33,
40
+ "অ": 34,
41
+ "আ": 35,
42
+ "ই": 36,
43
+ "ঈ": 37,
44
+ "উ": 38,
45
+ "ঊ": 39,
46
+ "ঋ": 40,
47
+ "এ": 41,
48
+ "ঐ": 42,
49
+ "ও": 43,
50
+ "ঔ": 44,
51
+ "ক": 45,
52
+ "খ": 46,
53
+ "গ": 47,
54
+ "ঘ": 48,
55
+ "ঙ": 49,
56
+ "চ": 50,
57
+ "ছ": 51,
58
+ "জ": 52,
59
+ "ঝ": 53,
60
+ "ঞ": 54,
61
+ "ট": 55,
62
+ "ঠ": 56,
63
+ "ড": 57,
64
+ "ঢ": 58,
65
+ "ণ": 59,
66
+ "ত": 60,
67
+ "থ": 61,
68
+ "দ": 62,
69
+ "ধ": 63,
70
+ "ন": 64,
71
+ "প": 65,
72
+ "ফ": 66,
73
+ "ব": 67,
74
+ "ভ": 68,
75
+ "ম": 69,
76
+ "য": 70,
77
+ "র": 71,
78
+ "ল": 72,
79
+ "শ": 73,
80
+ "ষ": 74,
81
+ "স": 75,
82
+ "হ": 76,
83
+ "়": 77,
84
+ "া": 78,
85
+ "ি": 79,
86
+ "ী": 80,
87
+ "ু": 81,
88
+ "ূ": 82,
89
+ "ৃ": 83,
90
+ "ে": 84,
91
+ "ৈ": 85,
92
+ "ো": 86,
93
+ "ৌ": 87,
94
+ "্": 88,
95
+ "ৎ": 89,
96
+ "ৗ": 90,
97
+ "ড়": 91,
98
+ "ঢ়": 92,
99
+ "য়": 93,
100
+ "০": 94,
101
+ "১": 95,
102
+ "২": 96,
103
+ "৩": 97,
104
+ "৪": 98,
105
+ "৫": 99,
106
+ "৬": 100,
107
+ "৭": 101,
108
+ "৮": 102,
109
+ "৯": 103,
110
+ "ৰ": 104,
111
+ "‌": 105,
112
+ "‍": 106,
113
+ "‎": 107
114
+ }