5roop commited on
Commit
a8941f4
1 Parent(s): 49c98fd

First commit

Browse files
README.md ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: sr
3
+ datasets:
4
+ - juznevesti-sr
5
+ tags:
6
+ - audio
7
+ - automatic-speech-recognition
8
+ widget:
9
+ - example_title: Croatian example 1
10
+ src: https://huggingface.co/classla/wav2vec2-xls-r-parlaspeech-hr/raw/main/1800.m4a
11
+ - example_title: Croatian example 2
12
+ src: https://huggingface.co/classla/wav2vec2-xls-r-parlaspeech-hr/raw/main/00020578b.flac.wav
13
+ - example_title: Croatian example 3
14
+ src: https://huggingface.co/classla/wav2vec2-xls-r-parlaspeech-hr/raw/main/00020570a.flac.wav
15
+ ---
16
+
17
+ # wav2vec2-large-juznevesti
18
+
19
+ This model for Serbian ASR is based on the [facebook/wav2vec2-large-slavic-voxpopuli-v2 model](https://huggingface.co/facebook/wav2vec2-large-slavic-voxpopuli-v2) and was fine-tuned with 58 hours of audio and transcripts from [Južne vesti](https://www.juznevesti.com/), programme '15 minuta'.
20
+
21
+
22
+
23
+ ## Metrics
24
+
25
+ Evaluation is performed on the dev and test portions of the JuzneVesti dataset
26
+
27
+ | | dev | test |
28
+ |:----|---------:|---------:|
29
+ | WER | 0.295206 | 0.290094 |
30
+ | CER | 0.140766 | 0.137642 |
31
+
32
+
33
+
34
+ ## Usage in `transformers`
35
+
36
+ Tested with `transformers==4.18.0`, `torch==1.11.0`, and `SoundFile==0.10.3.post1`.
37
+
38
+ ```python
39
+ from transformers import Wav2Vec2ProcessorWithLM, Wav2Vec2ForCTC
40
+ import soundfile as sf
41
+ import torch
42
+ import os
43
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
44
+ # load model and tokenizer
45
+ processor = Wav2Vec2ProcessorWithLM.from_pretrained(
46
+ "classla/wav2vec2-large-slavic-parlaspeech-hr-lm")
47
+ model = Wav2Vec2ForCTC.from_pretrained("classla/wav2vec2-large-slavic-parlaspeech-hr-lm")
48
+ # download the example wav files:
49
+ os.system("wget https://huggingface.co/classla/wav2vec2-large-slavic-parlaspeech-hr-lm/raw/main/00020570a.flac.wav")
50
+ # read the wav file
51
+ speech, sample_rate = sf.read("00020570a.flac.wav")
52
+ input_values = processor(speech, sampling_rate=sample_rate, return_tensors="pt").input_values.cuda()
53
+ inputs = processor(speech, sampling_rate=sample_rate, return_tensors="pt")
54
+ with torch.no_grad():
55
+ logits = model(**inputs).logits
56
+ transcription = processor.batch_decode(logits.numpy()).text[0]
57
+
58
+ # remove the raw wav file
59
+ os.system("rm 00020570a.flac.wav")
60
+
61
+ transcription # 'velik broj poslovnih subjekata poslao je sa minusom velik dio'
62
+ ```
63
+
64
+
65
+
66
+ ## Training hyperparameters
67
+
68
+ In fine-tuning, the following arguments were used:
69
+
70
+ | arg | value |
71
+ |-------------------------------|-------|
72
+ | `per_device_train_batch_size` | 16 |
73
+ | `gradient_accumulation_steps` | 4 |
74
+ | `num_train_epochs` | 8 |
75
+ | `learning_rate` | 3e-4 |
76
+ | `warmup_steps` | 500 |
config.json ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "facebook/wav2vec2-xls-r-300m",
3
+ "activation_dropout": 0.0,
4
+ "adapter_kernel_size": 3,
5
+ "adapter_stride": 2,
6
+ "add_adapter": false,
7
+ "apply_spec_augment": true,
8
+ "architectures": [
9
+ "Wav2Vec2ForCTC"
10
+ ],
11
+ "attention_dropout": 0.0,
12
+ "bos_token_id": 1,
13
+ "classifier_proj_size": 256,
14
+ "codevector_dim": 768,
15
+ "contrastive_logits_temperature": 0.1,
16
+ "conv_bias": true,
17
+ "conv_dim": [
18
+ 512,
19
+ 512,
20
+ 512,
21
+ 512,
22
+ 512,
23
+ 512,
24
+ 512
25
+ ],
26
+ "conv_kernel": [
27
+ 10,
28
+ 3,
29
+ 3,
30
+ 3,
31
+ 3,
32
+ 2,
33
+ 2
34
+ ],
35
+ "conv_stride": [
36
+ 5,
37
+ 2,
38
+ 2,
39
+ 2,
40
+ 2,
41
+ 2,
42
+ 2
43
+ ],
44
+ "ctc_loss_reduction": "mean",
45
+ "ctc_zero_infinity": false,
46
+ "diversity_loss_weight": 0.1,
47
+ "do_stable_layer_norm": true,
48
+ "eos_token_id": 2,
49
+ "feat_extract_activation": "gelu",
50
+ "feat_extract_dropout": 0.0,
51
+ "feat_extract_norm": "layer",
52
+ "feat_proj_dropout": 0.0,
53
+ "feat_quantizer_dropout": 0.0,
54
+ "final_dropout": 0.0,
55
+ "gradient_checkpointing": false,
56
+ "hidden_act": "gelu",
57
+ "hidden_dropout": 0.0,
58
+ "hidden_size": 1024,
59
+ "initializer_range": 0.02,
60
+ "intermediate_size": 4096,
61
+ "layer_norm_eps": 1e-05,
62
+ "layerdrop": 0.0,
63
+ "mask_feature_length": 10,
64
+ "mask_feature_min_masks": 0,
65
+ "mask_feature_prob": 0.0,
66
+ "mask_time_length": 10,
67
+ "mask_time_min_masks": 2,
68
+ "mask_time_prob": 0.05,
69
+ "model_type": "wav2vec2",
70
+ "num_adapter_layers": 3,
71
+ "num_attention_heads": 16,
72
+ "num_codevector_groups": 2,
73
+ "num_codevectors_per_group": 320,
74
+ "num_conv_pos_embedding_groups": 16,
75
+ "num_conv_pos_embeddings": 128,
76
+ "num_feat_extract_layers": 7,
77
+ "num_hidden_layers": 24,
78
+ "num_negatives": 100,
79
+ "output_hidden_size": 1024,
80
+ "pad_token_id": 36,
81
+ "proj_codevector_dim": 768,
82
+ "tdnn_dilation": [
83
+ 1,
84
+ 2,
85
+ 3,
86
+ 1,
87
+ 1
88
+ ],
89
+ "tdnn_dim": [
90
+ 512,
91
+ 512,
92
+ 512,
93
+ 512,
94
+ 1500
95
+ ],
96
+ "tdnn_kernel": [
97
+ 5,
98
+ 3,
99
+ 3,
100
+ 1,
101
+ 1
102
+ ],
103
+ "torch_dtype": "float32",
104
+ "transformers_version": "4.19.2",
105
+ "use_weighted_layer_sum": false,
106
+ "vocab_size": 50,
107
+ "xvector_output_dim": 512
108
+ }
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec0bdc557b477b255b60b979cb7cebc52527b6d1ae19b47bbf47d575b1d156a3
3
+ size 2524166561
preprocessor_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0.0,
7
+ "return_attention_mask": true,
8
+ "sampling_rate": 16000
9
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63181247e08f99317f361d16404e1fd9cce5ca286467749f0bbc4bd937ce1440
3
+ size 1262103729
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:866eea7832e32cfe26dacb917016d5f3cfe06609ea47b04d86e102af89841be7
3
+ size 14503
scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:73c500e222819b1d5e70ed939b749462553fc9c25e6517cab848faaa304b73f5
3
+ size 559
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb65d63b6c71452bb9d6a3e8d5989e111749c30774ebd9f6101ef190364e0fe3
3
+ size 623
trainer_state.json ADDED
@@ -0,0 +1,252 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 19.99591836734694,
5
+ "global_step": 2440,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 1.0,
12
+ "eval_cer": 1.0,
13
+ "eval_loss": 3.4127440452575684,
14
+ "eval_runtime": 162.9527,
15
+ "eval_samples_per_second": 6.64,
16
+ "eval_steps_per_second": 0.835,
17
+ "eval_wer": 1.0,
18
+ "step": 122
19
+ },
20
+ {
21
+ "epoch": 2.0,
22
+ "eval_cer": 1.0,
23
+ "eval_loss": 2.9441540241241455,
24
+ "eval_runtime": 163.1192,
25
+ "eval_samples_per_second": 6.633,
26
+ "eval_steps_per_second": 0.834,
27
+ "eval_wer": 1.0,
28
+ "step": 244
29
+ },
30
+ {
31
+ "epoch": 3.0,
32
+ "eval_cer": 0.3244493212689078,
33
+ "eval_loss": 1.2275432348251343,
34
+ "eval_runtime": 159.0312,
35
+ "eval_samples_per_second": 6.804,
36
+ "eval_steps_per_second": 0.855,
37
+ "eval_wer": 0.8635921622015851,
38
+ "step": 366
39
+ },
40
+ {
41
+ "epoch": 3.28,
42
+ "learning_rate": 0.0002388,
43
+ "loss": 3.9307,
44
+ "step": 400
45
+ },
46
+ {
47
+ "epoch": 4.0,
48
+ "eval_cer": 0.2023591638341556,
49
+ "eval_loss": 0.7168374061584473,
50
+ "eval_runtime": 158.5869,
51
+ "eval_samples_per_second": 6.823,
52
+ "eval_steps_per_second": 0.858,
53
+ "eval_wer": 0.501920552341129,
54
+ "step": 488
55
+ },
56
+ {
57
+ "epoch": 5.0,
58
+ "eval_cer": 0.17312679219727137,
59
+ "eval_loss": 0.617878794670105,
60
+ "eval_runtime": 158.2056,
61
+ "eval_samples_per_second": 6.839,
62
+ "eval_steps_per_second": 0.86,
63
+ "eval_wer": 0.41668692565760684,
64
+ "step": 610
65
+ },
66
+ {
67
+ "epoch": 6.0,
68
+ "eval_cer": 0.16569868368602203,
69
+ "eval_loss": 0.6016563177108765,
70
+ "eval_runtime": 157.9983,
71
+ "eval_samples_per_second": 6.848,
72
+ "eval_steps_per_second": 0.861,
73
+ "eval_wer": 0.37708465016774445,
74
+ "step": 732
75
+ },
76
+ {
77
+ "epoch": 6.56,
78
+ "learning_rate": 0.00025391752577319586,
79
+ "loss": 0.6036,
80
+ "step": 800
81
+ },
82
+ {
83
+ "epoch": 7.0,
84
+ "eval_cer": 0.16388987711259606,
85
+ "eval_loss": 0.6307795643806458,
86
+ "eval_runtime": 159.8432,
87
+ "eval_samples_per_second": 6.769,
88
+ "eval_steps_per_second": 0.851,
89
+ "eval_wer": 0.3708610881509214,
90
+ "step": 854
91
+ },
92
+ {
93
+ "epoch": 8.0,
94
+ "eval_cer": 0.1604865775407946,
95
+ "eval_loss": 0.6786409616470337,
96
+ "eval_runtime": 160.9659,
97
+ "eval_samples_per_second": 6.722,
98
+ "eval_steps_per_second": 0.845,
99
+ "eval_wer": 0.35467010259153015,
100
+ "step": 976
101
+ },
102
+ {
103
+ "epoch": 9.0,
104
+ "eval_cer": 0.15351281380865225,
105
+ "eval_loss": 0.696066677570343,
106
+ "eval_runtime": 165.3737,
107
+ "eval_samples_per_second": 6.543,
108
+ "eval_steps_per_second": 0.822,
109
+ "eval_wer": 0.3348324986629066,
110
+ "step": 1098
111
+ },
112
+ {
113
+ "epoch": 9.83,
114
+ "learning_rate": 0.00019206185567010307,
115
+ "loss": 0.303,
116
+ "step": 1200
117
+ },
118
+ {
119
+ "epoch": 10.0,
120
+ "eval_cer": 0.15007951033634373,
121
+ "eval_loss": 0.713607132434845,
122
+ "eval_runtime": 160.5614,
123
+ "eval_samples_per_second": 6.739,
124
+ "eval_steps_per_second": 0.847,
125
+ "eval_wer": 0.3272961540331599,
126
+ "step": 1220
127
+ },
128
+ {
129
+ "epoch": 11.0,
130
+ "eval_cer": 0.1534270883786321,
131
+ "eval_loss": 0.724854052066803,
132
+ "eval_runtime": 158.4702,
133
+ "eval_samples_per_second": 6.828,
134
+ "eval_steps_per_second": 0.858,
135
+ "eval_wer": 0.3330821218456751,
136
+ "step": 1342
137
+ },
138
+ {
139
+ "epoch": 12.0,
140
+ "eval_cer": 0.15021667102437603,
141
+ "eval_loss": 0.7961975932121277,
142
+ "eval_runtime": 158.6964,
143
+ "eval_samples_per_second": 6.818,
144
+ "eval_steps_per_second": 0.857,
145
+ "eval_wer": 0.3294598142655711,
146
+ "step": 1464
147
+ },
148
+ {
149
+ "epoch": 13.0,
150
+ "eval_cer": 0.15035811798390933,
151
+ "eval_loss": 0.8382342457771301,
152
+ "eval_runtime": 159.3031,
153
+ "eval_samples_per_second": 6.792,
154
+ "eval_steps_per_second": 0.854,
155
+ "eval_wer": 0.3209996596489522,
156
+ "step": 1586
157
+ },
158
+ {
159
+ "epoch": 13.11,
160
+ "learning_rate": 0.0001302061855670103,
161
+ "loss": 0.1685,
162
+ "step": 1600
163
+ },
164
+ {
165
+ "epoch": 14.0,
166
+ "eval_cer": 0.1506152942739699,
167
+ "eval_loss": 0.8463586568832397,
168
+ "eval_runtime": 159.0294,
169
+ "eval_samples_per_second": 6.804,
170
+ "eval_steps_per_second": 0.855,
171
+ "eval_wer": 0.3242329946030048,
172
+ "step": 1708
173
+ },
174
+ {
175
+ "epoch": 15.0,
176
+ "eval_cer": 0.14896507974608128,
177
+ "eval_loss": 0.9096932411193848,
178
+ "eval_runtime": 162.4549,
179
+ "eval_samples_per_second": 6.66,
180
+ "eval_steps_per_second": 0.837,
181
+ "eval_wer": 0.3218019156901833,
182
+ "step": 1830
183
+ },
184
+ {
185
+ "epoch": 16.0,
186
+ "eval_cer": 0.14725485741717853,
187
+ "eval_loss": 0.963500440120697,
188
+ "eval_runtime": 158.9025,
189
+ "eval_samples_per_second": 6.809,
190
+ "eval_steps_per_second": 0.856,
191
+ "eval_wer": 0.31263674818884624,
192
+ "step": 1952
193
+ },
194
+ {
195
+ "epoch": 16.39,
196
+ "learning_rate": 6.835051546391752e-05,
197
+ "loss": 0.1031,
198
+ "step": 2000
199
+ },
200
+ {
201
+ "epoch": 17.0,
202
+ "eval_cer": 0.1464233207459827,
203
+ "eval_loss": 0.9706696271896362,
204
+ "eval_runtime": 158.7074,
205
+ "eval_samples_per_second": 6.818,
206
+ "eval_steps_per_second": 0.857,
207
+ "eval_wer": 0.3110565468955122,
208
+ "step": 2074
209
+ },
210
+ {
211
+ "epoch": 18.0,
212
+ "eval_cer": 0.14608041902590194,
213
+ "eval_loss": 1.0005509853363037,
214
+ "eval_runtime": 158.0719,
215
+ "eval_samples_per_second": 6.845,
216
+ "eval_steps_per_second": 0.86,
217
+ "eval_wer": 0.31331745028443625,
218
+ "step": 2196
219
+ },
220
+ {
221
+ "epoch": 19.0,
222
+ "eval_cer": 0.14561750170379292,
223
+ "eval_loss": 1.0291253328323364,
224
+ "eval_runtime": 159.3838,
225
+ "eval_samples_per_second": 6.789,
226
+ "eval_steps_per_second": 0.853,
227
+ "eval_wer": 0.31030291243253755,
228
+ "step": 2318
229
+ },
230
+ {
231
+ "epoch": 19.67,
232
+ "learning_rate": 6.494845360824742e-06,
233
+ "loss": 0.0629,
234
+ "step": 2400
235
+ },
236
+ {
237
+ "epoch": 20.0,
238
+ "eval_cer": 0.1464233207459827,
239
+ "eval_loss": 1.0326457023620605,
240
+ "eval_runtime": 158.8998,
241
+ "eval_samples_per_second": 6.809,
242
+ "eval_steps_per_second": 0.856,
243
+ "eval_wer": 0.31287985608012836,
244
+ "step": 2440
245
+ }
246
+ ],
247
+ "max_steps": 2440,
248
+ "num_train_epochs": 20,
249
+ "total_flos": 7.698957470096574e+19,
250
+ "trial_name": null,
251
+ "trial_params": null
252
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ce0a84736069ab60f654939fcd8d5b72989b8933abcf298adb585ea9ed83703
3
+ size 3247
vocab.json ADDED
@@ -0,0 +1 @@
 
1
+ {"?": 1, "a": 2, "b": 3, "c": 4, "d": 5, "e": 6, "f": 7, "g": 8, "h": 9, "i": 10, "j": 11, "k": 12, "l": 13, "m": 14, "n": 15, "o": 16, "p": 17, "q": 18, "r": 19, "s": 20, "t": 21, "u": 22, "v": 23, "w": 24, "x": 25, "y": 26, "z": 27, "\u00e4": 28, "\u00fc": 29, "\u0107": 30, "\u010d": 31, "\u0111": 32, "\u0161": 33, "\u017e": 34, "|": 0, "[UNK]": 35, "[PAD]": 36, " ": 37, "1": 38, "2": 39, "3": 40, "4": 41, "5": 42, "6": 43, "7": 44, "8": 45, "9": 46, "0": 47}