voidful nazneen commited on
Commit
5eb0674
1 Parent(s): 8aa4511

model documentation (#3)

Browse files

- model documentation (d7847b3dd5b31b0f9850a79b8fe319b528db8ee5)


Co-authored-by: Nazneen Rajani <nazneen@users.noreply.huggingface.co>

Files changed (1) hide show
  1. README.md +243 -92
README.md CHANGED
@@ -1,3 +1,4 @@
 
1
  ---
2
  language:
3
  - multilingual
@@ -47,7 +48,17 @@ language:
47
  - tt
48
  - uk
49
  - vi
50
- language_bcp47:
 
 
 
 
 
 
 
 
 
 
51
  - fy-NL
52
  - ga-IE
53
  - pa-IN
@@ -57,40 +68,232 @@ language_bcp47:
57
  - zh-CN
58
  - zh-HK
59
  - zh-TW
60
- datasets:
61
- - common_voice
62
- tags:
63
- - audio
64
- - automatic-speech-recognition
65
- - hf-asr-leaderboard
66
- - robust-speech-event
67
- - speech
68
- - xlsr-fine-tuning-week
69
- license: apache-2.0
70
  model-index:
71
  - name: XLSR Wav2Vec2 for 56 language by Voidful
72
  results:
73
  - task:
74
- name: Speech Recognition
75
  type: automatic-speech-recognition
 
76
  dataset:
77
  name: Common Voice
78
  type: common_voice
79
  metrics:
80
- - name: Test CER
81
- type: cer
82
  value: 23.21
 
83
  ---
84
 
85
- # wav2vec2-xlsr-multilingual-56
86
-
87
- *56 language, 1 model Multilingual ASR*
88
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on 56 language using the [Common Voice](https://huggingface.co/datasets/common_voice).
 
 
 
 
 
 
 
 
 
 
 
90
  When using this model, make sure that your speech input is sampled at 16kHz.
91
-
92
- For more detail: [https://github.com/voidful/wav2vec2-xlsr-multilingual-56](https://github.com/voidful/wav2vec2-xlsr-multilingual-56)
93
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  ## Env setup:
95
  ```
96
  !pip install torchaudio
@@ -98,8 +301,9 @@ For more detail: [https://github.com/voidful/wav2vec2-xlsr-multilingual-56](http
98
  !pip install asrp
99
  !wget -O lang_ids.pk https://huggingface.co/voidful/wav2vec2-xlsr-multilingual-56/raw/main/lang_ids.pk
100
  ```
101
-
102
  ## Usage
 
103
  ```
104
  import torchaudio
105
  from datasets import load_dataset, load_metric
@@ -116,16 +320,16 @@ import soundfile as sf
116
  model_name = "voidful/wav2vec2-xlsr-multilingual-56"
117
  device = "cuda"
118
  processor_name = "voidful/wav2vec2-xlsr-multilingual-56"
119
-
120
  import pickle
121
  with open("lang_ids.pk", 'rb') as output:
122
  lang_ids = pickle.load(output)
123
 
124
  model = Wav2Vec2ForCTC.from_pretrained(model_name).to(device)
125
  processor = Wav2Vec2Processor.from_pretrained(processor_name)
126
-
127
  model.eval()
128
-
129
  def load_file_to_data(file,sampling_rate=16_000):
130
  batch = {}
131
  speech, _ = torchaudio.load(file)
@@ -137,8 +341,8 @@ def load_file_to_data(file,sampling_rate=16_000):
137
  batch["speech"] = speech.squeeze(0).numpy()
138
  batch["sampling_rate"] = '16000'
139
  return batch
140
-
141
-
142
  def predict(data):
143
  features = processor(data["speech"], sampling_rate=data["sampling_rate"], padding=True, return_tensors="pt")
144
  input_values = features.input_values.to(device)
@@ -153,9 +357,9 @@ def predict(data):
153
  voice_prob = torch.nn.functional.softmax((torch.masked_select(logit, mask).view(-1,vocab_size)),dim=-1)
154
  comb_pred_ids = torch.argmax(voice_prob, dim=-1)
155
  decoded_results.append(processor.decode(comb_pred_ids))
156
-
157
  return decoded_results
158
-
159
  def predict_lang_specific(data,lang_code):
160
  features = processor(data["speech"], sampling_rate=data["sampling_rate"], padding=True, return_tensors="pt")
161
  input_values = features.input_values.to(device)
@@ -180,69 +384,16 @@ def predict_lang_specific(data,lang_code):
180
  decoded_results.append(processor.decode(comb_pred_ids))
181
 
182
  return decoded_results
183
-
184
-
185
  predict(load_file_to_data('audio file path',sampling_rate=16_000)) # beware of the audio file sampling rate
186
-
187
  predict_lang_specific(load_file_to_data('audio file path',sampling_rate=16_000),'en') # beware of the audio file sampling rate
188
-
189
  ```
190
- ## Result
191
- | Common Voice Languages | Num. of data | Hour | WER | CER |
192
- |------------------------|--------------|--------|--------|-------|
193
- | ar | 21744 | 81.5 | 75.29 | 31.23 |
194
- | as | 394 | 1.1 | 95.37 | 46.05 |
195
- | br | 4777 | 7.4 | 93.79 | 41.16 |
196
- | ca | 301308 | 692.8 | 24.80 | 10.39 |
197
- | cnh | 1563 | 2.4 | 68.11 | 23.10 |
198
- | cs | 9773 | 39.5 | 67.86 | 12.57 |
199
- | cv | 1749 | 5.9 | 95.43 | 34.03 |
200
- | cy | 11615 | 106.7 | 67.03 | 23.97 |
201
- | de | 262113 | 822.8 | 27.03 | 6.50 |
202
- | dv | 4757 | 18.6 | 92.16 | 30.15 |
203
- | el | 3717 | 11.1 | 94.48 | 58.67 |
204
- | en | 580501 | 1763.6 | 34.87 | 14.84 |
205
- | eo | 28574 | 162.3 | 37.77 | 6.23 |
206
- | es | 176902 | 337.7 | 19.63 | 5.41 |
207
- | et | 5473 | 35.9 | 86.87 | 20.79 |
208
- | eu | 12677 | 90.2 | 44.80 | 7.32 |
209
- | fa | 12806 | 290.6 | 53.81 | 15.09 |
210
- | fi | 875 | 2.6 | 93.78 | 27.57 |
211
- | fr | 314745 | 664.1 | 33.16 | 13.94 |
212
- | fy-NL | 6717 | 27.2 | 72.54 | 26.58 |
213
- | ga-IE | 1038 | 3.5 | 92.57 | 51.02 |
214
- | hi | 292 | 2.0 | 90.95 | 57.43 |
215
- | hsb | 980 | 2.3 | 89.44 | 27.19 |
216
- | hu | 4782 | 9.3 | 97.15 | 36.75 |
217
- | ia | 5078 | 10.4 | 52.00 | 11.35 |
218
- | id | 3965 | 9.9 | 82.50 | 22.82 |
219
- | it | 70943 | 178.0 | 39.09 | 8.72 |
220
- | ja | 1308 | 8.2 | 99.21 | 62.06 |
221
- | ka | 1585 | 4.0 | 90.53 | 18.57 |
222
- | ky | 3466 | 12.2 | 76.53 | 19.80 |
223
- | lg | 1634 | 17.1 | 98.95 | 43.84 |
224
- | lt | 1175 | 3.9 | 92.61 | 26.81 |
225
- | lv | 4554 | 6.3 | 90.34 | 30.81 |
226
- | mn | 4020 | 11.6 | 82.68 | 30.14 |
227
- | mt | 3552 | 7.8 | 84.18 | 22.96 |
228
- | nl | 14398 | 71.8 | 57.18 | 19.01 |
229
- | or | 517 | 0.9 | 90.93 | 27.34 |
230
- | pa-IN | 255 | 0.8 | 87.95 | 42.03 |
231
- | pl | 12621 | 112.0 | 56.14 | 12.06 |
232
- | pt | 11106 | 61.3 | 53.24 | 16.32 |
233
- | rm-sursilv | 2589 | 5.9 | 78.17 | 23.31 |
234
- | rm-vallader | 931 | 2.3 | 73.67 | 21.76 |
235
- | ro | 4257 | 8.7 | 83.84 | 21.95 |
236
- | ru | 23444 | 119.1 | 61.83 | 15.18 |
237
- | sah | 1847 | 4.4 | 94.38 | 38.46 |
238
- | sl | 2594 | 6.7 | 84.21 | 20.54 |
239
- | sv-SE | 4350 | 20.8 | 83.68 | 30.79 |
240
- | ta | 3788 | 18.4 | 84.19 | 21.60 |
241
- | th | 4839 | 11.7 | 141.87 | 37.16 |
242
- | tr | 3478 | 22.3 | 66.77 | 15.55 |
243
- | tt | 13338 | 26.7 | 86.80 | 33.57 |
244
- | uk | 7271 | 39.4 | 70.23 | 14.34 |
245
- | vi | 421 | 1.7 | 96.06 | 66.25 |
246
- | zh-CN | 27284 | 58.7 | 89.67 | 23.96 |
247
- | zh-HK | 12678 | 92.1 | 81.77 | 18.82 |
248
- | zh-TW | 6402 | 56.6 | 85.08 | 29.07 |
1
+
2
  ---
3
  language:
4
  - multilingual
48
  - tt
49
  - uk
50
  - vi
51
+ license: apache-2.0
52
+ tags:
53
+ - audio
54
+ - automatic-speech-recognition
55
+ - hf-asr-leaderboard
56
+ - robust-speech-event
57
+ - speech
58
+ - xlsr-fine-tuning-week
59
+ datasets:
60
+ - common_voice
61
+ language_bcp47:
62
  - fy-NL
63
  - ga-IE
64
  - pa-IN
68
  - zh-CN
69
  - zh-HK
70
  - zh-TW
 
 
 
 
 
 
 
 
 
 
71
  model-index:
72
  - name: XLSR Wav2Vec2 for 56 language by Voidful
73
  results:
74
  - task:
 
75
  type: automatic-speech-recognition
76
+ name: Speech Recognition
77
  dataset:
78
  name: Common Voice
79
  type: common_voice
80
  metrics:
81
+ - type: cer
 
82
  value: 23.21
83
+ name: Test CER
84
  ---
85
 
86
+ # Model Card for wav2vec2-xlsr-multilingual-56
87
+
88
+
89
+ # Model Details
90
+
91
+ ## Model Description
92
+
93
+ - **Developed by:** voidful
94
+ - **Shared by [Optional]:** Hugging Face
95
+ - **Model type:** automatic-speech-recognition
96
+ - **Language(s) (NLP):** multilingual (*56 language, 1 model Multilingual ASR*)
97
+ - **License:** Apache-2.0
98
+ - **Related Models:**
99
+ - **Parent Model:** wav2vec
100
+ - **Resources for more information:**
101
+ - [GitHub Repo](https://github.com/voidful/wav2vec2-xlsr-multilingual-56)
102
+ - [Model Space](https://huggingface.co/spaces/Kamtera/Persian_Automatic_Speech_Recognition_and-more)
103
+
104
+
105
+ # Uses
106
+
107
+
108
+ ## Direct Use
109
+
110
+ This model can be used for the task of automatic-speech-recognition
111
+
112
+ ## Downstream Use [Optional]
113
+
114
+ More information needed
115
+
116
+ ## Out-of-Scope Use
117
+
118
+ The model should not be used to intentionally create hostile or alienating environments for people.
119
+
120
+ # Bias, Risks, and Limitations
121
+
122
+ Significant research has explored bias and fairness issues with language models (see, e.g., [Sheng et al. (2021)](https://aclanthology.org/2021.acl-long.330.pdf) and [Bender et al. (2021)](https://dl.acm.org/doi/pdf/10.1145/3442188.3445922)). Predictions generated by the model may include disturbing and harmful stereotypes across protected classes; identity characteristics; and sensitive, social, and occupational groups.
123
+
124
+
125
+ ## Recommendations
126
+
127
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
128
+
129
+
130
+ # Training Details
131
+
132
+ ## Training Data
133
+
134
+ See the [common_voice dataset card](https://huggingface.co/datasets/common_voice)
135
  Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on 56 language using the [Common Voice](https://huggingface.co/datasets/common_voice).
136
+
137
+ ## Training Procedure
138
+
139
+
140
+ ### Preprocessing
141
+
142
+ More information needed
143
+
144
+ ### Speeds, Sizes, Times
145
+
146
+
147
  When using this model, make sure that your speech input is sampled at 16kHz.
148
+
149
+
150
+ # Evaluation
151
+
152
+
153
+ ## Testing Data, Factors & Metrics
154
+
155
+ ### Testing Data
156
+
157
+ More information needed
158
+
159
+ ### Factors
160
+
161
+
162
+ ### Metrics
163
+
164
+ More information needed
165
+ ## Results
166
+ <details>
167
+ <summary> Click to expand </summary>
168
+
169
+ | Common Voice Languages | Num. of data | Hour | WER | CER |
170
+ |------------------------|--------------|--------|--------|-------|
171
+ | ar | 21744 | 81.5 | 75.29 | 31.23 |
172
+ | as | 394 | 1.1 | 95.37 | 46.05 |
173
+ | br | 4777 | 7.4 | 93.79 | 41.16 |
174
+ | ca | 301308 | 692.8 | 24.80 | 10.39 |
175
+ | cnh | 1563 | 2.4 | 68.11 | 23.10 |
176
+ | cs | 9773 | 39.5 | 67.86 | 12.57 |
177
+ | cv | 1749 | 5.9 | 95.43 | 34.03 |
178
+ | cy | 11615 | 106.7 | 67.03 | 23.97 |
179
+ | de | 262113 | 822.8 | 27.03 | 6.50 |
180
+ | dv | 4757 | 18.6 | 92.16 | 30.15 |
181
+ | el | 3717 | 11.1 | 94.48 | 58.67 |
182
+ | en | 580501 | 1763.6 | 34.87 | 14.84 |
183
+ | eo | 28574 | 162.3 | 37.77 | 6.23 |
184
+ | es | 176902 | 337.7 | 19.63 | 5.41 |
185
+ | et | 5473 | 35.9 | 86.87 | 20.79 |
186
+ | eu | 12677 | 90.2 | 44.80 | 7.32 |
187
+ | fa | 12806 | 290.6 | 53.81 | 15.09 |
188
+ | fi | 875 | 2.6 | 93.78 | 27.57 |
189
+ | fr | 314745 | 664.1 | 33.16 | 13.94 |
190
+ | fy-NL | 6717 | 27.2 | 72.54 | 26.58 |
191
+ | ga-IE | 1038 | 3.5 | 92.57 | 51.02 |
192
+ | hi | 292 | 2.0 | 90.95 | 57.43 |
193
+ | hsb | 980 | 2.3 | 89.44 | 27.19 |
194
+ | hu | 4782 | 9.3 | 97.15 | 36.75 |
195
+ | ia | 5078 | 10.4 | 52.00 | 11.35 |
196
+ | id | 3965 | 9.9 | 82.50 | 22.82 |
197
+ | it | 70943 | 178.0 | 39.09 | 8.72 |
198
+ | ja | 1308 | 8.2 | 99.21 | 62.06 |
199
+ | ka | 1585 | 4.0 | 90.53 | 18.57 |
200
+ | ky | 3466 | 12.2 | 76.53 | 19.80 |
201
+ | lg | 1634 | 17.1 | 98.95 | 43.84 |
202
+ | lt | 1175 | 3.9 | 92.61 | 26.81 |
203
+ | lv | 4554 | 6.3 | 90.34 | 30.81 |
204
+ | mn | 4020 | 11.6 | 82.68 | 30.14 |
205
+ | mt | 3552 | 7.8 | 84.18 | 22.96 |
206
+ | nl | 14398 | 71.8 | 57.18 | 19.01 |
207
+ | or | 517 | 0.9 | 90.93 | 27.34 |
208
+ | pa-IN | 255 | 0.8 | 87.95 | 42.03 |
209
+ | pl | 12621 | 112.0 | 56.14 | 12.06 |
210
+ | pt | 11106 | 61.3 | 53.24 | 16.32 |
211
+ | rm-sursilv | 2589 | 5.9 | 78.17 | 23.31 |
212
+ | rm-vallader | 931 | 2.3 | 73.67 | 21.76 |
213
+ | ro | 4257 | 8.7 | 83.84 | 21.95 |
214
+ | ru | 23444 | 119.1 | 61.83 | 15.18 |
215
+ | sah | 1847 | 4.4 | 94.38 | 38.46 |
216
+ | sl | 2594 | 6.7 | 84.21 | 20.54 |
217
+ | sv-SE | 4350 | 20.8 | 83.68 | 30.79 |
218
+ | ta | 3788 | 18.4 | 84.19 | 21.60 |
219
+ | th | 4839 | 11.7 | 141.87 | 37.16 |
220
+ | tr | 3478 | 22.3 | 66.77 | 15.55 |
221
+ | tt | 13338 | 26.7 | 86.80 | 33.57 |
222
+ | uk | 7271 | 39.4 | 70.23 | 14.34 |
223
+ | vi | 421 | 1.7 | 96.06 | 66.25 |
224
+ | zh-CN | 27284 | 58.7 | 89.67 | 23.96 |
225
+ | zh-HK | 12678 | 92.1 | 81.77 | 18.82 |
226
+ | zh-TW | 6402 | 56.6 | 85.08 | 29.07 |
227
+
228
+ </details>
229
+ # Model Examination
230
+
231
+ More information needed
232
+
233
+ # Environmental Impact
234
+
235
+
236
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
237
+
238
+ - **Hardware Type:** More information needed
239
+ - **Hours used:** More information needed
240
+ - **Cloud Provider:** More information needed
241
+ - **Compute Region:** More information needed
242
+ - **Carbon Emitted:** More information needed
243
+
244
+ # Technical Specifications [optional]
245
+
246
+ ## Model Architecture and Objective
247
+
248
+ More information needed
249
+
250
+ ## Compute Infrastructure
251
+
252
+ More information needed
253
+
254
+ ### Hardware
255
+
256
+ More information needed
257
+
258
+ ### Software
259
+ More information needed
260
+
261
+ # Citation
262
+
263
+
264
+ **BibTeX:**
265
+ ```
266
+ More information needed
267
+ ```
268
+
269
+ **APA:**
270
+ ```
271
+ More information needed
272
+ ```
273
+
274
+ # Glossary [optional]
275
+ More information needed
276
+
277
+ # More Information [optional]
278
+
279
+ More information needed
280
+
281
+ # Model Card Authors [optional]
282
+
283
+ voidful in collaboration with Ezi Ozoani and the Hugging Face team
284
+
285
+ # Model Card Contact
286
+
287
+ More information needed
288
+
289
+ # How to Get Started with the Model
290
+
291
+ Use the code below to get started with the model.
292
+
293
+ <details>
294
+ <summary> Click to expand </summary>
295
+
296
+
297
  ## Env setup:
298
  ```
299
  !pip install torchaudio
301
  !pip install asrp
302
  !wget -O lang_ids.pk https://huggingface.co/voidful/wav2vec2-xlsr-multilingual-56/raw/main/lang_ids.pk
303
  ```
304
+
305
  ## Usage
306
+
307
  ```
308
  import torchaudio
309
  from datasets import load_dataset, load_metric
320
  model_name = "voidful/wav2vec2-xlsr-multilingual-56"
321
  device = "cuda"
322
  processor_name = "voidful/wav2vec2-xlsr-multilingual-56"
323
+
324
  import pickle
325
  with open("lang_ids.pk", 'rb') as output:
326
  lang_ids = pickle.load(output)
327
 
328
  model = Wav2Vec2ForCTC.from_pretrained(model_name).to(device)
329
  processor = Wav2Vec2Processor.from_pretrained(processor_name)
330
+
331
  model.eval()
332
+
333
  def load_file_to_data(file,sampling_rate=16_000):
334
  batch = {}
335
  speech, _ = torchaudio.load(file)
341
  batch["speech"] = speech.squeeze(0).numpy()
342
  batch["sampling_rate"] = '16000'
343
  return batch
344
+
345
+
346
  def predict(data):
347
  features = processor(data["speech"], sampling_rate=data["sampling_rate"], padding=True, return_tensors="pt")
348
  input_values = features.input_values.to(device)
357
  voice_prob = torch.nn.functional.softmax((torch.masked_select(logit, mask).view(-1,vocab_size)),dim=-1)
358
  comb_pred_ids = torch.argmax(voice_prob, dim=-1)
359
  decoded_results.append(processor.decode(comb_pred_ids))
360
+
361
  return decoded_results
362
+
363
  def predict_lang_specific(data,lang_code):
364
  features = processor(data["speech"], sampling_rate=data["sampling_rate"], padding=True, return_tensors="pt")
365
  input_values = features.input_values.to(device)
384
  decoded_results.append(processor.decode(comb_pred_ids))
385
 
386
  return decoded_results
387
+
388
+
389
  predict(load_file_to_data('audio file path',sampling_rate=16_000)) # beware of the audio file sampling rate
390
+
391
  predict_lang_specific(load_file_to_data('audio file path',sampling_rate=16_000),'en') # beware of the audio file sampling rate
392
+
393
  ```
394
+
395
+ ```python
396
+ {{ get_started_code | default("More information needed", true)}}
397
+ ```
398
+ </details>
399
+