model documentation (#3)
Browse files- model documentation (d7847b3dd5b31b0f9850a79b8fe319b528db8ee5)
Co-authored-by: Nazneen Rajani <nazneen@users.noreply.huggingface.co>
README.md
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
---
|
2 |
language:
|
3 |
- multilingual
|
@@ -47,7 +48,17 @@ language:
|
|
47 |
- tt
|
48 |
- uk
|
49 |
- vi
|
50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
- fy-NL
|
52 |
- ga-IE
|
53 |
- pa-IN
|
@@ -57,40 +68,232 @@ language_bcp47:
|
|
57 |
- zh-CN
|
58 |
- zh-HK
|
59 |
- zh-TW
|
60 |
-
datasets:
|
61 |
-
- common_voice
|
62 |
-
tags:
|
63 |
-
- audio
|
64 |
-
- automatic-speech-recognition
|
65 |
-
- hf-asr-leaderboard
|
66 |
-
- robust-speech-event
|
67 |
-
- speech
|
68 |
-
- xlsr-fine-tuning-week
|
69 |
-
license: apache-2.0
|
70 |
model-index:
|
71 |
- name: XLSR Wav2Vec2 for 56 language by Voidful
|
72 |
results:
|
73 |
- task:
|
74 |
-
name: Speech Recognition
|
75 |
type: automatic-speech-recognition
|
|
|
76 |
dataset:
|
77 |
name: Common Voice
|
78 |
type: common_voice
|
79 |
metrics:
|
80 |
-
-
|
81 |
-
type: cer
|
82 |
value: 23.21
|
|
|
83 |
---
|
84 |
|
85 |
-
# wav2vec2-xlsr-multilingual-56
|
86 |
-
|
87 |
-
|
88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on 56 language using the [Common Voice](https://huggingface.co/datasets/common_voice).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
When using this model, make sure that your speech input is sampled at 16kHz.
|
91 |
-
|
92 |
-
|
93 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
## Env setup:
|
95 |
```
|
96 |
!pip install torchaudio
|
@@ -98,8 +301,9 @@ For more detail: [https://github.com/voidful/wav2vec2-xlsr-multilingual-56](http
|
|
98 |
!pip install asrp
|
99 |
!wget -O lang_ids.pk https://huggingface.co/voidful/wav2vec2-xlsr-multilingual-56/raw/main/lang_ids.pk
|
100 |
```
|
101 |
-
|
102 |
## Usage
|
|
|
103 |
```
|
104 |
import torchaudio
|
105 |
from datasets import load_dataset, load_metric
|
@@ -116,16 +320,16 @@ import soundfile as sf
|
|
116 |
model_name = "voidful/wav2vec2-xlsr-multilingual-56"
|
117 |
device = "cuda"
|
118 |
processor_name = "voidful/wav2vec2-xlsr-multilingual-56"
|
119 |
-
|
120 |
import pickle
|
121 |
with open("lang_ids.pk", 'rb') as output:
|
122 |
lang_ids = pickle.load(output)
|
123 |
|
124 |
model = Wav2Vec2ForCTC.from_pretrained(model_name).to(device)
|
125 |
processor = Wav2Vec2Processor.from_pretrained(processor_name)
|
126 |
-
|
127 |
model.eval()
|
128 |
-
|
129 |
def load_file_to_data(file,sampling_rate=16_000):
|
130 |
batch = {}
|
131 |
speech, _ = torchaudio.load(file)
|
@@ -137,8 +341,8 @@ def load_file_to_data(file,sampling_rate=16_000):
|
|
137 |
batch["speech"] = speech.squeeze(0).numpy()
|
138 |
batch["sampling_rate"] = '16000'
|
139 |
return batch
|
140 |
-
|
141 |
-
|
142 |
def predict(data):
|
143 |
features = processor(data["speech"], sampling_rate=data["sampling_rate"], padding=True, return_tensors="pt")
|
144 |
input_values = features.input_values.to(device)
|
@@ -153,9 +357,9 @@ def predict(data):
|
|
153 |
voice_prob = torch.nn.functional.softmax((torch.masked_select(logit, mask).view(-1,vocab_size)),dim=-1)
|
154 |
comb_pred_ids = torch.argmax(voice_prob, dim=-1)
|
155 |
decoded_results.append(processor.decode(comb_pred_ids))
|
156 |
-
|
157 |
return decoded_results
|
158 |
-
|
159 |
def predict_lang_specific(data,lang_code):
|
160 |
features = processor(data["speech"], sampling_rate=data["sampling_rate"], padding=True, return_tensors="pt")
|
161 |
input_values = features.input_values.to(device)
|
@@ -180,69 +384,16 @@ def predict_lang_specific(data,lang_code):
|
|
180 |
decoded_results.append(processor.decode(comb_pred_ids))
|
181 |
|
182 |
return decoded_results
|
183 |
-
|
184 |
-
|
185 |
predict(load_file_to_data('audio file path',sampling_rate=16_000)) # beware of the audio file sampling rate
|
186 |
-
|
187 |
predict_lang_specific(load_file_to_data('audio file path',sampling_rate=16_000),'en') # beware of the audio file sampling rate
|
188 |
-
|
189 |
```
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
| ca | 301308 | 692.8 | 24.80 | 10.39 |
|
197 |
-
| cnh | 1563 | 2.4 | 68.11 | 23.10 |
|
198 |
-
| cs | 9773 | 39.5 | 67.86 | 12.57 |
|
199 |
-
| cv | 1749 | 5.9 | 95.43 | 34.03 |
|
200 |
-
| cy | 11615 | 106.7 | 67.03 | 23.97 |
|
201 |
-
| de | 262113 | 822.8 | 27.03 | 6.50 |
|
202 |
-
| dv | 4757 | 18.6 | 92.16 | 30.15 |
|
203 |
-
| el | 3717 | 11.1 | 94.48 | 58.67 |
|
204 |
-
| en | 580501 | 1763.6 | 34.87 | 14.84 |
|
205 |
-
| eo | 28574 | 162.3 | 37.77 | 6.23 |
|
206 |
-
| es | 176902 | 337.7 | 19.63 | 5.41 |
|
207 |
-
| et | 5473 | 35.9 | 86.87 | 20.79 |
|
208 |
-
| eu | 12677 | 90.2 | 44.80 | 7.32 |
|
209 |
-
| fa | 12806 | 290.6 | 53.81 | 15.09 |
|
210 |
-
| fi | 875 | 2.6 | 93.78 | 27.57 |
|
211 |
-
| fr | 314745 | 664.1 | 33.16 | 13.94 |
|
212 |
-
| fy-NL | 6717 | 27.2 | 72.54 | 26.58 |
|
213 |
-
| ga-IE | 1038 | 3.5 | 92.57 | 51.02 |
|
214 |
-
| hi | 292 | 2.0 | 90.95 | 57.43 |
|
215 |
-
| hsb | 980 | 2.3 | 89.44 | 27.19 |
|
216 |
-
| hu | 4782 | 9.3 | 97.15 | 36.75 |
|
217 |
-
| ia | 5078 | 10.4 | 52.00 | 11.35 |
|
218 |
-
| id | 3965 | 9.9 | 82.50 | 22.82 |
|
219 |
-
| it | 70943 | 178.0 | 39.09 | 8.72 |
|
220 |
-
| ja | 1308 | 8.2 | 99.21 | 62.06 |
|
221 |
-
| ka | 1585 | 4.0 | 90.53 | 18.57 |
|
222 |
-
| ky | 3466 | 12.2 | 76.53 | 19.80 |
|
223 |
-
| lg | 1634 | 17.1 | 98.95 | 43.84 |
|
224 |
-
| lt | 1175 | 3.9 | 92.61 | 26.81 |
|
225 |
-
| lv | 4554 | 6.3 | 90.34 | 30.81 |
|
226 |
-
| mn | 4020 | 11.6 | 82.68 | 30.14 |
|
227 |
-
| mt | 3552 | 7.8 | 84.18 | 22.96 |
|
228 |
-
| nl | 14398 | 71.8 | 57.18 | 19.01 |
|
229 |
-
| or | 517 | 0.9 | 90.93 | 27.34 |
|
230 |
-
| pa-IN | 255 | 0.8 | 87.95 | 42.03 |
|
231 |
-
| pl | 12621 | 112.0 | 56.14 | 12.06 |
|
232 |
-
| pt | 11106 | 61.3 | 53.24 | 16.32 |
|
233 |
-
| rm-sursilv | 2589 | 5.9 | 78.17 | 23.31 |
|
234 |
-
| rm-vallader | 931 | 2.3 | 73.67 | 21.76 |
|
235 |
-
| ro | 4257 | 8.7 | 83.84 | 21.95 |
|
236 |
-
| ru | 23444 | 119.1 | 61.83 | 15.18 |
|
237 |
-
| sah | 1847 | 4.4 | 94.38 | 38.46 |
|
238 |
-
| sl | 2594 | 6.7 | 84.21 | 20.54 |
|
239 |
-
| sv-SE | 4350 | 20.8 | 83.68 | 30.79 |
|
240 |
-
| ta | 3788 | 18.4 | 84.19 | 21.60 |
|
241 |
-
| th | 4839 | 11.7 | 141.87 | 37.16 |
|
242 |
-
| tr | 3478 | 22.3 | 66.77 | 15.55 |
|
243 |
-
| tt | 13338 | 26.7 | 86.80 | 33.57 |
|
244 |
-
| uk | 7271 | 39.4 | 70.23 | 14.34 |
|
245 |
-
| vi | 421 | 1.7 | 96.06 | 66.25 |
|
246 |
-
| zh-CN | 27284 | 58.7 | 89.67 | 23.96 |
|
247 |
-
| zh-HK | 12678 | 92.1 | 81.77 | 18.82 |
|
248 |
-
| zh-TW | 6402 | 56.6 | 85.08 | 29.07 |
|
1 |
+
|
2 |
---
|
3 |
language:
|
4 |
- multilingual
|
48 |
- tt
|
49 |
- uk
|
50 |
- vi
|
51 |
+
license: apache-2.0
|
52 |
+
tags:
|
53 |
+
- audio
|
54 |
+
- automatic-speech-recognition
|
55 |
+
- hf-asr-leaderboard
|
56 |
+
- robust-speech-event
|
57 |
+
- speech
|
58 |
+
- xlsr-fine-tuning-week
|
59 |
+
datasets:
|
60 |
+
- common_voice
|
61 |
+
language_bcp47:
|
62 |
- fy-NL
|
63 |
- ga-IE
|
64 |
- pa-IN
|
68 |
- zh-CN
|
69 |
- zh-HK
|
70 |
- zh-TW
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
model-index:
|
72 |
- name: XLSR Wav2Vec2 for 56 language by Voidful
|
73 |
results:
|
74 |
- task:
|
|
|
75 |
type: automatic-speech-recognition
|
76 |
+
name: Speech Recognition
|
77 |
dataset:
|
78 |
name: Common Voice
|
79 |
type: common_voice
|
80 |
metrics:
|
81 |
+
- type: cer
|
|
|
82 |
value: 23.21
|
83 |
+
name: Test CER
|
84 |
---
|
85 |
|
86 |
+
# Model Card for wav2vec2-xlsr-multilingual-56
|
87 |
+
|
88 |
+
|
89 |
+
# Model Details
|
90 |
+
|
91 |
+
## Model Description
|
92 |
+
|
93 |
+
- **Developed by:** voidful
|
94 |
+
- **Shared by [Optional]:** Hugging Face
|
95 |
+
- **Model type:** automatic-speech-recognition
|
96 |
+
- **Language(s) (NLP):** multilingual (*56 language, 1 model Multilingual ASR*)
|
97 |
+
- **License:** Apache-2.0
|
98 |
+
- **Related Models:**
|
99 |
+
- **Parent Model:** wav2vec
|
100 |
+
- **Resources for more information:**
|
101 |
+
- [GitHub Repo](https://github.com/voidful/wav2vec2-xlsr-multilingual-56)
|
102 |
+
- [Model Space](https://huggingface.co/spaces/Kamtera/Persian_Automatic_Speech_Recognition_and-more)
|
103 |
+
|
104 |
+
|
105 |
+
# Uses
|
106 |
+
|
107 |
+
|
108 |
+
## Direct Use
|
109 |
+
|
110 |
+
This model can be used for the task of automatic-speech-recognition
|
111 |
+
|
112 |
+
## Downstream Use [Optional]
|
113 |
+
|
114 |
+
More information needed
|
115 |
+
|
116 |
+
## Out-of-Scope Use
|
117 |
+
|
118 |
+
The model should not be used to intentionally create hostile or alienating environments for people.
|
119 |
+
|
120 |
+
# Bias, Risks, and Limitations
|
121 |
+
|
122 |
+
Significant research has explored bias and fairness issues with language models (see, e.g., [Sheng et al. (2021)](https://aclanthology.org/2021.acl-long.330.pdf) and [Bender et al. (2021)](https://dl.acm.org/doi/pdf/10.1145/3442188.3445922)). Predictions generated by the model may include disturbing and harmful stereotypes across protected classes; identity characteristics; and sensitive, social, and occupational groups.
|
123 |
+
|
124 |
+
|
125 |
+
## Recommendations
|
126 |
+
|
127 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
128 |
+
|
129 |
+
|
130 |
+
# Training Details
|
131 |
+
|
132 |
+
## Training Data
|
133 |
+
|
134 |
+
See the [common_voice dataset card](https://huggingface.co/datasets/common_voice)
|
135 |
Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on 56 language using the [Common Voice](https://huggingface.co/datasets/common_voice).
|
136 |
+
|
137 |
+
## Training Procedure
|
138 |
+
|
139 |
+
|
140 |
+
### Preprocessing
|
141 |
+
|
142 |
+
More information needed
|
143 |
+
|
144 |
+
### Speeds, Sizes, Times
|
145 |
+
|
146 |
+
|
147 |
When using this model, make sure that your speech input is sampled at 16kHz.
|
148 |
+
|
149 |
+
|
150 |
+
# Evaluation
|
151 |
+
|
152 |
+
|
153 |
+
## Testing Data, Factors & Metrics
|
154 |
+
|
155 |
+
### Testing Data
|
156 |
+
|
157 |
+
More information needed
|
158 |
+
|
159 |
+
### Factors
|
160 |
+
|
161 |
+
|
162 |
+
### Metrics
|
163 |
+
|
164 |
+
More information needed
|
165 |
+
## Results
|
166 |
+
<details>
|
167 |
+
<summary> Click to expand </summary>
|
168 |
+
|
169 |
+
| Common Voice Languages | Num. of data | Hour | WER | CER |
|
170 |
+
|------------------------|--------------|--------|--------|-------|
|
171 |
+
| ar | 21744 | 81.5 | 75.29 | 31.23 |
|
172 |
+
| as | 394 | 1.1 | 95.37 | 46.05 |
|
173 |
+
| br | 4777 | 7.4 | 93.79 | 41.16 |
|
174 |
+
| ca | 301308 | 692.8 | 24.80 | 10.39 |
|
175 |
+
| cnh | 1563 | 2.4 | 68.11 | 23.10 |
|
176 |
+
| cs | 9773 | 39.5 | 67.86 | 12.57 |
|
177 |
+
| cv | 1749 | 5.9 | 95.43 | 34.03 |
|
178 |
+
| cy | 11615 | 106.7 | 67.03 | 23.97 |
|
179 |
+
| de | 262113 | 822.8 | 27.03 | 6.50 |
|
180 |
+
| dv | 4757 | 18.6 | 92.16 | 30.15 |
|
181 |
+
| el | 3717 | 11.1 | 94.48 | 58.67 |
|
182 |
+
| en | 580501 | 1763.6 | 34.87 | 14.84 |
|
183 |
+
| eo | 28574 | 162.3 | 37.77 | 6.23 |
|
184 |
+
| es | 176902 | 337.7 | 19.63 | 5.41 |
|
185 |
+
| et | 5473 | 35.9 | 86.87 | 20.79 |
|
186 |
+
| eu | 12677 | 90.2 | 44.80 | 7.32 |
|
187 |
+
| fa | 12806 | 290.6 | 53.81 | 15.09 |
|
188 |
+
| fi | 875 | 2.6 | 93.78 | 27.57 |
|
189 |
+
| fr | 314745 | 664.1 | 33.16 | 13.94 |
|
190 |
+
| fy-NL | 6717 | 27.2 | 72.54 | 26.58 |
|
191 |
+
| ga-IE | 1038 | 3.5 | 92.57 | 51.02 |
|
192 |
+
| hi | 292 | 2.0 | 90.95 | 57.43 |
|
193 |
+
| hsb | 980 | 2.3 | 89.44 | 27.19 |
|
194 |
+
| hu | 4782 | 9.3 | 97.15 | 36.75 |
|
195 |
+
| ia | 5078 | 10.4 | 52.00 | 11.35 |
|
196 |
+
| id | 3965 | 9.9 | 82.50 | 22.82 |
|
197 |
+
| it | 70943 | 178.0 | 39.09 | 8.72 |
|
198 |
+
| ja | 1308 | 8.2 | 99.21 | 62.06 |
|
199 |
+
| ka | 1585 | 4.0 | 90.53 | 18.57 |
|
200 |
+
| ky | 3466 | 12.2 | 76.53 | 19.80 |
|
201 |
+
| lg | 1634 | 17.1 | 98.95 | 43.84 |
|
202 |
+
| lt | 1175 | 3.9 | 92.61 | 26.81 |
|
203 |
+
| lv | 4554 | 6.3 | 90.34 | 30.81 |
|
204 |
+
| mn | 4020 | 11.6 | 82.68 | 30.14 |
|
205 |
+
| mt | 3552 | 7.8 | 84.18 | 22.96 |
|
206 |
+
| nl | 14398 | 71.8 | 57.18 | 19.01 |
|
207 |
+
| or | 517 | 0.9 | 90.93 | 27.34 |
|
208 |
+
| pa-IN | 255 | 0.8 | 87.95 | 42.03 |
|
209 |
+
| pl | 12621 | 112.0 | 56.14 | 12.06 |
|
210 |
+
| pt | 11106 | 61.3 | 53.24 | 16.32 |
|
211 |
+
| rm-sursilv | 2589 | 5.9 | 78.17 | 23.31 |
|
212 |
+
| rm-vallader | 931 | 2.3 | 73.67 | 21.76 |
|
213 |
+
| ro | 4257 | 8.7 | 83.84 | 21.95 |
|
214 |
+
| ru | 23444 | 119.1 | 61.83 | 15.18 |
|
215 |
+
| sah | 1847 | 4.4 | 94.38 | 38.46 |
|
216 |
+
| sl | 2594 | 6.7 | 84.21 | 20.54 |
|
217 |
+
| sv-SE | 4350 | 20.8 | 83.68 | 30.79 |
|
218 |
+
| ta | 3788 | 18.4 | 84.19 | 21.60 |
|
219 |
+
| th | 4839 | 11.7 | 141.87 | 37.16 |
|
220 |
+
| tr | 3478 | 22.3 | 66.77 | 15.55 |
|
221 |
+
| tt | 13338 | 26.7 | 86.80 | 33.57 |
|
222 |
+
| uk | 7271 | 39.4 | 70.23 | 14.34 |
|
223 |
+
| vi | 421 | 1.7 | 96.06 | 66.25 |
|
224 |
+
| zh-CN | 27284 | 58.7 | 89.67 | 23.96 |
|
225 |
+
| zh-HK | 12678 | 92.1 | 81.77 | 18.82 |
|
226 |
+
| zh-TW | 6402 | 56.6 | 85.08 | 29.07 |
|
227 |
+
|
228 |
+
</details>
|
229 |
+
# Model Examination
|
230 |
+
|
231 |
+
More information needed
|
232 |
+
|
233 |
+
# Environmental Impact
|
234 |
+
|
235 |
+
|
236 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
237 |
+
|
238 |
+
- **Hardware Type:** More information needed
|
239 |
+
- **Hours used:** More information needed
|
240 |
+
- **Cloud Provider:** More information needed
|
241 |
+
- **Compute Region:** More information needed
|
242 |
+
- **Carbon Emitted:** More information needed
|
243 |
+
|
244 |
+
# Technical Specifications [optional]
|
245 |
+
|
246 |
+
## Model Architecture and Objective
|
247 |
+
|
248 |
+
More information needed
|
249 |
+
|
250 |
+
## Compute Infrastructure
|
251 |
+
|
252 |
+
More information needed
|
253 |
+
|
254 |
+
### Hardware
|
255 |
+
|
256 |
+
More information needed
|
257 |
+
|
258 |
+
### Software
|
259 |
+
More information needed
|
260 |
+
|
261 |
+
# Citation
|
262 |
+
|
263 |
+
|
264 |
+
**BibTeX:**
|
265 |
+
```
|
266 |
+
More information needed
|
267 |
+
```
|
268 |
+
|
269 |
+
**APA:**
|
270 |
+
```
|
271 |
+
More information needed
|
272 |
+
```
|
273 |
+
|
274 |
+
# Glossary [optional]
|
275 |
+
More information needed
|
276 |
+
|
277 |
+
# More Information [optional]
|
278 |
+
|
279 |
+
More information needed
|
280 |
+
|
281 |
+
# Model Card Authors [optional]
|
282 |
+
|
283 |
+
voidful in collaboration with Ezi Ozoani and the Hugging Face team
|
284 |
+
|
285 |
+
# Model Card Contact
|
286 |
+
|
287 |
+
More information needed
|
288 |
+
|
289 |
+
# How to Get Started with the Model
|
290 |
+
|
291 |
+
Use the code below to get started with the model.
|
292 |
+
|
293 |
+
<details>
|
294 |
+
<summary> Click to expand </summary>
|
295 |
+
|
296 |
+
|
297 |
## Env setup:
|
298 |
```
|
299 |
!pip install torchaudio
|
301 |
!pip install asrp
|
302 |
!wget -O lang_ids.pk https://huggingface.co/voidful/wav2vec2-xlsr-multilingual-56/raw/main/lang_ids.pk
|
303 |
```
|
304 |
+
|
305 |
## Usage
|
306 |
+
|
307 |
```
|
308 |
import torchaudio
|
309 |
from datasets import load_dataset, load_metric
|
320 |
model_name = "voidful/wav2vec2-xlsr-multilingual-56"
|
321 |
device = "cuda"
|
322 |
processor_name = "voidful/wav2vec2-xlsr-multilingual-56"
|
323 |
+
|
324 |
import pickle
|
325 |
with open("lang_ids.pk", 'rb') as output:
|
326 |
lang_ids = pickle.load(output)
|
327 |
|
328 |
model = Wav2Vec2ForCTC.from_pretrained(model_name).to(device)
|
329 |
processor = Wav2Vec2Processor.from_pretrained(processor_name)
|
330 |
+
|
331 |
model.eval()
|
332 |
+
|
333 |
def load_file_to_data(file,sampling_rate=16_000):
|
334 |
batch = {}
|
335 |
speech, _ = torchaudio.load(file)
|
341 |
batch["speech"] = speech.squeeze(0).numpy()
|
342 |
batch["sampling_rate"] = '16000'
|
343 |
return batch
|
344 |
+
|
345 |
+
|
346 |
def predict(data):
|
347 |
features = processor(data["speech"], sampling_rate=data["sampling_rate"], padding=True, return_tensors="pt")
|
348 |
input_values = features.input_values.to(device)
|
357 |
voice_prob = torch.nn.functional.softmax((torch.masked_select(logit, mask).view(-1,vocab_size)),dim=-1)
|
358 |
comb_pred_ids = torch.argmax(voice_prob, dim=-1)
|
359 |
decoded_results.append(processor.decode(comb_pred_ids))
|
360 |
+
|
361 |
return decoded_results
|
362 |
+
|
363 |
def predict_lang_specific(data,lang_code):
|
364 |
features = processor(data["speech"], sampling_rate=data["sampling_rate"], padding=True, return_tensors="pt")
|
365 |
input_values = features.input_values.to(device)
|
384 |
decoded_results.append(processor.decode(comb_pred_ids))
|
385 |
|
386 |
return decoded_results
|
387 |
+
|
388 |
+
|
389 |
predict(load_file_to_data('audio file path',sampling_rate=16_000)) # beware of the audio file sampling rate
|
390 |
+
|
391 |
predict_lang_specific(load_file_to_data('audio file path',sampling_rate=16_000),'en') # beware of the audio file sampling rate
|
392 |
+
|
393 |
```
|
394 |
+
|
395 |
+
```python
|
396 |
+
{{ get_started_code | default("More information needed", true)}}
|
397 |
+
```
|
398 |
+
</details>
|
399 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|