bofenghuang
commited on
Commit
•
fe8883a
1
Parent(s):
9f3e1fa
updt README.md
Browse files
README.md
CHANGED
@@ -60,110 +60,73 @@ model-index:
|
|
60 |
# Fine-tuned Wav2Vec2 XLS-R 1B model for ASR in French
|
61 |
|
62 |
This model is a fine-tuned version of [facebook/wav2vec2-xls-r-1b](https://huggingface.co/facebook/wav2vec2-xls-r-1b) on the MOZILLA-FOUNDATION/COMMON_VOICE_9_0 - FR dataset.
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
| 0.1645 | 5.96 | 21500 | 0.1744 | 0.1527 |
|
131 |
-
| 0.1551 | 6.1 | 22000 | 0.1778 | 0.1543 |
|
132 |
-
| 0.1505 | 6.24 | 22500 | 0.1754 | 0.1528 |
|
133 |
-
| 0.1499 | 6.38 | 23000 | 0.1743 | 0.1500 |
|
134 |
-
| 0.1491 | 6.52 | 23500 | 0.1684 | 0.1473 |
|
135 |
-
| 0.1477 | 6.66 | 24000 | 0.1661 | 0.1472 |
|
136 |
-
| 0.1456 | 6.79 | 24500 | 0.1654 | 0.1440 |
|
137 |
-
| 0.1415 | 6.93 | 25000 | 0.1654 | 0.1448 |
|
138 |
-
| 0.136 | 7.07 | 25500 | 0.1616 | 0.1407 |
|
139 |
-
| 0.132 | 7.21 | 26000 | 0.1625 | 0.1410 |
|
140 |
-
| 0.1323 | 7.35 | 26500 | 0.1604 | 0.1404 |
|
141 |
-
| 0.1338 | 7.49 | 27000 | 0.1574 | 0.1386 |
|
142 |
-
| 0.13 | 7.63 | 27500 | 0.1576 | 0.1384 |
|
143 |
-
| 0.1291 | 7.76 | 28000 | 0.1551 | 0.1366 |
|
144 |
-
| 0.1277 | 7.9 | 28500 | 0.1542 | 0.1356 |
|
145 |
-
| 0.1241 | 8.04 | 29000 | 0.1545 | 0.1350 |
|
146 |
-
| 0.1198 | 8.18 | 29500 | 0.1536 | 0.1322 |
|
147 |
-
| 0.1204 | 8.32 | 30000 | 0.1547 | 0.1337 |
|
148 |
-
| 0.1195 | 8.46 | 30500 | 0.1494 | 0.1309 |
|
149 |
-
| 0.1169 | 8.6 | 31000 | 0.1490 | 0.1300 |
|
150 |
-
| 0.1159 | 8.74 | 31500 | 0.1485 | 0.1305 |
|
151 |
-
| 0.1142 | 8.87 | 32000 | 0.1479 | 0.1292 |
|
152 |
-
| 0.1087 | 9.01 | 32500 | 0.1471 | 0.1284 |
|
153 |
-
| 0.1076 | 9.15 | 33000 | 0.1467 | 0.1270 |
|
154 |
-
| 0.1078 | 9.29 | 33500 | 0.1467 | 0.1270 |
|
155 |
-
| 0.1073 | 9.43 | 34000 | 0.1447 | 0.1256 |
|
156 |
-
| 0.108 | 9.57 | 34500 | 0.1447 | 0.1257 |
|
157 |
-
| 0.106 | 9.71 | 35000 | 0.1438 | 0.1255 |
|
158 |
-
| 0.1052 | 9.84 | 35500 | 0.1428 | 0.1247 |
|
159 |
-
| 0.1044 | 9.98 | 36000 | 0.1430 | 0.1245 |
|
160 |
-
|
161 |
-
### Framework versions
|
162 |
-
|
163 |
-
- Transformers 4.22.0.dev0
|
164 |
-
- Pytorch 1.12.0+cu113
|
165 |
-
- Datasets 2.4.0
|
166 |
-
- Tokenizers 0.12.1
|
167 |
|
168 |
|
169 |
## Evaluation
|
|
|
60 |
# Fine-tuned Wav2Vec2 XLS-R 1B model for ASR in French
|
61 |
|
62 |
This model is a fine-tuned version of [facebook/wav2vec2-xls-r-1b](https://huggingface.co/facebook/wav2vec2-xls-r-1b) on the MOZILLA-FOUNDATION/COMMON_VOICE_9_0 - FR dataset.
|
63 |
+
|
64 |
+
|
65 |
+
## Usage
|
66 |
+
|
67 |
+
1. To use on a local audio file without the language model
|
68 |
+
|
69 |
+
```python
|
70 |
+
import torch
|
71 |
+
import torchaudio
|
72 |
+
|
73 |
+
from transformers import AutoModelForCTC, Wav2Vec2Processor
|
74 |
+
|
75 |
+
processor = Wav2Vec2Processor.from_pretrained("bhuang/wav2vec2-xls-r-1b-cv9-fr")
|
76 |
+
model = AutoModelForCTC.from_pretrained("bhuang/wav2vec2-xls-r-1b-cv9-fr").cuda()
|
77 |
+
|
78 |
+
# path to your audio file
|
79 |
+
wav_path = "/projects/bhuang/corpus/speech/multilingual-tedx/fr-fr/flac/09UU0I9gLNc_0.flac"
|
80 |
+
waveform, sample_rate = torchaudio.load(wav_path)
|
81 |
+
waveform = waveform.squeeze(axis=0) # mono
|
82 |
+
|
83 |
+
# resample
|
84 |
+
if sample_rate != 16_000:
|
85 |
+
resampler = torchaudio.transforms.Resample(sample_rate, 16_000)
|
86 |
+
waveform = resampler(waveform)
|
87 |
+
|
88 |
+
# normalize
|
89 |
+
input_dict = processor(waveform, sampling_rate=16_000, return_tensors="pt")
|
90 |
+
|
91 |
+
with torch.inference_mode():
|
92 |
+
logits = model(input_dict.input_values.to("cuda")).logits
|
93 |
+
|
94 |
+
# decode
|
95 |
+
predicted_ids = torch.argmax(logits, dim=-1)
|
96 |
+
predicted_sentence = processor.batch_decode(predicted_ids)[0]
|
97 |
+
```
|
98 |
+
|
99 |
+
2. To use on a local audio file with the language model
|
100 |
+
|
101 |
+
```python
|
102 |
+
import torch
|
103 |
+
import torchaudio
|
104 |
+
|
105 |
+
from transformers import AutoModelForCTC, Wav2Vec2ProcessorWithLM
|
106 |
+
|
107 |
+
processor_with_lm = Wav2Vec2ProcessorWithLM.from_pretrained("bhuang/wav2vec2-xls-r-1b-cv9-fr")
|
108 |
+
model = AutoModelForCTC.from_pretrained("bhuang/wav2vec2-xls-r-1b-cv9-fr").cuda()
|
109 |
+
|
110 |
+
model_sampling_rate = processor_with_lm.feature_extractor.sampling_rate
|
111 |
+
|
112 |
+
# path to your audio file
|
113 |
+
wav_path = "/projects/bhuang/corpus/speech/multilingual-tedx/fr-fr/flac/09UU0I9gLNc_0.flac"
|
114 |
+
waveform, sample_rate = torchaudio.load(wav_path)
|
115 |
+
waveform = waveform.squeeze(axis=0) # mono
|
116 |
+
|
117 |
+
# resample
|
118 |
+
if sample_rate != 16_000:
|
119 |
+
resampler = torchaudio.transforms.Resample(sample_rate, 16_000)
|
120 |
+
waveform = resampler(waveform)
|
121 |
+
|
122 |
+
# normalize
|
123 |
+
input_dict = processor_with_lm(waveform, sampling_rate=16_000, return_tensors="pt")
|
124 |
+
|
125 |
+
with torch.inference_mode():
|
126 |
+
logits = model(input_dict.input_values.to("cuda")).logits
|
127 |
+
|
128 |
+
predicted_sentence = processor_with_lm.batch_decode(logits.cpu().numpy()).text[0]
|
129 |
+
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
130 |
|
131 |
|
132 |
## Evaluation
|