File size: 12,085 Bytes
bf21a79 aba15e5 bf21a79 65b4bd2 bf21a79 65b4bd2 bf21a79 65b4bd2 720a1a6 6742ee9 65b4bd2 720a1a6 65b4bd2 7aa1354 65b4bd2 7aa1354 65b4bd2 bf21a79 65b4bd2 bf21a79 65b4bd2 bf21a79 65b4bd2 bf21a79 65b4bd2 bf21a79 65b4bd2 bf21a79 65b4bd2 bf21a79 65b4bd2 bf21a79 657acba bf21a79 720a1a6 bf21a79 657acba bf21a79 657acba bf21a79 65b4bd2 657acba |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 |
---
language: ar
datasets:
- common_voice
metrics:
- wer
tags:
- audio
- automatic-speech-recognition
- speech
- xlsr-fine-tuning-week
license: apache-2.0
model-index:
- name: Sinai Voice Arabic Speech Recognition Model
results:
- task:
name: Speech Recognition
type: automatic-speech-recognition
dataset:
name: Common Voice ar
type: common_voice
args: ar
metrics:
- name: Test WER
type: wer
value: 23.80
---
# Sinai Voice Arabic Speech Recognition Model
# نموذج **صوت سيناء** للتعرف على الأصوات العربية الفصحى و تحويلها إلى نصوص
Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53)
on Arabic using the [Common Voice](https://huggingface.co/datasets/common_voice)
Most of evaluation codes in this documentation are INSPIRED by [elgeish/wav2vec2-large-xlsr-53-arabic](https://huggingface.co/elgeish/wav2vec2-large-xlsr-53-arabic)
Please install:
- [PyTorch](https://pytorch.org/)
- `$ pip3 install jiwer lang_trans torchaudio datasets transformers pandas tqdm`
## Benchmark
We evaluated the model against different Arabic-STT Wav2Vec models.
| | Model | [using transliteration](https://pypi.org/project/lang-trans/) | WER | Training Datasets |
|---:|:--------------------------------------|:---------------------|---------:|---------:|
| 1 | bakrianoo/sinai-voice-ar-stt | True | 0.238001 |Common Voice 6|
| 2 | elgeish/wav2vec2-large-xlsr-53-arabic | True | 0.266527 |Common Voice 6 + Arabic Speech Corpus|
| 3 | othrif/wav2vec2-large-xlsr-arabic | True | 0.298122 |Common Voice 6|
| 4 | bakrianoo/sinai-voice-ar-stt | False | 0.448987 |Common Voice 6|
| 5 | othrif/wav2vec2-large-xlsr-arabic | False | 0.464004 |Common Voice 6|
| 6 | anas/wav2vec2-large-xlsr-arabic | True | 0.506191 |Common Voice 4|
| 7 | anas/wav2vec2-large-xlsr-arabic | False | 0.622288 |Common Voice 4|
<details>
<summary>We used the following <b>CODE</b> to generate the above results</summary>
```python
import jiwer
import torch
from tqdm.auto import tqdm
import torchaudio
from datasets import load_dataset
from lang_trans.arabic import buckwalter
from transformers import set_seed, Wav2Vec2ForCTC, Wav2Vec2Processor
import pandas as pd
# load test dataset
set_seed(42)
test_split = load_dataset("common_voice", "ar", split="test")
# init sample rate resamplers
resamplers = { # all three sampling rates exist in test split
48000: torchaudio.transforms.Resample(48000, 16000),
44100: torchaudio.transforms.Resample(44100, 16000),
32000: torchaudio.transforms.Resample(32000, 16000),
}
# WER composer
transformation = jiwer.Compose([
# normalize some diacritics, remove punctuation, and replace Persian letters with Arabic ones
jiwer.SubstituteRegexes({
r'[auiFNKo\\\\\\\\\\\\\\\\~_،؟»\\\\\\\\\\\\\\\\?;:\\\\\\\\\\\\\\\\-,\\\\\\\\\\\\\\\\.؛«!"]': "", "\\\\\\\\\\\\\\\\u06D6": "",
r"[\\\\\\\\\\\\\\\\|\\\\\\\\\\\\\\\\{]": "A", "p": "h", "ک": "k", "ی": "y"}),
# default transformation below
jiwer.RemoveMultipleSpaces(),
jiwer.Strip(),
jiwer.SentencesToListOfWords(),
jiwer.RemoveEmptyStrings(),
])
def prepare_example(example):
speech, sampling_rate = torchaudio.load(example["path"])
if sampling_rate in resamplers:
example["speech"] = resamplers[sampling_rate](speech).squeeze().numpy()
else:
example["speech"] = resamplers[4800](speech).squeeze().numpy()
return example
def predict(batch):
inputs = processor(batch["speech"], sampling_rate=16000, return_tensors="pt", padding=True)
with torch.no_grad():
predicted = torch.argmax(model(inputs.input_values.to("cuda")).logits, dim=-1)
predicted[predicted == -100] = processor.tokenizer.pad_token_id # see fine-tuning script
batch["predicted"] = processor.batch_decode(predicted)
return batch
# prepare the test dataset
test_split = test_split.map(prepare_example)
stt_models = [
"elgeish/wav2vec2-large-xlsr-53-arabic",
"othrif/wav2vec2-large-xlsr-arabic",
"anas/wav2vec2-large-xlsr-arabic",
"bakrianoo/sinai-voice-ar-stt"
]
stt_results = []
for model_path in tqdm(stt_models):
processor = Wav2Vec2Processor.from_pretrained(model_path)
model = Wav2Vec2ForCTC.from_pretrained(model_path).to("cuda").eval()
test_split_preds = test_split.map(predict, batched=True, batch_size=56, remove_columns=["speech"])
orig_metrics = jiwer.compute_measures(
truth=[s for s in test_split_preds["sentence"]],
hypothesis=[s for s in test_split_preds["predicted"]],
truth_transform=transformation,
hypothesis_transform=transformation,
)
trans_metrics = jiwer.compute_measures(
truth=[buckwalter.trans(s) for s in test_split_preds["sentence"]], # Buckwalter transliteration
hypothesis=[buckwalter.trans(s) for s in test_split_preds["predicted"]], # Buckwalter transliteration
truth_transform=transformation,
hypothesis_transform=transformation,
)
stt_results.append({
"model": model_path,
"using_transliation": True,
"WER": trans_metrics["wer"]
})
stt_results.append({
"model": model_path,
"using_transliation": False,
"WER": orig_metrics["wer"]
})
del model
del processor
stt_results_df = pd.DataFrame(stt_results)
stt_results_df = stt_results_df.sort_values('WER', axis=0, ascending=True)
stt_results_df.head(n=50)
```
</details>
## Usage
The model can be used directly (without a language model) as follows:
```python
import torch
import torchaudio
from datasets import load_dataset
from lang_trans.arabic import buckwalter
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
dataset = load_dataset("common_voice", "ar", split="test[:10]")
resamplers = { # all three sampling rates exist in test split
48000: torchaudio.transforms.Resample(48000, 16000),
44100: torchaudio.transforms.Resample(44100, 16000),
32000: torchaudio.transforms.Resample(32000, 16000),
}
def prepare_example(example):
speech, sampling_rate = torchaudio.load(example["path"])
if sampling_rate in resamplers:
example["speech"] = resamplers[sampling_rate](speech).squeeze().numpy()
else:
example["speech"] = resamplers[4800](speech).squeeze().numpy()
return example
dataset = dataset.map(prepare_example)
processor = Wav2Vec2Processor.from_pretrained("bakrianoo/sinai-voice-ar-stt")
model = Wav2Vec2ForCTC.from_pretrained("bakrianoo/sinai-voice-ar-stt").eval()
def predict(batch):
inputs = processor(batch["speech"], sampling_rate=16000, return_tensors="pt", padding=True)
with torch.no_grad():
predicted = torch.argmax(model(inputs.input_values).logits, dim=-1)
predicted[predicted == -100] = processor.tokenizer.pad_token_id # see fine-tuning script
batch["predicted"] = processor.tokenizer.batch_decode(predicted)
return batch
dataset = dataset.map(predict, batched=True, batch_size=1, remove_columns=["speech"])
for reference, predicted in zip(dataset["sentence"], dataset["predicted"]):
print("reference:", reference)
print("predicted:", predicted)
print("--")
```
Here's the output:
```
reference: ألديك قلم ؟
predicted: ألديك قلم
--
reference: ليست هناك مسافة على هذه الأرض أبعد من يوم أمس.
predicted: ليست نارك مسافة على هذه الأرض أبعد من يوم أمس
--
reference: إنك تكبر المشكلة.
predicted: إنك تكبر المشكلة
--
reference: يرغب أن يلتقي بك.
predicted: يرغب أن يلتقي بك
--
reference: إنهم لا يعرفون لماذا حتى.
predicted: إنهم لا يعرفون لماذا حتى
--
reference: سيسعدني مساعدتك أي وقت تحب.
predicted: سيسعدن مساعثتك أي وقد تحب
--
reference: أَحَبُّ نظريّة علمية إليّ هي أن حلقات زحل مكونة بالكامل من الأمتعة المفقودة.
predicted: أحب نظرية علمية إلي هي أن أحلقتز حلم كوينا بالكامل من الأمت عن المفقودة
--
reference: سأشتري له قلماً.
predicted: سأشتري له قلما
--
reference: أين المشكلة ؟
predicted: أين المشكل
--
reference: وَلِلَّهِ يَسْجُدُ مَا فِي السَّمَاوَاتِ وَمَا فِي الْأَرْضِ مِنْ دَابَّةٍ وَالْمَلَائِكَةُ وَهُمْ لَا يَسْتَكْبِرُونَ
predicted: ولله يسجد ما في السماوات وما في الأرض من دابة والملائكة وهم لا يستكبرون
```
## Evaluation
The model can be evaluated as follows on the Arabic test data of Common Voice:
```python
import jiwer
import torch
import torchaudio
from datasets import load_dataset
from lang_trans.arabic import buckwalter
from transformers import set_seed, Wav2Vec2ForCTC, Wav2Vec2Processor
set_seed(42)
test_split = load_dataset("common_voice", "ar", split="test")
resamplers = { # all three sampling rates exist in test split
48000: torchaudio.transforms.Resample(48000, 16000),
44100: torchaudio.transforms.Resample(44100, 16000),
32000: torchaudio.transforms.Resample(32000, 16000),
}
def prepare_example(example):
speech, sampling_rate = torchaudio.load(example["path"])
if sampling_rate in resamplers:
example["speech"] = resamplers[sampling_rate](speech).squeeze().numpy()
else:
example["speech"] = resamplers[4800](speech).squeeze().numpy()
return example
test_split = test_split.map(prepare_example)
processor = Wav2Vec2Processor.from_pretrained("bakrianoo/sinai-voice-ar-stt")
model = Wav2Vec2ForCTC.from_pretrained("bakrianoo/sinai-voice-ar-stt").to("cuda").eval()
def predict(batch):
inputs = processor(batch["speech"], sampling_rate=16000, return_tensors="pt", padding=True)
with torch.no_grad():
predicted = torch.argmax(model(inputs.input_values.to("cuda")).logits, dim=-1)
predicted[predicted == -100] = processor.tokenizer.pad_token_id # see fine-tuning script
batch["predicted"] = processor.batch_decode(predicted)
return batch
test_split = test_split.map(predict, batched=True, batch_size=16, remove_columns=["speech"])
transformation = jiwer.Compose([
# normalize some diacritics, remove punctuation, and replace Persian letters with Arabic ones
jiwer.SubstituteRegexes({
r'[auiFNKo\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\~_،؟»\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\?;:\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\-,\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\.؛«!"]': "", "\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\u06D6": "",
r"[\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\|\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\{]": "A", "p": "h", "ک": "k", "ی": "y"}),
# default transformation below
jiwer.RemoveMultipleSpaces(),
jiwer.Strip(),
jiwer.SentencesToListOfWords(),
jiwer.RemoveEmptyStrings(),
])
metrics = jiwer.compute_measures(
truth=[buckwalter.trans(s) for s in test_split["sentence"]], # Buckwalter transliteration
hypothesis=[buckwalter.trans(s) for s in test_split["predicted"]],
truth_transform=transformation,
hypothesis_transform=transformation,
)
print(f"WER: {metrics['wer']:.2%}")
```
**Test Result**: 23.80%
## Other Arabic Voice recognition Models
الكلمات لا تكفى لشكر أولئك الذين يؤمنون أن هنالك أمل, و يسعون من أجله
- [elgeish/wav2vec2-large-xlsr-53-arabic](https://huggingface.co/elgeish/wav2vec2-large-xlsr-53-arabic)
- [othrif/wav2vec2-large-xlsr-arabic](https://huggingface.co/othrif/wav2vec2-large-xlsr-arabic)
- [anas/wav2vec2-large-xlsr-arabic](https://huggingface.co/anas/wav2vec2-large-xlsr-arabic)
|