othrif commited on
Commit
9909c05
โ€ข
1 Parent(s): cacc687

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +28 -10
README.md CHANGED
@@ -44,7 +44,7 @@ import torchaudio
44
  from datasets import load_dataset
45
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
46
 
47
- test_dataset = load_dataset("", split="test[:2%]")
48
 
49
  processor = Wav2Vec2Processor.from_pretrained("othrif/wav2vec2-large-xlsr-arabic")
50
  model = Wav2Vec2ForCTC.from_pretrained("othrif/wav2vec2-large-xlsr-arabic")
@@ -83,20 +83,38 @@ from datasets import load_dataset, load_metric
83
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
84
  import re
85
 
86
- test_dataset = load_dataset("common_voice", "ar", split="test")
87
  wer = load_metric("wer")
88
 
89
- processor = Wav2Vec2Processor.from_pretrained("othrif/wav2vec2-large-xlsr-arabic")
90
- model = Wav2Vec2ForCTC.from_pretrained("othrif/wav2vec2-large-xlsr-arabic")
91
  model.to("cuda")
92
 
93
- chars_to_ignore_regex = '[\\\\\\\\ุ›\\\\\\\\โ€”\\\\\\\\_get\\\\\\\\ยซ\\\\\\\\ยป\\\\\\\\ู€\\\\\\\\ู€\\\\\\\\,\\\\\\\\?\\\\\\\\.\\\\\\\\!\\\\\\\\-\\\\\\\\;\\\\\\\\:\\\\\\\\"\\\\\\\\โ€œ\\\\\\\\%\\\\\\\\โ€˜\\\\\\\\โ€\\\\\\\\๏ฟฝ\\\\\\\\#\\\\\\\\ุŒ\\\\\\\\โ˜ญ,\\\\\\\\ุŸ]'
94
- resampler = torchaudio.transforms.Resample(48_000, 16_000)
95
 
96
  # Preprocessing the datasets.
97
  # We need to read the audio files as arrays
98
  def speech_file_to_array_fn(batch):
99
- batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  speech_array, sampling_rate = torchaudio.load(batch["path"])
101
  batch["speech"] = resampler(speech_array).squeeze().numpy()
102
  return batch
@@ -117,7 +135,7 @@ def evaluate(batch):
117
 
118
  result = test_dataset.map(evaluate, batched=True, batch_size=8)
119
 
120
- print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
121
  ```
122
 
123
  **Test Result**: 44.51
@@ -125,6 +143,6 @@ print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"],
125
 
126
  ## Training
127
 
128
- The Common Voice `train`, `validation` datasets were used for training.
129
 
130
- The script used for training can be found [here](https://huggingface.co/othrif/wav2vec2-large-xlsr-arabic/tree/main)
 
44
  from datasets import load_dataset
45
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
46
 
47
+ test_dataset = load_dataset("ma_speech_corpus", split="test")
48
 
49
  processor = Wav2Vec2Processor.from_pretrained("othrif/wav2vec2-large-xlsr-arabic")
50
  model = Wav2Vec2ForCTC.from_pretrained("othrif/wav2vec2-large-xlsr-arabic")
 
83
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
84
  import re
85
 
86
+ test_dataset = load_dataset("ma_speech_corpus", split="test")
87
  wer = load_metric("wer")
88
 
89
+ processor = Wav2Vec2Processor.from_pretrained("othrif/wav2vec2-large-xlsr-moroccan")
90
+ model = Wav2Vec2ForCTC.from_pretrained("othrif/wav2vec2-large-xlsr-moroccan")
91
  model.to("cuda")
92
 
93
+ chars_to_ignore_regex = '[0\,\?\.\!\-\;\:\"\โ€œ\%\โ€˜\โ€\๏ฟฝ\n\@\ู€\ุŸ\*\ \#\'\ \โ€ฆ\\u2003]'
94
+ #resampler = torchaudio.transforms.Resample(48_000, 16_000)
95
 
96
  # Preprocessing the datasets.
97
  # We need to read the audio files as arrays
98
  def speech_file_to_array_fn(batch):
99
+ batch["text"] = re.sub(chars_to_ignore_regex, '', batch["text"]).lower()
100
+ batch["text"] = re.sub('[a-zA-z]', '', batch["text"]).lower() + " "
101
+ batch["text"] = re.sub('[ูููŽู‹ููŒ~]', '', batch["text"]).lower() + " "
102
+
103
+ # batch["text"] = re.sub('\\n','', batch["text"])
104
+ batch["text"] = re.sub("[ุฅุฃูฑุขุง]", "ุง", batch["text"])
105
+ batch["text"] = re.sub("ฺธ", "ู„", batch["text"])
106
+ noise = re.compile(""" ู‘ | # Tashdid
107
+ ูŽ | # Fatha
108
+ ู‹ | # Tanwin Fath
109
+ ู | # Damma
110
+ ูŒ | # Tanwin Damm
111
+ ู | # Kasra
112
+ ู | # Tanwin Kasr
113
+ ู’ | # Sukun
114
+ ู€ # Tatwil/Kashida
115
+ """, re.VERBOSE)
116
+ batch["text"] = re.sub(noise, '', batch["text"])
117
+ batch["text"] = re.sub('ู–', '', batch["text"]).lower() + " "
118
  speech_array, sampling_rate = torchaudio.load(batch["path"])
119
  batch["speech"] = resampler(speech_array).squeeze().numpy()
120
  return batch
 
135
 
136
  result = test_dataset.map(evaluate, batched=True, batch_size=8)
137
 
138
+ print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["text"])))
139
  ```
140
 
141
  **Test Result**: 44.51
 
143
 
144
  ## Training
145
 
146
+ The [MGB5](http://www.islrn.org/resources/938-639-614-524-5/) `train`, `validation` datasets were used for training.
147
 
148
+ The script used for training can be found [here](https://github.com/othrif/xlsr-wav2vec2)