othrif commited on
Commit
c9a2c45
โ€ข
1 Parent(s): 8daeec6

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +78 -43
README.md CHANGED
@@ -39,35 +39,63 @@ When using this model, make sure that your speech input is sampled at 16kHz.
39
  The model can be used directly (without a language model) as follows:
40
 
41
  ```python
 
42
  import torch
 
43
  import torchaudio
44
- from datasets import load_dataset
45
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
 
46
 
47
- test_dataset = load_dataset("ma_speech_corpus", split="test")
48
 
49
- processor = Wav2Vec2Processor.from_pretrained("othrif/wav2vec2-large-xlsr-arabic")
50
- model = Wav2Vec2ForCTC.from_pretrained("othrif/wav2vec2-large-xlsr-arabic")
51
 
52
- resampler = torchaudio.transforms.Resample(48_000, 16_000)
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
- # Preprocessing the datasets.
55
- # We need to read the audio files as arrays
56
  def speech_file_to_array_fn(batch):
 
57
  speech_array, sampling_rate = torchaudio.load(batch["path"])
58
- batch["speech"] = resampler(speech_array).squeeze().numpy()
 
 
 
 
59
  return batch
60
 
61
- test_dataset = test_dataset.map(speech_file_to_array_fn)
62
- inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
63
 
64
- with torch.no_grad():
65
- logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
 
 
 
 
 
 
 
 
 
 
 
66
 
67
- predicted_ids = torch.argmax(logits, dim=-1)
68
 
69
- print("Prediction:", processor.batch_decode(predicted_ids))
70
- print("Reference:", test_dataset["sentence"][:2])
 
 
71
  ```
72
 
73
 
@@ -77,60 +105,67 @@ The model can be evaluated as follows on the Arabic test data of Common Voice.
77
 
78
 
79
  ```python
80
- import re
81
  import torch
82
- import librosa
83
  import torchaudio
84
  from datasets import load_dataset, load_metric
85
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
86
- import soundfile as sf
87
 
88
- eval_dataset = load_dataset("ma_speech_corpus", split="test")
89
  wer = load_metric("wer")
90
 
91
- processor = Wav2Vec2Processor.from_pretrained("othrif/wav2vec2-large-xlsr-moroccan")
92
  model = Wav2Vec2ForCTC.from_pretrained("othrif/wav2vec2-large-xlsr-moroccan")
93
  model.to("cuda")
94
 
95
- chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\โ€œ\'\๏ฟฝ]'
96
-
97
- def remove_special_characters(batch):
98
- batch["text"] = re.sub(chars_to_ignore_regex, "", batch["sentence"]).lower() + " "
99
- return batch
100
-
101
-
102
- eval_dataset = eval_dataset.map(remove_special_characters, remove_columns=["sentence"])
103
- #eval_dataset = eval_dataset.select(range(100))
104
 
 
 
105
  def speech_file_to_array_fn(batch):
106
- start, stop = batch['segment'].split('_')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  speech_array, sampling_rate = torchaudio.load(batch["path"])
108
- speech_array, sampling_rate = sf.read(batch["path"], start=int(float(start) * sampling_rate),
109
- stop=int(float(stop) * sampling_rate))
110
- batch["speech"] = librosa.resample(speech_array, sampling_rate, 16_000)
111
- batch["sampling_rate"] = 16_000
112
- batch["target_text"] = batch["text"]
113
  return batch
114
 
 
115
 
116
- eval_dataset = eval_dataset.map(
117
- speech_file_to_array_fn,
118
- remove_columns=eval_dataset.column_names
119
- )
120
-
121
  def evaluate(batch):
122
  inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
123
 
124
  with torch.no_grad():
125
- logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
126
 
127
  pred_ids = torch.argmax(logits, dim=-1)
128
  batch["pred_strings"] = processor.batch_decode(pred_ids)
129
  return batch
130
 
131
- result = eval_dataset.map(evaluate, batched=True, batch_size=32)
132
 
133
- print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["target_text"])))
134
  ```
135
 
136
  **Test Result**: 66.45
 
39
  The model can be used directly (without a language model) as follows:
40
 
41
  ```python
42
+ import re
43
  import torch
44
+ import librosa
45
  import torchaudio
46
+ from datasets import load_dataset, load_metric
47
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
48
+ import soundfile as sf
49
 
 
50
 
51
+ dataset = load_dataset("ma_speech_corpus", split="test")
 
52
 
53
+ processor = Wav2Vec2Processor.from_pretrained("othrif/wav2vec2-large-xlsr-moroccan")
54
+ model = Wav2Vec2ForCTC.from_pretrained("othrif/wav2vec2-large-xlsr-moroccan")
55
+ model.to("cuda")
56
+
57
+
58
+ chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\โ€œ\'\๏ฟฝ]'
59
+
60
+ def remove_special_characters(batch):
61
+ batch["text"] = re.sub(chars_to_ignore_regex, "", batch["sentence"]).lower() + " "
62
+ return batch
63
+
64
+
65
+ dataset = dataset.map(remove_special_characters)
66
+ dataset = dataset.select(range(10))
67
 
 
 
68
  def speech_file_to_array_fn(batch):
69
+ start, stop = batch['segment'].split('_')
70
  speech_array, sampling_rate = torchaudio.load(batch["path"])
71
+ speech_array, sampling_rate = sf.read(batch["path"], start=int(float(start) * sampling_rate),
72
+ stop=int(float(stop) * sampling_rate))
73
+ batch["speech"] = librosa.resample(speech_array, sampling_rate, 16_000)
74
+ batch["sampling_rate"] = 16_000
75
+ batch["target_text"] = batch["text"]
76
  return batch
77
 
 
 
78
 
79
+ dataset = dataset.map(
80
+ speech_file_to_array_fn
81
+ )
82
+
83
+ def predict(batch):
84
+ inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
85
+
86
+ with torch.no_grad():
87
+ logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
88
+
89
+ pred_ids = torch.argmax(logits, dim=-1)
90
+ batch["predicted"] = processor.batch_decode(pred_ids)
91
+ return batch
92
 
93
+ dataset = dataset.map(predict, batched=True, batch_size=32)
94
 
95
+ for reference, predicted in zip(dataset["sentence"], dataset["predicted"]):
96
+ print("reference:", reference)
97
+ print("predicted:", predicted)
98
+ print("--")
99
  ```
100
 
101
 
 
105
 
106
 
107
  ```python
 
108
  import torch
 
109
  import torchaudio
110
  from datasets import load_dataset, load_metric
111
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
112
+ import re
113
 
114
+ test_dataset = load_dataset("ma_speech_corpus", split="test")
115
  wer = load_metric("wer")
116
 
117
+ processor = Wav2Vec2Processor.from_pretrained("othrif/wav2vec2-large-xlsr-moroccan")
118
  model = Wav2Vec2ForCTC.from_pretrained("othrif/wav2vec2-large-xlsr-moroccan")
119
  model.to("cuda")
120
 
121
+ chars_to_ignore_regex = '[0\\\\,\\\\?\\\\.\\\\!\\\\-\\\\;\\\\:\\\\"\\\\โ€œ\\\\%\\\\โ€˜\\\\โ€\\\\๏ฟฝ\\
122
+ \\\\@\\\\ู€\\\\ุŸ\\\\*\\\\ \\\\#\\\\'\\\\ \\\\โ€ฆ\\\\\\\\u2003]'
123
+ #resampler = torchaudio.transforms.Resample(48_000, 16_000)
 
 
 
 
 
 
124
 
125
+ # Preprocessing the datasets.
126
+ # We need to read the audio files as arrays
127
  def speech_file_to_array_fn(batch):
128
+ batch["text"] = re.sub(chars_to_ignore_regex, '', batch["text"]).lower()
129
+ batch["text"] = re.sub('[a-zA-z]', '', batch["text"]).lower() + " "
130
+ batch["text"] = re.sub('[ูููŽู‹ููŒ~]', '', batch["text"]).lower() + " "
131
+
132
+ # batch["text"] = re.sub('\\\\\\
133
+ ','', batch["text"])
134
+ batch["text"] = re.sub("[ุฅุฃูฑุขุง]", "ุง", batch["text"])
135
+ batch["text"] = re.sub("ฺธ", "ู„", batch["text"])
136
+ noise = re.compile(""" ู‘ | # Tashdid
137
+ ูŽ | # Fatha
138
+ ู‹ | # Tanwin Fath
139
+ ู | # Damma
140
+ ูŒ | # Tanwin Damm
141
+ ู | # Kasra
142
+ ู | # Tanwin Kasr
143
+ ู’ | # Sukun
144
+ ู€ # Tatwil/Kashida
145
+ """, re.VERBOSE)
146
+ batch["text"] = re.sub(noise, '', batch["text"])
147
+ batch["text"] = re.sub('ู–', '', batch["text"]).lower() + " "
148
  speech_array, sampling_rate = torchaudio.load(batch["path"])
149
+ batch["speech"] = resampler(speech_array).squeeze().numpy()
 
 
 
 
150
  return batch
151
 
152
+ test_dataset = test_dataset.map(speech_file_to_array_fn)
153
 
154
+ # Preprocessing the datasets.
155
+ # We need to read the audio files as arrays
 
 
 
156
  def evaluate(batch):
157
  inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
158
 
159
  with torch.no_grad():
160
+ logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
161
 
162
  pred_ids = torch.argmax(logits, dim=-1)
163
  batch["pred_strings"] = processor.batch_decode(pred_ids)
164
  return batch
165
 
166
+ result = test_dataset.map(evaluate, batched=True, batch_size=8)
167
 
168
+ print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["text"])))
169
  ```
170
 
171
  **Test Result**: 66.45