m3hrdadfi commited on
Commit
045d59f
1 Parent(s): bc5ac14

Update readme for more info

Browse files
Files changed (1) hide show
  1. README.md +102 -54
README.md CHANGED
@@ -26,21 +26,21 @@ model-index:
26
  metrics:
27
  - name: Test WER
28
  type: wer
29
- value: 32.09
30
  - name: Test CER
31
  type: cer
32
- value: 8.23
33
 
34
  ---
35
 
36
- # Wav2Vec2-Large-XLSR-53 Persian
37
 
38
  Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) in Persian (Farsi) using [Common Voice](https://huggingface.co/datasets/common_voice). When using this model, make sure that your speech input is sampled at 16kHz.
39
 
40
- ## How To Use
41
  The model can be used directly (without a language model) as follows:
42
 
43
- ### Requirements
44
  ```bash
45
  # requirement packages
46
  !pip install git+https://github.com/huggingface/datasets.git
@@ -51,23 +51,28 @@ The model can be used directly (without a language model) as follows:
51
  !pip install hazm
52
  ```
53
 
54
- ### Preprocessing
55
 
 
56
  ```python
57
- # preprocessing the datasets.
58
- # normalizing the texts
 
 
 
59
 
 
60
  import hazm
61
  import re
62
  import string
63
 
 
64
 
65
  _normalizer = hazm.Normalizer()
66
- chars_to_ignore = list(set([
67
  ",", "?", ".", "!", "-", ";", ":", '""', "%", "'", '"', "�",
68
  "#", "!", "؟", "?", "«", "»", "ء", "،", "(", ")", "؛", "'ٔ", "٬",'ٔ', ",", "?",
69
  ".", "!", "-", ";", ":",'"',"“", "%", "‘", "”", "�", "–", "…", "_", "”", '“', '„'
70
- ]))
71
 
72
  # In case of farsi
73
  chars_to_ignore = chars_to_ignore + list(string.ascii_lowercase + string.digits)
@@ -100,37 +105,6 @@ def normalizer(batch, chars_to_ignore, chars_to_mapping):
100
 
101
  batch["sentence"] = text
102
  return batch
103
- ```
104
-
105
- ### Loading The Data
106
-
107
- ```python
108
- from datasets import load_dataset
109
-
110
- dataset = load_dataset("common_voice", "fa", split="test[:1%]")
111
- print(dataset)
112
- ```
113
-
114
- **Output:**
115
- ```text
116
- >>>
117
- Dataset({
118
- features: ['client_id', 'path', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
119
- num_rows: 52
120
- })
121
- ```
122
-
123
- ### Model
124
-
125
- ```python
126
- import librosa
127
- import torch
128
- import torchaudio
129
- from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
130
-
131
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
132
- processor = Wav2Vec2Processor.from_pretrained("m3hrdadfi/wav2vec2-large-xlsr-persian")
133
- model = Wav2Vec2ForCTC.from_pretrained("m3hrdadfi/wav2vec2-large-xlsr-persian").to(device)
134
 
135
 
136
  def speech_file_to_array_fn(batch):
@@ -155,13 +129,11 @@ def predict(batch):
155
 
156
  batch["predicted"] = processor.batch_decode(pred_ids)[0]
157
  return batch
158
- ```
159
 
160
- ## Prediction
161
-
162
- ```python
163
- import IPython.display as ipd
164
 
 
 
 
165
 
166
  dataset = load_dataset("common_voice", "fa", split="test[:1%]")
167
  dataset = dataset.map(
@@ -246,13 +218,91 @@ predicted: من سفر کردم را دوست دارم
246
 
247
  ## Evaluation
248
 
 
 
249
  ```bash
250
- mkdir cer
251
- wget -O cer/cer.py https://huggingface.co/ctl/wav2vec2-large-xlsr-cantonese/raw/main/cer.py
252
  ```
253
 
254
  ```python
255
- from datasets import load_metric
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
 
257
  dataset = load_dataset("common_voice", "fa", split="test")
258
  dataset = dataset.map(
@@ -270,11 +320,9 @@ print("WER: {:.2f}".format(100 * wer.compute(predictions=result["predicted"], re
270
  print("CER: {:.2f}".format(100 * cer.compute(predictions=result["predicted"], references=result["sentence"])))
271
  ```
272
 
273
- **Output:**
274
- ```text
275
- WER: 32.09%
276
- CER: 8.23%
277
- ```
278
 
279
 
280
  ## Training
 
26
  metrics:
27
  - name: Test WER
28
  type: wer
29
+ value: 32.18
30
  - name: Test CER
31
  type: cer
32
+ value: 8.27
33
 
34
  ---
35
 
36
+ # Wav2Vec2-Large-XLSR-53-Persian
37
 
38
  Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) in Persian (Farsi) using [Common Voice](https://huggingface.co/datasets/common_voice). When using this model, make sure that your speech input is sampled at 16kHz.
39
 
40
+ ## Usage
41
  The model can be used directly (without a language model) as follows:
42
 
43
+ **Requirements**
44
  ```bash
45
  # requirement packages
46
  !pip install git+https://github.com/huggingface/datasets.git
 
51
  !pip install hazm
52
  ```
53
 
 
54
 
55
+ **Prediction**
56
  ```python
57
+ import librosa
58
+ import torch
59
+ import torchaudio
60
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
61
+ from datasets import load_dataset
62
 
63
+ import numpy as np
64
  import hazm
65
  import re
66
  import string
67
 
68
+ import IPython.display as ipd
69
 
70
  _normalizer = hazm.Normalizer()
71
+ chars_to_ignore = [
72
  ",", "?", ".", "!", "-", ";", ":", '""', "%", "'", '"', "�",
73
  "#", "!", "؟", "?", "«", "»", "ء", "،", "(", ")", "؛", "'ٔ", "٬",'ٔ', ",", "?",
74
  ".", "!", "-", ";", ":",'"',"“", "%", "‘", "”", "�", "–", "…", "_", "”", '“', '„'
75
+ ]
76
 
77
  # In case of farsi
78
  chars_to_ignore = chars_to_ignore + list(string.ascii_lowercase + string.digits)
 
105
 
106
  batch["sentence"] = text
107
  return batch
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
 
110
  def speech_file_to_array_fn(batch):
 
129
 
130
  batch["predicted"] = processor.batch_decode(pred_ids)[0]
131
  return batch
 
132
 
 
 
 
 
133
 
134
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
135
+ processor = Wav2Vec2Processor.from_pretrained("m3hrdadfi/wav2vec2-large-xlsr-persian")
136
+ model = Wav2Vec2ForCTC.from_pretrained("m3hrdadfi/wav2vec2-large-xlsr-persian").to(device)
137
 
138
  dataset = load_dataset("common_voice", "fa", split="test[:1%]")
139
  dataset = dataset.map(
 
218
 
219
  ## Evaluation
220
 
221
+ The model can be evaluated as follows on the Persian (Farsi) test data of Common Voice.
222
+
223
  ```bash
224
+ !mkdir cer
225
+ !wget -O cer/cer.py https://huggingface.co/ctl/wav2vec2-large-xlsr-cantonese/raw/main/cer.py
226
  ```
227
 
228
  ```python
229
+ import librosa
230
+ import torch
231
+ import torchaudio
232
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
233
+ from datasets import load_dataset, load_metric
234
+
235
+ import hazm
236
+ import re
237
+ import string
238
+
239
+ _normalizer = hazm.Normalizer()
240
+ chars_to_ignore = [
241
+ ",", "?", ".", "!", "-", ";", ":", '""', "%", "'", '"', "�",
242
+ "#", "!", "؟", "?", "«", "»", "ء", "،", "(", ")", "؛", "'ٔ", "٬",'ٔ', ",", "?",
243
+ ".", "!", "-", ";", ":",'"',"“", "%", "‘", "”", "�", "–", "…", "_", "”", '“', '„'
244
+ ]
245
+
246
+ # In case of farsi
247
+ # chars_to_ignore = chars_to_ignore + list(string.ascii_lowercase + string.digits)
248
+
249
+ chars_to_mapping = {
250
+ 'ك': 'ک', 'دِ': 'د', 'بِ': 'ب', 'زِ': 'ز', 'ذِ': 'ذ', 'شِ': 'ش', 'سِ': 'س', 'ى': 'ی',
251
+ 'ي': 'ی', 'أ': 'ا', 'ؤ': 'و', "ے": "ی", "ۀ": "ه", "ﭘ": "پ", "ﮐ": "ک", "ﯽ": "ی",
252
+ "ﺎ": "ا", "ﺑ": "ب", "ﺘ": "ت", "ﺧ": "خ", "ﺩ": "د", "ﺱ": "س", "ﻀ": "ض", "ﻌ": "ع",
253
+ "ﻟ": "ل", "ﻡ": "م", "ﻢ": "م", "ﻪ": "ه", "ﻮ": "و", "ئ": "ی", 'ﺍ': "ا", 'ة': "ه",
254
+ 'ﯾ': "ی", 'ﯿ': "ی", 'ﺒ': "ب", 'ﺖ': "ت", 'ﺪ': "د", 'ﺮ': "ر", 'ﺴ': "س", 'ﺷ': "ش",
255
+ 'ﺸ': "ش", 'ﻋ': "ع", 'ﻤ': "م", 'ﻥ': "ن", 'ﻧ': "ن", 'ﻭ': "و", 'ﺭ': "ر", "ﮔ": "گ",
256
+ "\u200c": " ", "\u200d": " ", "\u200e": " ", "\u200f": " ", "\ufeff": " ",
257
+ }
258
+
259
+ def multiple_replace(text, chars_to_mapping):
260
+ pattern = "|".join(map(re.escape, chars_to_mapping.keys()))
261
+ return re.sub(pattern, lambda m: chars_to_mapping[m.group()], str(text))
262
+
263
+ def remove_special_characters(text, chars_to_ignore_regex):
264
+ text = re.sub(chars_to_ignore_regex, '', text).lower() + " "
265
+ return text
266
+
267
+ def normalizer(batch, chars_to_ignore, chars_to_mapping):
268
+ chars_to_ignore_regex = f"""[{"".join(chars_to_ignore)}]"""
269
+ text = batch["sentence"].lower().strip()
270
+
271
+ text = _normalizer.normalize(text)
272
+ text = multiple_replace(text, chars_to_mapping)
273
+ text = remove_special_characters(text, chars_to_ignore_regex)
274
+
275
+ batch["sentence"] = text
276
+ return batch
277
+
278
+
279
+ def speech_file_to_array_fn(batch):
280
+ speech_array, sampling_rate = torchaudio.load(batch["path"])
281
+ speech_array = speech_array.squeeze().numpy()
282
+ speech_array = librosa.resample(np.asarray(speech_array), sampling_rate, 16_000)
283
+
284
+ batch["speech"] = speech_array
285
+ return batch
286
+
287
+
288
+ def predict(batch):
289
+ features = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
290
+
291
+ input_values = features.input_values.to(device)
292
+ attention_mask = features.attention_mask.to(device)
293
+
294
+ with torch.no_grad():
295
+ logits = model(input_values, attention_mask=attention_mask).logits
296
+
297
+ pred_ids = torch.argmax(logits, dim=-1)
298
+
299
+ batch["predicted"] = processor.batch_decode(pred_ids)[0]
300
+ return batch
301
+
302
+
303
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
304
+ processor = Wav2Vec2Processor.from_pretrained("m3hrdadfi/wav2vec2-large-xlsr-persian")
305
+ model = Wav2Vec2ForCTC.from_pretrained("m3hrdadfi/wav2vec2-large-xlsr-persian").to(device)
306
 
307
  dataset = load_dataset("common_voice", "fa", split="test")
308
  dataset = dataset.map(
 
320
  print("CER: {:.2f}".format(100 * cer.compute(predictions=result["predicted"], references=result["sentence"])))
321
  ```
322
 
323
+ **Test Result:**
324
+ - WER: 32.18%
325
+ - CER: 8.27%
 
 
326
 
327
 
328
  ## Training