m3hrdadfi commited on
Commit
2f546f1
β€’
1 Parent(s): acdefb8

Update readme for more info

Browse files
Files changed (1) hide show
  1. README.md +123 -34
README.md CHANGED
@@ -30,14 +30,16 @@ model-index:
30
 
31
  ---
32
 
33
- # Wav2Vec2-Large-XLSR-53 Georgian
34
 
35
  Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) in Georgian using [Common Voice](https://huggingface.co/datasets/common_voice). When using this model, make sure that your speech input is sampled at 16kHz.
36
 
37
  ## Usage
38
  The model can be used directly (without a language model) as follows:
39
 
 
40
  ```bash
 
41
  !pip install git+https://github.com/huggingface/datasets.git
42
  !pip install git+https://github.com/huggingface/transformers.git
43
  !pip install torchaudio
@@ -45,52 +47,49 @@ The model can be used directly (without a language model) as follows:
45
  !pip install jiwer
46
  ```
47
 
 
 
48
  ```python
 
49
  import torch
50
  import torchaudio
51
- from datasets import load_dataset, load_metric
52
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
 
53
 
54
- import librosa
55
-
56
- import pandas as pd
57
  import numpy as np
58
-
59
- import random
60
- import os
61
- import string
62
- import six
63
  import re
 
64
 
65
  import IPython.display as ipd
66
 
67
- # Loading the datasets
68
- dataset = load_dataset("common_voice", "ka", split="test")
69
- print(dataset)
70
-
71
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
72
- processor = Wav2Vec2Processor.from_pretrained("m3hrdadfi/wav2vec2-large-xlsr-georgian")
73
- model = Wav2Vec2ForCTC.from_pretrained("m3hrdadfi/wav2vec2-large-xlsr-georgian").to(device)
74
-
75
-
76
- # Preprocessing the datasets.
77
- chars_to_ignore_regex = f"""[{"".join([
78
  ",", "?", ".", "!", "-", ";", ":", '""', "%", "'", '"', "οΏ½",
79
  "#", "!", "?", "Β«", "Β»", "(", ")", "Ψ›", ",", "?", ".", "!", "-", ";", ":", '"',
80
  "β€œ", "%", "β€˜", "οΏ½", "–", "…", "_", "”", 'β€œ', 'β€ž'
81
- ])}]"""
 
 
 
82
 
83
- def remove_special_characters(text, chars_to_ignore):
84
- text = re.sub(chars_to_ignore, '', text).lower() + " "
 
 
 
 
85
  return text
86
 
87
- def normalizer(batch, chars_to_ignore):
88
- text = batch["sentence"]
89
- text = remove_special_characters(text, chars_to_ignore)
 
 
 
 
90
  batch["sentence"] = text
91
  return batch
92
 
93
- # We need to read the aduio files as arrays
94
  def speech_file_to_array_fn(batch):
95
  speech_array, sampling_rate = torchaudio.load(batch["path"])
96
  speech_array = speech_array.squeeze().numpy()
@@ -99,6 +98,7 @@ def speech_file_to_array_fn(batch):
99
  batch["speech"] = speech_array
100
  return batch
101
 
 
102
  def predict(batch):
103
  features = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
104
 
@@ -113,14 +113,21 @@ def predict(batch):
113
  batch["predicted"] = processor.batch_decode(pred_ids)[0]
114
  return batch
115
 
116
- dataset = dataset.map(normalizer, fn_kwargs={"chars_to_ignore": chars_to_ignore_regex})
117
- dataset = dataset.map(speech_file_to_array_fn, remove_columns=list(set(dataset.column_names) - set(['sentence', 'path'])))
118
- result = dataset.map(predict)
119
- ```
120
 
121
- ## Prediction
 
 
 
 
 
 
 
 
 
 
 
 
122
 
123
- ```python
124
  max_items = np.random.randint(0, len(result), 20).tolist()
125
  for i in max_items:
126
  reference, predicted = result["sentence"][i], result["predicted"][i]
@@ -129,6 +136,7 @@ for i in max_items:
129
  print('---')
130
  ```
131
 
 
132
  ```text
133
  reference: αƒαƒ“αƒ›αƒ˜αƒœαƒ˜αƒ‘αƒ’αƒ αƒαƒͺαƒ˜αƒ£αƒšαƒ˜ αƒͺαƒ”αƒœαƒ’αƒ αƒ˜ αƒ₯αƒαƒšαƒαƒ₯αƒ˜ αƒ˜αƒ›αƒ˜αƒ¨αƒšαƒ˜
134
  predicted: αƒαƒ“αƒ›αƒ˜αƒœαƒ˜αƒ‘αƒ’αƒ αƒαƒͺαƒ˜αƒ£αƒšαƒ˜ αƒͺαƒ”αƒœαƒ’αƒ αƒ˜ αƒ₯αƒαƒšαƒαƒ₯αƒ˜ αƒ˜αƒ›αƒ˜αƒ¨αƒšαƒ˜
@@ -162,14 +170,95 @@ predicted: αƒ˜αƒ’αƒ˜ მდებარეობბ αƒ₯αƒαƒšαƒαƒ₯αƒ˜αƒ‘ ჩრ
162
  ---
163
  ```
164
 
 
165
  ## Evaluation
166
 
 
 
167
  ```python
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  wer = load_metric("wer")
169
 
170
  print("WER: {:.2f}".format(100 * wer.compute(predictions=result["predicted"], references=result["sentence"])))
171
  ```
172
 
 
173
  **Test Result**:
174
  - WER: 54.00%
175
 
30
 
31
  ---
32
 
33
+ # Wav2Vec2-Large-XLSR-53-Georgian
34
 
35
  Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) in Georgian using [Common Voice](https://huggingface.co/datasets/common_voice). When using this model, make sure that your speech input is sampled at 16kHz.
36
 
37
  ## Usage
38
  The model can be used directly (without a language model) as follows:
39
 
40
+ **Requirements**
41
  ```bash
42
+ # requirement packages
43
  !pip install git+https://github.com/huggingface/datasets.git
44
  !pip install git+https://github.com/huggingface/transformers.git
45
  !pip install torchaudio
47
  !pip install jiwer
48
  ```
49
 
50
+
51
+ **Prediction**
52
  ```python
53
+ import librosa
54
  import torch
55
  import torchaudio
 
56
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
57
+ from datasets import load_dataset
58
 
 
 
 
59
  import numpy as np
 
 
 
 
 
60
  import re
61
+ import string
62
 
63
  import IPython.display as ipd
64
 
65
+ chars_to_ignore = [
 
 
 
 
 
 
 
 
 
 
66
  ",", "?", ".", "!", "-", ";", ":", '""', "%", "'", '"', "οΏ½",
67
  "#", "!", "?", "Β«", "Β»", "(", ")", "Ψ›", ",", "?", ".", "!", "-", ";", ":", '"',
68
  "β€œ", "%", "β€˜", "οΏ½", "–", "…", "_", "”", 'β€œ', 'β€ž'
69
+ ]
70
+ chars_to_mapping = {
71
+ "\u200c": " ", "\u200d": " ", "\u200e": " ", "\u200f": " ", "\ufeff": " ",
72
+ }
73
 
74
+ def multiple_replace(text, chars_to_mapping):
75
+ pattern = "|".join(map(re.escape, chars_to_mapping.keys()))
76
+ return re.sub(pattern, lambda m: chars_to_mapping[m.group()], str(text))
77
+
78
+ def remove_special_characters(text, chars_to_ignore_regex):
79
+ text = re.sub(chars_to_ignore_regex, '', text).lower() + " "
80
  return text
81
 
82
+ def normalizer(batch, chars_to_ignore, chars_to_mapping):
83
+ chars_to_ignore_regex = f"""[{"".join(chars_to_ignore)}]"""
84
+ text = batch["sentence"].lower().strip()
85
+
86
+ text = multiple_replace(text, chars_to_mapping)
87
+ text = remove_special_characters(text, chars_to_ignore_regex)
88
+
89
  batch["sentence"] = text
90
  return batch
91
 
92
+
93
  def speech_file_to_array_fn(batch):
94
  speech_array, sampling_rate = torchaudio.load(batch["path"])
95
  speech_array = speech_array.squeeze().numpy()
98
  batch["speech"] = speech_array
99
  return batch
100
 
101
+
102
  def predict(batch):
103
  features = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
104
 
113
  batch["predicted"] = processor.batch_decode(pred_ids)[0]
114
  return batch
115
 
 
 
 
 
116
 
117
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
118
+ processor = Wav2Vec2Processor.from_pretrained("m3hrdadfi/wav2vec2-large-xlsr-georgian")
119
+ model = Wav2Vec2ForCTC.from_pretrained("m3hrdadfi/wav2vec2-large-xlsr-georgian").to(device)
120
+
121
+ dataset = load_dataset("common_voice", "ka", split="test[:1%]")
122
+ dataset = dataset.map(
123
+ normalizer,
124
+ fn_kwargs={"chars_to_ignore": chars_to_ignore, "chars_to_mapping": chars_to_mapping},
125
+ remove_columns=list(set(dataset.column_names) - set(['sentence', 'path']))
126
+ )
127
+
128
+ dataset = dataset.map(speech_file_to_array_fn)
129
+ result = dataset.map(predict)
130
 
 
131
  max_items = np.random.randint(0, len(result), 20).tolist()
132
  for i in max_items:
133
  reference, predicted = result["sentence"][i], result["predicted"][i]
136
  print('---')
137
  ```
138
 
139
+ **Output:**
140
  ```text
141
  reference: αƒαƒ“αƒ›αƒ˜αƒœαƒ˜αƒ‘αƒ’αƒ αƒαƒͺαƒ˜αƒ£αƒšαƒ˜ αƒͺαƒ”αƒœαƒ’αƒ αƒ˜ αƒ₯αƒαƒšαƒαƒ₯αƒ˜ αƒ˜αƒ›αƒ˜αƒ¨αƒšαƒ˜
142
  predicted: αƒαƒ“αƒ›αƒ˜αƒœαƒ˜αƒ‘αƒ’αƒ αƒαƒͺαƒ˜αƒ£αƒšαƒ˜ αƒͺαƒ”αƒœαƒ’αƒ αƒ˜ αƒ₯αƒαƒšαƒαƒ₯αƒ˜ αƒ˜αƒ›αƒ˜αƒ¨αƒšαƒ˜
170
  ---
171
  ```
172
 
173
+
174
  ## Evaluation
175
 
176
+ The model can be evaluated as follows on the Georgian test data of Common Voice.
177
+
178
  ```python
179
+ import librosa
180
+ import torch
181
+ import torchaudio
182
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
183
+ from datasets import load_dataset, load_metric
184
+
185
+ import numpy as np
186
+ import re
187
+ import string
188
+
189
+
190
+ chars_to_ignore = [
191
+ ",", "?", ".", "!", "-", ";", ":", '""', "%", "'", '"', "οΏ½",
192
+ "#", "!", "?", "Β«", "Β»", "(", ")", "Ψ›", ",", "?", ".", "!", "-", ";", ":", '"',
193
+ "β€œ", "%", "β€˜", "οΏ½", "–", "…", "_", "”", 'β€œ', 'β€ž'
194
+ ]
195
+ chars_to_mapping = {
196
+ "\u200c": " ", "\u200d": " ", "\u200e": " ", "\u200f": " ", "\ufeff": " ",
197
+ }
198
+
199
+ def multiple_replace(text, chars_to_mapping):
200
+ pattern = "|".join(map(re.escape, chars_to_mapping.keys()))
201
+ return re.sub(pattern, lambda m: chars_to_mapping[m.group()], str(text))
202
+
203
+ def remove_special_characters(text, chars_to_ignore_regex):
204
+ text = re.sub(chars_to_ignore_regex, '', text).lower() + " "
205
+ return text
206
+
207
+ def normalizer(batch, chars_to_ignore, chars_to_mapping):
208
+ chars_to_ignore_regex = f"""[{"".join(chars_to_ignore)}]"""
209
+ text = batch["sentence"].lower().strip()
210
+
211
+ text = multiple_replace(text, chars_to_mapping)
212
+ text = remove_special_characters(text, chars_to_ignore_regex)
213
+
214
+ batch["sentence"] = text
215
+ return batch
216
+
217
+
218
+ def speech_file_to_array_fn(batch):
219
+ speech_array, sampling_rate = torchaudio.load(batch["path"])
220
+ speech_array = speech_array.squeeze().numpy()
221
+ speech_array = librosa.resample(np.asarray(speech_array), sampling_rate, 16_000)
222
+
223
+ batch["speech"] = speech_array
224
+ return batch
225
+
226
+
227
+ def predict(batch):
228
+ features = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
229
+
230
+ input_values = features.input_values.to(device)
231
+ attention_mask = features.attention_mask.to(device)
232
+
233
+ with torch.no_grad():
234
+ logits = model(input_values, attention_mask=attention_mask).logits
235
+
236
+ pred_ids = torch.argmax(logits, dim=-1)
237
+
238
+ batch["predicted"] = processor.batch_decode(pred_ids)[0]
239
+ return batch
240
+
241
+
242
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
243
+ processor = Wav2Vec2Processor.from_pretrained("m3hrdadfi/wav2vec2-large-xlsr-georgian")
244
+ model = Wav2Vec2ForCTC.from_pretrained("m3hrdadfi/wav2vec2-large-xlsr-georgian").to(device)
245
+
246
+ dataset = load_dataset("common_voice", "ka", split="test[:1%]")
247
+ dataset = dataset.map(
248
+ normalizer,
249
+ fn_kwargs={"chars_to_ignore": chars_to_ignore, "chars_to_mapping": chars_to_mapping},
250
+ remove_columns=list(set(dataset.column_names) - set(['sentence', 'path']))
251
+ )
252
+
253
+ dataset = dataset.map(speech_file_to_array_fn)
254
+ result = dataset.map(predict)
255
+
256
  wer = load_metric("wer")
257
 
258
  print("WER: {:.2f}".format(100 * wer.compute(predictions=result["predicted"], references=result["sentence"])))
259
  ```
260
 
261
+
262
  **Test Result**:
263
  - WER: 54.00%
264