Nhut DOANNGUYEN commited on
Commit
c038370
1 Parent(s): cdffa6b

Version 2.27

Browse files
Files changed (7) hide show
  1. .DS_Store +0 -0
  2. README.md +223 -14
  3. added_tokens.json +0 -1
  4. config.json +5 -5
  5. pytorch_model.bin +2 -2
  6. tokenizer_config.json +1 -1
  7. vocab.json +1 -1
.DS_Store CHANGED
Binary files a/.DS_Store and b/.DS_Store differ
 
README.md CHANGED
@@ -42,6 +42,115 @@ import torchaudio
42
  from datasets import load_dataset
43
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  test_dataset = load_dataset("common_voice", "vi", split="test[:2%]")
46
 
47
  processor = Wav2Vec2Processor.from_pretrained("Nhut/wav2vec2-large-xlsr-vietnamese")
@@ -63,7 +172,7 @@ with torch.no_grad():
63
 
64
  predicted_ids = torch.argmax(logits, dim=-1)
65
 
66
- print("Prediction:", processor.batch_decode(predicted_ids))
67
  print("Reference:", test_dataset["sentence"][:2])
68
  ```
69
 
@@ -80,26 +189,125 @@ from datasets import load_dataset, load_metric
80
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
81
  import re
82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  test_dataset = load_dataset("common_voice", "vi", split="test")
84
  wer = load_metric("wer")
85
 
86
- processor = Wav2Vec2Processor.from_pretrained("Nhut/wav2vec2-large-xlsr-vietnamese")
87
- model = Wav2Vec2ForCTC.from_pretrained("Nhut/wav2vec2-large-xlsr-vietnamese")
88
  model.to("cuda")
89
 
90
  chars_to_ignore_regex = '[\\\+\@\ǀ\,\?\.\!\-\;\:\"\“\%\‘\”\�]'
91
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
92
 
93
- # Preprocessing the datasets.
94
- # We need to read the aduio files as arrays
95
- def speech_file_to_array_fn(batch):
96
- batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
97
- speech_array, sampling_rate = torchaudio.load(batch["path"])
98
- batch["speech"] = resampler(speech_array).squeeze().numpy()
99
- return batch
100
-
101
- test_dataset = test_dataset.map(speech_file_to_array_fn)
102
-
103
  # Preprocessing the datasets.
104
  # We need to read the aduio files as arrays
105
  def evaluate(batch):
@@ -110,10 +318,11 @@ def evaluate(batch):
110
 
111
  pred_ids = torch.argmax(logits, dim=-1)
112
  batch["pred_strings"] = processor.batch_decode(pred_ids)
 
 
113
  return batch
114
 
115
  result = test_dataset.map(evaluate, batched=True, batch_size=8)
116
-
117
  print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
118
  ```
119
 
 
42
  from datasets import load_dataset
43
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
44
 
45
+ ENCODER = {
46
+ "ia ": "iê ",
47
+ "ìa ": "iề ",
48
+ "ía ": "iế ",
49
+ "ỉa ": "iể ",
50
+ "ĩa ": "iễ ",
51
+ "ịa ": "iệ ",
52
+ "ya ": "yê ",
53
+ "ỳa ": "yề ",
54
+ "ýa ": "yế ",
55
+ "ỷa ": "yể ",
56
+ "ỹa ": "yễ ",
57
+ "ỵa ": "yệ ",
58
+ "ua ": "uô ",
59
+ "ùa ": "uồ ",
60
+ "úa ": "uố ",
61
+ "ủa ": "uổ ",
62
+ "ũa ": "uỗ ",
63
+ "ụa ": "uộ ",
64
+ "ưa ": "ươ ",
65
+ "ừa ": "ườ ",
66
+ "ứa ": "ướ ",
67
+ "ửa ": "ưở ",
68
+ "ữa ": "ưỡ ",
69
+ "ựa ": "ượ ",
70
+ "ke": "ce",
71
+ "kè": "cè",
72
+ "ké": "cé",
73
+ "kẻ": "cẻ",
74
+ "kẽ": "cẽ",
75
+ "kẹ": "cẹ",
76
+ "kê": "cê",
77
+ "kề": "cề",
78
+ "kế": "cế",
79
+ "kể": "cể",
80
+ "kễ": "cễ",
81
+ "kệ": "cệ",
82
+ "ki": "ci",
83
+ "kì": "cì",
84
+ "kí": "cí",
85
+ "kỉ": "cỉ",
86
+ "kĩ": "cĩ",
87
+ "kị": "cị",
88
+ "ky": "cy",
89
+ "kỳ": "cỳ",
90
+ "ký": "cý",
91
+ "kỷ": "cỷ",
92
+ "kỹ": "cỹ",
93
+ "kỵ": "cỵ",
94
+ "ghe": "ge",
95
+ "ghè": "gè",
96
+ "ghé": "gé",
97
+ "ghẻ": "gẻ",
98
+ "ghẽ": "gẽ",
99
+ "ghẹ": "gẹ",
100
+ "ghê": "gê",
101
+ "ghề": "gề",
102
+ "ghế": "gế",
103
+ "ghể": "gể",
104
+ "ghễ": "gễ",
105
+ "ghệ": "gệ",
106
+ "ngh": "\x80",
107
+ "uyê": "\x96",
108
+ "uyề": "\x97",
109
+ "uyế": "\x98",
110
+ "uyể": "\x99",
111
+ "uyễ": "\x9a",
112
+ "uyệ": "\x9b",
113
+ "ng": "\x81",
114
+ "ch": "\x82",
115
+ "gh": "\x83",
116
+ "nh": "\x84",
117
+ "gi": "\x85",
118
+ "ph": "\x86",
119
+ "kh": "\x87",
120
+ "th": "\x88",
121
+ "tr": "\x89",
122
+ "uy": "\x8a",
123
+ "uỳ": "\x8b",
124
+ "uý": "\x8c",
125
+ "uỷ": "\x8d",
126
+ "uỹ": "\x8e",
127
+ "uỵ": "\x8f",
128
+ "iê": "\x90",
129
+ "iề": "\x91",
130
+ "iế": "\x92",
131
+ "iể": "\x93",
132
+ "iễ": "\x94",
133
+ "iệ": "\x95",
134
+ "uô": "\x9c",
135
+ "uồ": "\x9d",
136
+ "uố": "\x9e",
137
+ "uổ": "\x9f",
138
+ "uỗ": "\xa0",
139
+ "uộ": "\xa1",
140
+ "ươ": "\xa2",
141
+ "ườ": "\xa3",
142
+ "ướ": "\xa4",
143
+ "ưở": "\xa5",
144
+ "ưỡ": "\xa6",
145
+ "ượ": "\xa7",
146
+ }
147
+
148
+ def decode_string(x):
149
+ for k, v in list(reversed(list(ENCODER.items()))):
150
+ x = x.replace(v, k)
151
+ return x
152
+
153
+
154
  test_dataset = load_dataset("common_voice", "vi", split="test[:2%]")
155
 
156
  processor = Wav2Vec2Processor.from_pretrained("Nhut/wav2vec2-large-xlsr-vietnamese")
 
172
 
173
  predicted_ids = torch.argmax(logits, dim=-1)
174
 
175
+ print("Prediction:", decode_string(processor.batch_decode(predicted_ids)))
176
  print("Reference:", test_dataset["sentence"][:2])
177
  ```
178
 
 
189
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
190
  import re
191
 
192
+ ENCODER = {
193
+ "ia ": "iê ",
194
+ "ìa ": "iề ",
195
+ "ía ": "iế ",
196
+ "ỉa ": "iể ",
197
+ "ĩa ": "iễ ",
198
+ "ịa ": "iệ ",
199
+ "ya ": "yê ",
200
+ "ỳa ": "yề ",
201
+ "ýa ": "yế ",
202
+ "ỷa ": "yể ",
203
+ "ỹa ": "yễ ",
204
+ "ỵa ": "yệ ",
205
+ "ua ": "uô ",
206
+ "ùa ": "uồ ",
207
+ "úa ": "uố ",
208
+ "ủa ": "uổ ",
209
+ "ũa ": "uỗ ",
210
+ "ụa ": "uộ ",
211
+ "ưa ": "ươ ",
212
+ "ừa ": "ườ ",
213
+ "ứa ": "ướ ",
214
+ "ửa ": "ưở ",
215
+ "ữa ": "ưỡ ",
216
+ "ựa ": "ượ ",
217
+ "ke": "ce",
218
+ "kè": "cè",
219
+ "ké": "cé",
220
+ "kẻ": "cẻ",
221
+ "kẽ": "cẽ",
222
+ "kẹ": "cẹ",
223
+ "kê": "cê",
224
+ "kề": "cề",
225
+ "kế": "cế",
226
+ "kể": "cể",
227
+ "kễ": "cễ",
228
+ "kệ": "cệ",
229
+ "ki": "ci",
230
+ "kì": "cì",
231
+ "kí": "cí",
232
+ "kỉ": "cỉ",
233
+ "kĩ": "cĩ",
234
+ "kị": "cị",
235
+ "ky": "cy",
236
+ "kỳ": "cỳ",
237
+ "ký": "cý",
238
+ "kỷ": "cỷ",
239
+ "kỹ": "cỹ",
240
+ "kỵ": "cỵ",
241
+ "ghe": "ge",
242
+ "ghè": "gè",
243
+ "ghé": "gé",
244
+ "ghẻ": "gẻ",
245
+ "ghẽ": "gẽ",
246
+ "ghẹ": "gẹ",
247
+ "ghê": "gê",
248
+ "ghề": "gề",
249
+ "ghế": "gế",
250
+ "ghể": "gể",
251
+ "ghễ": "gễ",
252
+ "ghệ": "gệ",
253
+ "ngh": "\x80",
254
+ "uyê": "\x96",
255
+ "uyề": "\x97",
256
+ "uyế": "\x98",
257
+ "uyể": "\x99",
258
+ "uyễ": "\x9a",
259
+ "uyệ": "\x9b",
260
+ "ng": "\x81",
261
+ "ch": "\x82",
262
+ "gh": "\x83",
263
+ "nh": "\x84",
264
+ "gi": "\x85",
265
+ "ph": "\x86",
266
+ "kh": "\x87",
267
+ "th": "\x88",
268
+ "tr": "\x89",
269
+ "uy": "\x8a",
270
+ "uỳ": "\x8b",
271
+ "uý": "\x8c",
272
+ "uỷ": "\x8d",
273
+ "uỹ": "\x8e",
274
+ "uỵ": "\x8f",
275
+ "iê": "\x90",
276
+ "iề": "\x91",
277
+ "iế": "\x92",
278
+ "iể": "\x93",
279
+ "iễ": "\x94",
280
+ "iệ": "\x95",
281
+ "uô": "\x9c",
282
+ "uồ": "\x9d",
283
+ "uố": "\x9e",
284
+ "uổ": "\x9f",
285
+ "uỗ": "\xa0",
286
+ "uộ": "\xa1",
287
+ "ươ": "\xa2",
288
+ "ườ": "\xa3",
289
+ "ướ": "\xa4",
290
+ "ưở": "\xa5",
291
+ "ưỡ": "\xa6",
292
+ "ượ": "\xa7",
293
+ }
294
+
295
+ def decode_string(x):
296
+ for k, v in list(reversed(list(ENCODER.items()))):
297
+ x = x.replace(v, k)
298
+ return x
299
+
300
+
301
  test_dataset = load_dataset("common_voice", "vi", split="test")
302
  wer = load_metric("wer")
303
 
304
+ processor = Wav2Vec2Processor.from_pretrained(MODEL)
305
+ model = Wav2Vec2ForCTC.from_pretrained(MODEL)
306
  model.to("cuda")
307
 
308
  chars_to_ignore_regex = '[\\\+\@\ǀ\,\?\.\!\-\;\:\"\“\%\‘\”\�]'
309
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
310
 
 
 
 
 
 
 
 
 
 
 
311
  # Preprocessing the datasets.
312
  # We need to read the aduio files as arrays
313
  def evaluate(batch):
 
318
 
319
  pred_ids = torch.argmax(logits, dim=-1)
320
  batch["pred_strings"] = processor.batch_decode(pred_ids)
321
+ # decode_string: We replace the encoded letter with the initial letters
322
+ batch["pred_strings"] = [decode_string(x) for x in batch["pred_strings"]]
323
  return batch
324
 
325
  result = test_dataset.map(evaluate, batched=True, batch_size=8)
 
326
  print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
327
  ```
328
 
added_tokens.json DELETED
@@ -1 +0,0 @@
1
- {"<s>": 91, "</s>": 92}
 
 
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "/content/gdrive/MyDrive/Colab\\ Notebooks/XLSR_V54/wav2vec-large-xlsr-vietnamese-demo",
3
  "activation_dropout": 0.0,
4
  "apply_spec_augment": true,
5
  "architectures": [
@@ -51,7 +51,7 @@
51
  "initializer_range": 0.02,
52
  "intermediate_size": 4096,
53
  "layer_norm_eps": 1e-05,
54
- "layerdrop": 0.05,
55
  "mask_channel_length": 10,
56
  "mask_channel_min_space": 1,
57
  "mask_channel_other": 0.0,
@@ -62,7 +62,7 @@
62
  "mask_time_length": 10,
63
  "mask_time_min_space": 1,
64
  "mask_time_other": 0.0,
65
- "mask_time_prob": 0.06,
66
  "mask_time_selection": "static",
67
  "model_type": "wav2vec2",
68
  "num_attention_heads": 16,
@@ -70,7 +70,7 @@
70
  "num_conv_pos_embeddings": 128,
71
  "num_feat_extract_layers": 7,
72
  "num_hidden_layers": 24,
73
- "pad_token_id": 90,
74
  "transformers_version": "4.4.0",
75
- "vocab_size": 93
76
  }
 
1
  {
2
+ "_name_or_path": "/content/gdrive/MyDrive/Colab\\ Notebooks/XLSR_V2_26/wav2vec-large-xlsr-vietnamese-demo",
3
  "activation_dropout": 0.0,
4
  "apply_spec_augment": true,
5
  "architectures": [
 
51
  "initializer_range": 0.02,
52
  "intermediate_size": 4096,
53
  "layer_norm_eps": 1e-05,
54
+ "layerdrop": 0.1,
55
  "mask_channel_length": 10,
56
  "mask_channel_min_space": 1,
57
  "mask_channel_other": 0.0,
 
62
  "mask_time_length": 10,
63
  "mask_time_min_space": 1,
64
  "mask_time_other": 0.0,
65
+ "mask_time_prob": 0.05,
66
  "mask_time_selection": "static",
67
  "model_type": "wav2vec2",
68
  "num_attention_heads": 16,
 
70
  "num_conv_pos_embeddings": 128,
71
  "num_feat_extract_layers": 7,
72
  "num_hidden_layers": 24,
73
+ "pad_token_id": 135,
74
  "transformers_version": "4.4.0",
75
+ "vocab_size": 136
76
  }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c1c863c1992ca852a12be2ff3944828de77e69478a28613f9f050ffe3a6aa0c7
3
- size 1262315159
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:27f8edf2f10fc71c73bf8fb234cd46e66a7ddb59dd0778094aa6c70b750c3e4b
3
+ size 1262491415
tokenizer_config.json CHANGED
@@ -1 +1 @@
1
- {"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|", "special_tokens_map_file": "/content/gdrive/MyDrive/Colab\\ Notebooks/XLSR_24_1938-0.71+0/wav2vec-large-xlsr-vietnamese-demo/special_tokens_map.json", "tokenizer_file": null, "name_or_path": "/content/gdrive/MyDrive/Colab\\ Notebooks/XLSR_V54/wav2vec-large-xlsr-vietnamese-demo"}
 
1
+ {"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|"}
vocab.json CHANGED
@@ -1 +1 @@
1
- {"|": 0, "ồ": 1, "": 2, "": 3, "": 4, "ê": 5, "": 6, "s": 7, "ơ": 8, "h": 9, "": 10, "ế": 11, "": 12, "": 13, "": 14, "ý": 15, "": 16, "ò": 17, "": 18, "": 19, "g": 20, "": 21, "e": 22, "d": 23, "": 24, "": 25, "": 26, "": 27, "ù": 28, "i": 29, "": 30, "v": 31, "p": 32, "": 33, "x": 34, "a": 35, "r": 36, "m": 37, "": 38, "": 39, "y": 40, "": 41, "ú": 42, "": 43, "n": 44, "": 45, "": 46, "": 47, "ó": 48, "": 49, "ì": 50, "à": 51, "ũ": 52, "đ": 53, "": 54, "": 55, "é": 56, "": 57, "": 58, "": 59, "": 60, "õ": 61, "t": 62, "ã": 63, "â": 64, "í": 65, "ô": 66, "ư": 67, "": 68, "": 70, "u": 71, "o": 72, "k": 73, "": 74, "": 75, "ă": 76, "á": 77, "": 78, "ĩ": 79, "": 80, "": 81, "": 82, "c": 83, "b": 84, "q": 85, "": 86, "l": 87, "è": 88, "ạ": 69, "[UNK]": 89, "[PAD]": 90}
 
1
+ {"a": 1, "b": 2, "c": 3, "d": 4, "e": 5, "f": 6, "g": 7, "h": 8, "i": 9, "j": 10, "k": 11, "l": 12, "m": 13, "n": 14, "o": 15, "p": 16, "q": 17, "r": 18, "s": 19, "t": 20, "u": 21, "v": 22, "w": 23, "x": 24, "y": 25, "z": 26, "€": 27, "": 28, "‚": 29, "ƒ": 30, "„": 31, "…": 32, "†": 33, "‡": 34, "ˆ": 35, "‰": 36, "Š": 37, "‹": 38, "Œ": 39, "": 40, "Ž": 41, "": 42, "": 43, "‘": 44, "’": 45, "“": 46, "”": 47, "•": 48, "–": 49, "—": 50, "˜": 51, "™": 52, "š": 53, "›": 54, "œ": 55, "": 56, "ž": 57, "Ÿ": 58, " ": 59, "¡": 60, "¢": 61, "£": 62, "¤": 63, "¥": 64, "¦": 65, "§": 66, "à": 67, "á": 68, "â": 69, "ã": 70, "è": 71, "é": 72, "ê": 73, "ì": 74, "í": 75, "ò": 76, "ó": 77, "ô": 78, "õ": 79, "ù": 80, "ú": 81, "ý": 82, "ă": 83, "đ": 84, "ĩ": 85, "ũ": 86, "ơ": 87, "ư": 88, "ạ": 89, "ả": 90, "ấ": 91, "ầ": 92, "ẩ": 93, "ẫ": 94, "ậ": 95, "ắ": 96, "ằ": 97, "ẳ": 98, "ẵ": 99, "ặ": 100, "ẹ": 101, "ẻ": 102, "ẽ": 103, "ế": 104, "ề": 105, "ể": 106, "ễ": 107, "ệ": 108, "ỉ": 109, "ị": 110, "ọ": 111, "ỏ": 112, "ố": 113, "ồ": 114, "ổ": 115, "ỗ": 116, "ộ": 117, "ớ": 118, "ờ": 119, "ở": 120, "ỡ": 121, "ợ": 122, "ụ": 123, "ủ": 124, "ứ": 125, "ừ": 126, "ử": 127, "ữ": 128, "ự": 129, "ỳ": 130, "ỵ": 131, "ỷ": 132, "ỹ": 133, "|": 0, "[UNK]": 134, "[PAD]": 135}