mrm8488 commited on
Commit
6d4841e
1 Parent(s): e8cc4d8

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +55 -22
README.md CHANGED
@@ -33,92 +33,125 @@ When using this model, make sure that your speech input is sampled at 16kHz.
33
  The model can be used directly (without a language model) as follows:
34
 
35
  ```python
 
36
  import torch
 
37
  import torchaudio
 
38
  from datasets import load_dataset
 
39
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
40
 
41
- test_dataset = load_dataset("common_voice", "es, split="test[:2%]").
42
 
43
  processor = Wav2Vec2Processor.from_pretrained("mrm8488/wav2vec2-large-xlsr-53-spanish")
 
44
  model = Wav2Vec2ForCTC.from_pretrained("mrm8488/wav2vec2-large-xlsr-53-spanish")
45
 
46
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
47
 
48
  # Preprocessing the datasets.
 
49
  # We need to read the aduio files as arrays
 
50
  def speech_file_to_array_fn(batch):
51
- speech_array, sampling_rate = torchaudio.load(batch["path"])
52
- batch["speech"] = resampler(speech_array).squeeze().numpy()
53
- return batch
 
 
 
54
 
55
  test_dataset = test_dataset.map(speech_file_to_array_fn)
 
56
  inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
57
 
58
  with torch.no_grad():
59
- logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
 
60
 
61
  predicted_ids = torch.argmax(logits, dim=-1)
62
 
63
  print("Prediction:", processor.batch_decode(predicted_ids))
 
64
  print("Reference:", test_dataset["sentence"][:2])
65
- ```
66
 
 
67
 
68
  ## Evaluation
69
 
70
- The model can be evaluated as follows on the Spanish test data of Common Voice.
71
-
72
 
73
  ```python
 
74
  import torch
 
75
  import torchaudio
 
76
  from datasets import load_dataset, load_metric
 
77
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
 
78
  import re
79
 
80
  test_dataset = load_dataset("common_voice", "es", split="test")
 
81
  wer = load_metric("wer")
82
 
83
  processor = Wav2Vec2Processor.from_pretrained("mrm8488/wav2vec2-large-xlsr-53-spanish")
 
84
  model = Wav2Vec2ForCTC.from_pretrained("mrm8488/wav2vec2-large-xlsr-53-spanish")
 
85
  model.to("cuda")
86
 
87
- chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“]'
 
88
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
89
 
90
  # Preprocessing the datasets.
 
91
  # We need to read the aduio files as arrays
 
92
  def speech_file_to_array_fn(batch):
93
- batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
94
- speech_array, sampling_rate = torchaudio.load(batch["path"])
95
- batch["speech"] = resampler(speech_array).squeeze().numpy()
96
- return batch
 
 
 
 
97
 
98
  test_dataset = test_dataset.map(speech_file_to_array_fn)
99
 
100
  # Preprocessing the datasets.
 
101
  # We need to read the aduio files as arrays
 
102
  def evaluate(batch):
103
- inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
104
 
105
- with torch.no_grad():
106
- logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
 
 
 
107
 
108
- pred_ids = torch.argmax(logits, dim=-1)
109
- batch["pred_strings"] = processor.batch_decode(pred_ids)
110
- return batch
 
 
111
 
112
  result = test_dataset.map(evaluate, batched=True, batch_size=8)
113
 
114
  print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
115
- ```
116
 
117
- **Test Result**: ??? %
118
 
 
119
 
120
  ## Training
121
 
122
  The Common Voice `train`, `validation` datasets were used for training.
123
 
124
- The script used for training can be found ???
 
33
  The model can be used directly (without a language model) as follows:
34
 
35
  ```python
36
+
37
  import torch
38
+
39
  import torchaudio
40
+
41
  from datasets import load_dataset
42
+
43
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
44
 
45
+ test_dataset = load_dataset("common_voice", "es", split="test[:2%]").
46
 
47
  processor = Wav2Vec2Processor.from_pretrained("mrm8488/wav2vec2-large-xlsr-53-spanish")
48
+
49
  model = Wav2Vec2ForCTC.from_pretrained("mrm8488/wav2vec2-large-xlsr-53-spanish")
50
 
51
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
52
 
53
  # Preprocessing the datasets.
54
+
55
  # We need to read the aduio files as arrays
56
+
57
  def speech_file_to_array_fn(batch):
58
+
59
+ speech_array, sampling_rate = torchaudio.load(batch["path"])
60
+
61
+ batch["speech"] = resampler(speech_array).squeeze().numpy()
62
+
63
+ return batch
64
 
65
  test_dataset = test_dataset.map(speech_file_to_array_fn)
66
+
67
  inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
68
 
69
  with torch.no_grad():
70
+
71
+ logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
72
 
73
  predicted_ids = torch.argmax(logits, dim=-1)
74
 
75
  print("Prediction:", processor.batch_decode(predicted_ids))
76
+
77
  print("Reference:", test_dataset["sentence"][:2])
 
78
 
79
+ ```
80
 
81
  ## Evaluation
82
 
83
+ The model can be evaluated as follows on the Ukrainian test data of Common Voice.
 
84
 
85
  ```python
86
+
87
  import torch
88
+
89
  import torchaudio
90
+
91
  from datasets import load_dataset, load_metric
92
+
93
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
94
+
95
  import re
96
 
97
  test_dataset = load_dataset("common_voice", "es", split="test")
98
+
99
  wer = load_metric("wer")
100
 
101
  processor = Wav2Vec2Processor.from_pretrained("mrm8488/wav2vec2-large-xlsr-53-spanish")
102
+
103
  model = Wav2Vec2ForCTC.from_pretrained("mrm8488/wav2vec2-large-xlsr-53-spanish")
104
+
105
  model.to("cuda")
106
 
107
+ chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�]'
108
+
109
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
110
 
111
  # Preprocessing the datasets.
112
+
113
  # We need to read the aduio files as arrays
114
+
115
  def speech_file_to_array_fn(batch):
116
+
117
+ batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
118
+
119
+ speech_array, sampling_rate = torchaudio.load(batch["path"])
120
+
121
+ batch["speech"] = resampler(speech_array).squeeze().numpy()
122
+
123
+ return batch
124
 
125
  test_dataset = test_dataset.map(speech_file_to_array_fn)
126
 
127
  # Preprocessing the datasets.
128
+
129
  # We need to read the aduio files as arrays
130
+
131
  def evaluate(batch):
 
132
 
133
+ inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
134
+
135
+ with torch.no_grad():
136
+
137
+ logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
138
 
139
+ pred_ids = torch.argmax(logits, dim=-1)
140
+
141
+ batch["pred_strings"] = processor.batch_decode(pred_ids)
142
+
143
+ return batch
144
 
145
  result = test_dataset.map(evaluate, batched=True, batch_size=8)
146
 
147
  print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
 
148
 
149
+ ```
150
 
151
+ **Test Result**: %
152
 
153
  ## Training
154
 
155
  The Common Voice `train`, `validation` datasets were used for training.
156
 
157
+ The script used for training can be found ???