MehdiHosseiniMoghadam commited on
Commit
dd9b9fc
1 Parent(s): 6c37025

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +81 -24
README.md CHANGED
@@ -1,32 +1,57 @@
1
- language: {sv-SE}
 
 
 
2
  datasets:
3
- - common_voice
4
- metrics:
5
- - wer
6
  tags:
 
7
  - audio
 
8
  - automatic-speech-recognition
 
9
  - speech
 
10
  - xlsr-fine-tuning-week
 
11
  license: apache-2.0
 
12
  model-index:
13
- - name: {MehdiHosseiniMoghadam/wav2vec2-large-xlsr-53-Swedish}
 
 
14
  results:
 
15
  - task:
 
16
  name: Speech Recognition
 
17
  type: automatic-speech-recognition
 
18
  dataset:
19
- name: Common Voice {sv-SE}
 
 
20
  type: common_voice
21
- args: {sv-SE}
 
 
22
  metrics:
 
23
  - name: Test WER
 
24
  type: wer
25
- value: {41.388337}
 
 
26
  ---
27
 
 
 
 
28
 
29
- Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on {Swedish} using the [Common Voice](https://huggingface.co/datasets/common_voice)
30
  When using this model, make sure that your speech input is sampled at 16kHz.
31
 
32
  ## Usage
@@ -34,91 +59,123 @@ When using this model, make sure that your speech input is sampled at 16kHz.
34
  The model can be used directly (without a language model) as follows:
35
 
36
  ```python
 
37
  import torch
 
38
  import torchaudio
 
39
  from datasets import load_dataset
 
40
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
41
 
42
- test_dataset = load_dataset("common_voice", "{sv-SE}", split="test[:2%]")
 
 
43
 
44
- processor = Wav2Vec2Processor.from_pretrained("{MehdiHosseiniMoghadam/wav2vec2-large-xlsr-53-Swedish}")
45
- model = Wav2Vec2ForCTC.from_pretrained("{MehdiHosseiniMoghadam/wav2vec2-large-xlsr-53-Swedish}")
46
 
47
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
48
 
49
  # Preprocessing the datasets.
 
50
  # We need to read the aduio files as arrays
 
51
  def speech_file_to_array_fn(batch):
 
52
  speech_array, sampling_rate = torchaudio.load(batch["path"])
 
53
  batch["speech"] = resampler(speech_array).squeeze().numpy()
 
54
  return batch
55
 
56
  test_dataset = test_dataset.map(speech_file_to_array_fn)
 
57
  inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
58
 
59
  with torch.no_grad():
 
60
  logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
61
 
62
  predicted_ids = torch.argmax(logits, dim=-1)
63
 
64
  print("Prediction:", processor.batch_decode(predicted_ids))
 
65
  print("Reference:", test_dataset["sentence"][:2])
66
- ```
67
 
 
68
 
69
  ## Evaluation
70
 
71
- The model can be evaluated as follows on the {Swedish} test data of Common Voice.
72
-
73
 
74
  ```python
 
75
  import torch
 
76
  import torchaudio
 
77
  from datasets import load_dataset, load_metric
 
78
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
 
79
  import re
80
 
81
- test_dataset = load_dataset("common_voice", "{sv-SE}", split="test")
 
82
  wer = load_metric("wer")
83
 
84
- processor = Wav2Vec2Processor.from_pretrained("{MehdiHosseiniMoghadam/wav2vec2-large-xlsr-53-Swedish}")
85
- model = Wav2Vec2ForCTC.from_pretrained("{MehdiHosseiniMoghadam/wav2vec2-large-xlsr-53-Swedish}")
 
 
86
  model.to("cuda")
87
 
88
- chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“]'
 
89
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
90
 
91
  # Preprocessing the datasets.
 
92
  # We need to read the aduio files as arrays
 
93
  def speech_file_to_array_fn(batch):
 
94
  batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
 
95
  speech_array, sampling_rate = torchaudio.load(batch["path"])
 
96
  batch["speech"] = resampler(speech_array).squeeze().numpy()
 
97
  return batch
98
 
99
  test_dataset = test_dataset.map(speech_file_to_array_fn)
100
 
101
  # Preprocessing the datasets.
 
102
  # We need to read the aduio files as arrays
 
103
  def evaluate(batch):
 
104
  inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
105
 
106
  with torch.no_grad():
 
107
  logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
108
 
109
- pred_ids = torch.argmax(logits, dim=-1)
 
110
  batch["pred_strings"] = processor.batch_decode(pred_ids)
 
111
  return batch
112
 
113
  result = test_dataset.map(evaluate, batched=True, batch_size=8)
114
 
115
  print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
116
- ```
117
 
118
- **Test Result**: 41.388337 %
119
 
 
120
 
121
  ## Training
122
 
123
- The Common Voice `train`, `validation`
124
-
1
+ ---
2
+
3
+ language: sv-SE
4
+
5
  datasets:
6
+
7
+ - common_voice
8
+
9
  tags:
10
+
11
  - audio
12
+
13
  - automatic-speech-recognition
14
+
15
  - speech
16
+
17
  - xlsr-fine-tuning-week
18
+
19
  license: apache-2.0
20
+
21
  model-index:
22
+
23
+ - name: wav2vec2-large-xlsr-53-Swedish by Mehdi Hosseini Moghadam
24
+
25
  results:
26
+
27
  - task:
28
+
29
  name: Speech Recognition
30
+
31
  type: automatic-speech-recognition
32
+
33
  dataset:
34
+
35
+ name: Common Voice sv-SE
36
+
37
  type: common_voice
38
+
39
+ args: sv-SE
40
+
41
  metrics:
42
+
43
  - name: Test WER
44
+
45
  type: wer
46
+
47
+ value: 41.388337
48
+
49
  ---
50
 
51
+ # wav2vec2-large-xlsr-53-Swedish
52
+
53
+ Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) in Swedish using the [Common Voice](https://huggingface.co/datasets/common_voice)
54
 
 
55
  When using this model, make sure that your speech input is sampled at 16kHz.
56
 
57
  ## Usage
59
  The model can be used directly (without a language model) as follows:
60
 
61
  ```python
62
+
63
  import torch
64
+
65
  import torchaudio
66
+
67
  from datasets import load_dataset
68
+
69
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
70
 
71
+ test_dataset = load_dataset("common_voice", "sv-SE", split="test[:2%]").
72
+
73
+ processor = Wav2Vec2Processor.from_pretrained("MehdiHosseiniMoghadam/wav2vec2-large-xlsr-53-Swedish")
74
 
75
+ model = Wav2Vec2ForCTC.from_pretrained("MehdiHosseiniMoghadam/wav2vec2-large-xlsr-53-Swedish")
 
76
 
77
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
78
 
79
  # Preprocessing the datasets.
80
+
81
  # We need to read the aduio files as arrays
82
+
83
  def speech_file_to_array_fn(batch):
84
+
85
  speech_array, sampling_rate = torchaudio.load(batch["path"])
86
+
87
  batch["speech"] = resampler(speech_array).squeeze().numpy()
88
+
89
  return batch
90
 
91
  test_dataset = test_dataset.map(speech_file_to_array_fn)
92
+
93
  inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
94
 
95
  with torch.no_grad():
96
+
97
  logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
98
 
99
  predicted_ids = torch.argmax(logits, dim=-1)
100
 
101
  print("Prediction:", processor.batch_decode(predicted_ids))
102
+
103
  print("Reference:", test_dataset["sentence"][:2])
 
104
 
105
+ ```
106
 
107
  ## Evaluation
108
 
109
+ The model can be evaluated as follows on the Swedish test data of Common Voice.
 
110
 
111
  ```python
112
+
113
  import torch
114
+
115
  import torchaudio
116
+
117
  from datasets import load_dataset, load_metric
118
+
119
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
120
+
121
  import re
122
 
123
+ test_dataset = load_dataset("common_voice", "sv-SE", split="test")
124
+
125
  wer = load_metric("wer")
126
 
127
+ processor = Wav2Vec2Processor.from_pretrained("MehdiHosseiniMoghadam/wav2vec2-large-xlsr-53-Swedish")
128
+
129
+ model = Wav2Vec2ForCTC.from_pretrained("MehdiHosseiniMoghadam/wav2vec2-large-xlsr-53-Swedish")
130
+
131
  model.to("cuda")
132
 
133
+ chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“]'
134
+
135
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
136
 
137
  # Preprocessing the datasets.
138
+
139
  # We need to read the aduio files as arrays
140
+
141
  def speech_file_to_array_fn(batch):
142
+
143
  batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
144
+
145
  speech_array, sampling_rate = torchaudio.load(batch["path"])
146
+
147
  batch["speech"] = resampler(speech_array).squeeze().numpy()
148
+
149
  return batch
150
 
151
  test_dataset = test_dataset.map(speech_file_to_array_fn)
152
 
153
  # Preprocessing the datasets.
154
+
155
  # We need to read the aduio files as arrays
156
+
157
  def evaluate(batch):
158
+
159
  inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
160
 
161
  with torch.no_grad():
162
+
163
  logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
164
 
165
+ pred_ids = torch.argmax(logits, dim=-1)
166
+
167
  batch["pred_strings"] = processor.batch_decode(pred_ids)
168
+
169
  return batch
170
 
171
  result = test_dataset.map(evaluate, batched=True, batch_size=8)
172
 
173
  print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
 
174
 
175
+ ```
176
 
177
+ **Test Result**: 41.388337 %
178
 
179
  ## Training
180
 
181
+ The Common Voice `train`, `validation` datasets were used for training.