skylord commited on
Commit
9150426
1 Parent(s): 7272e79

Updated WER for eval

Browse files
.ipynb_checkpoints/README-checkpoint.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- language: {el}
3
  datasets:
4
  - common_voice
5
  metrics:
@@ -11,24 +11,24 @@ tags:
11
  - xlsr-fine-tuning-week
12
  license: apache-2.0
13
  model-index:
14
- - name: {Greek XLSR Wav2Vec2 Large 53}
15
  results:
16
  - task:
17
  name: Speech Recognition
18
  type: automatic-speech-recognition
19
  dataset:
20
- name: Common Voice {el}
21
  type: common_voice
22
- args: {el}
23
  metrics:
24
  - name: Test WER
25
  type: wer
26
- value: {1.00}
27
  ---
28
 
29
- # Wav2Vec2-Large-XLSR-53-{Greek}
30
 
31
- Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on {Greek} using the [Common Voice](https://huggingface.co/datasets/common_voice), ... and ... dataset{s}. #TODO: replace {language} with your language, *e.g.* French and eventually add more datasets that were used and eventually remove common voice if model was not trained on common voice
32
  When using this model, make sure that your speech input is sampled at 16kHz.
33
 
34
  ## Usage
@@ -40,20 +40,26 @@ import torch
40
  import torchaudio
41
  from datasets import load_dataset
42
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
43
- test_dataset = load_dataset("common_voice", "{el}", split="test[:2%]") #TODO: replace {lang_id} in your language code here. Make sure the code is one of the *ISO codes* of [this](https://huggingface.co/languages) site.
44
- processor = Wav2Vec2Processor.from_pretrained("{skylord/greek_lsr_1}") #TODO: replace {model_id} with your model id. The model id consists of {your_username}/{your_modelname}, *e.g.* `elgeish/wav2vec2-large-xlsr-53-arabic`
45
- model = Wav2Vec2ForCTC.from_pretrained("{skylord/greek_lsr_1}") #TODO: replace {model_id} with your model id. The model id consists of {your_username}/{your_modelname}, *e.g.* `elgeish/wav2vec2-large-xlsr-53-arabic`
 
 
46
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
 
47
  # Preprocessing the datasets.
48
  # We need to read the aduio files as arrays
49
  def speech_file_to_array_fn(batch):
50
- speech_array, sampling_rate = torchaudio.load(batch["path"])
51
- batch["speech"] = resampler(speech_array).squeeze().numpy()
52
- return batch
 
53
  test_dataset = test_dataset.map(speech_file_to_array_fn)
54
  inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
 
55
  with torch.no_grad():
56
- logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
 
57
  predicted_ids = torch.argmax(logits, dim=-1)
58
  print("Prediction:", processor.batch_decode(predicted_ids))
59
  print("Reference:", test_dataset["sentence"][:2])
@@ -62,7 +68,7 @@ print("Reference:", test_dataset["sentence"][:2])
62
 
63
  ## Evaluation
64
 
65
- The model can be evaluated as follows on the {Greek} test data of Common Voice. # TODO: replace #TODO: replace language with your {language}, *e.g.* French
66
 
67
 
68
  ```python
@@ -71,35 +77,44 @@ import torchaudio
71
  from datasets import load_dataset, load_metric
72
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
73
  import re
74
- test_dataset = load_dataset("common_voice", "{el}", split="test") #TODO: replace {lang_id} in your language code here. Make sure the code is one of the *ISO codes* of [this](https://huggingface.co/languages) site.
 
75
  wer = load_metric("wer")
76
- processor = Wav2Vec2Processor.from_pretrained("{skylord/greek_lsr_1}") #TODO: replace {model_id} with your model id. The model id consists of {your_username}/{your_modelname}, *e.g.* `elgeish/wav2vec2-large-xlsr-53-arabic`
77
- model = Wav2Vec2ForCTC.from_pretrained("{skylord/greek_lsr_1}") #TODO: replace {model_id} with your model id. The model id consists of {your_username}/{your_modelname}, *e.g.* `elgeish/wav2vec2-large-xlsr-53-arabic`
 
78
  model.to("cuda")
79
- chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“]' # TODO: adapt this list to include all special characters you removed from the data
 
80
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
 
81
  # Preprocessing the datasets.
82
  # We need to read the aduio files as arrays
 
83
  def speech_file_to_array_fn(batch):
84
- batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
85
- speech_array, sampling_rate = torchaudio.load(batch["path"])
86
- batch["speech"] = resampler(speech_array).squeeze().numpy()
87
- return batch
 
88
  test_dataset = test_dataset.map(speech_file_to_array_fn)
 
89
  # Preprocessing the datasets.
90
  # We need to read the aduio files as arrays
 
91
  def evaluate(batch):
92
- inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
93
- with torch.no_grad():
94
- logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
95
- pred_ids = torch.argmax(logits, dim=-1)
96
- batch["pred_strings"] = processor.batch_decode(pred_ids)
97
- return batch
 
98
  result = test_dataset.map(evaluate, batched=True, batch_size=8)
99
  print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
100
  ```
101
 
102
- **Test Result**: 1.00 % # TODO: write output of print here. IMPORTANT: Please remember to also replace {wer_result_on_test} at the top of with this value here. tags.
103
 
104
 
105
  ## Training
 
1
  ---
2
+ language: el
3
  datasets:
4
  - common_voice
5
  metrics:
 
11
  - xlsr-fine-tuning-week
12
  license: apache-2.0
13
  model-index:
14
+ - name: Greek XLSR Wav2Vec2 Large 53
15
  results:
16
  - task:
17
  name: Speech Recognition
18
  type: automatic-speech-recognition
19
  dataset:
20
+ name: Common Voice el
21
  type: common_voice
22
+ args: el
23
  metrics:
24
  - name: Test WER
25
  type: wer
26
+ value: 56.253154
27
  ---
28
 
29
+ # Wav2Vec2-Large-XLSR-53-Greek
30
 
31
+ Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on Greek using the [Common Voice](https://huggingface.co/datasets/common_voice), ... and ... dataset{s}. #TODO: replace {language} with your language, *e.g.* French and eventually add more datasets that were used and eventually remove common voice if model was not trained on common voice
32
  When using this model, make sure that your speech input is sampled at 16kHz.
33
 
34
  ## Usage
 
40
  import torchaudio
41
  from datasets import load_dataset
42
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
43
+ test_dataset = load_dataset("common_voice", "el", split="test[:2%]")
44
+
45
+ processor = Wav2Vec2Processor.from_pretrained("skylord/greek_lsr_1")
46
+ model = Wav2Vec2ForCTC.from_pretrained("skylord/greek_lsr_1")
47
+
48
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
49
+
50
  # Preprocessing the datasets.
51
  # We need to read the aduio files as arrays
52
  def speech_file_to_array_fn(batch):
53
+ speech_array, sampling_rate = torchaudio.load(batch["path"])
54
+ batch["speech"] = resampler(speech_array).squeeze().numpy()
55
+ return batch
56
+
57
  test_dataset = test_dataset.map(speech_file_to_array_fn)
58
  inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
59
+
60
  with torch.no_grad():
61
+ logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
62
+
63
  predicted_ids = torch.argmax(logits, dim=-1)
64
  print("Prediction:", processor.batch_decode(predicted_ids))
65
  print("Reference:", test_dataset["sentence"][:2])
 
68
 
69
  ## Evaluation
70
 
71
+ The model can be evaluated as follows on the Greek test data of Common Voice.
72
 
73
 
74
  ```python
 
77
  from datasets import load_dataset, load_metric
78
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
79
  import re
80
+
81
+ test_dataset = load_dataset("common_voice", "el", split="test")
82
  wer = load_metric("wer")
83
+
84
+ processor = Wav2Vec2Processor.from_pretrained("skylord/greek_lsr_1")
85
+ model = Wav2Vec2ForCTC.from_pretrained("skylord/greek_lsr_1")
86
  model.to("cuda")
87
+
88
+ chars_to_ignore_regex = '[\\\\\\\\,\\\\\\\\?\\\\\\\\.\\\\\\\\!\\\\\\\\-\\\\\\\\;\\\\\\\\:\\\\\\\\"\\\\\\\\“]'
89
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
90
+
91
  # Preprocessing the datasets.
92
  # We need to read the aduio files as arrays
93
+
94
  def speech_file_to_array_fn(batch):
95
+ batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
96
+ speech_array, sampling_rate = torchaudio.load(batch["path"])
97
+ batch["speech"] = resampler(speech_array).squeeze().numpy()
98
+ return batch
99
+
100
  test_dataset = test_dataset.map(speech_file_to_array_fn)
101
+
102
  # Preprocessing the datasets.
103
  # We need to read the aduio files as arrays
104
+
105
  def evaluate(batch):
106
+ inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
107
+ with torch.no_grad():
108
+ logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
109
+ pred_ids = torch.argmax(logits, dim=-1)
110
+ batch["pred_strings"] = processor.batch_decode(pred_ids)
111
+ return batch
112
+
113
  result = test_dataset.map(evaluate, batched=True, batch_size=8)
114
  print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
115
  ```
116
 
117
+ **Test Result**: 56.253154 %
118
 
119
 
120
  ## Training
README.md CHANGED
@@ -23,7 +23,7 @@ results:
23
  metrics:
24
  - name: Test WER
25
  type: wer
26
- value: ???
27
  ---
28
 
29
  # Wav2Vec2-Large-XLSR-53-Greek
@@ -114,7 +114,7 @@ result = test_dataset.map(evaluate, batched=True, batch_size=8)
114
  print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
115
  ```
116
 
117
- **Test Result**: ??? %
118
 
119
 
120
  ## Training
 
23
  metrics:
24
  - name: Test WER
25
  type: wer
26
+ value: 56.253154
27
  ---
28
 
29
  # Wav2Vec2-Large-XLSR-53-Greek
 
114
  print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
115
  ```
116
 
117
+ **Test Result**: 56.253154 %
118
 
119
 
120
  ## Training