patrickvonplaten commited on
Commit
ff45bd7
1 Parent(s): 0d1b78a

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +28 -28
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- language: {el}
3
  datasets:
4
  - common_voice
5
  metrics:
@@ -11,24 +11,24 @@ tags:
11
  - xlsr-fine-tuning-week
12
  license: apache-2.0
13
  model-index:
14
- - name: {Greek XLSR Wav2Vec2 Large 53}
15
  results:
16
  - task:
17
  name: Speech Recognition
18
  type: automatic-speech-recognition
19
  dataset:
20
- name: Common Voice {el}
21
  type: common_voice
22
- args: {el}
23
  metrics:
24
  - name: Test WER
25
  type: wer
26
- value: {1.00}
27
  ---
28
 
29
  # Wav2Vec2-Large-XLSR-53-{Greek}
30
 
31
- Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on {Greek} using the [Common Voice](https://huggingface.co/datasets/common_voice), ... and ... dataset{s}. #TODO: replace {language} with your language, *e.g.* French and eventually add more datasets that were used and eventually remove common voice if model was not trained on common voice
32
  When using this model, make sure that your speech input is sampled at 16kHz.
33
 
34
  ## Usage
@@ -41,19 +41,19 @@ import torchaudio
41
  from datasets import load_dataset
42
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
43
  test_dataset = load_dataset("common_voice", "{el}", split="test[:2%]") #TODO: replace {lang_id} in your language code here. Make sure the code is one of the *ISO codes* of [this](https://huggingface.co/languages) site.
44
- processor = Wav2Vec2Processor.from_pretrained("{skylord/greek_lsr_1}") #TODO: replace {model_id} with your model id. The model id consists of {your_username}/{your_modelname}, *e.g.* `elgeish/wav2vec2-large-xlsr-53-arabic`
45
- model = Wav2Vec2ForCTC.from_pretrained("{skylord/greek_lsr_1}") #TODO: replace {model_id} with your model id. The model id consists of {your_username}/{your_modelname}, *e.g.* `elgeish/wav2vec2-large-xlsr-53-arabic`
46
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
47
  # Preprocessing the datasets.
48
  # We need to read the aduio files as arrays
49
  def speech_file_to_array_fn(batch):
50
- speech_array, sampling_rate = torchaudio.load(batch["path"])
51
- batch["speech"] = resampler(speech_array).squeeze().numpy()
52
- return batch
53
  test_dataset = test_dataset.map(speech_file_to_array_fn)
54
  inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
55
  with torch.no_grad():
56
- logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
57
  predicted_ids = torch.argmax(logits, dim=-1)
58
  print("Prediction:", processor.batch_decode(predicted_ids))
59
  print("Reference:", test_dataset["sentence"][:2])
@@ -62,7 +62,7 @@ print("Reference:", test_dataset["sentence"][:2])
62
 
63
  ## Evaluation
64
 
65
- The model can be evaluated as follows on the {Greek} test data of Common Voice. # TODO: replace #TODO: replace language with your {language}, *e.g.* French
66
 
67
 
68
  ```python
@@ -71,35 +71,35 @@ import torchaudio
71
  from datasets import load_dataset, load_metric
72
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
73
  import re
74
- test_dataset = load_dataset("common_voice", "{el}", split="test") #TODO: replace {lang_id} in your language code here. Make sure the code is one of the *ISO codes* of [this](https://huggingface.co/languages) site.
75
  wer = load_metric("wer")
76
- processor = Wav2Vec2Processor.from_pretrained("{skylord/greek_lsr_1}") #TODO: replace {model_id} with your model id. The model id consists of {your_username}/{your_modelname}, *e.g.* `elgeish/wav2vec2-large-xlsr-53-arabic`
77
- model = Wav2Vec2ForCTC.from_pretrained("{skylord/greek_lsr_1}") #TODO: replace {model_id} with your model id. The model id consists of {your_username}/{your_modelname}, *e.g.* `elgeish/wav2vec2-large-xlsr-53-arabic`
78
  model.to("cuda")
79
- chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“]' # TODO: adapt this list to include all special characters you removed from the data
80
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
81
  # Preprocessing the datasets.
82
  # We need to read the aduio files as arrays
83
  def speech_file_to_array_fn(batch):
84
- batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
85
- speech_array, sampling_rate = torchaudio.load(batch["path"])
86
- batch["speech"] = resampler(speech_array).squeeze().numpy()
87
- return batch
88
  test_dataset = test_dataset.map(speech_file_to_array_fn)
89
  # Preprocessing the datasets.
90
  # We need to read the aduio files as arrays
91
  def evaluate(batch):
92
- inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
93
- with torch.no_grad():
94
- logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
95
- pred_ids = torch.argmax(logits, dim=-1)
96
- batch["pred_strings"] = processor.batch_decode(pred_ids)
97
- return batch
98
  result = test_dataset.map(evaluate, batched=True, batch_size=8)
99
  print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
100
  ```
101
 
102
- **Test Result**: 1.00 % # TODO: write output of print here. IMPORTANT: Please remember to also replace {wer_result_on_test} at the top of with this value here. tags.
103
 
104
 
105
  ## Training
 
1
  ---
2
+ language: el
3
  datasets:
4
  - common_voice
5
  metrics:
 
11
  - xlsr-fine-tuning-week
12
  license: apache-2.0
13
  model-index:
14
+ - name: Greek XLSR Wav2Vec2 Large 53
15
  results:
16
  - task:
17
  name: Speech Recognition
18
  type: automatic-speech-recognition
19
  dataset:
20
+ name: Common Voice el
21
  type: common_voice
22
+ args: el
23
  metrics:
24
  - name: Test WER
25
  type: wer
26
+ value: ???
27
  ---
28
 
29
  # Wav2Vec2-Large-XLSR-53-{Greek}
30
 
31
+ Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on Greek using the [Common Voice](https://huggingface.co/datasets/common_voice), ... and ... dataset{s}. #TODO: replace {language} with your language, *e.g.* French and eventually add more datasets that were used and eventually remove common voice if model was not trained on common voice
32
  When using this model, make sure that your speech input is sampled at 16kHz.
33
 
34
  ## Usage
 
41
  from datasets import load_dataset
42
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
43
  test_dataset = load_dataset("common_voice", "{el}", split="test[:2%]") #TODO: replace {lang_id} in your language code here. Make sure the code is one of the *ISO codes* of [this](https://huggingface.co/languages) site.
44
+ processor = Wav2Vec2Processor.from_pretrained("skylord/greek_lsr_1")
45
+ model = Wav2Vec2ForCTC.from_pretrained("skylord/greek_lsr_1")
46
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
47
  # Preprocessing the datasets.
48
  # We need to read the aduio files as arrays
49
  def speech_file_to_array_fn(batch):
50
+ speech_array, sampling_rate = torchaudio.load(batch["path"])
51
+ \tbatch["speech"] = resampler(speech_array).squeeze().numpy()
52
+ \treturn batch
53
  test_dataset = test_dataset.map(speech_file_to_array_fn)
54
  inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
55
  with torch.no_grad():
56
+ \tlogits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
57
  predicted_ids = torch.argmax(logits, dim=-1)
58
  print("Prediction:", processor.batch_decode(predicted_ids))
59
  print("Reference:", test_dataset["sentence"][:2])
 
62
 
63
  ## Evaluation
64
 
65
+ The model can be evaluated as follows on the Greek test data of Common Voice.
66
 
67
 
68
  ```python
 
71
  from datasets import load_dataset, load_metric
72
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
73
  import re
74
+ test_dataset = load_dataset("common_voice", "el", split="test")
75
  wer = load_metric("wer")
76
+ processor = Wav2Vec2Processor.from_pretrained("skylord/greek_lsr_}")
77
+ model = Wav2Vec2ForCTC.from_pretrained("skylord/greek_lsr_1")
78
  model.to("cuda")
79
+ chars_to_ignore_regex = '[\\,\\?\\.\\!\\-\\;\\:\\"\\“]' # TODO: adapt this list to include all special characters you removed from the data
80
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
81
  # Preprocessing the datasets.
82
  # We need to read the aduio files as arrays
83
  def speech_file_to_array_fn(batch):
84
+ \tbatch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
85
+ \tspeech_array, sampling_rate = torchaudio.load(batch["path"])
86
+ \tbatch["speech"] = resampler(speech_array).squeeze().numpy()
87
+ \treturn batch
88
  test_dataset = test_dataset.map(speech_file_to_array_fn)
89
  # Preprocessing the datasets.
90
  # We need to read the aduio files as arrays
91
  def evaluate(batch):
92
+ \tinputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
93
+ \twith torch.no_grad():
94
+ \t\tlogits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
95
+ \tpred_ids = torch.argmax(logits, dim=-1)
96
+ \tbatch["pred_strings"] = processor.batch_decode(pred_ids)
97
+ \treturn batch
98
  result = test_dataset.map(evaluate, batched=True, batch_size=8)
99
  print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
100
  ```
101
 
102
+ **Test Result**: 1.00 %
103
 
104
 
105
  ## Training