patrickvonplaten commited on
Commit
775745d
1 Parent(s): ff45bd7

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +33 -18
README.md CHANGED
@@ -26,7 +26,7 @@ results:
26
  value: ???
27
  ---
28
 
29
- # Wav2Vec2-Large-XLSR-53-{Greek}
30
 
31
  Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on Greek using the [Common Voice](https://huggingface.co/datasets/common_voice), ... and ... dataset{s}. #TODO: replace {language} with your language, *e.g.* French and eventually add more datasets that were used and eventually remove common voice if model was not trained on common voice
32
  When using this model, make sure that your speech input is sampled at 16kHz.
@@ -40,20 +40,26 @@ import torch
40
  import torchaudio
41
  from datasets import load_dataset
42
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
43
- test_dataset = load_dataset("common_voice", "{el}", split="test[:2%]") #TODO: replace {lang_id} in your language code here. Make sure the code is one of the *ISO codes* of [this](https://huggingface.co/languages) site.
 
44
  processor = Wav2Vec2Processor.from_pretrained("skylord/greek_lsr_1")
45
  model = Wav2Vec2ForCTC.from_pretrained("skylord/greek_lsr_1")
 
46
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
 
47
  # Preprocessing the datasets.
48
  # We need to read the aduio files as arrays
49
  def speech_file_to_array_fn(batch):
50
- speech_array, sampling_rate = torchaudio.load(batch["path"])
51
- \tbatch["speech"] = resampler(speech_array).squeeze().numpy()
52
- \treturn batch
 
53
  test_dataset = test_dataset.map(speech_file_to_array_fn)
54
  inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
 
55
  with torch.no_grad():
56
- \tlogits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
 
57
  predicted_ids = torch.argmax(logits, dim=-1)
58
  print("Prediction:", processor.batch_decode(predicted_ids))
59
  print("Reference:", test_dataset["sentence"][:2])
@@ -71,30 +77,39 @@ import torchaudio
71
  from datasets import load_dataset, load_metric
72
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
73
  import re
 
74
  test_dataset = load_dataset("common_voice", "el", split="test")
75
  wer = load_metric("wer")
76
- processor = Wav2Vec2Processor.from_pretrained("skylord/greek_lsr_}")
 
77
  model = Wav2Vec2ForCTC.from_pretrained("skylord/greek_lsr_1")
78
  model.to("cuda")
79
- chars_to_ignore_regex = '[\\,\\?\\.\\!\\-\\;\\:\\"\\“]' # TODO: adapt this list to include all special characters you removed from the data
 
80
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
 
81
  # Preprocessing the datasets.
82
  # We need to read the aduio files as arrays
 
83
  def speech_file_to_array_fn(batch):
84
- \tbatch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
85
- \tspeech_array, sampling_rate = torchaudio.load(batch["path"])
86
- \tbatch["speech"] = resampler(speech_array).squeeze().numpy()
87
- \treturn batch
 
88
  test_dataset = test_dataset.map(speech_file_to_array_fn)
 
89
  # Preprocessing the datasets.
90
  # We need to read the aduio files as arrays
 
91
  def evaluate(batch):
92
- \tinputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
93
- \twith torch.no_grad():
94
- \t\tlogits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
95
- \tpred_ids = torch.argmax(logits, dim=-1)
96
- \tbatch["pred_strings"] = processor.batch_decode(pred_ids)
97
- \treturn batch
 
98
  result = test_dataset.map(evaluate, batched=True, batch_size=8)
99
  print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
100
  ```
 
26
  value: ???
27
  ---
28
 
29
+ # Wav2Vec2-Large-XLSR-53-Greek
30
 
31
  Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on Greek using the [Common Voice](https://huggingface.co/datasets/common_voice), ... and ... dataset{s}. #TODO: replace {language} with your language, *e.g.* French and eventually add more datasets that were used and eventually remove common voice if model was not trained on common voice
32
  When using this model, make sure that your speech input is sampled at 16kHz.
 
40
  import torchaudio
41
  from datasets import load_dataset
42
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
43
+ test_dataset = load_dataset("common_voice", "el", split="test[:2%]")
44
+
45
  processor = Wav2Vec2Processor.from_pretrained("skylord/greek_lsr_1")
46
  model = Wav2Vec2ForCTC.from_pretrained("skylord/greek_lsr_1")
47
+
48
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
49
+
50
  # Preprocessing the datasets.
51
  # We need to read the aduio files as arrays
52
  def speech_file_to_array_fn(batch):
53
+ speech_array, sampling_rate = torchaudio.load(batch["path"])
54
+ batch["speech"] = resampler(speech_array).squeeze().numpy()
55
+ return batch
56
+
57
  test_dataset = test_dataset.map(speech_file_to_array_fn)
58
  inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
59
+
60
  with torch.no_grad():
61
+ logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
62
+
63
  predicted_ids = torch.argmax(logits, dim=-1)
64
  print("Prediction:", processor.batch_decode(predicted_ids))
65
  print("Reference:", test_dataset["sentence"][:2])
 
77
  from datasets import load_dataset, load_metric
78
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
79
  import re
80
+
81
  test_dataset = load_dataset("common_voice", "el", split="test")
82
  wer = load_metric("wer")
83
+
84
+ processor = Wav2Vec2Processor.from_pretrained("skylord/greek_lsr_1")
85
  model = Wav2Vec2ForCTC.from_pretrained("skylord/greek_lsr_1")
86
  model.to("cuda")
87
+
88
+ chars_to_ignore_regex = '[\\\\,\\\\?\\\\.\\\\!\\\\-\\\\;\\\\:\\\\"\\\\“]'
89
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
90
+
91
  # Preprocessing the datasets.
92
  # We need to read the aduio files as arrays
93
+
94
  def speech_file_to_array_fn(batch):
95
+ batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
96
+ speech_array, sampling_rate = torchaudio.load(batch["path"])
97
+ batch["speech"] = resampler(speech_array).squeeze().numpy()
98
+ return batch
99
+
100
  test_dataset = test_dataset.map(speech_file_to_array_fn)
101
+
102
  # Preprocessing the datasets.
103
  # We need to read the aduio files as arrays
104
+
105
  def evaluate(batch):
106
+ inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
107
+ with torch.no_grad():
108
+ logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
109
+ pred_ids = torch.argmax(logits, dim=-1)
110
+ batch["pred_strings"] = processor.batch_decode(pred_ids)
111
+ return batch
112
+
113
  result = test_dataset.map(evaluate, batched=True, batch_size=8)
114
  print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
115
  ```