aapot commited on
Commit
7c0c2a7
1 Parent(s): 4f0c069

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +16 -16
README.md CHANGED
@@ -51,15 +51,15 @@ resampler = torchaudio.transforms.Resample(48_000, 16_000)
51
  # Preprocessing the datasets.
52
  # We need to read the aduio files as arrays
53
  def speech_file_to_array_fn(batch):
54
- speech_array, sampling_rate = torchaudio.load(batch["path"])
55
- batch["speech"] = resampler(speech_array).squeeze().numpy()
56
- return batch
57
 
58
  test_dataset = test_dataset.map(speech_file_to_array_fn)
59
  inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
60
 
61
  with torch.no_grad():
62
- logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
63
 
64
  predicted_ids = torch.argmax(logits, dim=-1)
65
 
@@ -87,30 +87,30 @@ processor = Wav2Vec2Processor.from_pretrained("aapot/wav2vec2-large-xlsr-53-finn
87
  model = Wav2Vec2ForCTC.from_pretrained("aapot/wav2vec2-large-xlsr-53-finnish")
88
  model.to("cuda")
89
 
90
- chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\'\...\…\–\é]' # TODO: adapt this list to include all special characters you removed from the data
91
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
92
 
93
  # Preprocessing the datasets.
94
  # We need to read the aduio files as arrays
95
  def speech_file_to_array_fn(batch):
96
- batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
97
- speech_array, sampling_rate = torchaudio.load(batch["path"])
98
- batch["speech"] = resampler(speech_array).squeeze().numpy()
99
- return batch
100
 
101
  test_dataset = test_dataset.map(speech_file_to_array_fn)
102
 
103
  # Preprocessing the datasets.
104
  # We need to read the aduio files as arrays
105
  def evaluate(batch):
106
- inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
107
 
108
- with torch.no_grad():
109
- logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
110
 
111
- pred_ids = torch.argmax(logits, dim=-1)
112
- batch["pred_strings"] = processor.batch_decode(pred_ids)
113
- return batch
114
 
115
  result = test_dataset.map(evaluate, batched=True, batch_size=8)
116
 
@@ -124,4 +124,4 @@ print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"],
124
 
125
  The Common Voice `train` and `validation` datasets were used for training as well as CSS10 Finnish and Finnish parliament session 2 datasets.
126
 
127
- The script used for training COMING SOON # TODO: fill in a link to your training script here. If you trained your model in a colab, simply fill in the link here. If you trained the model locally, it would be great if you could upload the training script on github and paste the link here.
 
51
  # Preprocessing the datasets.
52
  # We need to read the aduio files as arrays
53
  def speech_file_to_array_fn(batch):
54
+ \tspeech_array, sampling_rate = torchaudio.load(batch["path"])
55
+ \tbatch["speech"] = resampler(speech_array).squeeze().numpy()
56
+ \treturn batch
57
 
58
  test_dataset = test_dataset.map(speech_file_to_array_fn)
59
  inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
60
 
61
  with torch.no_grad():
62
+ \tlogits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
63
 
64
  predicted_ids = torch.argmax(logits, dim=-1)
65
 
 
87
  model = Wav2Vec2ForCTC.from_pretrained("aapot/wav2vec2-large-xlsr-53-finnish")
88
  model.to("cuda")
89
 
90
+ chars_to_ignore_regex = '[\\,\\?\\.\\!\\-\\;\\:\\"\\“\\%\\‘\\”\\�\\'\\...\\…\\–\\é]'
91
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
92
 
93
  # Preprocessing the datasets.
94
  # We need to read the aduio files as arrays
95
  def speech_file_to_array_fn(batch):
96
+ \tbatch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
97
+ \tspeech_array, sampling_rate = torchaudio.load(batch["path"])
98
+ \tbatch["speech"] = resampler(speech_array).squeeze().numpy()
99
+ \treturn batch
100
 
101
  test_dataset = test_dataset.map(speech_file_to_array_fn)
102
 
103
  # Preprocessing the datasets.
104
  # We need to read the aduio files as arrays
105
  def evaluate(batch):
106
+ \tinputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
107
 
108
+ \twith torch.no_grad():
109
+ \t\tlogits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
110
 
111
+ \tpred_ids = torch.argmax(logits, dim=-1)
112
+ \tbatch["pred_strings"] = processor.batch_decode(pred_ids)
113
+ \treturn batch
114
 
115
  result = test_dataset.map(evaluate, batched=True, batch_size=8)
116
 
 
124
 
125
  The Common Voice `train` and `validation` datasets were used for training as well as CSS10 Finnish and Finnish parliament session 2 datasets.
126
 
127
+ The script used for training COMING SOON