tanmaylaud commited on
Commit
6caf0bf
1 Parent(s): 8629923

Updated README with correct eval script

Browse files
Files changed (1) hide show
  1. README.md +18 -15
README.md CHANGED
@@ -67,32 +67,34 @@ Evaluation
67
  The model can be evaluated as follows on 10% of the Marathi data on OpenSLR.
68
  ```
69
  ```
70
- import torch
71
  import torchaudio
 
 
 
72
  import librosa
73
- from datasets import load_dataset, load_metric
74
- from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
75
  import re
76
 
77
- # test_data = #TODO: WRITE YOUR CODE TO LOAD THE TEST DATASET. For sample see the Colab link in Training Section.
78
-
79
  wer = load_metric("wer")
80
- processor = Wav2Vec2Processor.from_pretrained("tanmaylaud/wav2vec2-large-xlsr-hindi-marathi")
81
- model = Wav2Vec2ForCTC.from_pretrained("tanmaylaud/wav2vec2-large-xlsr-hindi-marathi")
82
  model.to("cuda")
83
 
84
- chars_to_ignore_regex = '[\\\\,\\\\?\\\\.\\\\!\\\\-\\\\;\\\\:\\\\"\\\\“\\\\%\\\\‘\\\\”\\\\�\\\\–\\\\…]'
85
-
86
 
87
  # Preprocessing the datasets.
88
  # We need to read the audio files as arrays
89
  def speech_file_to_array_fn(batch):
90
- batch["text"] = re.sub(chars_to_ignore_regex, '', batch["text"]).lower()
91
- speech_array, sampling_rate = torchaudio.load(batch["audio_path"])
92
- batch["speech"] = librosa.resample(speech_array[0].numpy(), sampling_rate, 16_000)
 
 
 
 
93
  return batch
94
 
95
- test_data= test_data.map(speech_file_to_array_fn)
96
 
97
  # Preprocessing the datasets.
98
  # We need to read the audio files as arrays
@@ -101,9 +103,10 @@ def evaluate(batch):
101
  with torch.no_grad():
102
  logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
103
  pred_ids = torch.argmax(logits, dim=-1)
104
- batch["pred_strings"] = processor.batch_decode(pred_ids)
 
105
  return batch
106
 
107
- result = test_data.map(evaluate, batched=True, batch_size=8)
108
  print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["text"])))
109
  ```
67
  The model can be evaluated as follows on 10% of the Marathi data on OpenSLR.
68
  ```
69
  ```
 
70
  import torchaudio
71
+ from datasets import load_metric
72
+ from transformers import Wav2Vec2Processor,Wav2Vec2ForCTC
73
+ import torch
74
  import librosa
75
+ import numpy as np
 
76
  import re
77
 
 
 
78
  wer = load_metric("wer")
79
+ processor = Wav2Vec2Processor.from_pretrained(output_models_dir)
80
+ model = Wav2Vec2ForCTC.from_pretrained(output_models_dir+'/'+checkpoint)
81
  model.to("cuda")
82
 
83
+ chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\।]'
 
84
 
85
  # Preprocessing the datasets.
86
  # We need to read the audio files as arrays
87
  def speech_file_to_array_fn(batch):
88
+ batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"])
89
+ speech_array, sampling_rate = torchaudio.load(batch["path"])
90
+ batch["speech"] = speech_array[0].numpy()
91
+ batch["sampling_rate"] = sampling_rate
92
+ batch["target_text"] = batch["sentence"]
93
+ batch["speech"] = librosa.resample(np.asarray(batch["speech"]), 8_000, 16_000)
94
+ batch["sampling_rate"] = 16_000
95
  return batch
96
 
97
+ test= test.map(speech_file_to_array_fn)
98
 
99
  # Preprocessing the datasets.
100
  # We need to read the audio files as arrays
103
  with torch.no_grad():
104
  logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
105
  pred_ids = torch.argmax(logits, dim=-1)
106
+ batch["pred_strings"] = processor.batch_decode(pred_ids, group_tokens=False)
107
+ # we do not want to group tokens when computing the metrics
108
  return batch
109
 
110
+ result = test.map(evaluate, batched=True, batch_size=32)
111
  print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["text"])))
112
  ```