tanmaylaud commited on
Commit
16efafc
1 Parent(s): cbeb18d

Update README.md

Browse files

Added eval for common voice hindi

Files changed (1) hide show
  1. README.md +76 -18
README.md CHANGED
@@ -31,8 +31,23 @@ model-index:
31
  # Wav2Vec2-Large-XLSR-53-Hindi-Marathi
32
  Fine-tuned facebook/wav2vec2-large-xlsr-53 on Hindi and Marathi using the OpenSLR SLR64 datasets. When using this model, make sure that your speech input is sampled at 16kHz.
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  ## Usage
35
- The model can be used directly (without a language model) as follows, assuming you have a dataset with Marathi text and audio_path fields:
 
 
36
 
37
  ```python
38
  import torch
@@ -41,16 +56,23 @@ import librosa
41
  from datasets import load_dataset
42
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
43
 
44
- # test_data = #TODO: WRITE YOUR CODE TO LOAD THE TEST DATASET. For sample see the Colab link in Training Section.
 
45
 
46
- processor = Wav2Vec2Processor.from_pretrained("tanmaylaud/wav2vec2-large-xlsr-hindi-marathi")
47
- model = Wav2Vec2ForCTC.from_pretrained("tanmaylaud/wav2vec2-large-xlsr-hindi-marathi")
 
48
 
49
  # Preprocessing the datasets.
50
  # We need to read the audio files as arrays
51
  def speech_file_to_array_fn(batch):
52
- speech_array, sampling_rate = torchaudio.load(batch["audio_path"])
53
- batch["speech"] = librosa.resample(speech_array[0].numpy(), sampling_rate, 16_000) # sampling_rate can vary
 
 
 
 
 
54
  return batch
55
 
56
  test_data= test_data.map(speech_file_to_array_fn)
@@ -63,25 +85,20 @@ predicted_ids = torch.argmax(logits, dim=-1)
63
 
64
  print("Prediction:", processor.batch_decode(predicted_ids))
65
  print("Reference:", test_data["text"][:2])
66
- Evaluation
67
- The model can be evaluated as follows on 10% of the Marathi data on OpenSLR.
68
 
 
69
  ```python
70
  import torchaudio
71
- from datasets import load_metric
72
- from transformers import Wav2Vec2Processor,Wav2Vec2ForCTC
73
  import torch
74
  import librosa
75
  import numpy as np
76
  import re
77
 
78
- wer = load_metric("wer")
79
- processor = Wav2Vec2Processor.from_pretrained("tanmaylaud/wav2vec2-large-xlsr-hindi-marathi")
80
- model = Wav2Vec2ForCTC.from_pretrained("tanmaylaud/wav2vec2-large-xlsr-hindi-marathi")
81
 
82
- model.to("cuda")
83
 
84
- chars_to_ignore_regex = '[\\\\\\\\,\\\\\\\\?\\\\\\\\.\\\\\\\\!\\\\\\\\-\\\\\\\\;\\\\\\\\:\\\\\\\\"\\\\\\\\“\\\\\\\\%\\\\\\\\‘\\\\\\\\”\\\\\\\\�\\\\\\\\।]'
85
 
86
  # Preprocessing the datasets.
87
  # We need to read the audio files as arrays
@@ -104,12 +121,53 @@ def evaluate(batch):
104
  with torch.no_grad():
105
  logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
106
  pred_ids = torch.argmax(logits, dim=-1)
107
- batch["pred_strings"] = processor.batch_decode(pred_ids, group_tokens=False)
108
  # we do not want to group tokens when computing the metrics
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  return batch
110
 
111
- result = test.map(evaluate, batched=True, batch_size=32)
112
- print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["text"])))
 
 
 
 
113
  ```
114
 
115
  Link to eval notebook : https://colab.research.google.com/drive/1nZRTgKfxCD9cvy90wikTHkg2il3zgcqW#scrollTo=cXWFbhb0d7DT
31
  # Wav2Vec2-Large-XLSR-53-Hindi-Marathi
32
  Fine-tuned facebook/wav2vec2-large-xlsr-53 on Hindi and Marathi using the OpenSLR SLR64 datasets. When using this model, make sure that your speech input is sampled at 16kHz.
33
 
34
+ ## Installation
35
+ pip install git+https://github.com/huggingface/transformers.git datasets librosa torch==1.7.0 torchaudio==0.7.0 jiwer
36
+
37
+ ## Eval dataset:
38
+ !wget https://www.openslr.org/resources/103/Marathi_test.zip -P data/marathi
39
+ !unzip -P "K3[2?do9" data/marathi/Marathi_test.zip -d data/marathi/.
40
+ !tar -xzf data/marathi/Marathi_test.tar.gz -C data/marathi/.
41
+ !wget https://www.openslr.org/resources/103/Hindi_test.zip -P data/hindi
42
+ !unzip -P "w9I2{3B*" data/hindi/Hindi_test.zip -d data/hindi/.
43
+ !tar -xzf data/hindi/Hindi_test.tar.gz -C data/hindi/.
44
+ !wget -O test.csv 'https://filebin.net/snrz6bt13usv8w2e/test_large.csv?t=ps3n99ho'
45
+ If download does not work, paste this link in browser: https://filebin.net/snrz6bt13usv8w2e/test_large.csv
46
+
47
  ## Usage
48
+ The model can be used directly (without a language model) as follows, assuming you have a dataset with Marathi text and path fields:
49
+
50
+
51
 
52
  ```python
53
  import torch
56
  from datasets import load_dataset
57
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
58
 
59
+ from datasets import load_metric, Dataset
60
+ from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
61
 
62
+ wer = load_metric("wer")
63
+ processor = Wav2Vec2Processor.from_pretrained('tanmaylaud/wav2vec2-large-xlsr-hindi-marathi')
64
+ model = Wav2Vec2ForCTC.from_pretrained('tanmaylaud/wav2vec2-large-xlsr-hindi-marathi').to("cuda")
65
 
66
  # Preprocessing the datasets.
67
  # We need to read the audio files as arrays
68
  def speech_file_to_array_fn(batch):
69
+ batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"])
70
+ speech_array, sampling_rate = torchaudio.load(batch["path"])
71
+ batch["speech"] = speech_array[0].numpy()
72
+ batch["sampling_rate"] = sampling_rate
73
+ batch["target_text"] = batch["sentence"]
74
+ batch["speech"] = librosa.resample(np.asarray(batch["speech"]), sampling_rate, 16_000)
75
+ batch["sampling_rate"] = 16_000
76
  return batch
77
 
78
  test_data= test_data.map(speech_file_to_array_fn)
85
 
86
  print("Prediction:", processor.batch_decode(predicted_ids))
87
  print("Reference:", test_data["text"][:2])
88
+ ```
 
89
 
90
+ #Code For Evaluation on OpenSLR (Hindi + Marathi : https://filebin.net/snrz6bt13usv8w2e/test_large.csv)
91
  ```python
92
  import torchaudio
 
 
93
  import torch
94
  import librosa
95
  import numpy as np
96
  import re
97
 
98
+ test = Dataset.from_csv('test.csv')
 
 
99
 
 
100
 
101
+ chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\।]'
102
 
103
  # Preprocessing the datasets.
104
  # We need to read the audio files as arrays
121
  with torch.no_grad():
122
  logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
123
  pred_ids = torch.argmax(logits, dim=-1)
 
124
  # we do not want to group tokens when computing the metrics
125
+ batch["pred_strings"] = processor.batch_decode(pred_ids)
126
+ return batch
127
+
128
+ test = test.map(evaluate, batched=True, batch_size=32)
129
+ print("WER: {:2f}".format(100 * wer.compute(predictions=test["pred_strings"], references=test["sentence"])))
130
+ ```
131
+
132
+ #### Code for Evaluation on Common Voice Hindi (Common voice does not have Marathi yet)
133
+ ```python
134
+ import torchaudio
135
+ import torch
136
+ import librosa
137
+ import numpy as np
138
+ import re
139
+ from datasets import load_dataset
140
+
141
+ chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\।]'
142
+
143
+ # Preprocessing the datasets.
144
+ # We need to read the audio files as arrays
145
+ def speech_file_to_array_fn(batch):
146
+ batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"])
147
+ speech_array, sampling_rate = torchaudio.load(batch["path"])
148
+ batch["speech"] = speech_array[0].numpy()
149
+ batch["sampling_rate"] = sampling_rate
150
+ batch["target_text"] = batch["sentence"]
151
+ batch["speech"] = librosa.resample(np.asarray(batch["speech"]), sampling_rate, 16_000)
152
+ batch["sampling_rate"] = 16_000
153
+ return batch
154
+
155
+ #Run prediction on batch
156
+ def evaluate(batch):
157
+ inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
158
+ with torch.no_grad():
159
+ logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
160
+ pred_ids = torch.argmax(logits, dim=-1)
161
+ # we do not want to group tokens when computing the metrics
162
+ batch["pred_strings"] = processor.batch_decode(pred_ids)
163
  return batch
164
 
165
+
166
+ test_data = load_dataset("common_voice", "hi", split="test")
167
+ test_data = test_data.map(speech_file_to_array_fn)
168
+ test_data = test_data.map(evaluate, batched=True, batch_size=32)
169
+ print("WER: {:2f}".format(100 * wer.compute(predictions=test_data["pred_strings"],
170
+ references=test_data["sentence"])))
171
  ```
172
 
173
  Link to eval notebook : https://colab.research.google.com/drive/1nZRTgKfxCD9cvy90wikTHkg2il3zgcqW#scrollTo=cXWFbhb0d7DT