tanmaylaud commited on
Commit
d7aa717
1 Parent(s): 6750fe1

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +8 -2
README.md CHANGED
@@ -90,6 +90,8 @@ print("Prediction:", processor.batch_decode(predicted_ids))
90
  print("Reference:", test_data["text"][:2])
91
  ```
92
 
 
 
93
  # Code For Evaluation on OpenSLR (Hindi + Marathi : https://filebin.net/snrz6bt13usv8w2e/test_large.csv)
94
  ```python
95
  import torchaudio
@@ -101,7 +103,7 @@ import re
101
  test = Dataset.from_csv('test.csv')
102
 
103
 
104
- chars_to_ignore_regex = '[\\\\,\\\\?\\\\.\\\\!\\\\-\\\\;\\\\:\\\\"\\\\“\\\\%\\\\‘\\\\”\\\\�\\\\।]'
105
 
106
  # Preprocessing the datasets.
107
  # We need to read the audio files as arrays
@@ -132,6 +134,8 @@ test = test.map(evaluate, batched=True, batch_size=32)
132
  print("WER: {:2f}".format(100 * wer.compute(predictions=test["pred_strings"], references=test["sentence"])))
133
  ```
134
 
 
 
135
  #### Code for Evaluation on Common Voice Hindi (Common voice does not have Marathi yet)
136
  ```python
137
  import torchaudio
@@ -141,7 +145,7 @@ import numpy as np
141
  import re
142
  from datasets import load_dataset
143
 
144
- chars_to_ignore_regex = '[\\\\,\\\\?\\\\.\\\\!\\\\-\\\\;\\\\:\\\\"\\\\“\\\\%\\\\‘\\\\”\\\\�\\\\।]'
145
 
146
  # Preprocessing the datasets.
147
  # We need to read the audio files as arrays
@@ -176,4 +180,6 @@ print("WER: {:2f}".format(100 * wer.compute(predictions=test_data["pred_strings"
176
  Link to eval notebook : https://colab.research.google.com/drive/1nZRTgKfxCD9cvy90wikTHkg2il3zgcqW#scrollTo=cXWFbhb0d7DT
177
 
178
  WER : 24.944955% (OpenSLR Hindi+Marathi Test set : https://filebin.net/snrz6bt13usv8w2e/test_large.csv)
 
 
179
  WER: 49.303944% (Common Voice Hindi Test Split)
90
  print("Reference:", test_data["text"][:2])
91
  ```
92
 
93
+
94
+
95
  # Code For Evaluation on OpenSLR (Hindi + Marathi : https://filebin.net/snrz6bt13usv8w2e/test_large.csv)
96
  ```python
97
  import torchaudio
103
  test = Dataset.from_csv('test.csv')
104
 
105
 
106
+ chars_to_ignore_regex = '[\\\\\\\\,\\\\\\\\?\\\\\\\\.\\\\\\\\!\\\\\\\\-\\\\\\\\;\\\\\\\\:\\\\\\\\"\\\\\\\\“\\\\\\\\%\\\\\\\\‘\\\\\\\\”\\\\\\\\�\\\\\\\\।]'
107
 
108
  # Preprocessing the datasets.
109
  # We need to read the audio files as arrays
134
  print("WER: {:2f}".format(100 * wer.compute(predictions=test["pred_strings"], references=test["sentence"])))
135
  ```
136
 
137
+
138
+
139
  #### Code for Evaluation on Common Voice Hindi (Common voice does not have Marathi yet)
140
  ```python
141
  import torchaudio
145
  import re
146
  from datasets import load_dataset
147
 
148
+ chars_to_ignore_regex = '[\\\\\\\\,\\\\\\\\?\\\\\\\\.\\\\\\\\!\\\\\\\\-\\\\\\\\;\\\\\\\\:\\\\\\\\"\\\\\\\\“\\\\\\\\%\\\\\\\\‘\\\\\\\\”\\\\\\\\�\\\\\\\\।]'
149
 
150
  # Preprocessing the datasets.
151
  # We need to read the audio files as arrays
180
  Link to eval notebook : https://colab.research.google.com/drive/1nZRTgKfxCD9cvy90wikTHkg2il3zgcqW#scrollTo=cXWFbhb0d7DT
181
 
182
  WER : 24.944955% (OpenSLR Hindi+Marathi Test set : https://filebin.net/snrz6bt13usv8w2e/test_large.csv)
183
+
184
+
185
  WER: 49.303944% (Common Voice Hindi Test Split)