tanmaylaud
/

wav2vec2-large-xlsr-hindi-marathi

@@ -31,8 +31,23 @@ model-index:
 # Wav2Vec2-Large-XLSR-53-Hindi-Marathi
 Fine-tuned facebook/wav2vec2-large-xlsr-53 on Hindi and Marathi using the OpenSLR SLR64 datasets. When using this model, make sure that your speech input is sampled at 16kHz.
 ## Usage
- The model can be used directly (without a language model) as follows, assuming you have a dataset with Marathi text and audio_path fields:
 ```python
 import torch
@@ -41,16 +56,23 @@ import librosa
 from datasets import load_dataset
 from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
-# test_data = #TODO: WRITE YOUR CODE TO LOAD THE TEST DATASET. For sample see the Colab link in Training Section.
-processor = Wav2Vec2Processor.from_pretrained("tanmaylaud/wav2vec2-large-xlsr-hindi-marathi")
-model = Wav2Vec2ForCTC.from_pretrained("tanmaylaud/wav2vec2-large-xlsr-hindi-marathi")
 # Preprocessing the datasets.
 # We need to read the audio files as arrays
 def speech_file_to_array_fn(batch):
-    speech_array, sampling_rate = torchaudio.load(batch["audio_path"])
-    batch["speech"] = librosa.resample(speech_array[0].numpy(), sampling_rate, 16_000) # sampling_rate can vary
     return batch
 test_data= test_data.map(speech_file_to_array_fn)
@@ -63,25 +85,20 @@ predicted_ids = torch.argmax(logits, dim=-1)
 print("Prediction:", processor.batch_decode(predicted_ids))
 print("Reference:", test_data["text"][:2])
-Evaluation
-The model can be evaluated as follows on 10% of the Marathi data on OpenSLR.
 ```python
 import torchaudio
-from datasets import load_metric
-from transformers import Wav2Vec2Processor,Wav2Vec2ForCTC
 import torch
 import librosa
 import numpy as np
 import re
-wer = load_metric("wer")
-processor = Wav2Vec2Processor.from_pretrained("tanmaylaud/wav2vec2-large-xlsr-hindi-marathi")
-model = Wav2Vec2ForCTC.from_pretrained("tanmaylaud/wav2vec2-large-xlsr-hindi-marathi")
-model.to("cuda")
-chars_to_ignore_regex = '[\\\\\\\\,\\\\\\\\?\\\\\\\\.\\\\\\\\!\\\\\\\\-\\\\\\\\;\\\\\\\\:\\\\\\\\"\\\\\\\\“\\\\\\\\%\\\\\\\\‘\\\\\\\\”\\\\\\\\�\\\\\\\\।]'
 # Preprocessing the datasets.
 # We need to read the audio files as arrays
@@ -104,12 +121,53 @@ def evaluate(batch):
     with torch.no_grad():
         logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
         pred_ids = torch.argmax(logits, dim=-1)
-        batch["pred_strings"] = processor.batch_decode(pred_ids, group_tokens=False)
         # we do not want to group tokens when computing the metrics
         return batch
-result = test.map(evaluate, batched=True, batch_size=32)
-print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["text"])))
 ```
 Link to eval notebook : https://colab.research.google.com/drive/1nZRTgKfxCD9cvy90wikTHkg2il3zgcqW#scrollTo=cXWFbhb0d7DT

 # Wav2Vec2-Large-XLSR-53-Hindi-Marathi
 Fine-tuned facebook/wav2vec2-large-xlsr-53 on Hindi and Marathi using the OpenSLR SLR64 datasets. When using this model, make sure that your speech input is sampled at 16kHz.
+## Installation
+pip install git+https://github.com/huggingface/transformers.git datasets librosa torch==1.7.0 torchaudio==0.7.0 jiwer
+## Eval dataset:
+!wget https://www.openslr.org/resources/103/Marathi_test.zip  -P data/marathi
+!unzip -P "K3[2?do9" data/marathi/Marathi_test.zip -d data/marathi/.
+!tar -xzf data/marathi/Marathi_test.tar.gz -C data/marathi/.
+!wget https://www.openslr.org/resources/103/Hindi_test.zip  -P data/hindi
+!unzip -P "w9I2{3B*" data/hindi/Hindi_test.zip -d data/hindi/.
+!tar -xzf data/hindi/Hindi_test.tar.gz -C data/hindi/.
+!wget  -O test.csv 'https://filebin.net/snrz6bt13usv8w2e/test_large.csv?t=ps3n99ho'
+If download does not work, paste this link in browser: https://filebin.net/snrz6bt13usv8w2e/test_large.csv
 ## Usage
+ The model can be used directly (without a language model) as follows, assuming you have a dataset with Marathi text and path fields:
 ```python
 import torch
 from datasets import load_dataset
 from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
+from datasets import load_metric, Dataset
+from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
+wer = load_metric("wer")
+processor = Wav2Vec2Processor.from_pretrained('tanmaylaud/wav2vec2-large-xlsr-hindi-marathi')
+model = Wav2Vec2ForCTC.from_pretrained('tanmaylaud/wav2vec2-large-xlsr-hindi-marathi').to("cuda")
 # Preprocessing the datasets.
 # We need to read the audio files as arrays
 def speech_file_to_array_fn(batch):
+    batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"])
+    speech_array, sampling_rate = torchaudio.load(batch["path"])
+    batch["speech"] = speech_array[0].numpy()
+    batch["sampling_rate"] = sampling_rate
+    batch["target_text"] = batch["sentence"]
+    batch["speech"] = librosa.resample(np.asarray(batch["speech"]), sampling_rate, 16_000)
+    batch["sampling_rate"] = 16_000
     return batch
 test_data= test_data.map(speech_file_to_array_fn)
 print("Prediction:", processor.batch_decode(predicted_ids))
 print("Reference:", test_data["text"][:2])
+```
+#Code For Evaluation on OpenSLR (Hindi + Marathi : https://filebin.net/snrz6bt13usv8w2e/test_large.csv)
 ```python
 import torchaudio
 import torch
 import librosa
 import numpy as np
 import re
+test = Dataset.from_csv('test.csv')
+chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\।]'
 # Preprocessing the datasets.
 # We need to read the audio files as arrays
     with torch.no_grad():
         logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
         pred_ids = torch.argmax(logits, dim=-1)
         # we do not want to group tokens when computing the metrics
+        batch["pred_strings"] = processor.batch_decode(pred_ids)
+        return batch
+test = test.map(evaluate, batched=True, batch_size=32)
+print("WER: {:2f}".format(100 * wer.compute(predictions=test["pred_strings"], references=test["sentence"])))
+```
+#### Code for Evaluation on Common Voice Hindi (Common voice does not have Marathi yet)
+```python
+import torchaudio
+import torch
+import librosa
+import numpy as np
+import re
+from datasets import load_dataset
+chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\।]'
+# Preprocessing the datasets.
+# We need to read the audio files as arrays
+def speech_file_to_array_fn(batch):
+    batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"])
+    speech_array, sampling_rate = torchaudio.load(batch["path"])
+    batch["speech"] = speech_array[0].numpy()
+    batch["sampling_rate"] = sampling_rate
+    batch["target_text"] = batch["sentence"]
+    batch["speech"] = librosa.resample(np.asarray(batch["speech"]), sampling_rate, 16_000)
+    batch["sampling_rate"] = 16_000
+    return batch
+#Run prediction on batch
+def evaluate(batch):
+    inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
+    with torch.no_grad():
+        logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
+        pred_ids = torch.argmax(logits, dim=-1)
+        # we do not want to group tokens when computing the metrics
+        batch["pred_strings"] = processor.batch_decode(pred_ids)
         return batch
+test_data = load_dataset("common_voice", "hi", split="test")
+test_data = test_data.map(speech_file_to_array_fn)
+test_data = test_data.map(evaluate, batched=True, batch_size=32)
+print("WER: {:2f}".format(100 * wer.compute(predictions=test_data["pred_strings"],
+                                            references=test_data["sentence"])))
 ```
 Link to eval notebook : https://colab.research.google.com/drive/1nZRTgKfxCD9cvy90wikTHkg2il3zgcqW#scrollTo=cXWFbhb0d7DT