tanmaylaud commited on
Commit
cb7ac4f
1 Parent(s): 49181d4

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +79 -0
README.md ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Wav2Vec2-Large-XLSR-53-Marathi
2
+ ### Fine-tuned facebook/wav2vec2-large-xlsr-53 on Marathi using the OpenSLR SLR64 dataset and InterSpeech 2021 Marathi datasets. Note that this data OpenSLR contains only female voices. Please keep this in mind before using the model for your task. When using this model, make sure that your speech input is sampled at 16kHz.
3
+
4
+ ## Usage
5
+ The model can be used directly (without a language model) as follows, assuming you have a dataset with Marathi text and audio_path fields:
6
+
7
+ ```
8
+ import torch
9
+ import torchaudio
10
+ import librosa
11
+ from datasets import load_dataset
12
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
13
+
14
+ # test_data = #TODO: WRITE YOUR CODE TO LOAD THE TEST DATASET. For sample see the Colab link in Training Section.
15
+
16
+ processor = Wav2Vec2Processor.from_pretrained("gchhablani/wav2vec2-large-xlsr-mr-3")
17
+ model = Wav2Vec2ForCTC.from_pretrained("gchhablani/wav2vec2-large-xlsr-mr-3")
18
+
19
+ # Preprocessing the datasets.
20
+ # We need to read the audio files as arrays
21
+ def speech_file_to_array_fn(batch):
22
+ speech_array, sampling_rate = torchaudio.load(batch["audio_path"])
23
+ batch["speech"] = librosa.resample(speech_array[0].numpy(), sampling_rate, 16_000) # sampling_rate can vary
24
+ return batch
25
+
26
+ test_data= test_data.map(speech_file_to_array_fn)
27
+ inputs = processor(test_data["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
28
+
29
+ with torch.no_grad():
30
+ logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
31
+
32
+ predicted_ids = torch.argmax(logits, dim=-1)
33
+
34
+ print("Prediction:", processor.batch_decode(predicted_ids))
35
+ print("Reference:", test_data["text"][:2])
36
+ Evaluation
37
+ The model can be evaluated as follows on 10% of the Marathi data on OpenSLR.
38
+ ```
39
+ ```
40
+ import torch
41
+ import torchaudio
42
+ import librosa
43
+ from datasets import load_dataset, load_metric
44
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
45
+ import re
46
+
47
+ # test_data = #TODO: WRITE YOUR CODE TO LOAD THE TEST DATASET. For sample see the Colab link in Training Section.
48
+
49
+ wer = load_metric("wer")
50
+ processor = Wav2Vec2Processor.from_pretrained("gchhablani/wav2vec2-large-xlsr-mr-3")
51
+ model = Wav2Vec2ForCTC.from_pretrained("gchhablani/wav2vec2-large-xlsr-mr-3")
52
+ model.to("cuda")
53
+
54
+ chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\–\…]'
55
+
56
+
57
+ # Preprocessing the datasets.
58
+ # We need to read the audio files as arrays
59
+ def speech_file_to_array_fn(batch):
60
+ batch["text"] = re.sub(chars_to_ignore_regex, '', batch["text"]).lower()
61
+ speech_array, sampling_rate = torchaudio.load(batch["audio_path"])
62
+ batch["speech"] = librosa.resample(speech_array[0].numpy(), sampling_rate, 16_000)
63
+ return batch
64
+
65
+ test_data= test_data.map(speech_file_to_array_fn)
66
+
67
+ # Preprocessing the datasets.
68
+ # We need to read the audio files as arrays
69
+ def evaluate(batch):
70
+ inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
71
+ with torch.no_grad():
72
+ logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
73
+ pred_ids = torch.argmax(logits, dim=-1)
74
+ batch["pred_strings"] = processor.batch_decode(pred_ids)
75
+ return batch
76
+
77
+ result = test_data.map(evaluate, batched=True, batch_size=8)
78
+ print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["text"])))
79
+ ```