gchhablani
commited on
Commit
•
4543731
1
Parent(s):
c30dc58
Update README.md
Browse files
README.md
CHANGED
@@ -27,8 +27,7 @@ model-index:
|
|
27 |
|
28 |
# Wav2Vec2-Large-XLSR-53-Marathi
|
29 |
|
30 |
-
Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on Marthi using the [OpenSLR SLR64](http://openslr.org/64/) dataset.
|
31 |
-
When using this model, make sure that your speech input is sampled at 16kHz.
|
32 |
|
33 |
## Usage
|
34 |
|
@@ -50,15 +49,15 @@ resampler = torchaudio.transforms.Resample(48_000, 16_000) # The original data w
|
|
50 |
# Preprocessing the datasets.
|
51 |
# We need to read the audio files as arrays
|
52 |
def speech_file_to_array_fn(batch):
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
|
57 |
test_dataset = test_dataset.map(speech_file_to_array_fn)
|
58 |
inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
|
59 |
|
60 |
with torch.no_grad():
|
61 |
-
|
62 |
|
63 |
predicted_ids = torch.argmax(logits, dim=-1)
|
64 |
|
@@ -86,30 +85,30 @@ processor = Wav2Vec2Processor.from_pretrained("gchhablani/wav2vec2-large-xlsr-mr
|
|
86 |
model = Wav2Vec2ForCTC.from_pretrained("gchhablani/wav2vec2-large-xlsr-mr")
|
87 |
model.to("cuda")
|
88 |
|
89 |
-
chars_to_ignore_regex = '[
|
90 |
resampler = torchaudio.transforms.Resample(48_000, 16_000)
|
91 |
|
92 |
# Preprocessing the datasets.
|
93 |
# We need to read the aduio files as arrays
|
94 |
def speech_file_to_array_fn(batch):
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
|
100 |
test_dataset = test_dataset.map(speech_file_to_array_fn)
|
101 |
|
102 |
# Preprocessing the datasets.
|
103 |
# We need to read the aduio files as arrays
|
104 |
def evaluate(batch):
|
105 |
-
|
106 |
|
107 |
-
|
108 |
-
|
109 |
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
|
114 |
result = test_dataset.map(evaluate, batched=True, batch_size=8)
|
115 |
|
@@ -121,4 +120,4 @@ print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"],
|
|
121 |
## Training
|
122 |
|
123 |
90% of the OpenSLR Marathi dataset was used for training.
|
124 |
-
The colab notebook used for training can be found [here](https://colab.research.google.com/drive/1_BbLyLqDUsXG3RpSULfLRjC6UY3RjwME?usp=sharing)
|
27 |
|
28 |
# Wav2Vec2-Large-XLSR-53-Marathi
|
29 |
|
30 |
+
Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on Marthi using the [OpenSLR SLR64](http://openslr.org/64/) dataset. Note that this data contains only female voices. Please keep this in mind before using the model for your task. When using this model, make sure that your speech input is sampled at 16kHz.
|
|
|
31 |
|
32 |
## Usage
|
33 |
|
49 |
# Preprocessing the datasets.
|
50 |
# We need to read the audio files as arrays
|
51 |
def speech_file_to_array_fn(batch):
|
52 |
+
\tspeech_array, sampling_rate = torchaudio.load(batch["path"])
|
53 |
+
\tbatch["speech"] = resampler(speech_array).squeeze().numpy()
|
54 |
+
\treturn batch
|
55 |
|
56 |
test_dataset = test_dataset.map(speech_file_to_array_fn)
|
57 |
inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
|
58 |
|
59 |
with torch.no_grad():
|
60 |
+
\tlogits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
|
61 |
|
62 |
predicted_ids = torch.argmax(logits, dim=-1)
|
63 |
|
85 |
model = Wav2Vec2ForCTC.from_pretrained("gchhablani/wav2vec2-large-xlsr-mr")
|
86 |
model.to("cuda")
|
87 |
|
88 |
+
chars_to_ignore_regex = '[\\,\\?\\.\\!\\-\\;\\:\\"\\“\\%\\‘\\”\\�\\–\\…]'
|
89 |
resampler = torchaudio.transforms.Resample(48_000, 16_000)
|
90 |
|
91 |
# Preprocessing the datasets.
|
92 |
# We need to read the aduio files as arrays
|
93 |
def speech_file_to_array_fn(batch):
|
94 |
+
\tbatch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
|
95 |
+
\tspeech_array, sampling_rate = torchaudio.load(batch["path"])
|
96 |
+
\tbatch["speech"] = resampler(speech_array).squeeze().numpy()
|
97 |
+
\treturn batch
|
98 |
|
99 |
test_dataset = test_dataset.map(speech_file_to_array_fn)
|
100 |
|
101 |
# Preprocessing the datasets.
|
102 |
# We need to read the aduio files as arrays
|
103 |
def evaluate(batch):
|
104 |
+
\tinputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
|
105 |
|
106 |
+
\twith torch.no_grad():
|
107 |
+
\t\tlogits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
|
108 |
|
109 |
+
\tpred_ids = torch.argmax(logits, dim=-1)
|
110 |
+
\tbatch["pred_strings"] = processor.batch_decode(pred_ids)
|
111 |
+
\treturn batch
|
112 |
|
113 |
result = test_dataset.map(evaluate, batched=True, batch_size=8)
|
114 |
|
120 |
## Training
|
121 |
|
122 |
90% of the OpenSLR Marathi dataset was used for training.
|
123 |
+
The colab notebook used for training can be found [here](https://colab.research.google.com/drive/1_BbLyLqDUsXG3RpSULfLRjC6UY3RjwME?usp=sharing).
|