Update README.md
Browse files
README.md
CHANGED
@@ -4,6 +4,7 @@ datasets:
|
|
4 |
- Indic TTS Malayalam Speech Corpus
|
5 |
- Openslr Malayalam Speech Corpus
|
6 |
- SMC Malayalam Speech Corpus
|
|
|
7 |
metrics:
|
8 |
- wer
|
9 |
tags:
|
@@ -25,12 +26,12 @@ model-index:
|
|
25 |
metrics:
|
26 |
- name: Test WER
|
27 |
type: wer
|
28 |
-
value:
|
29 |
---
|
30 |
|
31 |
# Wav2Vec2-Large-XLSR-53-ml
|
32 |
|
33 |
-
Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on ml using the [Indic TTS Malayalam Speech Corpus (via Kaggle)](https://www.kaggle.com/kavyamanohar/indic-tts-malayalam-speech-corpus), [Openslr Malayalam Speech Corpus](http://openslr.org/63/), [SMC Malayalam Speech Corpus](https://blog.smc.org.in/malayalam-speech-corpus/).
|
34 |
When using this model, make sure that your speech input is sampled at 16kHz.
|
35 |
|
36 |
## Usage
|
@@ -84,35 +85,44 @@ import re
|
|
84 |
from datasets import load_dataset, load_metric
|
85 |
from pathlib import Path
|
86 |
|
|
|
87 |
data_dir = Path('<path-to-custom-dataset>')
|
88 |
|
89 |
dataset_folders = {
|
|
|
90 |
'openslr': 'openslr',
|
91 |
'indic-tts': 'indic-tts-ml',
|
|
|
92 |
}
|
93 |
|
94 |
# Set directories for datasets
|
95 |
openslr_male_dir = data_dir / dataset_folders['openslr'] / 'male'
|
96 |
openslr_female_dir = data_dir / dataset_folders['openslr'] / 'female'
|
|
|
97 |
indic_tts_male_dir = data_dir / dataset_folders['indic-tts'] / 'male'
|
98 |
indic_tts_female_dir = data_dir / dataset_folders['indic-tts'] / 'female'
|
|
|
99 |
|
100 |
-
# Load the datasets
|
101 |
openslr_male = load_dataset("json", data_files=[f"{str(openslr_male_dir.absolute())}/sample_{i}.json" for i in range(2023)], split="train")
|
102 |
openslr_female = load_dataset("json", data_files=[f"{str(openslr_female_dir.absolute())}/sample_{i}.json" for i in range(2103)], split="train")
|
|
|
103 |
indic_tts_male = load_dataset("json", data_files=[f"{str(indic_tts_male_dir.absolute())}/sample_{i}.json" for i in range(5649)], split="train")
|
104 |
indic_tts_female = load_dataset("json", data_files=[f"{str(indic_tts_female_dir.absolute())}/sample_{i}.json" for i in range(2950)], split="train")
|
|
|
105 |
|
106 |
# Create test split as 20%, set random seed as well.
|
107 |
test_size = 0.2
|
108 |
random_seed=1
|
109 |
openslr_male_splits = openslr_male.train_test_split(test_size=test_size, seed=random_seed)
|
110 |
openslr_female_splits = openslr_female.train_test_split(test_size=test_size, seed=random_seed)
|
|
|
111 |
indic_tts_male_splits = indic_tts_male.train_test_split(test_size=test_size, seed=random_seed)
|
112 |
indic_tts_female_splits = indic_tts_female.train_test_split(test_size=test_size, seed=random_seed)
|
|
|
113 |
|
114 |
# Get combined test dataset
|
115 |
-
split_list = [openslr_male_splits, openslr_female_splits, indic_tts_male_splits, indic_tts_female_splits]
|
116 |
test_dataset = datasets.concatenate_datasets([split['test'] for split in split_list)
|
117 |
|
118 |
wer = load_metric("wer")
|
@@ -121,19 +131,28 @@ processor = Wav2Vec2Processor.from_pretrained("gvs/wav2vec2-large-xlsr-malayalam
|
|
121 |
model = Wav2Vec2ForCTC.from_pretrained("gvs/wav2vec2-large-xlsr-malayalam")
|
122 |
model.to("cuda")
|
123 |
|
124 |
-
|
125 |
-
|
|
|
126 |
|
127 |
-
|
|
|
128 |
|
129 |
# Preprocessing the datasets.
|
130 |
# We need to read the audio files as arrays
|
131 |
def speech_file_to_array_fn(batch):
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
137 |
|
138 |
test_dataset = test_dataset.map(speech_file_to_array_fn)
|
139 |
|
@@ -154,11 +173,11 @@ result = test_dataset.map(evaluate, batched=True, batch_size=8)
|
|
154 |
print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
|
155 |
```
|
156 |
|
157 |
-
**Test Result**:
|
158 |
|
159 |
|
160 |
## Training
|
161 |
|
162 |
-
A combined dataset was created using [Indic TTS Malayalam Speech Corpus (via Kaggle)](https://www.kaggle.com/kavyamanohar/indic-tts-malayalam-speech-corpus), [Openslr Malayalam Speech Corpus](http://openslr.org/63/), [SMC Malayalam Speech Corpus](https://blog.smc.org.in/malayalam-speech-corpus/). The datasets were downloaded and was converted to HF Dataset format using [this notebook](https://github.com/gauthamsuresh09/wav2vec2-large-xlsr-53-malayalam/blob/main/make_hf_dataset.ipynb)
|
163 |
|
164 |
The notebook used for training and evaluation can be found [here](https://github.com/gauthamsuresh09/wav2vec2-large-xlsr-53-malayalam/blob/main/fine-tune-xlsr-wav2vec2-on-malayalam-asr-with-transformers.ipynb)
|
4 |
- Indic TTS Malayalam Speech Corpus
|
5 |
- Openslr Malayalam Speech Corpus
|
6 |
- SMC Malayalam Speech Corpus
|
7 |
+
- IIIT-H Indic Speech Databases
|
8 |
metrics:
|
9 |
- wer
|
10 |
tags:
|
26 |
metrics:
|
27 |
- name: Test WER
|
28 |
type: wer
|
29 |
+
value: 28.43
|
30 |
---
|
31 |
|
32 |
# Wav2Vec2-Large-XLSR-53-ml
|
33 |
|
34 |
+
Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on ml using the [Indic TTS Malayalam Speech Corpus (via Kaggle)](https://www.kaggle.com/kavyamanohar/indic-tts-malayalam-speech-corpus), [Openslr Malayalam Speech Corpus](http://openslr.org/63/), [SMC Malayalam Speech Corpus](https://blog.smc.org.in/malayalam-speech-corpus/) and [IIIT-H Indic Speech Databases](http://speech.iiit.ac.in/index.php/research-svl/69.html).
|
35 |
When using this model, make sure that your speech input is sampled at 16kHz.
|
36 |
|
37 |
## Usage
|
85 |
from datasets import load_dataset, load_metric
|
86 |
from pathlib import Path
|
87 |
|
88 |
+
# The custom dataset needs to be created using notebook mentioned at the end of this file
|
89 |
data_dir = Path('<path-to-custom-dataset>')
|
90 |
|
91 |
dataset_folders = {
|
92 |
+
'iiit': 'iiit_mal_abi',
|
93 |
'openslr': 'openslr',
|
94 |
'indic-tts': 'indic-tts-ml',
|
95 |
+
'msc-reviewed': 'msc-reviewed-speech-v1.0+20200825',
|
96 |
}
|
97 |
|
98 |
# Set directories for datasets
|
99 |
openslr_male_dir = data_dir / dataset_folders['openslr'] / 'male'
|
100 |
openslr_female_dir = data_dir / dataset_folders['openslr'] / 'female'
|
101 |
+
iiit_dir = data_dir / dataset_folders['iiit']
|
102 |
indic_tts_male_dir = data_dir / dataset_folders['indic-tts'] / 'male'
|
103 |
indic_tts_female_dir = data_dir / dataset_folders['indic-tts'] / 'female'
|
104 |
+
msc_reviewed_dir = data_dir / dataset_folders['msc-reviewed']
|
105 |
|
106 |
+
# Load the datasets
|
107 |
openslr_male = load_dataset("json", data_files=[f"{str(openslr_male_dir.absolute())}/sample_{i}.json" for i in range(2023)], split="train")
|
108 |
openslr_female = load_dataset("json", data_files=[f"{str(openslr_female_dir.absolute())}/sample_{i}.json" for i in range(2103)], split="train")
|
109 |
+
iiit = load_dataset("json", data_files=[f"{str(iiit_dir.absolute())}/sample_{i}.json" for i in range(1000)], split="train")
|
110 |
indic_tts_male = load_dataset("json", data_files=[f"{str(indic_tts_male_dir.absolute())}/sample_{i}.json" for i in range(5649)], split="train")
|
111 |
indic_tts_female = load_dataset("json", data_files=[f"{str(indic_tts_female_dir.absolute())}/sample_{i}.json" for i in range(2950)], split="train")
|
112 |
+
msc_reviewed = load_dataset("json", data_files=[f"{str(msc_reviewed_dir.absolute())}/sample_{i}.json" for i in range(1541)], split="train")
|
113 |
|
114 |
# Create test split as 20%, set random seed as well.
|
115 |
test_size = 0.2
|
116 |
random_seed=1
|
117 |
openslr_male_splits = openslr_male.train_test_split(test_size=test_size, seed=random_seed)
|
118 |
openslr_female_splits = openslr_female.train_test_split(test_size=test_size, seed=random_seed)
|
119 |
+
iiit_splits = iiit.train_test_split(test_size=test_size, seed=random_seed)
|
120 |
indic_tts_male_splits = indic_tts_male.train_test_split(test_size=test_size, seed=random_seed)
|
121 |
indic_tts_female_splits = indic_tts_female.train_test_split(test_size=test_size, seed=random_seed)
|
122 |
+
msc_reviewed_splits = msc_reviewed.train_test_split(test_size=test_size, seed=random_seed)
|
123 |
|
124 |
# Get combined test dataset
|
125 |
+
split_list = [openslr_male_splits, openslr_female_splits, indic_tts_male_splits, indic_tts_female_splits, msc_reviewed_splits, iiit_splits]
|
126 |
test_dataset = datasets.concatenate_datasets([split['test'] for split in split_list)
|
127 |
|
128 |
wer = load_metric("wer")
|
131 |
model = Wav2Vec2ForCTC.from_pretrained("gvs/wav2vec2-large-xlsr-malayalam")
|
132 |
model.to("cuda")
|
133 |
|
134 |
+
resamplers = {
|
135 |
+
48000: torchaudio.transforms.Resample(48_000, 16_000),
|
136 |
+
}
|
137 |
|
138 |
+
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�Utrnle\_]'
|
139 |
+
unicode_ignore_regex = r'[\u200e]'
|
140 |
|
141 |
# Preprocessing the datasets.
|
142 |
# We need to read the audio files as arrays
|
143 |
def speech_file_to_array_fn(batch):
|
144 |
+
batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"])
|
145 |
+
batch["sentence"] = re.sub(unicode_ignore_regex, '', batch["sentence"])
|
146 |
+
speech_array, sampling_rate = torchaudio.load(batch["path"])
|
147 |
+
# Resample if its not in 16kHz
|
148 |
+
if sampling_rate != 16000:
|
149 |
+
batch["speech"] = resamplers[sampling_rate](speech_array).squeeze().numpy()
|
150 |
+
else:
|
151 |
+
batch["speech"] = speech_array.squeeze().numpy()
|
152 |
+
# If more than one dimension is present, pick first one
|
153 |
+
if batch["speech"].ndim > 1:
|
154 |
+
batch["speech"] = batch["speech"][0]
|
155 |
+
return batch
|
156 |
|
157 |
test_dataset = test_dataset.map(speech_file_to_array_fn)
|
158 |
|
173 |
print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
|
174 |
```
|
175 |
|
176 |
+
**Test Result (WER)**: 28.43 %
|
177 |
|
178 |
|
179 |
## Training
|
180 |
|
181 |
+
A combined dataset was created using [Indic TTS Malayalam Speech Corpus (via Kaggle)](https://www.kaggle.com/kavyamanohar/indic-tts-malayalam-speech-corpus), [Openslr Malayalam Speech Corpus](http://openslr.org/63/), [SMC Malayalam Speech Corpus](https://blog.smc.org.in/malayalam-speech-corpus/) and [IIIT-H Indic Speech Databases](http://speech.iiit.ac.in/index.php/research-svl/69.html). The datasets were downloaded and was converted to HF Dataset format using [this notebook](https://github.com/gauthamsuresh09/wav2vec2-large-xlsr-53-malayalam/blob/main/make_hf_dataset.ipynb)
|
182 |
|
183 |
The notebook used for training and evaluation can be found [here](https://github.com/gauthamsuresh09/wav2vec2-large-xlsr-53-malayalam/blob/main/fine-tune-xlsr-wav2vec2-on-malayalam-asr-with-transformers.ipynb)
|