Cahya Wirawan
updated test_dataset 75c4a61
1
---
2
language: su
3
datasets:
4
- openslr
5
metrics:
6
- wer
7
tags:
8
- audio
9
- automatic-speech-recognition
10
- speech
11
- xlsr-fine-tuning-week
12
license: apache-2.0
13
model-index:
14
- name: XLSR Wav2Vec2 Sundanese by cahya
15
  results:
16
  - task: 
17
      name: Speech Recognition
18
      type: automatic-speech-recognition
19
    dataset:
20
      name: OpenSLR High quality TTS data for Sundanese
21
      type: OpenSLR
22
      args: su
23
    metrics:
24
       - name: Test WER
25
         type: wer
26
         value: 6.19
27
---
28
29
# Wav2Vec2-Large-XLSR-Sundanese
30
31
Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53)
32
on the [OpenSLR High quality TTS data for Sundanese](https://openslr.org/44/).
33
When using this model, make sure that your speech input is sampled at 16kHz.
34
35
## Usage
36
The model can be used directly (without a language model) as follows:
37
```python
38
import torch
39
import torchaudio
40
from datasets import load_dataset, load_metric, Dataset
41
from datasets.utils.download_manager import DownloadManager
42
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
43
from pathlib import Path
44
import pandas as pd
45
46
47
def load_dataset_sundanese():
48
    urls = [
49
        "https://www.openslr.org/resources/44/su_id_female.zip",
50
        "https://www.openslr.org/resources/44/su_id_male.zip"
51
    ]
52
    dm = DownloadManager()
53
    download_dirs = dm.download_and_extract(urls)
54
    data_dirs = [ 
55
        Path(download_dirs[0])/"su_id_female/wavs",
56
        Path(download_dirs[1])/"su_id_male/wavs",
57
    ]
58
    filenames = [ 
59
        Path(download_dirs[0])/"su_id_female/line_index.tsv",
60
        Path(download_dirs[1])/"su_id_male/line_index.tsv",
61
    ]
62
63
    dfs = []
64
    
65
    dfs.append(pd.read_csv(filenames[0], sep='\t4?\t', names=["path", "sentence"]))
66
    dfs.append(pd.read_csv(filenames[1], sep='\t\t', names=["path", "sentence"]))
67
    
68
    for i, dir in enumerate(data_dirs):
69
        dfs[i]["path"] = dfs[i].apply(lambda row: str(data_dirs[i]) + "/" + row + ".wav", axis=1)
70
    df = pd.concat(dfs)
71
    # df = df.sample(frac=1, random_state=1).reset_index(drop=True)
72
    dataset = Dataset.from_pandas(df)
73
    dataset = dataset.remove_columns('__index_level_0__')
74
    
75
    return dataset.train_test_split(test_size=0.1, seed=1)
76
    
77
dataset = load_dataset_sundanese()
78
test_dataset = dataset['test']
79
80
processor = Wav2Vec2Processor.from_pretrained("cahya/wav2vec2-large-xlsr-sundanese")
81
model = Wav2Vec2ForCTC.from_pretrained("cahya/wav2vec2-large-xlsr-sundanese")
82
83
resampler = torchaudio.transforms.Resample(48_000, 16_000)
84
85
# Preprocessing the datasets.
86
# We need to read the audio files as arrays
87
def speech_file_to_array_fn(batch):
88
    speech_array, sampling_rate = torchaudio.load(batch["path"])
89
    batch["speech"] = resampler(speech_array).squeeze().numpy()
90
    return batch
91
92
test_dataset = test_dataset.map(speech_file_to_array_fn)
93
inputs = processor(test_dataset[:2]["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
94
95
with torch.no_grad():
96
    logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
97
98
predicted_ids = torch.argmax(logits, dim=-1)
99
100
print("Prediction:", processor.batch_decode(predicted_ids))
101
print("Reference:", test_dataset[:2]["sentence"])
102
```
103
104
105
## Evaluation
106
107
The model can be evaluated as follows or using the [notebook](https://github.com/cahya-wirawan/indonesian-speech-recognition/blob/main/XLSR_Wav2Vec2_for_Indonesian_Evaluation-Sundanese.ipynb).
108
109
```python
110
import torch
111
import torchaudio
112
from datasets import load_dataset, load_metric, Dataset
113
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
114
from datasets.utils.download_manager import DownloadManager
115
import re
116
from pathlib import Path
117
import pandas as pd
118
119
120
def load_dataset_sundanese():
121
    urls = [
122
        "https://www.openslr.org/resources/44/su_id_female.zip",
123
        "https://www.openslr.org/resources/44/su_id_male.zip"
124
    ]
125
    dm = DownloadManager()
126
    download_dirs = dm.download_and_extract(urls)
127
    data_dirs = [ 
128
        Path(download_dirs[0])/"su_id_female/wavs",
129
        Path(download_dirs[1])/"su_id_male/wavs",
130
    ]
131
    filenames = [ 
132
        Path(download_dirs[0])/"su_id_female/line_index.tsv",
133
        Path(download_dirs[1])/"su_id_male/line_index.tsv",
134
    ]
135
136
    dfs = []
137
    
138
    dfs.append(pd.read_csv(filenames[0], sep='\t4?\t', names=["path", "sentence"]))
139
    dfs.append(pd.read_csv(filenames[1], sep='\t\t', names=["path", "sentence"]))
140
    
141
    for i, dir in enumerate(data_dirs):
142
        dfs[i]["path"] = dfs[i].apply(lambda row: str(data_dirs[i]) + "/" + row + ".wav", axis=1)
143
    df = pd.concat(dfs)
144
    # df = df.sample(frac=1, random_state=1).reset_index(drop=True)
145
    dataset = Dataset.from_pandas(df)
146
    dataset = dataset.remove_columns('__index_level_0__')
147
    
148
    return dataset.train_test_split(test_size=0.1, seed=1)
149
    
150
dataset = load_dataset_sundanese()
151
test_dataset = dataset['test']
152
153
wer = load_metric("wer")
154
155
processor = Wav2Vec2Processor.from_pretrained("cahya/wav2vec2-large-xlsr-sundanese")
156
model = Wav2Vec2ForCTC.from_pretrained("cahya/wav2vec2-large-xlsr-sundanese") 
157
model.to("cuda")
158
159
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\'\”_\�]'
160
resampler = torchaudio.transforms.Resample(48_000, 16_000)
161
162
# Preprocessing the datasets.
163
# We need to read the aduio files as arrays
164
def speech_file_to_array_fn(batch):
165
    batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
166
    speech_array, sampling_rate = torchaudio.load(batch["path"])
167
    batch["speech"] = resampler(speech_array).squeeze().numpy()
168
    return batch
169
170
test_dataset = test_dataset.map(speech_file_to_array_fn)
171
172
# Preprocessing the datasets.
173
# We need to read the audio files as arrays
174
def evaluate(batch):
175
    inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
176
177
    with torch.no_grad():
178
        logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
179
180
    pred_ids = torch.argmax(logits, dim=-1)
181
    batch["pred_strings"] = processor.batch_decode(pred_ids)
182
    return batch
183
184
result = test_dataset.map(evaluate, batched=True, batch_size=8)
185
186
print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
187
```
188
189
**Test Result**: 6.19 %
190
191
## Training
192
193
[OpenSLR High quality TTS data for Sundanese](https://openslr.org/44/) was used for training.
194
The script used for training can be found [here](https://github.com/cahya-wirawan/indonesian-speech-recognition/blob/main/XLSR_Wav2Vec2_for_Indonesian_Evaluation-Sundanese.ipynb) 
195
and to [evaluate it](https://github.com/cahya-wirawan/indonesian-speech-recognition/blob/main/XLSR_Wav2Vec2_for_Indonesian_Evaluation-Sundanese.ipynb)
196