|
""" Script to load, transform and upload swedish NST dataset to 🤗 datasets. |
|
|
|
Dataset source: https://www.nb.no/sprakbanken/en/resource-catalogue/oai-nb-no-sbr-56/ |
|
|
|
Procedure: |
|
1. Loop over annotations |
|
2. Decide whether to discard specific item |
|
3. Create DatasetDict = { |
|
features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'], |
|
num_rows: 11030 |
|
} |
|
3b. Mapping common_voice <---> NST |
|
- 'client_id': info.Speaker_ID |
|
- 'path': val_recording.file |
|
- 'audio': wav file (binary) |
|
- 'sentence': val_recording.text |
|
- 'up_votes': 0 |
|
- 'down_votes': 0 |
|
- 'age': info.Age |
|
- 'gender': info.Sex |
|
- 'accent': "" |
|
- 'locale': "sv" |
|
- 'segment': "" |
|
4. Dump to parquet |
|
5. Upload to hub |
|
|
|
Filter out: |
|
- examples with single words |
|
- examples with single characters |
|
- examples with words splitted in single characters |
|
- remove "\\Punkt", "\\Komma" from sentences |
|
|
|
""" |
|
|
|
import json |
|
import os |
|
|
|
import pandas as pd |
|
import torchaudio |
|
|
|
from datasets import load_dataset |
|
|
|
|
|
hf_dataset_repo = "marinone94/nst_sv" |
|
audio_files_path = "/Users/emiliomarinone/datasets/nst_sv/audio_files" |
|
annotations_path = "/Users/emiliomarinone/datasets/nst_sv/annotations" |
|
|
|
|
|
def load_audio_file(rel_filepath): |
|
audio_filepath = f'{audio_files_path}/{rel_filepath}' |
|
data_waveform, sampling_rate = torchaudio.load(audio_filepath) |
|
return { |
|
"path": rel_filepath, |
|
"array": data_waveform[0].t().numpy(), |
|
"sampling_rate": sampling_rate |
|
} |
|
|
|
|
|
def is_record_valid(text): |
|
text_split = text.split() |
|
|
|
if len(text_split) < 2: |
|
return False |
|
|
|
is_all_single_chars = True |
|
for token in text_split: |
|
if len(token) != 1: |
|
is_all_single_chars = False |
|
break |
|
if is_all_single_chars: |
|
return False |
|
|
|
return True |
|
|
|
|
|
def clean_text(text): |
|
return text.replace("\\\\Komma", "").replace("\\\\Punkt", "") |
|
|
|
|
|
def create_dataset_row(annotation_filename): |
|
annotations_filepath = os.path.join(annotations_path, annotation_filename) |
|
with open(annotations_filepath, "r") as f: |
|
annotation = json.load(f) |
|
|
|
dataset_rows = [] |
|
for recording in annotation["val_recordings"]: |
|
if is_record_valid(recording["text"]): |
|
rel_filepath = f'{annotation["pid"]}/{annotation["pid"]}_{recording["file"]}'.replace(".wav", "-2.wav") |
|
dataset_row = { |
|
"client_id": annotation["info"]["Speaker_ID"], |
|
'path': rel_filepath, |
|
'audio': load_audio_file(rel_filepath), |
|
'sentence': clean_text(recording["text"]), |
|
'up_votes': 0, |
|
'down_votes': 0, |
|
'age': annotation["info"]["Age"], |
|
'gender': annotation["info"]["Sex"], |
|
'accent': "", |
|
'locale': "sv", |
|
'segment': "" |
|
} |
|
dataset_rows.append(dataset_row) |
|
|
|
return dataset_rows |
|
|
|
|
|
dataset_rows = [] |
|
for i, filename in enumerate(os.listdir(annotations_path)): |
|
dataset_rows.extend(create_dataset_row(filename)) |
|
if i == 1: |
|
break |
|
|
|
df = pd.DataFrame(dataset_rows) |
|
df.to_parquet("dataset.parquet") |
|
dataset = load_dataset("parquet", data_files="dataset.parquet") |
|
dataset.push_to_hub("marinone94/nst_sv") |
|
|