""" Script to load, transform and upload swedish NST dataset to 🤗 datasets. Dataset source: https://www.nb.no/sprakbanken/en/resource-catalogue/oai-nb-no-sbr-56/ Procedure: 1. Loop over annotations 2. Decide whether to discard specific item 3. Create DatasetDict = { features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'], num_rows: 11030 } 3b. Mapping common_voice <---> NST - 'client_id': info.Speaker_ID - 'path': val_recording.file - 'audio': wav file (binary) - 'sentence': val_recording.text - 'up_votes': 0 - 'down_votes': 0 - 'age': info.Age - 'gender': info.Sex - 'accent': "" - 'locale': "sv" - 'segment': "" 4. Dump to parquet 5. Upload to hub Filter out: - examples with single words - examples with single characters - examples with words splitted in single characters - remove "\\Punkt", "\\Komma" from sentences """ import json import os import pandas as pd import torchaudio from datasets import load_dataset hf_dataset_repo = "marinone94/nst_sv" audio_files_path = "/Users/emiliomarinone/datasets/nst_sv/audio_files" annotations_path = "/Users/emiliomarinone/datasets/nst_sv/annotations" def load_audio_file(rel_filepath): audio_filepath = f'{audio_files_path}/{rel_filepath}' data_waveform, sampling_rate = torchaudio.load(audio_filepath) return { "path": rel_filepath, "array": data_waveform[0].t().numpy(), "sampling_rate": sampling_rate } def is_record_valid(text): text_split = text.split() if len(text_split) < 2: return False is_all_single_chars = True for token in text_split: if len(token) != 1: is_all_single_chars = False break if is_all_single_chars: return False return True def clean_text(text): return text.replace("\\\\Komma", "").replace("\\\\Punkt", "") def create_dataset_row(annotation_filename): annotations_filepath = os.path.join(annotations_path, annotation_filename) with open(annotations_filepath, "r") as f: annotation = json.load(f) dataset_rows = [] for recording in annotation["val_recordings"]: if is_record_valid(recording["text"]): rel_filepath = f'{annotation["pid"]}/{annotation["pid"]}_{recording["file"]}'.replace(".wav", "-2.wav") dataset_row = { "client_id": annotation["info"]["Speaker_ID"], 'path': rel_filepath, 'audio': load_audio_file(rel_filepath), 'sentence': clean_text(recording["text"]), 'up_votes': 0, 'down_votes': 0, 'age': annotation["info"]["Age"], 'gender': annotation["info"]["Sex"], 'accent': "", 'locale': "sv", 'segment': "" } dataset_rows.append(dataset_row) return dataset_rows dataset_rows = [] for i, filename in enumerate(os.listdir(annotations_path)): dataset_rows.extend(create_dataset_row(filename)) if i == 1: break df = pd.DataFrame(dataset_rows) df.to_parquet("dataset.parquet") dataset = load_dataset("parquet", data_files="dataset.parquet") dataset.push_to_hub("marinone94/nst_sv")