""" Script to load, transform and upload swedish NST dataset to 🤗 datasets. Dataset source: https://www.nb.no/sprakbanken/en/resource-catalogue/oai-nb-no-sbr-56/ Procedure: 1. Loop over annotations 2. Decide whether to discard specific item 3. Create DatasetDict = { features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'], num_rows: 11030 } 3b. Mapping common_voice <---> NST - 'client_id': info.Speaker_ID - 'path': val_recording.file - 'audio': wav file (binary) - 'sentence': val_recording.text - 'up_votes': 0 - 'down_votes': 0 - 'age': info.Age - 'gender': info.Sex - 'accent': "" - 'locale': "sv" - 'segment': "" 4. Dump to parquet 5. Upload to hub Filter out: - single words - single characters - words splitted in single characters """ import json import os from datasets import DatasetDict hf_dataset_repo = "marinone94/nst_sv" audio_files_path = "/Users/emiliomarinone/datasets/nst_sv/audio_files" annotations_path = "/Users/emiliomarinone/datasets/nst_sv/annotations" def load_audio_file(filepath): return None def is_record_valid(text): text_split = text.split() if len(text_split) < 2: return False is_all_single_chars = True for token in text_split: if len(token) != 1: is_all_single_chars = False break if is_all_single_chars: return False return True def create_dataset_row(annotation_filename): annotations_filepath = os.path.join(annotations_path, annotation_filename) with open(annotations_filepath, "r") as f: annotation = json.load(f) dataset_rows = [] for recording in annotation["val_recordings"]: if is_record_valid(recording["text"]): audio_filepath = f'{audio_files_path}/{annotation["pid"]}/{annotation["pid"]}_{recording["file"]}' dataset_row = { "client_id": annotation["info"]["Speaker_ID"], 'path': recording["file"], 'audio': load_audio_file(audio_filepath), 'sentence': recording["text"], 'up_votes': 0, 'down_votes': 0, 'age': annotation["info"]["Age"], 'gender': annotation["info"]["Sex"], 'accent': "", 'locale': "sv", 'segment': "" } dataset_rows.append(dataset_row) return dataset_rows dataset_rows = [] for i, filename in enumerate(os.listdir(annotations_path)): dataset_rows.extend(create_dataset_row(filename)) if i == 5: break from pprint import pformat pformat(dataset_rows) # dataset = DatasetDict(dataset_rows) # with open("temp.json", "w") as f: # json.dump(f, dataset_rows)