File size: 2,852 Bytes

4abd0b0

""" Script to load, transform and upload swedish NST dataset to 🤗 datasets.

Dataset source: https://www.nb.no/sprakbanken/en/resource-catalogue/oai-nb-no-sbr-56/

Procedure:
1. Loop over annotations
2. Decide whether to discard specific item
3. Create DatasetDict = {
    features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
    num_rows: 11030
}
3b. Mapping common_voice <---> NST
      - 'client_id': info.Speaker_ID
      - 'path': val_recording.file
      - 'audio': wav file (binary)
      - 'sentence': val_recording.text
      - 'up_votes': 0
      - 'down_votes': 0
      - 'age': info.Age
      - 'gender': info.Sex
      - 'accent': ""
      - 'locale': "sv"
      - 'segment': ""
4. Dump to parquet
5. Upload to hub

Filter out:
- single words
- single characters
- words splitted in single characters

"""

import json
import os

from datasets import DatasetDict


hf_dataset_repo = "marinone94/nst_sv"
audio_files_path = "/Users/emiliomarinone/datasets/nst_sv/audio_files"
annotations_path = "/Users/emiliomarinone/datasets/nst_sv/annotations"


def load_audio_file(filepath):
    return None

def is_record_valid(text):
    text_split = text.split()

    if len(text_split) < 2:
        return False

    is_all_single_chars = True
    for token in text_split:
        if len(token) != 1:
            is_all_single_chars = False
            break
    if is_all_single_chars:
        return False
    
    return True


def create_dataset_row(annotation_filename):
    annotations_filepath = os.path.join(annotations_path, annotation_filename)
    with open(annotations_filepath, "r") as f:
        annotation = json.load(f)

    dataset_rows = []
    for recording in annotation["val_recordings"]:
        if is_record_valid(recording["text"]):
            audio_filepath = f'{audio_files_path}/{annotation["pid"]}/{annotation["pid"]}_{recording["file"]}'
            dataset_row = {
                "client_id": annotation["info"]["Speaker_ID"],
                'path': recording["file"],
                'audio': load_audio_file(audio_filepath),
                'sentence': recording["text"],
                'up_votes': 0,
                'down_votes': 0,
                'age': annotation["info"]["Age"],
                'gender': annotation["info"]["Sex"],
                'accent': "",
                'locale': "sv",
                'segment': ""
            }
            dataset_rows.append(dataset_row)

    return dataset_rows


dataset_rows = []
for i, filename in enumerate(os.listdir(annotations_path)):
    dataset_rows.extend(create_dataset_row(filename))
    if i == 5:
        break

from pprint import pformat
pformat(dataset_rows)

# dataset = DatasetDict(dataset_rows)
# with open("temp.json", "w") as f:
#     json.dump(f, dataset_rows)