marinone94
/

xls-r-300m-sv-robust

@@ -1,116 +0,0 @@
-""" Script to load, transform and upload swedish NST dataset to 🤗 datasets.
-Dataset source: https://www.nb.no/sprakbanken/en/resource-catalogue/oai-nb-no-sbr-56/
-Procedure:
-1. Loop over annotations
-2. Decide whether to discard specific item
-3. Create DatasetDict = {
-    features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
-    num_rows: 11030
-}
-3b. Mapping common_voice <---> NST
-      - 'client_id': info.Speaker_ID
-      - 'path': val_recording.file
-      - 'audio': wav file (binary)
-      - 'sentence': val_recording.text
-      - 'up_votes': 0
-      - 'down_votes': 0
-      - 'age': info.Age
-      - 'gender': info.Sex
-      - 'accent': ""
-      - 'locale': "sv"
-      - 'segment': ""
-4. Dump to parquet
-5. Upload to hub
-Filter out:
-- examples with single words
-- examples with single characters
-- examples with words splitted in single characters
-- remove "\\Punkt", "\\Komma" from sentences
-"""
-import json
-import os
-import pandas as pd
-import torchaudio
-from datasets import load_dataset
-hf_dataset_repo = "marinone94/nst_sv"
-audio_files_path = "/Users/emiliomarinone/datasets/nst_sv/audio_files"
-annotations_path = "/Users/emiliomarinone/datasets/nst_sv/annotations"
-def load_audio_file(rel_filepath):
-    audio_filepath = f'{audio_files_path}/{rel_filepath}'
-    data_waveform, sampling_rate = torchaudio.load(audio_filepath)
-    return {
-        "path": rel_filepath,
-        "array": data_waveform[0].t().numpy(),
-        "sampling_rate": sampling_rate
-    }
-def is_record_valid(text):
-    text_split = text.split()
-    if len(text_split) < 2:
-        return False
-    is_all_single_chars = True
-    for token in text_split:
-        if len(token) != 1:
-            is_all_single_chars = False
-            break
-    if is_all_single_chars:
-        return False
-    return True
-def clean_text(text):
-    return text.replace("\\\\Komma", "").replace("\\\\Punkt", "")
-def create_dataset_row(annotation_filename):
-    annotations_filepath = os.path.join(annotations_path, annotation_filename)
-    with open(annotations_filepath, "r") as f:
-        annotation = json.load(f)
-    dataset_rows = []
-    for recording in annotation["val_recordings"]:
-        if is_record_valid(recording["text"]):
-            rel_filepath = f'{annotation["pid"]}/{annotation["pid"]}_{recording["file"]}'.replace(".wav", "-2.wav")
-            dataset_row = {
-                "client_id": annotation["info"]["Speaker_ID"],
-                'path': rel_filepath,
-                'audio': load_audio_file(rel_filepath),
-                'sentence': clean_text(recording["text"]),
-                'up_votes': 0,
-                'down_votes': 0,
-                'age': annotation["info"]["Age"],
-                'gender': annotation["info"]["Sex"],
-                'accent': "",
-                'locale': "sv",
-                'segment': ""
-            }
-            dataset_rows.append(dataset_row)
-    return dataset_rows
-dataset_rows = []
-for i, filename in enumerate(os.listdir(annotations_path)):
-    dataset_rows.extend(create_dataset_row(filename))
-    if i == 1:
-        break
-df = pd.DataFrame(dataset_rows)
-df.to_parquet("dataset.parquet")
-dataset = load_dataset("parquet", data_files="dataset.parquet")
-dataset.push_to_hub("marinone94/nst_sv")