File size: 2,852 Bytes
4abd0b0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
""" Script to load, transform and upload swedish NST dataset to 🤗 datasets.
Dataset source: https://www.nb.no/sprakbanken/en/resource-catalogue/oai-nb-no-sbr-56/
Procedure:
1. Loop over annotations
2. Decide whether to discard specific item
3. Create DatasetDict = {
features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
num_rows: 11030
}
3b. Mapping common_voice <---> NST
- 'client_id': info.Speaker_ID
- 'path': val_recording.file
- 'audio': wav file (binary)
- 'sentence': val_recording.text
- 'up_votes': 0
- 'down_votes': 0
- 'age': info.Age
- 'gender': info.Sex
- 'accent': ""
- 'locale': "sv"
- 'segment': ""
4. Dump to parquet
5. Upload to hub
Filter out:
- single words
- single characters
- words splitted in single characters
"""
import json
import os
from datasets import DatasetDict
hf_dataset_repo = "marinone94/nst_sv"
audio_files_path = "/Users/emiliomarinone/datasets/nst_sv/audio_files"
annotations_path = "/Users/emiliomarinone/datasets/nst_sv/annotations"
def load_audio_file(filepath):
return None
def is_record_valid(text):
text_split = text.split()
if len(text_split) < 2:
return False
is_all_single_chars = True
for token in text_split:
if len(token) != 1:
is_all_single_chars = False
break
if is_all_single_chars:
return False
return True
def create_dataset_row(annotation_filename):
annotations_filepath = os.path.join(annotations_path, annotation_filename)
with open(annotations_filepath, "r") as f:
annotation = json.load(f)
dataset_rows = []
for recording in annotation["val_recordings"]:
if is_record_valid(recording["text"]):
audio_filepath = f'{audio_files_path}/{annotation["pid"]}/{annotation["pid"]}_{recording["file"]}'
dataset_row = {
"client_id": annotation["info"]["Speaker_ID"],
'path': recording["file"],
'audio': load_audio_file(audio_filepath),
'sentence': recording["text"],
'up_votes': 0,
'down_votes': 0,
'age': annotation["info"]["Age"],
'gender': annotation["info"]["Sex"],
'accent': "",
'locale': "sv",
'segment': ""
}
dataset_rows.append(dataset_row)
return dataset_rows
dataset_rows = []
for i, filename in enumerate(os.listdir(annotations_path)):
dataset_rows.extend(create_dataset_row(filename))
if i == 5:
break
from pprint import pformat
pformat(dataset_rows)
# dataset = DatasetDict(dataset_rows)
# with open("temp.json", "w") as f:
# json.dump(f, dataset_rows)
|