xls-r-300m-sv-robust / upload_nst_sv_to_hf_dataset.py

add nst sv to hf dataset

06d47ea over 2 years ago

No virus

3.34 kB

	""" Script to load, transform and upload swedish NST dataset to 🤗 datasets.

	Dataset source: https://www.nb.no/sprakbanken/en/resource-catalogue/oai-nb-no-sbr-56/

	Procedure:
	1. Loop over annotations
	2. Decide whether to discard specific item
	3. Create DatasetDict = {
	features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
	num_rows: 11030
	}
	3b. Mapping common_voice <---> NST
	- 'client_id': info.Speaker_ID
	- 'path': val_recording.file
	- 'audio': wav file (binary)
	- 'sentence': val_recording.text
	- 'up_votes': 0
	- 'down_votes': 0
	- 'age': info.Age
	- 'gender': info.Sex
	- 'accent': ""
	- 'locale': "sv"
	- 'segment': ""
	4. Dump to parquet
	5. Upload to hub

	Filter out:
	- examples with single words
	- examples with single characters
	- examples with words splitted in single characters
	- remove "\\Punkt", "\\Komma" from sentences

	"""

	import json
	import os

	import pandas as pd
	import torchaudio

	from datasets import load_dataset


	hf_dataset_repo = "marinone94/nst_sv"
	audio_files_path = "/Users/emiliomarinone/datasets/nst_sv/audio_files"
	annotations_path = "/Users/emiliomarinone/datasets/nst_sv/annotations"


	def load_audio_file(rel_filepath):
	audio_filepath = f'{audio_files_path}/{rel_filepath}'
	data_waveform, sampling_rate = torchaudio.load(audio_filepath)
	return {
	"path": rel_filepath,
	"array": data_waveform[0].t().numpy(),
	"sampling_rate": sampling_rate
	}


	def is_record_valid(text):
	text_split = text.split()

	if len(text_split) < 2:
	return False

	is_all_single_chars = True
	for token in text_split:
	if len(token) != 1:
	is_all_single_chars = False
	break
	if is_all_single_chars:
	return False

	return True


	def clean_text(text):
	return text.replace("\\\\Komma", "").replace("\\\\Punkt", "")


	def create_dataset_row(annotation_filename):
	annotations_filepath = os.path.join(annotations_path, annotation_filename)
	with open(annotations_filepath, "r") as f:
	annotation = json.load(f)

	dataset_rows = []
	for recording in annotation["val_recordings"]:
	if is_record_valid(recording["text"]):
	rel_filepath = f'{annotation["pid"]}/{annotation["pid"]}_{recording["file"]}'.replace(".wav", "-2.wav")
	dataset_row = {
	"client_id": annotation["info"]["Speaker_ID"],
	'path': rel_filepath,
	'audio': load_audio_file(rel_filepath),
	'sentence': clean_text(recording["text"]),
	'up_votes': 0,
	'down_votes': 0,
	'age': annotation["info"]["Age"],
	'gender': annotation["info"]["Sex"],
	'accent': "",
	'locale': "sv",
	'segment': ""
	}
	dataset_rows.append(dataset_row)

	return dataset_rows


	dataset_rows = []
	for i, filename in enumerate(os.listdir(annotations_path)):
	dataset_rows.extend(create_dataset_row(filename))
	if i == 1:
	break

	df = pd.DataFrame(dataset_rows)
	df.to_parquet("dataset.parquet")
	dataset = load_dataset("parquet", data_files="dataset.parquet")
	dataset.push_to_hub("marinone94/nst_sv")