amu-bigos-data-dash / run-analysis.py
mj-new
Added support for datasets without secret test split
3533dd6
raw
history blame contribute delete
No virus
6.79 kB
import os
import json
from datasets import load_dataset, get_dataset_config_names, Features, Value
from utils import num_of_samples_per_split, uniq_utts_per_split, words_per_split, uniq_words_per_split, chars_per_split, uniq_chars_per_split
from utils import total_audio_duration_per_split, average_audio_duration_per_split, average_utterance_length_chars_per_split, average_utterance_length_words_per_split, speakers_per_split, meta_cov_per_split
#, uniq_utts_per_speaker
from utils import meta_distribution_text, meta_distribution_violin_plot, recordings_per_speaker, speech_rate_words_per_split, speech_rate_chars_per_split
import argparse
# move to constants
output_dir_plots = "./plots"
output_dir_reports = "./reports"
os.makedirs(output_dir_plots, exist_ok=True)
os.makedirs(output_dir_plots, exist_ok=True)
# get as cmd line args
# read from command line argument
parser = argparse.ArgumentParser()
parser.add_argument("--dataset", type=str, required=True, help="Name of the dataset to generate reports for")
parser.add_argument('--no_secret_test_split', action='store_false', help="Should references for test split be retrieved from the secret distribution?")
args = parser.parse_args()
dataset_name = args.dataset
print("Generating reports for dataset: {}".format(dataset_name))
if not (args.no_secret_test_split):
dataset_name_secret = str.join("-", [dataset_name, "secret"])
# check if secret repo exists
print(dataset_name_secret)
try:
dataset_configs_secret = get_dataset_config_names(dataset_name_secret)
except:
print("Config for secret dataset {} cannot be retrieved!".format(dataset_name_secret))
output_dir_reports_dataset = os.path.join(output_dir_reports, dataset_name)
os.makedirs(output_dir_reports_dataset, exist_ok=True)
# get dataset config names
dataset_configs = get_dataset_config_names(dataset_name)
# initialize output structures
dataset_statistics = {}
output_fn_stats = os.path.join(output_dir_reports_dataset, "dataset_statistics.json")
dataset_contents = {}
output_fn_contents = os.path.join(output_dir_reports_dataset, "dataset_contents.json")
# specify features to load. Skip loading of audio data
features_to_load = Features({'audioname': Value(dtype='string', id=None), 'split': Value(dtype='string', id=None), 'dataset': Value(dtype='string', id=None), 'speaker_id': Value(dtype='string', id=None), 'ref_orig': Value(dtype='string', id=None), 'audio_duration_samples': Value(dtype='int32', id=None), 'audio_duration_seconds': Value(dtype='float32', id=None), 'samplingrate_orig': Value(dtype='int32', id=None), 'sampling_rate': Value(dtype='int32', id=None), 'audiopath_bigos': Value(dtype='string', id=None), 'audiopath_local': Value(dtype='string', id=None), 'speaker_age': Value(dtype='string', id=None), 'speaker_sex': Value(dtype='string', id=None)})
for config_name in dataset_configs:
print("Generating stats for {}".format(config_name))
dataset_statistics[config_name] = {}
dataset_contents[config_name] = {}
dataset_hf_subset = load_dataset(dataset_name, config_name, features=features_to_load, trust_remote_code=True)
if not (args.no_secret_test_split):
dataset_hf_subset_secret = load_dataset(dataset_name_secret, config_name, features=features_to_load, trust_remote_code=True)
else:
dataset_hf_subset_secret = None
#audio content size
dataset_statistics[config_name]["samples"] = num_of_samples_per_split(dataset_hf_subset)
dataset_statistics[config_name]["audio[h]"] = total_audio_duration_per_split(dataset_hf_subset)
dataset_statistics[config_name]["speakers"] = speakers_per_split(dataset_hf_subset)
# text content size
# metrics based on transcriptions (references) - requires reading secret repo for test split
dataset_statistics[config_name]["words"] = words_per_split(dataset_hf_subset, dataset_hf_subset_secret)
dataset_statistics[config_name]["chars"] = chars_per_split(dataset_hf_subset, dataset_hf_subset_secret)
# text content derived features
dataset_statistics[config_name]["utts_unique"], dataset_contents[config_name]["unique_utts"] = uniq_utts_per_split(dataset_hf_subset, dataset_hf_subset_secret)
dataset_statistics[config_name]["words_unique"], dataset_contents[config_name]["unique_words"] = uniq_words_per_split(dataset_hf_subset, dataset_hf_subset_secret)
dataset_statistics[config_name]["chars_unique"], dataset_contents[config_name]["unique_chars"] = uniq_chars_per_split(dataset_hf_subset, dataset_hf_subset_secret)
dataset_statistics[config_name]["average_utterance_length[words]"] = average_utterance_length_words_per_split(dataset_hf_subset, dataset_hf_subset_secret)
dataset_statistics[config_name]["average_utterance_length[chars]"] = average_utterance_length_chars_per_split(dataset_hf_subset, dataset_hf_subset_secret)
dataset_statistics[config_name]["samples_per_spk_stats"], dataset_contents[config_name]["samples_per_spk"] = recordings_per_speaker(dataset_hf_subset)
# audio content derived features
dataset_statistics[config_name]["words_per_sec"] = speech_rate_words_per_split(dataset_hf_subset, dataset_hf_subset_secret)
dataset_statistics[config_name]["chars_per_sec"] = speech_rate_chars_per_split(dataset_hf_subset, dataset_hf_subset_secret)
dataset_statistics[config_name]["average_audio_duration[s]"] = average_audio_duration_per_split(dataset_hf_subset)
# metadata coverage per subset in percent - speaker accent
dataset_statistics[config_name]["meta_cov_sex"] = meta_cov_per_split(dataset_hf_subset, 'speaker_sex')
dataset_statistics[config_name]["meta_cov_age"] = meta_cov_per_split(dataset_hf_subset, 'speaker_age')
# speech rate per subset
dataset_statistics[config_name]["meta_dist_sex"] = meta_distribution_text(dataset_hf_subset, 'speaker_sex')
dataset_statistics[config_name]["meta_dist_age"] = meta_distribution_text(dataset_hf_subset, 'speaker_age')
# dataset_statistics[config_name] = uniq_utts_per_speaker(dataset_hf_subset)
# number of words per speaker (min, max, med, avg, std)
# distribution of audio duration per subset
output_dir_plots_subset = os.path.join(output_dir_plots, config_name)
meta_distribution_violin_plot(dataset_hf_subset, output_dir_plots_subset, 'audio_duration_seconds', 'speaker_sex')
# distribution of audio duration per age
meta_distribution_violin_plot(dataset_hf_subset, output_dir_plots_subset, 'audio_duration_seconds', 'speaker_age')
# save datasets statistics dict to storage as JSON file
with open(output_fn_stats, 'w') as f:
json.dump(dataset_statistics, f)
# save dataset content analysis to storage
with open(output_fn_contents, 'w') as f:
json.dump(dataset_contents, f)