import os import json from datasets import load_dataset, get_dataset_config_names, Features, Value from utils import num_of_samples_per_split, uniq_utts_per_split, words_per_split, uniq_words_per_split, chars_per_split, uniq_chars_per_split from utils import total_audio_duration_per_split, average_audio_duration_per_split, average_utterance_length_chars_per_split, average_utterance_length_words_per_split, speakers_per_split, meta_cov_per_split #, uniq_utts_per_speaker from utils import meta_distribution_text, meta_distribution_violin_plot, recordings_per_speaker, speech_rate_words_per_split, speech_rate_chars_per_split import argparse # move to constants output_dir_plots = "./plots" output_dir_reports = "./reports" os.makedirs(output_dir_plots, exist_ok=True) os.makedirs(output_dir_plots, exist_ok=True) # get as cmd line args # read from command line argument parser = argparse.ArgumentParser() parser.add_argument("--dataset", type=str, required=True, help="Name of the dataset to generate reports for") parser.add_argument('--no_secret_test_split', action='store_false', help="Should references for test split be retrieved from the secret distribution?") args = parser.parse_args() dataset_name = args.dataset print("Generating reports for dataset: {}".format(dataset_name)) if not (args.no_secret_test_split): dataset_name_secret = str.join("-", [dataset_name, "secret"]) # check if secret repo exists print(dataset_name_secret) try: dataset_configs_secret = get_dataset_config_names(dataset_name_secret) except: print("Config for secret dataset {} cannot be retrieved!".format(dataset_name_secret)) output_dir_reports_dataset = os.path.join(output_dir_reports, dataset_name) os.makedirs(output_dir_reports_dataset, exist_ok=True) # get dataset config names dataset_configs = get_dataset_config_names(dataset_name) # initialize output structures dataset_statistics = {} output_fn_stats = os.path.join(output_dir_reports_dataset, "dataset_statistics.json") dataset_contents = {} output_fn_contents = os.path.join(output_dir_reports_dataset, "dataset_contents.json") # specify features to load. Skip loading of audio data features_to_load = Features({'audioname': Value(dtype='string', id=None), 'split': Value(dtype='string', id=None), 'dataset': Value(dtype='string', id=None), 'speaker_id': Value(dtype='string', id=None), 'ref_orig': Value(dtype='string', id=None), 'audio_duration_samples': Value(dtype='int32', id=None), 'audio_duration_seconds': Value(dtype='float32', id=None), 'samplingrate_orig': Value(dtype='int32', id=None), 'sampling_rate': Value(dtype='int32', id=None), 'audiopath_bigos': Value(dtype='string', id=None), 'audiopath_local': Value(dtype='string', id=None), 'speaker_age': Value(dtype='string', id=None), 'speaker_sex': Value(dtype='string', id=None)}) for config_name in dataset_configs: print("Generating stats for {}".format(config_name)) dataset_statistics[config_name] = {} dataset_contents[config_name] = {} dataset_hf_subset = load_dataset(dataset_name, config_name, features=features_to_load, trust_remote_code=True) if not (args.no_secret_test_split): dataset_hf_subset_secret = load_dataset(dataset_name_secret, config_name, features=features_to_load, trust_remote_code=True) else: dataset_hf_subset_secret = None #audio content size dataset_statistics[config_name]["samples"] = num_of_samples_per_split(dataset_hf_subset) dataset_statistics[config_name]["audio[h]"] = total_audio_duration_per_split(dataset_hf_subset) dataset_statistics[config_name]["speakers"] = speakers_per_split(dataset_hf_subset) # text content size # metrics based on transcriptions (references) - requires reading secret repo for test split dataset_statistics[config_name]["words"] = words_per_split(dataset_hf_subset, dataset_hf_subset_secret) dataset_statistics[config_name]["chars"] = chars_per_split(dataset_hf_subset, dataset_hf_subset_secret) # text content derived features dataset_statistics[config_name]["utts_unique"], dataset_contents[config_name]["unique_utts"] = uniq_utts_per_split(dataset_hf_subset, dataset_hf_subset_secret) dataset_statistics[config_name]["words_unique"], dataset_contents[config_name]["unique_words"] = uniq_words_per_split(dataset_hf_subset, dataset_hf_subset_secret) dataset_statistics[config_name]["chars_unique"], dataset_contents[config_name]["unique_chars"] = uniq_chars_per_split(dataset_hf_subset, dataset_hf_subset_secret) dataset_statistics[config_name]["average_utterance_length[words]"] = average_utterance_length_words_per_split(dataset_hf_subset, dataset_hf_subset_secret) dataset_statistics[config_name]["average_utterance_length[chars]"] = average_utterance_length_chars_per_split(dataset_hf_subset, dataset_hf_subset_secret) dataset_statistics[config_name]["samples_per_spk_stats"], dataset_contents[config_name]["samples_per_spk"] = recordings_per_speaker(dataset_hf_subset) # audio content derived features dataset_statistics[config_name]["words_per_sec"] = speech_rate_words_per_split(dataset_hf_subset, dataset_hf_subset_secret) dataset_statistics[config_name]["chars_per_sec"] = speech_rate_chars_per_split(dataset_hf_subset, dataset_hf_subset_secret) dataset_statistics[config_name]["average_audio_duration[s]"] = average_audio_duration_per_split(dataset_hf_subset) # metadata coverage per subset in percent - speaker accent dataset_statistics[config_name]["meta_cov_sex"] = meta_cov_per_split(dataset_hf_subset, 'speaker_sex') dataset_statistics[config_name]["meta_cov_age"] = meta_cov_per_split(dataset_hf_subset, 'speaker_age') # speech rate per subset dataset_statistics[config_name]["meta_dist_sex"] = meta_distribution_text(dataset_hf_subset, 'speaker_sex') dataset_statistics[config_name]["meta_dist_age"] = meta_distribution_text(dataset_hf_subset, 'speaker_age') # dataset_statistics[config_name] = uniq_utts_per_speaker(dataset_hf_subset) # number of words per speaker (min, max, med, avg, std) # distribution of audio duration per subset output_dir_plots_subset = os.path.join(output_dir_plots, config_name) meta_distribution_violin_plot(dataset_hf_subset, output_dir_plots_subset, 'audio_duration_seconds', 'speaker_sex') # distribution of audio duration per age meta_distribution_violin_plot(dataset_hf_subset, output_dir_plots_subset, 'audio_duration_seconds', 'speaker_age') # save datasets statistics dict to storage as JSON file with open(output_fn_stats, 'w') as f: json.dump(dataset_statistics, f) # save dataset content analysis to storage with open(output_fn_contents, 'w') as f: json.dump(dataset_contents, f)