File size: 6,051 Bytes
32fbd07
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import os
import json
from datasets import load_dataset, get_dataset_config_names, Features, Value
from utils import num_of_samples_per_split, uniq_utts_per_split, words_per_split, uniq_words_per_split, chars_per_split, uniq_chars_per_split
from utils import audio_duration_per_split, speakers_per_split, meta_cov_per_split
#, uniq_utts_per_speaker
from utils import meta_distribution_text, meta_distribution_violin_plot, recordings_per_speaker, speech_rate_words_per_split, speech_rate_chars_per_split
import argparse
# move to constants
output_dir_plots = "./plots"
output_dir_reports = "./reports"
os.makedirs(output_dir_plots, exist_ok=True)
os.makedirs(output_dir_plots, exist_ok=True)

# get as cmd line args
# read from command line argument
parser = argparse.ArgumentParser()
parser.add_argument("--dataset", type=str, required=True, help="Name of the dataset to generate reports for")
parser.add_argument("--secret_test_split", default=True, type=bool, help="Should references for test split be retrieved from the secret distribution?")

args = parser.parse_args()

dataset_name = args.dataset
print("Generating reports for dataset: {}".format(dataset_name))
if (args.secret_test_split):
    dataset_name_secret = str.join("-", [dataset_name, "secret"])
    # check if secret repo exists
    print(dataset_name_secret)
    try:
        dataset_configs_secret = get_dataset_config_names(dataset_name_secret)
    except:
        print("Config for secret dataset {} cannot be retrieved!".format(dataset_name_secret))

#dataset_name = "amu-cai/pl-asr-bigos-v2"
output_dir_reports_dataset = os.path.join(output_dir_reports, dataset_name)
os.makedirs(output_dir_reports_dataset, exist_ok=True)

# get dataset config names
dataset_configs = get_dataset_config_names(dataset_name)

# initialize output structures
dataset_statistics = {}
output_fn_stats = os.path.join(output_dir_reports_dataset, "dataset_statistics.json")

dataset_contents = {}
output_fn_contents = os.path.join(output_dir_reports_dataset, "dataset_contents.json")

# specify features to load. Skip loading of audio data
features_to_load = Features({'audioname': Value(dtype='string', id=None), 'split': Value(dtype='string', id=None), 'dataset': Value(dtype='string', id=None), 'speaker_id': Value(dtype='string', id=None), 'ref_orig': Value(dtype='string', id=None), 'audio_duration_samples': Value(dtype='int32', id=None), 'audio_duration_seconds': Value(dtype='float32', id=None), 'samplingrate_orig': Value(dtype='int32', id=None), 'sampling_rate': Value(dtype='int32', id=None), 'audiopath_bigos': Value(dtype='string', id=None), 'audiopath_local': Value(dtype='string', id=None), 'speaker_age': Value(dtype='string', id=None), 'speaker_sex': Value(dtype='string', id=None)})

for config_name in dataset_configs:
    print("Generating stats for {}".format(config_name))
    
    dataset_statistics[config_name] = {}
    dataset_contents[config_name] = {}

    dataset_hf_subset = load_dataset(dataset_name, config_name, features=features_to_load, trust_remote_code=True)
    if(args.secret_test_split):
        dataset_hf_subset_secret = load_dataset(dataset_name_secret, config_name, features=features_to_load, trust_remote_code=True)

    dataset_statistics[config_name]["samples"] = num_of_samples_per_split(dataset_hf_subset)
    dataset_statistics[config_name]["audio[h]"] = audio_duration_per_split(dataset_hf_subset)
    dataset_statistics[config_name]["speakers"] = speakers_per_split(dataset_hf_subset)

    # metrics based on transcriptions (references) - requires reading secret repo for test split
    dataset_statistics[config_name]["utts_unique"], dataset_contents[config_name]["unique_utts"] = uniq_utts_per_split(dataset_hf_subset, dataset_hf_subset_secret)
    dataset_statistics[config_name]["words"] = words_per_split(dataset_hf_subset, dataset_hf_subset_secret)
    dataset_statistics[config_name]["words_unique"], dataset_contents[config_name]["unique_words"] = uniq_words_per_split(dataset_hf_subset, dataset_hf_subset_secret)
    dataset_statistics[config_name]["chars"] = chars_per_split(dataset_hf_subset, dataset_hf_subset_secret)
    dataset_statistics[config_name]["chars_unique"], dataset_contents[config_name]["unique_chars"] = uniq_chars_per_split(dataset_hf_subset, dataset_hf_subset_secret)
    dataset_statistics[config_name]["words_per_sec"] = speech_rate_words_per_split(dataset_hf_subset, dataset_hf_subset_secret)
    dataset_statistics[config_name]["chars_per_sec"] = speech_rate_chars_per_split(dataset_hf_subset, dataset_hf_subset_secret)

    # metadata coverage per subset in percent - speaker accent
    dataset_statistics[config_name]["meta_cov_sex"] = meta_cov_per_split(dataset_hf_subset, 'speaker_sex')
    dataset_statistics[config_name]["meta_cov_age"] = meta_cov_per_split(dataset_hf_subset, 'speaker_age')

    # speech rate per subset
    dataset_statistics[config_name]["meta_dist_sex"] = meta_distribution_text(dataset_hf_subset, 'speaker_sex')
    dataset_statistics[config_name]["meta_dist_age"] = meta_distribution_text(dataset_hf_subset, 'speaker_age')

    dataset_statistics[config_name]["samples_per_spk"], dataset_contents[config_name]["samples_per_spk"]  = recordings_per_speaker(dataset_hf_subset)
    # dataset_statistics[config_name] = uniq_utts_per_speaker(dataset_hf_subset)
    # number of words per speaker (min, max, med, avg, std)

    # distribution of audio duration per subset
    output_dir_plots_subset = os.path.join(output_dir_plots, config_name)
    meta_distribution_violin_plot(dataset_hf_subset, output_dir_plots_subset, 'audio_duration_seconds', 'speaker_sex')
    
    # distribution of audio duration per age
    meta_distribution_violin_plot(dataset_hf_subset, output_dir_plots_subset, 'audio_duration_seconds', 'speaker_age')

    
# save datasets statistics dict to storage as JSON file
with open(output_fn_stats, 'w') as f:
    json.dump(dataset_statistics, f)

# save dataset content analysis to storage
with open(output_fn_contents, 'w') as f:
    json.dump(dataset_contents, f)