mj-new commited on
Commit
3533dd6
1 Parent(s): 25f0e74

Added support for datasets without secret test split

Browse files
app.py CHANGED
@@ -18,8 +18,17 @@ from datasets import get_dataset_config_names
18
  # TODO - compare the datasets
19
 
20
  st.set_page_config(layout="wide")
 
 
 
 
 
 
 
 
21
 
22
- about, analysis_bigos, analysis_bigos_pelcra = st.tabs(["About BIGOS datasets", "BIGOS V2 analysis", "PELCRA for BIGOS analysis"])
 
23
  #analysis_bigos_diagnostic
24
  #########################################BIGOS################################################
25
  with about:
@@ -30,7 +39,6 @@ with about:
30
 
31
  with analysis_bigos:
32
  dataset_name = "amu-cai/pl-asr-bigos-v2"
33
- #dataset_secret = "amu-cai/pl-asr-bigos-v2-secret"
34
  dataset_short_name = "BIGOS"
35
  dataset_version = "V2"
36
 
@@ -51,30 +59,93 @@ with analysis_bigos:
51
 
52
 
53
  st.header("Dataset level metrics")
54
- metrics_size = ["samples", "audio[h]", "speakers", "words", "chars"]
55
  df_sum_stats_agg = extract_stats_to_agg(df_multindex_for_agg, metrics_size)
56
 
57
  # split dataframe into separate dataframes for easier analysis and visualization
58
- st.subheader("Dataset size (audio)")
59
- df_sum_stats_audio = df_sum_stats_agg[["audio[h]", "samples", "speakers"]]
60
  st.dataframe(df_sum_stats_audio)
61
 
62
- st.subheader("Dataset size (text)")
63
- df_sum_stats_text = df_sum_stats_agg[["samples", "words", "chars"]]
64
  st.dataframe(df_sum_stats_text)
65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
- metrics_features = ["utts_unique", "words_unique", "chars_unique", "words_per_sec", "chars_per_sec", "average_audio_duration[s]"]
 
 
 
 
 
 
 
 
 
 
 
68
 
69
  df_sum_stats_all_splits = extract_stats_all_splits(df_multindex_all_splits, metrics_features)
70
 
71
- st.subheader("Dataset features (text)")
72
- df_sum_stats_feats_text = df_sum_stats_all_splits[metrics_features[0:3]]
73
  st.dataframe(df_sum_stats_feats_text)
74
 
75
- st.subheader("Dataset features (audio)")
76
- df_sum_stats_feats_audio = df_sum_stats_all_splits[metrics_features[3:]]
77
- st.dataframe(df_sum_stats_feats_audio)
 
 
 
 
 
 
 
 
78
 
79
  st.header("BIGOS subsets (source datasets) cards")
80
  for subset in dataset_configs:
@@ -84,14 +155,11 @@ with analysis_bigos:
84
  df_metrics_subset_features = extract_stats_for_dataset_card(df_multindex_for_agg, subset, metrics_features, add_total=False)
85
  st.dataframe(df_metrics_subset_features)
86
 
87
-
88
 
89
  #########################################PELCRA################################################
90
  with analysis_bigos_pelcra:
91
 
92
  dataset_name = "pelcra/pl-asr-pelcra-for-bigos"
93
- #dataset_secret = "pelcra/pl-asr-pelcra-for-bigos-secret"
94
-
95
  dataset_short_name = "PELCRA"
96
 
97
  # local version with granted gated access
@@ -113,35 +181,35 @@ with analysis_bigos_pelcra:
113
  # extract metrics from dictionary and convert to various dataframes for easier analysis and visualization
114
  #st.header("Summary statistics")
115
 
116
-
117
  st.header("Dataset level metrics")
118
- metrics_size = ["samples", "audio[h]", "speakers", "words", "chars"]
119
  df_sum_stats_agg = extract_stats_to_agg(df_multindex_for_agg, metrics_size)
120
 
121
- #st.dataframe(df_sum_stats_agg)
122
- #print(df_sum_stats.columns)
123
-
124
  # split dataframe into separate dataframes for easier analysis and visualization
125
- st.subheader("Dataset size (audio)")
126
- df_sum_stats_audio = df_sum_stats_agg[["audio[h]", "samples", "speakers"]]
127
  st.dataframe(df_sum_stats_audio)
128
 
129
- st.subheader("Dataset size (text)")
130
- df_sum_stats_text = df_sum_stats_agg[["samples", "words", "chars"]]
131
  st.dataframe(df_sum_stats_text)
132
 
133
-
134
- metrics_features = ["utts_unique", "words_unique", "chars_unique", "words_per_sec", "chars_per_sec", "average_audio_duration[s]"]
135
-
136
  df_sum_stats_all_splits = extract_stats_all_splits(df_multindex_all_splits, metrics_features)
137
 
138
- st.subheader("Dataset features (text)")
139
- df_sum_stats_feats_text = df_sum_stats_all_splits[metrics_features[0:3]]
140
  st.dataframe(df_sum_stats_feats_text)
141
 
142
- st.subheader("Dataset features (audio)")
143
- df_sum_stats_feats_audio = df_sum_stats_all_splits[metrics_features[3:]]
144
- st.dataframe(df_sum_stats_feats_audio)
 
 
 
 
 
 
 
 
145
 
146
  st.header("BIGOS subsets (source datasets) cards")
147
  for subset in dataset_configs:
 
18
  # TODO - compare the datasets
19
 
20
  st.set_page_config(layout="wide")
21
+ metrics_size_audio = ["samples", "audio[h]", "speakers"]
22
+ metrics_size_text = ["samples", "words", "chars"]
23
+ metrics_size = metrics_size_audio + metrics_size_text
24
+ metrics_features_text_uniq = ["utts_unique", "words_unique", "chars_unique"]
25
+ metrics_features_speech_rate = ["words_per_sec", "chars_per_sec"]
26
+ metrics_features_duration = ["average_audio_duration[s]", "average_utterance_length[words]", "average_utterance_length[chars]"]
27
+ metrics_features_meta = ["meta_cov_sex", "meta_cov_age"]
28
+ metrics_features = metrics_features_text_uniq + metrics_features_speech_rate + metrics_features_duration + metrics_features_meta
29
 
30
+
31
+ about, analysis_bigos, analysis_bigos_diagnostic, analysis_bigos_pelcra = st.tabs(["About BIGOS datasets", "BIGOS V2 analysis", "BIGOS V2 diagnostic", "PELCRA for BIGOS analysis"])
32
  #analysis_bigos_diagnostic
33
  #########################################BIGOS################################################
34
  with about:
 
39
 
40
  with analysis_bigos:
41
  dataset_name = "amu-cai/pl-asr-bigos-v2"
 
42
  dataset_short_name = "BIGOS"
43
  dataset_version = "V2"
44
 
 
59
 
60
 
61
  st.header("Dataset level metrics")
 
62
  df_sum_stats_agg = extract_stats_to_agg(df_multindex_for_agg, metrics_size)
63
 
64
  # split dataframe into separate dataframes for easier analysis and visualization
65
+ st.subheader("Audio content size")
66
+ df_sum_stats_audio = df_sum_stats_agg[metrics_size_audio]
67
  st.dataframe(df_sum_stats_audio)
68
 
69
+ st.subheader("Text content size")
70
+ df_sum_stats_text = df_sum_stats_agg[metrics_size_text]
71
  st.dataframe(df_sum_stats_text)
72
 
73
+ df_sum_stats_all_splits = extract_stats_all_splits(df_multindex_all_splits, metrics_features)
74
+
75
+ st.subheader("Utterances, vocabulary and alphabet space")
76
+ df_sum_stats_feats_text = df_sum_stats_all_splits[metrics_features_text_uniq]
77
+ st.dataframe(df_sum_stats_feats_text)
78
+
79
+ st.subheader("Speech rates")
80
+ df_sum_stats_feats_speech_rate= df_sum_stats_all_splits[metrics_features_speech_rate]
81
+ st.dataframe(df_sum_stats_feats_speech_rate)
82
+
83
+ st.subheader("Average utterance lengths and audio duration")
84
+ df_sum_stats_feats_durations = df_sum_stats_all_splits[metrics_features_duration]
85
+ st.dataframe(df_sum_stats_feats_durations)
86
+
87
+ st.subheader("Metadata coverage")
88
+ df_sum_stats_feats_meta = df_sum_stats_all_splits[metrics_features_meta]
89
+ st.dataframe(df_sum_stats_feats_meta)
90
+
91
+ st.header("BIGOS subsets (source datasets) cards")
92
+ for subset in dataset_configs:
93
+ st.subheader("Dataset card for: {}".format(subset))
94
+ df_metrics_subset_size = extract_stats_for_dataset_card(df_multindex_for_agg, subset, metrics_size, add_total=True)
95
+ st.dataframe(df_metrics_subset_size)
96
+ df_metrics_subset_features = extract_stats_for_dataset_card(df_multindex_for_agg, subset, metrics_features, add_total=False)
97
+ st.dataframe(df_metrics_subset_features)
98
+
99
+ with analysis_bigos_diagnostic:
100
+ dataset_name = "amu-cai/pl-asr-bigos-v2-diagnostic"
101
+ dataset_short_name = "BIGOS diagnostic"
102
+ dataset_version = "V2"
103
+
104
+ dataset_configs = get_dataset_config_names(dataset_name,trust_remote_code=True)
105
+ # remove "all" subset, which is always the last config type
106
+ dataset_configs.pop()
107
+ print(dataset_configs)
108
+ # read the reports for public and secret datasets
109
+ [stats_dict_public, contents_dict_public] = read_reports(dataset_name)
110
+
111
+ # update the metrics for test split with the secret dataset statistics
112
+ #stats_dict_public = add_test_split_stats_from_secret_dataset(stats_dict_public, stats_dict_secret)
113
+ df_multindex_for_agg = dict_to_multindex_df(stats_dict_public, all_splits=False)
114
+ df_multindex_all_splits = dict_to_multindex_df(stats_dict_public, all_splits=True)
115
+
116
+ # extract metrics from dictionary and convert to various dataframes for easier analysis and visualization
117
+ #st.header("Summary statistics")
118
 
119
+
120
+ st.header("Dataset level metrics")
121
+ df_sum_stats_agg = extract_stats_to_agg(df_multindex_for_agg, metrics_size)
122
+
123
+ # split dataframe into separate dataframes for easier analysis and visualization
124
+ st.subheader("Audio content size")
125
+ df_sum_stats_audio = df_sum_stats_agg[metrics_size_audio]
126
+ st.dataframe(df_sum_stats_audio)
127
+
128
+ st.subheader("Text content size")
129
+ df_sum_stats_text = df_sum_stats_agg[metrics_size_text]
130
+ st.dataframe(df_sum_stats_text)
131
 
132
  df_sum_stats_all_splits = extract_stats_all_splits(df_multindex_all_splits, metrics_features)
133
 
134
+ st.subheader("Utterances, vocabulary and alphabet space")
135
+ df_sum_stats_feats_text = df_sum_stats_all_splits[metrics_features_text_uniq]
136
  st.dataframe(df_sum_stats_feats_text)
137
 
138
+ st.subheader("Speech rates")
139
+ df_sum_stats_feats_speech_rate= df_sum_stats_all_splits[metrics_features_speech_rate]
140
+ st.dataframe(df_sum_stats_feats_speech_rate)
141
+
142
+ st.subheader("Average utterance lengths and audio duration")
143
+ df_sum_stats_feats_durations = df_sum_stats_all_splits[metrics_features_duration]
144
+ st.dataframe(df_sum_stats_feats_durations)
145
+
146
+ st.subheader("Metadata coverage")
147
+ df_sum_stats_feats_meta = df_sum_stats_all_splits[metrics_features_meta]
148
+ st.dataframe(df_sum_stats_feats_meta)
149
 
150
  st.header("BIGOS subsets (source datasets) cards")
151
  for subset in dataset_configs:
 
155
  df_metrics_subset_features = extract_stats_for_dataset_card(df_multindex_for_agg, subset, metrics_features, add_total=False)
156
  st.dataframe(df_metrics_subset_features)
157
 
 
158
 
159
  #########################################PELCRA################################################
160
  with analysis_bigos_pelcra:
161
 
162
  dataset_name = "pelcra/pl-asr-pelcra-for-bigos"
 
 
163
  dataset_short_name = "PELCRA"
164
 
165
  # local version with granted gated access
 
181
  # extract metrics from dictionary and convert to various dataframes for easier analysis and visualization
182
  #st.header("Summary statistics")
183
 
 
184
  st.header("Dataset level metrics")
 
185
  df_sum_stats_agg = extract_stats_to_agg(df_multindex_for_agg, metrics_size)
186
 
 
 
 
187
  # split dataframe into separate dataframes for easier analysis and visualization
188
+ st.subheader("Audio content size")
189
+ df_sum_stats_audio = df_sum_stats_agg[metrics_size_audio]
190
  st.dataframe(df_sum_stats_audio)
191
 
192
+ st.subheader("Text content size")
193
+ df_sum_stats_text = df_sum_stats_agg[metrics_size_text]
194
  st.dataframe(df_sum_stats_text)
195
 
 
 
 
196
  df_sum_stats_all_splits = extract_stats_all_splits(df_multindex_all_splits, metrics_features)
197
 
198
+ st.subheader("Utterances, vocabulary and alphabet space")
199
+ df_sum_stats_feats_text = df_sum_stats_all_splits[metrics_features_text_uniq]
200
  st.dataframe(df_sum_stats_feats_text)
201
 
202
+ st.subheader("Speech rates")
203
+ df_sum_stats_feats_speech_rate= df_sum_stats_all_splits[metrics_features_speech_rate]
204
+ st.dataframe(df_sum_stats_feats_speech_rate)
205
+
206
+ st.subheader("Average utterance lengths and audio duration")
207
+ df_sum_stats_feats_durations = df_sum_stats_all_splits[metrics_features_duration]
208
+ st.dataframe(df_sum_stats_feats_durations)
209
+
210
+ st.subheader("Metadata coverage")
211
+ df_sum_stats_feats_meta = df_sum_stats_all_splits[metrics_features_meta]
212
+ st.dataframe(df_sum_stats_feats_meta)
213
 
214
  st.header("BIGOS subsets (source datasets) cards")
215
  for subset in dataset_configs:
reports/amu-cai/pl-asr-bigos-v2-diagnostic/dataset_contents.json ADDED
The diff for this file is too large to render. See raw diff
 
reports/amu-cai/pl-asr-bigos-v2-diagnostic/dataset_statistics.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"pjatk-clarin_mobile-15": {"samples": {"test": 45, "train": 47, "validation": 45, "all_splits": 137}, "audio[h]": {"test": 0.16, "train": 0.16, "validation": 0.16, "all_splits": 0.48}, "speakers": {"test": 11, "train": 34, "validation": 10, "all_splits": 55}, "words": {"test": 1194, "train": 1268, "validation": 1203, "all_splits": 3665}, "chars": {"test": 8027, "train": 8258, "validation": 8109, "all_splits": 24394}, "utts_unique": {"test": 45, "train": 47, "validation": 45, "all_splits": 137}, "words_unique": {"test": 809, "train": 856, "validation": 809, "all_splits": 2161}, "chars_unique": {"test": 33, "train": 33, "validation": 33, "all_splits": 32}, "average_utterance_length[words]": {"test": 26.53, "train": 26.98, "validation": 26.73, "all_splits": 26.75}, "average_utterance_length[chars]": {"test": 178.38, "train": 175.7, "validation": 180.2, "all_splits": 178.06}, "samples_per_spk_stats": {"test": {"average": 4.09, "std": 2.07, "median": 4.0, "min": 1, "max": 7}, "train": {"average": 1.38, "std": 0.59, "median": 1.0, "min": 1, "max": 3}, "validation": {"average": 4.5, "std": 2.29, "median": 4.5, "min": 1, "max": 9}, "all_splits": {"average": 2.49, "std": 2.01, "median": 2.0, "min": 1, "max": 9}}, "words_per_sec": {"test": 2.06, "train": 2.17, "validation": 2.05, "all_splits": 2.09}, "chars_per_sec": {"test": 11.79, "train": 11.95, "validation": 11.78, "all_splits": 11.84}, "average_audio_duration[s]": {"test": 12.88, "train": 12.45, "validation": 13.03, "all_splits": 12.78}, "meta_cov_sex": {"test": "N/A", "train": "N/A", "validation": "N/A", "all_splits": "N/A"}, "meta_cov_age": {"test": "N/A", "train": "N/A", "validation": "N/A", "all_splits": "N/A"}, "meta_dist_sex": {"test": "N/A", "train": "N/A", "validation": "N/A", "all_splits": "N/A"}, "meta_dist_age": {"test": "N/A", "train": "N/A", "validation": "N/A", "all_splits": "N/A"}}, "pjatk-clarin_studio-15": {"samples": {"test": 42, "train": 44, "validation": 40, "all_splits": 126}, "audio[h]": {"test": 0.16, "train": 0.17, "validation": 0.17, "all_splits": 0.5}, "speakers": {"test": 32, "train": 43, "validation": 31, "all_splits": 106}, "words": {"test": 1660, "train": 1798, "validation": 1756, "all_splits": 5214}, "chars": {"test": 6549, "train": 6937, "validation": 6896, "all_splits": 20382}, "utts_unique": {"test": 42, "train": 44, "validation": 40, "all_splits": 126}, "words_unique": {"test": 593, "train": 636, "validation": 618, "all_splits": 1638}, "chars_unique": {"test": 34, "train": 34, "validation": 35, "all_splits": 35}, "average_utterance_length[words]": {"test": 39.52, "train": 40.86, "validation": 43.9, "all_splits": 41.38}, "average_utterance_length[chars]": {"test": 155.93, "train": 157.66, "validation": 172.4, "all_splits": 161.76}, "samples_per_spk_stats": {"test": {"average": 1.31, "std": 0.58, "median": 1.0, "min": 1, "max": 3}, "train": {"average": 1.02, "std": 0.15, "median": 1.0, "min": 1, "max": 2}, "validation": {"average": 1.29, "std": 0.52, "median": 1.0, "min": 1, "max": 3}, "all_splits": {"average": 1.19, "std": 0.46, "median": 1.0, "min": 1, "max": 3}}, "words_per_sec": {"test": 2.81, "train": 3.02, "validation": 2.96, "all_splits": 2.93}, "chars_per_sec": {"test": 8.29, "train": 8.63, "validation": 8.65, "all_splits": 8.52}, "average_audio_duration[s]": {"test": 14.04, "train": 13.54, "validation": 14.86, "all_splits": 14.12}, "meta_cov_sex": {"test": "N/A", "train": "N/A", "validation": "N/A", "all_splits": "N/A"}, "meta_cov_age": {"test": "N/A", "train": "N/A", "validation": "N/A", "all_splits": "N/A"}, "meta_dist_sex": {"test": "N/A", "train": "N/A", "validation": "N/A", "all_splits": "N/A"}, "meta_dist_age": {"test": "N/A", "train": "N/A", "validation": "N/A", "all_splits": "N/A"}}, "mailabs-corpus_librivox-19": {"samples": {"test": 42, "train": 48, "validation": 47, "all_splits": 137}, "audio[h]": {"test": 0.09, "train": 0.1, "validation": 0.09, "all_splits": 0.28}, "speakers": {"test": 32, "train": 34, "validation": 35, "all_splits": 101}, "words": {"test": 687, "train": 798, "validation": 721, "all_splits": 2206}, "chars": {"test": 4416, "train": 5230, "validation": 4691, "all_splits": 14337}, "utts_unique": {"test": 42, "train": 48, "validation": 47, "all_splits": 137}, "words_unique": {"test": 524, "train": 560, "validation": 539, "all_splits": 1412}, "chars_unique": {"test": 66, "train": 61, "validation": 69, "all_splits": 71}, "average_utterance_length[words]": {"test": 16.36, "train": 16.62, "validation": 15.34, "all_splits": 16.1}, "average_utterance_length[chars]": {"test": 105.14, "train": 108.96, "validation": 99.81, "all_splits": 104.65}, "samples_per_spk_stats": {"test": {"average": 1.31, "std": 0.53, "median": 1.0, "min": 1, "max": 3}, "train": {"average": 1.41, "std": 0.65, "median": 1.0, "min": 1, "max": 3}, "validation": {"average": 1.34, "std": 0.67, "median": 1.0, "min": 1, "max": 4}, "all_splits": {"average": 1.36, "std": 0.62, "median": 1.0, "min": 1, "max": 4}}, "words_per_sec": {"test": 2.22, "train": 2.18, "validation": 2.16, "all_splits": 2.18}, "chars_per_sec": {"test": 12.04, "train": 12.12, "validation": 11.87, "all_splits": 12.01}, "average_audio_duration[s]": {"test": 7.37, "train": 7.62, "validation": 7.12, "all_splits": 7.37}, "meta_cov_sex": {"test": 100.0, "train": 100.0, "validation": 100.0, "all_splits": 100.0}, "meta_cov_age": {"test": "N/A", "train": "N/A", "validation": "N/A", "all_splits": "N/A"}, "meta_dist_sex": {"test": {"male": 0.02, "female": 0.0}, "train": {"male": 0.0, "female": 0.0}, "validation": {"male": 0.0, "female": 0.09}, "all_splits": {"male": 0.2, "female": 0.8}}, "meta_dist_age": {"test": "N/A", "train": "N/A", "validation": "N/A", "all_splits": "N/A"}}, "pwr-azon_read-20": {"samples": {"test": 77, "train": 83, "validation": 95, "all_splits": 255}, "audio[h]": {"test": 0.17, "train": 0.17, "validation": 0.16, "all_splits": 0.5}, "speakers": {"test": 6, "train": 17, "validation": 4, "all_splits": 27}, "words": {"test": 780, "train": 805, "validation": 875, "all_splits": 2460}, "chars": {"test": 6744, "train": 6778, "validation": 7516, "all_splits": 21038}, "utts_unique": {"test": 73, "train": 82, "validation": 93, "all_splits": 239}, "words_unique": {"test": 589, "train": 618, "validation": 657, "all_splits": 1612}, "chars_unique": {"test": 33, "train": 33, "validation": 33, "all_splits": 32}, "average_utterance_length[words]": {"test": 10.13, "train": 9.7, "validation": 9.21, "all_splits": 9.65}, "average_utterance_length[chars]": {"test": 87.58, "train": 81.66, "validation": 79.12, "all_splits": 82.5}, "samples_per_spk_stats": {"test": {"average": 12.83, "std": 1.57, "median": 13.0, "min": 11, "max": 15}, "train": {"average": 4.88, "std": 2.0, "median": 5.0, "min": 1, "max": 8}, "validation": {"average": 23.75, "std": 6.61, "median": 23.5, "min": 15, "max": 33}, "all_splits": {"average": 9.44, "std": 7.45, "median": 7.0, "min": 1, "max": 33}}, "words_per_sec": {"test": 1.31, "train": 1.35, "validation": 1.48, "all_splits": 1.38}, "chars_per_sec": {"test": 10.04, "train": 10.0, "validation": 11.24, "all_splits": 10.42}, "average_audio_duration[s]": {"test": 7.72, "train": 7.2, "validation": 6.22, "all_splits": 6.99}, "meta_cov_sex": {"test": 100.0, "train": 100.0, "validation": 100.0, "all_splits": 100.0}, "meta_cov_age": {"test": "N/A", "train": "N/A", "validation": "N/A", "all_splits": "N/A"}, "meta_dist_sex": {"test": {"male": 0.14, "female": 0.86}, "train": {"male": 0.28, "female": 0.72}, "validation": {"male": 0.51, "female": 0.49}, "all_splits": {"male": 0.32, "female": 0.68}}, "meta_dist_age": {"test": "N/A", "train": "N/A", "validation": "N/A", "all_splits": "N/A"}}, "pwr-azon_spont-20": {"samples": {"test": 39, "train": 30, "validation": 31, "all_splits": 100}, "audio[h]": {"test": 0.17, "train": 0.16, "validation": 0.16, "all_splits": 0.49}, "speakers": {"test": 2, "train": 12, "validation": 2, "all_splits": 16}, "words": {"test": 1305, "train": 1190, "validation": 1711, "all_splits": 4206}, "chars": {"test": 8491, "train": 7791, "validation": 10497, "all_splits": 26779}, "utts_unique": {"test": 39, "train": 30, "validation": 31, "all_splits": 100}, "words_unique": {"test": 598, "train": 651, "validation": 724, "all_splits": 1646}, "chars_unique": {"test": 33, "train": 33, "validation": 33, "all_splits": 32}, "average_utterance_length[words]": {"test": 33.46, "train": 39.67, "validation": 55.19, "all_splits": 42.06}, "average_utterance_length[chars]": {"test": 217.72, "train": 259.7, "validation": 338.61, "all_splits": 267.79}, "samples_per_spk_stats": {"test": {"average": 19.5, "std": 7.5, "median": 19.5, "min": 12, "max": 27}, "train": {"average": 2.5, "std": 1.5, "median": 2.0, "min": 1, "max": 5}, "validation": {"average": 15.5, "std": 6.5, "median": 15.5, "min": 9, "max": 22}, "all_splits": {"average": 6.25, "std": 7.56, "median": 4.0, "min": 1, "max": 27}}, "words_per_sec": {"test": 2.18, "train": 2.03, "validation": 2.88, "all_splits": 2.36}, "chars_per_sec": {"test": 11.98, "train": 11.24, "validation": 14.81, "all_splits": 12.68}, "average_audio_duration[s]": {"test": 15.38, "train": 19.57, "validation": 19.13, "all_splits": 17.8}, "meta_cov_sex": {"test": 100.0, "train": 100.0, "validation": 100.0, "all_splits": 100.0}, "meta_cov_age": {"test": "N/A", "train": "N/A", "validation": "N/A", "all_splits": "N/A"}, "meta_dist_sex": {"test": {"male": 0.31, "female": 0.69}, "train": {"male": 0.53, "female": 0.47}, "validation": {"male": 1.0, "female": 0.0}, "all_splits": {"male": 0.59, "female": 0.41}}, "meta_dist_age": {"test": "N/A", "train": "N/A", "validation": "N/A", "all_splits": "N/A"}}, "pwr-maleset-unk": {"samples": {"test": 116, "train": 111, "validation": 118, "all_splits": 345}, "audio[h]": {"test": 0.16, "train": 0.16, "validation": 0.16, "all_splits": 0.48}, "speakers": {"test": 1, "train": 1, "validation": 1, "all_splits": 3}, "words": {"test": 1030, "train": 957, "validation": 972, "all_splits": 2959}, "chars": {"test": 7223, "train": 6779, "validation": 6467, "all_splits": 20469}, "utts_unique": {"test": 116, "train": 111, "validation": 117, "all_splits": 340}, "words_unique": {"test": 750, "train": 710, "validation": 692, "all_splits": 1866}, "chars_unique": {"test": 41, "train": 46, "validation": 49, "all_splits": 52}, "average_utterance_length[words]": {"test": 8.88, "train": 8.62, "validation": 8.24, "all_splits": 8.58}, "average_utterance_length[chars]": {"test": 62.27, "train": 61.07, "validation": 54.81, "all_splits": 59.33}, "samples_per_spk_stats": {"test": {"average": 116.0, "std": 0.0, "median": 116.0, "min": 116, "max": 116}, "train": {"average": 111.0, "std": 0.0, "median": 111.0, "min": 111, "max": 111}, "validation": {"average": 118.0, "std": 0.0, "median": 118.0, "min": 118, "max": 118}, "all_splits": {"average": 115.0, "std": 2.94, "median": 116.0, "min": 111, "max": 118}}, "words_per_sec": {"test": 1.73, "train": 1.68, "validation": 1.73, "all_splits": 1.71}, "chars_per_sec": {"test": 10.43, "train": 10.21, "validation": 9.79, "all_splits": 10.15}, "average_audio_duration[s]": {"test": 5.12, "train": 5.14, "validation": 4.76, "all_splits": 5.0}, "meta_cov_sex": {"test": 100.0, "train": 100.0, "validation": 100.0, "all_splits": 100.0}, "meta_cov_age": {"test": "N/A", "train": "N/A", "validation": "N/A", "all_splits": "N/A"}, "meta_dist_sex": {"test": {"male": 1.0, "female": 0.0}, "train": {"male": 1.0, "female": 0.0}, "validation": {"male": 1.0, "female": 0.0}, "all_splits": {"male": 1.0, "female": 0.0}}, "meta_dist_age": {"test": "N/A", "train": "N/A", "validation": "N/A", "all_splits": "N/A"}}, "pwr-shortwords-unk": {"samples": {"test": 86, "train": 102, "validation": 82, "all_splits": 270}, "audio[h]": {"test": 0.13, "train": 0.16, "validation": 0.12, "all_splits": 0.41000000000000003}, "speakers": {"test": 1, "train": 1, "validation": 1, "all_splits": 3}, "words": {"test": 822, "train": 984, "validation": 755, "all_splits": 2561}, "chars": {"test": 5649, "train": 6727, "validation": 5224, "all_splits": 17600}, "utts_unique": {"test": 84, "train": 92, "validation": 78, "all_splits": 232}, "words_unique": {"test": 585, "train": 644, "validation": 543, "all_splits": 1458}, "chars_unique": {"test": 37, "train": 47, "validation": 41, "all_splits": 48}, "average_utterance_length[words]": {"test": 9.56, "train": 9.65, "validation": 9.21, "all_splits": 9.49}, "average_utterance_length[chars]": {"test": 65.69, "train": 65.95, "validation": 63.71, "all_splits": 65.19}, "samples_per_spk_stats": {"test": {"average": 86.0, "std": 0.0, "median": 86.0, "min": 86, "max": 86}, "train": {"average": 102.0, "std": 0.0, "median": 102.0, "min": 102, "max": 102}, "validation": {"average": 82.0, "std": 0.0, "median": 82.0, "min": 82, "max": 82}, "all_splits": {"average": 90.0, "std": 8.64, "median": 86.0, "min": 82, "max": 102}}, "words_per_sec": {"test": 1.76, "train": 1.74, "validation": 1.75, "all_splits": 1.75}, "chars_per_sec": {"test": 10.33, "train": 10.13, "validation": 10.34, "all_splits": 10.26}, "average_audio_duration[s]": {"test": 5.43, "train": 5.56, "validation": 5.27, "all_splits": 5.43}, "meta_cov_sex": {"test": 100.0, "train": 100.0, "validation": 100.0, "all_splits": 100.0}, "meta_cov_age": {"test": "N/A", "train": "N/A", "validation": "N/A", "all_splits": "N/A"}, "meta_dist_sex": {"test": {"male": 1.0, "female": 0.0}, "train": {"male": 1.0, "female": 0.0}, "validation": {"male": 1.0, "female": 0.0}, "all_splits": {"male": 1.0, "female": 0.0}}, "meta_dist_age": {"test": "N/A", "train": "N/A", "validation": "N/A", "all_splits": "N/A"}}, "pwr-viu-unk": {"samples": {"test": 247, "train": 382, "validation": 266, "all_splits": 895}, "audio[h]": {"test": 0.09, "train": 0.15, "validation": 0.1, "all_splits": 0.33999999999999997}, "speakers": {"test": 1, "train": 1, "validation": 1, "all_splits": 3}, "words": {"test": 438, "train": 691, "validation": 457, "all_splits": 1586}, "chars": {"test": 2785, "train": 4470, "validation": 3012, "all_splits": 10267}, "utts_unique": {"test": 13, "train": 13, "validation": 13, "all_splits": 13}, "words_unique": {"test": 18, "train": 18, "validation": 18, "all_splits": 18}, "chars_unique": {"test": 28, "train": 28, "validation": 28, "all_splits": 27}, "average_utterance_length[words]": {"test": 1.77, "train": 1.81, "validation": 1.72, "all_splits": 1.77}, "average_utterance_length[chars]": {"test": 11.28, "train": 11.7, "validation": 11.32, "all_splits": 11.47}, "samples_per_spk_stats": {"test": {"average": 247.0, "std": 0.0, "median": 247.0, "min": 247, "max": 247}, "train": {"average": 382.0, "std": 0.0, "median": 382.0, "min": 382, "max": 382}, "validation": {"average": 266.0, "std": 0.0, "median": 266.0, "min": 266, "max": 266}, "all_splits": {"average": 298.33, "std": 59.67, "median": 266.0, "min": 247, "max": 382}}, "words_per_sec": {"test": 1.29, "train": 1.27, "validation": 1.25, "all_splits": 1.27}, "chars_per_sec": {"test": 6.92, "train": 6.95, "validation": 7.01, "all_splits": 6.96}, "average_audio_duration[s]": {"test": 1.37, "train": 1.42, "validation": 1.37, "all_splits": 1.39}, "meta_cov_sex": {"test": 100.0, "train": 100.0, "validation": 100.0, "all_splits": 100.0}, "meta_cov_age": {"test": "N/A", "train": "N/A", "validation": "N/A", "all_splits": "N/A"}, "meta_dist_sex": {"test": {"male": 1.0, "female": 0.0}, "train": {"male": 1.0, "female": 0.0}, "validation": {"male": 1.0, "female": 0.0}, "all_splits": {"male": 1.0, "female": 0.0}}, "meta_dist_age": {"test": "N/A", "train": "N/A", "validation": "N/A", "all_splits": "N/A"}}, "google-fleurs-22": {"samples": {"test": 56, "train": 47, "validation": 62, "all_splits": 165}, "audio[h]": {"test": 0.17, "train": 0.16, "validation": 0.16, "all_splits": 0.49}, "speakers": {"test": 1, "train": 1, "validation": 1, "all_splits": 3}, "words": {"test": 1149, "train": 930, "validation": 1100, "all_splits": 3179}, "chars": {"test": 8120, "train": 6479, "validation": 7817, "all_splits": 22416}, "utts_unique": {"test": 53, "train": 47, "validation": 55, "all_splits": 155}, "words_unique": {"test": 769, "train": 685, "validation": 705, "all_splits": 1908}, "chars_unique": {"test": 49, "train": 47, "validation": 45, "all_splits": 51}, "average_utterance_length[words]": {"test": 20.52, "train": 19.79, "validation": 17.74, "all_splits": 19.27}, "average_utterance_length[chars]": {"test": 145.0, "train": 137.85, "validation": 126.08, "all_splits": 135.85}, "samples_per_spk_stats": {"test": {"average": 56.0, "std": 0.0, "median": 56.0, "min": 56, "max": 56}, "train": {"average": 47.0, "std": 0.0, "median": 47.0, "min": 47, "max": 47}, "validation": {"average": 62.0, "std": 0.0, "median": 62.0, "min": 62, "max": 62}, "all_splits": {"average": 55.0, "std": 6.16, "median": 56.0, "min": 47, "max": 62}}, "words_per_sec": {"test": 1.93, "train": 1.6, "validation": 1.9, "all_splits": 1.81}, "chars_per_sec": {"test": 11.74, "train": 9.54, "validation": 11.63, "all_splits": 10.97}, "average_audio_duration[s]": {"test": 10.61, "train": 12.38, "validation": 9.32, "all_splits": 10.63}, "meta_cov_sex": {"test": "N/A", "train": "N/A", "validation": "N/A", "all_splits": "N/A"}, "meta_cov_age": {"test": "N/A", "train": "N/A", "validation": "N/A", "all_splits": "N/A"}, "meta_dist_sex": {"test": "N/A", "train": "N/A", "validation": "N/A", "all_splits": "N/A"}, "meta_dist_age": {"test": "N/A", "train": "N/A", "validation": "N/A", "all_splits": "N/A"}}, "polyai-minds14-21": {"samples": {"test": 24, "train": 26, "validation": 34, "all_splits": 84}, "audio[h]": {"test": 0.15, "train": 0.15, "validation": 0.16, "all_splits": 0.45999999999999996}, "speakers": {"test": 3, "train": 3, "validation": 3, "all_splits": 9}, "words": {"test": 456, "train": 524, "validation": 605, "all_splits": 1585}, "chars": {"test": 2874, "train": 3315, "validation": 3849, "all_splits": 10038}, "utts_unique": {"test": 24, "train": 26, "validation": 34, "all_splits": 84}, "words_unique": {"test": 253, "train": 274, "validation": 294, "all_splits": 551}, "chars_unique": {"test": 44, "train": 47, "validation": 49, "all_splits": 52}, "average_utterance_length[words]": {"test": 19.0, "train": 20.15, "validation": 17.79, "all_splits": 18.87}, "average_utterance_length[chars]": {"test": 119.75, "train": 127.5, "validation": 113.21, "all_splits": 119.5}, "samples_per_spk_stats": {"test": {"average": 8.0, "std": 9.2, "median": 2.0, "min": 1, "max": 21}, "train": {"average": 8.67, "std": 7.32, "median": 4.0, "min": 3, "max": 19}, "validation": {"average": 11.33, "std": 10.37, "median": 4.0, "min": 4, "max": 26}, "all_splits": {"average": 9.33, "std": 10.37, "median": 4.0, "min": 4, "max": 26}}, "words_per_sec": {"test": 0.82, "train": 0.97, "validation": 1.02, "all_splits": 0.94}, "chars_per_sec": {"test": 4.35, "train": 5.19, "validation": 5.47, "all_splits": 5.01}, "average_audio_duration[s]": {"test": 23.19, "train": 20.7, "validation": 17.44, "all_splits": 20.1}, "meta_cov_sex": {"test": "N/A", "train": "N/A", "validation": "N/A", "all_splits": "N/A"}, "meta_cov_age": {"test": "N/A", "train": "N/A", "validation": "N/A", "all_splits": "N/A"}, "meta_dist_sex": {"test": "N/A", "train": "N/A", "validation": "N/A", "all_splits": "N/A"}, "meta_dist_age": {"test": "N/A", "train": "N/A", "validation": "N/A", "all_splits": "N/A"}}, "all": {"samples": {"test": 774, "train": 920, "validation": 820, "all_splits": 2514}, "audio[h]": {"test": 1.45, "train": 1.54, "validation": 1.45, "all_splits": 4.44}, "speakers": {"test": 83, "train": 130, "validation": 71, "all_splits": 284}, "words": {"test": 9521, "train": 9945, "validation": 10155, "all_splits": 29621}, "chars": {"test": 60887, "train": 62773, "validation": 64087, "all_splits": 187747}, "utts_unique": {"test": 531, "train": 540, "validation": 552, "all_splits": 1558}, "words_unique": {"test": 4319, "train": 4502, "validation": 4384, "all_splits": 10565}, "chars_unique": {"test": 80, "train": 75, "validation": 81, "all_splits": 88}, "average_utterance_length[words]": {"test": 12.3, "train": 10.81, "validation": 12.38, "all_splits": 11.78}, "average_utterance_length[chars]": {"test": 78.67, "train": 68.23, "validation": 78.15, "all_splits": 74.68}, "samples_per_spk_stats": {"test": {"average": 8.6, "std": 30.0, "median": 1.0, "min": 1, "max": 247}, "train": {"average": 6.26, "std": 33.63, "median": 1.0, "min": 1, "max": 382}, "validation": {"average": 9.21, "std": 32.11, "median": 1.0, "min": 1, "max": 266}, "all_splits": {"average": 7.71, "std": 32.56, "median": 1.0, "min": 1, "max": 382}}, "words_per_sec": {"test": 1.82, "train": 1.8, "validation": 1.94, "all_splits": 1.85}, "chars_per_sec": {"test": 9.83, "train": 9.55, "validation": 10.32, "all_splits": 9.89}, "average_audio_duration[s]": {"test": 6.75, "train": 6.01, "validation": 6.38, "all_splits": 6.36}, "meta_cov_sex": {"test": 78.42, "train": 82.17, "validation": 77.93, "all_splits": 79.63}, "meta_cov_age": {"test": "N/A", "train": "N/A", "validation": "N/A", "all_splits": "N/A"}, "meta_dist_sex": {"test": {"male": 0.78, "female": 0.15}, "train": {"male": 0.84, "female": 0.1}, "validation": {"male": 0.85, "female": 0.08}, "all_splits": {"male": 0.88, "female": 0.12}}, "meta_dist_age": {"test": "N/A", "train": "N/A", "validation": "N/A", "all_splits": "N/A"}}}
reports/amu-cai/pl-asr-bigos-v2/dataset_contents.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:43e808b081d9b692c2469396565fb967105fd815894a7eaded34e89969dbc890
3
  size 46668863
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51bc49c43a89556c627ad2b57143bceedc0a5510de81f2ceffc471200cdc7ff2
3
  size 46668863
reports/amu-cai/pl-asr-bigos-v2/dataset_statistics.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0acb30a9a215f9c96b567b8753f565f400eac2366df6dba6248ccba859e190e3
3
- size 23940
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3914471cf4e90fa7cef52beae12fe5a1162ae90a73b1e2e97cc38f9222ea8831
3
+ size 26953
reports/pelcra/pl-asr-pelcra-for-bigos/dataset_contents.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9cea38447dc7485c0f628eba6e52f45e24d1d467fbe23c065162d6b36455ab1d
3
  size 95274266
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ccb4a2a854270fcd53a55ce09443c05392c3f15413013724a7975f7db9019ca
3
  size 95274266
reports/pelcra/pl-asr-pelcra-for-bigos/dataset_statistics.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0ab97523e5f4776bb566ed57c38126004bfac43f64bb3177e9ae39f1ee6e51d5
3
- size 30399
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18ea2da5c59735f8dcd2d78bffcbbda26e414abcdbf607ab4a66e15aff65acac
3
+ size 33533
run-analysis.py CHANGED
@@ -2,7 +2,7 @@ import os
2
  import json
3
  from datasets import load_dataset, get_dataset_config_names, Features, Value
4
  from utils import num_of_samples_per_split, uniq_utts_per_split, words_per_split, uniq_words_per_split, chars_per_split, uniq_chars_per_split
5
- from utils import total_audio_duration_per_split, average_audio_duration_per_split, speakers_per_split, meta_cov_per_split
6
  #, uniq_utts_per_speaker
7
  from utils import meta_distribution_text, meta_distribution_violin_plot, recordings_per_speaker, speech_rate_words_per_split, speech_rate_chars_per_split
8
  import argparse
@@ -16,14 +16,17 @@ os.makedirs(output_dir_plots, exist_ok=True)
16
  # read from command line argument
17
  parser = argparse.ArgumentParser()
18
  parser.add_argument("--dataset", type=str, required=True, help="Name of the dataset to generate reports for")
19
- parser.add_argument("--secret_test_split", default=True, type=bool, help="Should references for test split be retrieved from the secret distribution?")
20
 
21
  args = parser.parse_args()
22
 
 
23
  dataset_name = args.dataset
24
  print("Generating reports for dataset: {}".format(dataset_name))
25
- if (args.secret_test_split):
 
26
  dataset_name_secret = str.join("-", [dataset_name, "secret"])
 
27
  # check if secret repo exists
28
  print(dataset_name_secret)
29
  try:
@@ -31,7 +34,6 @@ if (args.secret_test_split):
31
  except:
32
  print("Config for secret dataset {} cannot be retrieved!".format(dataset_name_secret))
33
 
34
- #dataset_name = "amu-cai/pl-asr-bigos-v2"
35
  output_dir_reports_dataset = os.path.join(output_dir_reports, dataset_name)
36
  os.makedirs(output_dir_reports_dataset, exist_ok=True)
37
 
@@ -55,8 +57,11 @@ for config_name in dataset_configs:
55
  dataset_contents[config_name] = {}
56
 
57
  dataset_hf_subset = load_dataset(dataset_name, config_name, features=features_to_load, trust_remote_code=True)
58
- if(args.secret_test_split):
 
59
  dataset_hf_subset_secret = load_dataset(dataset_name_secret, config_name, features=features_to_load, trust_remote_code=True)
 
 
60
 
61
  #audio content size
62
  dataset_statistics[config_name]["samples"] = num_of_samples_per_split(dataset_hf_subset)
@@ -73,6 +78,10 @@ for config_name in dataset_configs:
73
  dataset_statistics[config_name]["words_unique"], dataset_contents[config_name]["unique_words"] = uniq_words_per_split(dataset_hf_subset, dataset_hf_subset_secret)
74
  dataset_statistics[config_name]["chars_unique"], dataset_contents[config_name]["unique_chars"] = uniq_chars_per_split(dataset_hf_subset, dataset_hf_subset_secret)
75
 
 
 
 
 
76
  # audio content derived features
77
  dataset_statistics[config_name]["words_per_sec"] = speech_rate_words_per_split(dataset_hf_subset, dataset_hf_subset_secret)
78
  dataset_statistics[config_name]["chars_per_sec"] = speech_rate_chars_per_split(dataset_hf_subset, dataset_hf_subset_secret)
@@ -86,7 +95,6 @@ for config_name in dataset_configs:
86
  dataset_statistics[config_name]["meta_dist_sex"] = meta_distribution_text(dataset_hf_subset, 'speaker_sex')
87
  dataset_statistics[config_name]["meta_dist_age"] = meta_distribution_text(dataset_hf_subset, 'speaker_age')
88
 
89
- dataset_statistics[config_name]["samples_per_spk"], dataset_contents[config_name]["samples_per_spk"] = recordings_per_speaker(dataset_hf_subset)
90
  # dataset_statistics[config_name] = uniq_utts_per_speaker(dataset_hf_subset)
91
  # number of words per speaker (min, max, med, avg, std)
92
 
 
2
  import json
3
  from datasets import load_dataset, get_dataset_config_names, Features, Value
4
  from utils import num_of_samples_per_split, uniq_utts_per_split, words_per_split, uniq_words_per_split, chars_per_split, uniq_chars_per_split
5
+ from utils import total_audio_duration_per_split, average_audio_duration_per_split, average_utterance_length_chars_per_split, average_utterance_length_words_per_split, speakers_per_split, meta_cov_per_split
6
  #, uniq_utts_per_speaker
7
  from utils import meta_distribution_text, meta_distribution_violin_plot, recordings_per_speaker, speech_rate_words_per_split, speech_rate_chars_per_split
8
  import argparse
 
16
  # read from command line argument
17
  parser = argparse.ArgumentParser()
18
  parser.add_argument("--dataset", type=str, required=True, help="Name of the dataset to generate reports for")
19
+ parser.add_argument('--no_secret_test_split', action='store_false', help="Should references for test split be retrieved from the secret distribution?")
20
 
21
  args = parser.parse_args()
22
 
23
+
24
  dataset_name = args.dataset
25
  print("Generating reports for dataset: {}".format(dataset_name))
26
+ if not (args.no_secret_test_split):
27
+
28
  dataset_name_secret = str.join("-", [dataset_name, "secret"])
29
+
30
  # check if secret repo exists
31
  print(dataset_name_secret)
32
  try:
 
34
  except:
35
  print("Config for secret dataset {} cannot be retrieved!".format(dataset_name_secret))
36
 
 
37
  output_dir_reports_dataset = os.path.join(output_dir_reports, dataset_name)
38
  os.makedirs(output_dir_reports_dataset, exist_ok=True)
39
 
 
57
  dataset_contents[config_name] = {}
58
 
59
  dataset_hf_subset = load_dataset(dataset_name, config_name, features=features_to_load, trust_remote_code=True)
60
+
61
+ if not (args.no_secret_test_split):
62
  dataset_hf_subset_secret = load_dataset(dataset_name_secret, config_name, features=features_to_load, trust_remote_code=True)
63
+ else:
64
+ dataset_hf_subset_secret = None
65
 
66
  #audio content size
67
  dataset_statistics[config_name]["samples"] = num_of_samples_per_split(dataset_hf_subset)
 
78
  dataset_statistics[config_name]["words_unique"], dataset_contents[config_name]["unique_words"] = uniq_words_per_split(dataset_hf_subset, dataset_hf_subset_secret)
79
  dataset_statistics[config_name]["chars_unique"], dataset_contents[config_name]["unique_chars"] = uniq_chars_per_split(dataset_hf_subset, dataset_hf_subset_secret)
80
 
81
+ dataset_statistics[config_name]["average_utterance_length[words]"] = average_utterance_length_words_per_split(dataset_hf_subset, dataset_hf_subset_secret)
82
+ dataset_statistics[config_name]["average_utterance_length[chars]"] = average_utterance_length_chars_per_split(dataset_hf_subset, dataset_hf_subset_secret)
83
+ dataset_statistics[config_name]["samples_per_spk_stats"], dataset_contents[config_name]["samples_per_spk"] = recordings_per_speaker(dataset_hf_subset)
84
+
85
  # audio content derived features
86
  dataset_statistics[config_name]["words_per_sec"] = speech_rate_words_per_split(dataset_hf_subset, dataset_hf_subset_secret)
87
  dataset_statistics[config_name]["chars_per_sec"] = speech_rate_chars_per_split(dataset_hf_subset, dataset_hf_subset_secret)
 
95
  dataset_statistics[config_name]["meta_dist_sex"] = meta_distribution_text(dataset_hf_subset, 'speaker_sex')
96
  dataset_statistics[config_name]["meta_dist_age"] = meta_distribution_text(dataset_hf_subset, 'speaker_age')
97
 
 
98
  # dataset_statistics[config_name] = uniq_utts_per_speaker(dataset_hf_subset)
99
  # number of words per speaker (min, max, med, avg, std)
100
 
utils.py CHANGED
@@ -77,6 +77,58 @@ def average_audio_duration_per_split(dataset_hf):
77
  out_dict["all_splits"] = round(audio_length_total_seconds / samples_all,2)
78
  return out_dict
79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  def speakers_per_split(dataset_hf):
81
  # input - huggingface dataset object
82
  # output - dictionary with statistics about audio duration per split
@@ -106,7 +158,7 @@ def uniq_utts_per_split(dataset_hf, dataset_hf_secret):
106
  utts_all = []
107
  for split in dataset_hf.keys():
108
  # extract speakers from file_id
109
- if (split == "test"):
110
  utts_split = dataset_hf_secret[split]["ref_orig"]
111
  else:
112
  utts_split = dataset_hf[split]["ref_orig"]
@@ -129,7 +181,7 @@ def words_per_split(dataset_hf, dataset_hf_secret):
129
 
130
  for split in dataset_hf.keys():
131
  # extract speakers from file_id
132
- if (split == "test"):
133
  utts_all = dataset_hf_secret[split]["ref_orig"]
134
  else:
135
  utts_all = dataset_hf[split]["ref_orig"]
@@ -153,7 +205,7 @@ def uniq_words_per_split(dataset_hf, dataset_hf_secret):
153
 
154
  for split in dataset_hf.keys():
155
  # extract speakers from file_id
156
- if (split == "test"):
157
  utts_all = dataset_hf_secret[split]["ref_orig"]
158
  else:
159
  utts_all = dataset_hf[split]["ref_orig"]
@@ -185,7 +237,7 @@ def chars_per_split(dataset_hf, dataset_hf_secret):
185
 
186
  for split in dataset_hf.keys():
187
  # extract speakers from file_id
188
- if (split=="test"):
189
  utts_all = dataset_hf_secret[split]["ref_orig"]
190
  else:
191
  utts_all = dataset_hf[split]["ref_orig"]
@@ -210,7 +262,7 @@ def uniq_chars_per_split(dataset_hf, dataset_hf_secret):
210
 
211
  for split in dataset_hf.keys():
212
  # extract speakers from file_id
213
- if(split == "test"):
214
  utts_all = dataset_hf_secret[split]["ref_orig"]
215
  else:
216
  utts_all = dataset_hf[split]["ref_orig"]
@@ -256,17 +308,13 @@ def meta_cov_per_split(dataset_hf, meta_field):
256
  out_dict[split] = "N/A"
257
  continue
258
  meta_info_not_null_all += meta_info_not_null_count
259
- meta_info_coverage = round(meta_info_not_null_count / meta_info_count, 2)
260
- #print(split, meta_info_coverage)
261
-
262
- # add number of samples for all splits
263
  out_dict[split] = meta_info_coverage
264
 
265
- # add number of samples for all splits
266
  if (meta_info_not_null_all == 0):
267
  out_dict["all_splits"] = "N/A"
268
  else:
269
- out_dict["all_splits"] = round(meta_info_not_null_all/meta_info_all,2 )
270
  return out_dict
271
 
272
 
@@ -282,7 +330,7 @@ def speech_rate_words_per_split(dataset_hf, dataset_hf_secret):
282
 
283
  for split in dataset_hf.keys():
284
  # extract speakers from file_id
285
- if (split == "test"):
286
  utts_split = dataset_hf_secret[split]["ref_orig"]
287
  else:
288
  utts_split = dataset_hf[split]["ref_orig"]
@@ -292,9 +340,8 @@ def speech_rate_words_per_split(dataset_hf, dataset_hf_secret):
292
  audio_split_length_seconds = sum(dataset_hf[split]["audio_duration_seconds"])
293
  audio_total_length_seconds += audio_split_length_seconds
294
  speech_rate = round(words_split_count / audio_split_length_seconds, 2)
295
- #print(split, speech_rate)
296
  out_dict[split] = speech_rate
297
- # add number of samples for all splits
298
  out_dict["all_splits"] = round(words_all_count / audio_total_length_seconds, 2)
299
  return out_dict
300
 
@@ -310,7 +357,7 @@ def speech_rate_chars_per_split(dataset_hf, dataset_hf_secret):
310
 
311
  for split in dataset_hf.keys():
312
  # extract speakers from file_id
313
- if (split == "test"):
314
  utts_split = dataset_hf_secret[split]["ref_orig"]
315
  else:
316
  utts_split = dataset_hf[split]["ref_orig"]
@@ -320,9 +367,8 @@ def speech_rate_chars_per_split(dataset_hf, dataset_hf_secret):
320
  audio_split_length_seconds = sum(dataset_hf[split]["audio_duration_seconds"])
321
  audio_total_length_seconds += audio_split_length_seconds
322
  speech_rate = round(chars_split_count / audio_split_length_seconds, 2)
323
- #print(split, speech_rate)
324
  out_dict[split] = speech_rate
325
- # add number of samples for all splits
326
  out_dict["all_splits"] = round(chars_all_count / audio_total_length_seconds, 2)
327
  return out_dict
328
 
@@ -362,7 +408,6 @@ def meta_distribution_text(dataset_hf, meta_field):
362
  out_dict[split][bucket] = round(values_count/len(meta_info_not_null),2)
363
  #print(split, out_dict[split])
364
 
365
- # add number of samples for all splits
366
  if (no_meta):
367
  out_dict["all_splits"] = "N/A"
368
  return out_dict
@@ -428,6 +473,7 @@ def recordings_per_speaker(dataset_hf):
428
  recordings_total += recordings_split
429
 
430
  average_recordings_per_speaker = round( recordings_split / speakers_split,2)
 
431
  out_dict_stats[split]["average"] = average_recordings_per_speaker
432
  out_dict_stats[split]["std"] = round(np.std(list(recordings_per_speaker_stats_dict_split.values())),2)
433
  out_dict_stats[split]["median"] = np.median(list(recordings_per_speaker_stats_dict_split.values()))
 
77
  out_dict["all_splits"] = round(audio_length_total_seconds / samples_all,2)
78
  return out_dict
79
 
80
+ def average_utterance_length_chars_per_split(dataset_hf, dataset_hf_secret):
81
+ # input - huggingface dataset object
82
+ # output - dictionary with statistics about average utterance length per split
83
+ out_dict = {}
84
+ metric = "average_utterance_length[chars]"
85
+ print("Calculating {}".format(metric))
86
+ chars_all=0
87
+ samples_all=0
88
+ for split in dataset_hf.keys():
89
+ # extract speakers from file_id
90
+ if (split=="test" and dataset_hf_secret is not None):
91
+ utts_split = dataset_hf_secret[split]["ref_orig"]
92
+ else:
93
+ utts_split = dataset_hf[split]["ref_orig"]
94
+ words_split = " ".join(utts_split).split(" ")
95
+ chars_split = " ".join(words_split)
96
+ chars_split_count = len(chars_split)
97
+ chars_all += chars_split_count
98
+ samples_split = len(utts_split)
99
+ samples_all += samples_split
100
+ #print(split, chars_all_count)
101
+ out_dict[split] = round(chars_split_count/samples_split, 2)
102
+
103
+ # add number of samples for all splits
104
+ out_dict["all_splits"] = round(chars_all/samples_all, 2)
105
+ return out_dict
106
+
107
+ def average_utterance_length_words_per_split(dataset_hf, dataset_hf_secret):
108
+ # input - huggingface dataset object
109
+ # output - dictionary with statistics about average utterance length per split
110
+ out_dict = {}
111
+ metric = "average_utterance_length[words]"
112
+ print("Calculating {}".format(metric))
113
+ words_all=0
114
+ samples_all=0
115
+ for split in dataset_hf.keys():
116
+ # extract speakers from file_id
117
+ if (split=="test" and dataset_hf_secret is not None):
118
+ utts_split = dataset_hf_secret[split]["ref_orig"]
119
+ else:
120
+ utts_split = dataset_hf[split]["ref_orig"]
121
+ words_split_count = len(" ".join(utts_split).split(" "))
122
+ words_all += words_split_count
123
+ samples_split = len(utts_split)
124
+ samples_all += samples_split
125
+ #print(split, chars_all_count)
126
+ out_dict[split] = round(words_split_count/samples_split , 2)
127
+
128
+ # add number of samples for all splits
129
+ out_dict["all_splits"] = round(words_all/samples_all, 2)
130
+ return out_dict
131
+
132
  def speakers_per_split(dataset_hf):
133
  # input - huggingface dataset object
134
  # output - dictionary with statistics about audio duration per split
 
158
  utts_all = []
159
  for split in dataset_hf.keys():
160
  # extract speakers from file_id
161
+ if (split == "test" and dataset_hf_secret is not None):
162
  utts_split = dataset_hf_secret[split]["ref_orig"]
163
  else:
164
  utts_split = dataset_hf[split]["ref_orig"]
 
181
 
182
  for split in dataset_hf.keys():
183
  # extract speakers from file_id
184
+ if (split == "test" and dataset_hf_secret is not None):
185
  utts_all = dataset_hf_secret[split]["ref_orig"]
186
  else:
187
  utts_all = dataset_hf[split]["ref_orig"]
 
205
 
206
  for split in dataset_hf.keys():
207
  # extract speakers from file_id
208
+ if (split == "test" and dataset_hf_secret is not None):
209
  utts_all = dataset_hf_secret[split]["ref_orig"]
210
  else:
211
  utts_all = dataset_hf[split]["ref_orig"]
 
237
 
238
  for split in dataset_hf.keys():
239
  # extract speakers from file_id
240
+ if (split=="test" and dataset_hf_secret is not None):
241
  utts_all = dataset_hf_secret[split]["ref_orig"]
242
  else:
243
  utts_all = dataset_hf[split]["ref_orig"]
 
262
 
263
  for split in dataset_hf.keys():
264
  # extract speakers from file_id
265
+ if(split == "test" and dataset_hf_secret is not None):
266
  utts_all = dataset_hf_secret[split]["ref_orig"]
267
  else:
268
  utts_all = dataset_hf[split]["ref_orig"]
 
308
  out_dict[split] = "N/A"
309
  continue
310
  meta_info_not_null_all += meta_info_not_null_count
311
+ meta_info_coverage = round(meta_info_not_null_count / meta_info_count * 100, 2)
 
 
 
312
  out_dict[split] = meta_info_coverage
313
 
 
314
  if (meta_info_not_null_all == 0):
315
  out_dict["all_splits"] = "N/A"
316
  else:
317
+ out_dict["all_splits"] = round(meta_info_not_null_all/meta_info_all * 100,2 )
318
  return out_dict
319
 
320
 
 
330
 
331
  for split in dataset_hf.keys():
332
  # extract speakers from file_id
333
+ if (split == "test" and dataset_hf_secret is not None):
334
  utts_split = dataset_hf_secret[split]["ref_orig"]
335
  else:
336
  utts_split = dataset_hf[split]["ref_orig"]
 
340
  audio_split_length_seconds = sum(dataset_hf[split]["audio_duration_seconds"])
341
  audio_total_length_seconds += audio_split_length_seconds
342
  speech_rate = round(words_split_count / audio_split_length_seconds, 2)
 
343
  out_dict[split] = speech_rate
344
+
345
  out_dict["all_splits"] = round(words_all_count / audio_total_length_seconds, 2)
346
  return out_dict
347
 
 
357
 
358
  for split in dataset_hf.keys():
359
  # extract speakers from file_id
360
+ if (split == "test" and dataset_hf_secret is not None):
361
  utts_split = dataset_hf_secret[split]["ref_orig"]
362
  else:
363
  utts_split = dataset_hf[split]["ref_orig"]
 
367
  audio_split_length_seconds = sum(dataset_hf[split]["audio_duration_seconds"])
368
  audio_total_length_seconds += audio_split_length_seconds
369
  speech_rate = round(chars_split_count / audio_split_length_seconds, 2)
 
370
  out_dict[split] = speech_rate
371
+
372
  out_dict["all_splits"] = round(chars_all_count / audio_total_length_seconds, 2)
373
  return out_dict
374
 
 
408
  out_dict[split][bucket] = round(values_count/len(meta_info_not_null),2)
409
  #print(split, out_dict[split])
410
 
 
411
  if (no_meta):
412
  out_dict["all_splits"] = "N/A"
413
  return out_dict
 
473
  recordings_total += recordings_split
474
 
475
  average_recordings_per_speaker = round( recordings_split / speakers_split,2)
476
+
477
  out_dict_stats[split]["average"] = average_recordings_per_speaker
478
  out_dict_stats[split]["std"] = round(np.std(list(recordings_per_speaker_stats_dict_split.values())),2)
479
  out_dict_stats[split]["median"] = np.median(list(recordings_per_speaker_stats_dict_split.values()))