mj-new commited on
Commit
25f0e74
1 Parent(s): 5d90238

Added average audio utterance calculation

Browse files
app.py CHANGED
@@ -64,7 +64,7 @@ with analysis_bigos:
64
  st.dataframe(df_sum_stats_text)
65
 
66
 
67
- metrics_features = ["utts_unique", "words_unique", "chars_unique", "words_per_sec", "chars_per_sec"]
68
 
69
  df_sum_stats_all_splits = extract_stats_all_splits(df_multindex_all_splits, metrics_features)
70
 
@@ -131,12 +131,12 @@ with analysis_bigos_pelcra:
131
  st.dataframe(df_sum_stats_text)
132
 
133
 
134
- metrics_features = ["utts_unique", "words_unique", "chars_unique", "words_per_sec", "chars_per_sec"]
135
 
136
  df_sum_stats_all_splits = extract_stats_all_splits(df_multindex_all_splits, metrics_features)
137
 
138
  st.subheader("Dataset features (text)")
139
- df_sum_stats_feats_text = df_sum_stats_all_splits[metrics_features[0:2]]
140
  st.dataframe(df_sum_stats_feats_text)
141
 
142
  st.subheader("Dataset features (audio)")
 
64
  st.dataframe(df_sum_stats_text)
65
 
66
 
67
+ metrics_features = ["utts_unique", "words_unique", "chars_unique", "words_per_sec", "chars_per_sec", "average_audio_duration[s]"]
68
 
69
  df_sum_stats_all_splits = extract_stats_all_splits(df_multindex_all_splits, metrics_features)
70
 
 
131
  st.dataframe(df_sum_stats_text)
132
 
133
 
134
+ metrics_features = ["utts_unique", "words_unique", "chars_unique", "words_per_sec", "chars_per_sec", "average_audio_duration[s]"]
135
 
136
  df_sum_stats_all_splits = extract_stats_all_splits(df_multindex_all_splits, metrics_features)
137
 
138
  st.subheader("Dataset features (text)")
139
+ df_sum_stats_feats_text = df_sum_stats_all_splits[metrics_features[0:3]]
140
  st.dataframe(df_sum_stats_feats_text)
141
 
142
  st.subheader("Dataset features (audio)")
reports/amu-cai/pl-asr-bigos-v2/dataset_contents.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f9d906a6794c9928df18c639c5b6095f304a64e3b1aa89a7a7ae62ec6e5ea398
3
  size 46668863
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:43e808b081d9b692c2469396565fb967105fd815894a7eaded34e89969dbc890
3
  size 46668863
reports/amu-cai/pl-asr-bigos-v2/dataset_statistics.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8c6f9fbd2061fe698519347857bd047c5d756d9358d988d86f8684d7d533c390
3
- size 22627
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0acb30a9a215f9c96b567b8753f565f400eac2366df6dba6248ccba859e190e3
3
+ size 23940
reports/pelcra/pl-asr-pelcra-for-bigos/dataset_contents.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:341199e4818ae3327eb100d85d0e2ed4d7a3bf81d0f70914d3c434ad763eb30c
3
  size 95274266
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9cea38447dc7485c0f628eba6e52f45e24d1d467fbe23c065162d6b36455ab1d
3
  size 95274266
reports/pelcra/pl-asr-pelcra-for-bigos/dataset_statistics.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:65f591c54e5a1daedc640482f88fd65acd0b2fb193e01f4ed58fb81c54d70cea
3
- size 29098
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ab97523e5f4776bb566ed57c38126004bfac43f64bb3177e9ae39f1ee6e51d5
3
+ size 30399
run-analysis.py CHANGED
@@ -2,7 +2,7 @@ import os
2
  import json
3
  from datasets import load_dataset, get_dataset_config_names, Features, Value
4
  from utils import num_of_samples_per_split, uniq_utts_per_split, words_per_split, uniq_words_per_split, chars_per_split, uniq_chars_per_split
5
- from utils import audio_duration_per_split, speakers_per_split, meta_cov_per_split
6
  #, uniq_utts_per_speaker
7
  from utils import meta_distribution_text, meta_distribution_violin_plot, recordings_per_speaker, speech_rate_words_per_split, speech_rate_chars_per_split
8
  import argparse
@@ -58,19 +58,26 @@ for config_name in dataset_configs:
58
  if(args.secret_test_split):
59
  dataset_hf_subset_secret = load_dataset(dataset_name_secret, config_name, features=features_to_load, trust_remote_code=True)
60
 
 
61
  dataset_statistics[config_name]["samples"] = num_of_samples_per_split(dataset_hf_subset)
62
- dataset_statistics[config_name]["audio[h]"] = audio_duration_per_split(dataset_hf_subset)
63
  dataset_statistics[config_name]["speakers"] = speakers_per_split(dataset_hf_subset)
64
 
 
65
  # metrics based on transcriptions (references) - requires reading secret repo for test split
66
- dataset_statistics[config_name]["utts_unique"], dataset_contents[config_name]["unique_utts"] = uniq_utts_per_split(dataset_hf_subset, dataset_hf_subset_secret)
67
  dataset_statistics[config_name]["words"] = words_per_split(dataset_hf_subset, dataset_hf_subset_secret)
68
- dataset_statistics[config_name]["words_unique"], dataset_contents[config_name]["unique_words"] = uniq_words_per_split(dataset_hf_subset, dataset_hf_subset_secret)
69
  dataset_statistics[config_name]["chars"] = chars_per_split(dataset_hf_subset, dataset_hf_subset_secret)
 
 
 
 
70
  dataset_statistics[config_name]["chars_unique"], dataset_contents[config_name]["unique_chars"] = uniq_chars_per_split(dataset_hf_subset, dataset_hf_subset_secret)
 
 
71
  dataset_statistics[config_name]["words_per_sec"] = speech_rate_words_per_split(dataset_hf_subset, dataset_hf_subset_secret)
72
  dataset_statistics[config_name]["chars_per_sec"] = speech_rate_chars_per_split(dataset_hf_subset, dataset_hf_subset_secret)
73
-
 
74
  # metadata coverage per subset in percent - speaker accent
75
  dataset_statistics[config_name]["meta_cov_sex"] = meta_cov_per_split(dataset_hf_subset, 'speaker_sex')
76
  dataset_statistics[config_name]["meta_cov_age"] = meta_cov_per_split(dataset_hf_subset, 'speaker_age')
 
2
  import json
3
  from datasets import load_dataset, get_dataset_config_names, Features, Value
4
  from utils import num_of_samples_per_split, uniq_utts_per_split, words_per_split, uniq_words_per_split, chars_per_split, uniq_chars_per_split
5
+ from utils import total_audio_duration_per_split, average_audio_duration_per_split, speakers_per_split, meta_cov_per_split
6
  #, uniq_utts_per_speaker
7
  from utils import meta_distribution_text, meta_distribution_violin_plot, recordings_per_speaker, speech_rate_words_per_split, speech_rate_chars_per_split
8
  import argparse
 
58
  if(args.secret_test_split):
59
  dataset_hf_subset_secret = load_dataset(dataset_name_secret, config_name, features=features_to_load, trust_remote_code=True)
60
 
61
+ #audio content size
62
  dataset_statistics[config_name]["samples"] = num_of_samples_per_split(dataset_hf_subset)
63
+ dataset_statistics[config_name]["audio[h]"] = total_audio_duration_per_split(dataset_hf_subset)
64
  dataset_statistics[config_name]["speakers"] = speakers_per_split(dataset_hf_subset)
65
 
66
+ # text content size
67
  # metrics based on transcriptions (references) - requires reading secret repo for test split
 
68
  dataset_statistics[config_name]["words"] = words_per_split(dataset_hf_subset, dataset_hf_subset_secret)
 
69
  dataset_statistics[config_name]["chars"] = chars_per_split(dataset_hf_subset, dataset_hf_subset_secret)
70
+
71
+ # text content derived features
72
+ dataset_statistics[config_name]["utts_unique"], dataset_contents[config_name]["unique_utts"] = uniq_utts_per_split(dataset_hf_subset, dataset_hf_subset_secret)
73
+ dataset_statistics[config_name]["words_unique"], dataset_contents[config_name]["unique_words"] = uniq_words_per_split(dataset_hf_subset, dataset_hf_subset_secret)
74
  dataset_statistics[config_name]["chars_unique"], dataset_contents[config_name]["unique_chars"] = uniq_chars_per_split(dataset_hf_subset, dataset_hf_subset_secret)
75
+
76
+ # audio content derived features
77
  dataset_statistics[config_name]["words_per_sec"] = speech_rate_words_per_split(dataset_hf_subset, dataset_hf_subset_secret)
78
  dataset_statistics[config_name]["chars_per_sec"] = speech_rate_chars_per_split(dataset_hf_subset, dataset_hf_subset_secret)
79
+ dataset_statistics[config_name]["average_audio_duration[s]"] = average_audio_duration_per_split(dataset_hf_subset)
80
+
81
  # metadata coverage per subset in percent - speaker accent
82
  dataset_statistics[config_name]["meta_cov_sex"] = meta_cov_per_split(dataset_hf_subset, 'speaker_sex')
83
  dataset_statistics[config_name]["meta_cov_age"] = meta_cov_per_split(dataset_hf_subset, 'speaker_age')
utils.py CHANGED
@@ -32,7 +32,7 @@ def num_of_samples_per_split(dataset_hf):
32
 
33
  return out_dict
34
 
35
- def audio_duration_per_split(dataset_hf):
36
  # input - huggingface dataset object
37
  # output - dictionary with statistics about audio duration per split
38
  out_dict = {}
@@ -52,6 +52,31 @@ def audio_duration_per_split(dataset_hf):
52
  out_dict["all_splits"] = sum(out_dict.values())
53
  return out_dict
54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  def speakers_per_split(dataset_hf):
56
  # input - huggingface dataset object
57
  # output - dictionary with statistics about audio duration per split
@@ -350,7 +375,6 @@ def meta_distribution_text(dataset_hf, meta_field):
350
  return out_dict
351
 
352
 
353
-
354
  def recordings_per_speaker(dataset_hf):
355
  recordings_per_speaker_stats_dict = {}
356
 
 
32
 
33
  return out_dict
34
 
35
+ def total_audio_duration_per_split(dataset_hf):
36
  # input - huggingface dataset object
37
  # output - dictionary with statistics about audio duration per split
38
  out_dict = {}
 
52
  out_dict["all_splits"] = sum(out_dict.values())
53
  return out_dict
54
 
55
+
56
+ def average_audio_duration_per_split(dataset_hf):
57
+ # input - huggingface dataset object
58
+ # output - dictionary with statistics about audio duration per split
59
+ out_dict = {}
60
+ metric = "average_audio_duration[s]"
61
+ print("Calculating {}".format(metric))
62
+ samples_all=0
63
+ audio_length_total_seconds=0
64
+ for split in dataset_hf.keys():
65
+ #sampling_rate = dataset_hf[split]["sampling_rate"][0]
66
+ #audio_total_length_samples = 0
67
+ #audio_total_length_samples = sum(len(audio_file["array"]) for audio_file in dataset_hf["test"]["audio"])
68
+ audio_length_split_seconds = sum(dataset_hf[split]["audio_duration_seconds"])
69
+ audio_length_total_seconds += audio_length_split_seconds
70
+
71
+ samples_split = len(dataset_hf[split]["audio_duration_seconds"])
72
+ samples_all += samples_split
73
+ audio_average_length_seconds = round(audio_length_split_seconds / samples_split,2)
74
+ out_dict[split] = audio_average_length_seconds
75
+ #print(split, audio_total_length_hours)
76
+ # add number of samples for all splits
77
+ out_dict["all_splits"] = round(audio_length_total_seconds / samples_all,2)
78
+ return out_dict
79
+
80
  def speakers_per_split(dataset_hf):
81
  # input - huggingface dataset object
82
  # output - dictionary with statistics about audio duration per split
 
375
  return out_dict
376
 
377
 
 
378
  def recordings_per_speaker(dataset_hf):
379
  recordings_per_speaker_stats_dict = {}
380