mj-new commited on
Commit
e283f70
1 Parent(s): d5cbb7a

Alpha version with tabs

Browse files
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ streamlit
__pycache__/app_utils.cpython-310.pyc CHANGED
Binary files a/__pycache__/app_utils.cpython-310.pyc and b/__pycache__/app_utils.cpython-310.pyc differ
 
__pycache__/contants.cpython-310.pyc CHANGED
Binary files a/__pycache__/contants.cpython-310.pyc and b/__pycache__/contants.cpython-310.pyc differ
 
__pycache__/utils.cpython-310.pyc CHANGED
Binary files a/__pycache__/utils.cpython-310.pyc and b/__pycache__/utils.cpython-310.pyc differ
 
app.py CHANGED
@@ -2,100 +2,133 @@ import pandas as pd
2
  import streamlit as st
3
 
4
  from app_utils import filter_dataframe, calculate_height_to_display
5
- from contants import WELCOME_TEXT, CITATION_TEXT
6
  from utils import BASE_SUMMARY_METRICS
7
- from utils import load_catalog, load_taxonomy
8
  from utils import datasets_count_and_size, datasets_count_and_size_standard, metadata_coverage, catalog_summary_statistics
9
 
10
  import matplotlib.pyplot as plt
11
  import seaborn as sns
12
 
13
- st.set_page_config(layout="wide")
14
-
15
- st.title("Polish Speech Datasets Catalog and Survey analysis")
16
 
17
- st.write(WELCOME_TEXT)
18
 
19
- st.write(CITATION_TEXT)
20
 
 
21
  # Cache the dataframe so it's only loaded once
22
- df_cat = load_catalog()
23
- df_tax = load_taxonomy()
24
 
25
  # Filter out non available datasets
26
- df_cat_available = df_cat[df_cat['Available online'] == 'yes']
27
  # Available and free
28
- df_cat_available_free = df_cat[(df_cat['Available online'] == 'yes') & (df_cat['Price - non-commercial usage'] == 'free')]
29
 
30
  # Available and paid
31
- df_cat_available_paid = df_cat[(df_cat['Available online'] == 'yes') & (df_cat['Price - non-commercial usage'] != 'free')]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
- # Display catalog contents
34
- st.dataframe(filter_dataframe(df_cat), hide_index=True, use_container_width=True)
 
35
 
36
- # Display taxonomy contents
37
 
38
- # Display summary statistics
39
- st.header("Polish ASR speech datasets summary statistics")
40
- df_summary_metrics = catalog_summary_statistics(df_cat)
 
41
 
42
- df_basic_stats = df_summary_metrics.loc[BASE_SUMMARY_METRICS[0:5]]
43
- st.dataframe(df_basic_stats, use_container_width=False)
44
 
45
- st.header("Speech data available across Polish ASR speech datasets")
46
- df_stats_audio_available = df_summary_metrics.loc[BASE_SUMMARY_METRICS[5:10]]
47
- st.dataframe(df_stats_audio_available, use_container_width=False)
 
48
 
49
- st.header("Transcribed data available across Polish ASR speech datasets")
50
- df_stats_transcribed_available = df_summary_metrics.loc[BASE_SUMMARY_METRICS[10:15]]
51
- st.dataframe(df_stats_transcribed_available, use_container_width=False)
 
52
 
 
 
 
 
53
 
54
- # Display distribution of datasets created per year
55
- st.header("Polish ASR speech datasets created in 1997-2023")
56
- col_groupby = ['Creation year']
57
- df_datasets_per_speech_type = datasets_count_and_size(df_cat, col_groupby, col_sort=col_groupby, col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
58
 
59
- st.dataframe(df_datasets_per_speech_type, use_container_width=False)
 
 
60
 
61
- st.header("Institutions contributing Polish ASR speech dataset")
62
- col_groupby = ['Publisher']
63
- df_datasets_per_publisher = datasets_count_and_size(df_cat, col_groupby, col_sort='Count Dataset ID', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
64
- st.dataframe(df_datasets_per_publisher, use_container_width=False)
 
65
 
66
- st.header("Repositories hosting Polish ASR speech datasets")
67
- col_groupby = ['Repository']
68
- df_datasets_per_repo = datasets_count_and_size(df_cat, col_groupby, col_sort='Count Dataset ID', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
69
- st.dataframe(df_datasets_per_repo, use_container_width=False)
 
70
 
71
- st.header("Public domain Polish ASR speech datasets")
72
- col_groupby = ['License', "Dataset ID"]
73
- df_datasets_public = datasets_count_and_size(df_cat_available_free, col_groupby, col_sort='License', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = [])
74
- st.dataframe(df_datasets_public, use_container_width=False)
 
75
 
76
- st.header("Commercialy available Polish ASR speech datasets")
77
- col_groupby = ['License', "Dataset ID"]
78
- df_datasets_paid = datasets_count_and_size(df_cat_available_paid, col_groupby, col_sort='License', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = [])
79
- st.dataframe(df_datasets_paid, use_container_width=False)
80
 
81
- st.header("Coverage of metadata across Polish ASR speech datasets")
82
- df_meta_all_flat, df_meta_all_pivot = metadata_coverage(df_cat, df_cat_available_free, df_cat_available_paid)
83
- st.dataframe(df_meta_all_pivot, use_container_width=False)
84
 
85
- # Display distribution of datasets for various speech types
86
- st.header("Datasets per speech type")
87
- col_groupby = ['Speech type']
88
- df_datasets_per_speech_type = datasets_count_and_size(df_cat, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
89
- st.dataframe(df_datasets_per_speech_type, use_container_width=False)
90
 
91
- # Display distribution of datasets for various speech types
92
- st.header("Distribution of available speech data per audio device - Public domain datasets")
93
- col_groupby = ['Audio device']
94
- df_datasets_per_device = datasets_count_and_size(df_cat_available_free, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
95
- st.dataframe(df_datasets_per_device, use_container_width=False)
96
 
97
- # Display distribution of datasets for various speech types
98
- st.header("Distribution of available speech data per audio device - Commercial datasets")
99
- col_groupby = ['Audio device']
100
- df_datasets_per_device = datasets_count_and_size(df_cat_available_paid, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
101
- st.dataframe(df_datasets_per_device, use_container_width=False)
 
2
  import streamlit as st
3
 
4
  from app_utils import filter_dataframe, calculate_height_to_display
5
+ from contants import INFO_CATALOG, INFO_BENCHMARK, INFO_SURVEY, CITATION_CATALOG, CITATION_BENCHMARK, CITATION_SURVEY
6
  from utils import BASE_SUMMARY_METRICS
7
+ from utils import load_data_catalog, load_data_taxonomy, load_bench_catalog, load_bench_taxonomy
8
  from utils import datasets_count_and_size, datasets_count_and_size_standard, metadata_coverage, catalog_summary_statistics
9
 
10
  import matplotlib.pyplot as plt
11
  import seaborn as sns
12
 
 
 
 
13
 
14
+ st.set_page_config(layout="wide")
15
 
 
16
 
17
+ # Load PL ASR data survey data
18
  # Cache the dataframe so it's only loaded once
19
+ df_data_cat = load_data_catalog()
20
+ df_data_tax = load_data_taxonomy()
21
 
22
  # Filter out non available datasets
23
+ df_data_cat_available = df_data_cat[df_data_cat['Available online'] == 'yes']
24
  # Available and free
25
+ df_data_cat_available_free = df_data_cat[(df_data_cat['Available online'] == 'yes') & (df_data_cat['Price - non-commercial usage'] == 'free')]
26
 
27
  # Available and paid
28
+ df_data_cat_available_paid = df_data_cat[(df_data_cat['Available online'] == 'yes') & (df_data_cat['Price - non-commercial usage'] != 'free')]
29
+
30
+
31
+ # Load PL ASR benchmarks survey data
32
+ df_bench_cat = load_bench_catalog()
33
+ df_bench_tax = load_bench_taxonomy()
34
+
35
+ data_cat, data_survey, bench_cat, bench_survey = st.tabs(["PL ASR speech datasets catalog", "PL ASR speech data survey", "PL ASR benchmarks catalog", "PL ASR benchmarks survey"])
36
+
37
+
38
+ with data_cat:
39
+ st.title("Polish ASR Speech Datasets Catalog")
40
+
41
+ st.markdown(INFO_CATALOG, unsafe_allow_html=True)
42
+
43
+ st.header("How to use?")
44
+ # sent = st.text_area("Text", WELCOME_TEXT, height = 275)
45
+
46
+ st.header("How to cite?")
47
+ st.code(CITATION_CATALOG)
48
+
49
+ # Display catalog contents
50
+ st.dataframe(filter_dataframe(df_data_cat, "datasets"), hide_index=True, use_container_width=True)
51
+
52
+ # Display taxonomy contents
53
+
54
+ with data_survey:
55
+ # Display summary statistics
56
+ st.title("Polish ASR Speech Datasets Survey")
57
+
58
+ st.header("Polish ASR speech datasets summary statistics")
59
+ df_summary_metrics = catalog_summary_statistics(df_data_cat)
60
+
61
+ df_basic_stats = df_summary_metrics.loc[BASE_SUMMARY_METRICS[0:5]]
62
+ st.dataframe(df_basic_stats, use_container_width=False)
63
+
64
+ st.header("Speech data available across Polish ASR speech datasets")
65
+ df_stats_audio_available = df_summary_metrics.loc[BASE_SUMMARY_METRICS[5:10]]
66
+ st.dataframe(df_stats_audio_available, use_container_width=False)
67
 
68
+ st.header("Transcribed data available across Polish ASR speech datasets")
69
+ df_stats_transcribed_available = df_summary_metrics.loc[BASE_SUMMARY_METRICS[10:15]]
70
+ st.dataframe(df_stats_transcribed_available, use_container_width=False)
71
 
 
72
 
73
+ # Display distribution of datasets created per year
74
+ st.header("Polish ASR speech datasets created in 1997-2023")
75
+ col_groupby = ['Creation year']
76
+ df_datasets_per_speech_type = datasets_count_and_size(df_data_cat, col_groupby, col_sort=col_groupby, col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
77
 
78
+ st.dataframe(df_datasets_per_speech_type, use_container_width=False)
 
79
 
80
+ st.header("Institutions contributing Polish ASR speech dataset")
81
+ col_groupby = ['Publisher']
82
+ df_datasets_per_publisher = datasets_count_and_size(df_data_cat, col_groupby, col_sort='Count Dataset ID', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
83
+ st.dataframe(df_datasets_per_publisher, use_container_width=False)
84
 
85
+ st.header("Repositories hosting Polish ASR speech datasets")
86
+ col_groupby = ['Repository']
87
+ df_datasets_per_repo = datasets_count_and_size(df_data_cat, col_groupby, col_sort='Count Dataset ID', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
88
+ st.dataframe(df_datasets_per_repo, use_container_width=False)
89
 
90
+ st.header("Public domain Polish ASR speech datasets")
91
+ col_groupby = ['License', "Dataset ID"]
92
+ df_datasets_public = datasets_count_and_size(df_data_cat_available_free, col_groupby, col_sort='License', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = [])
93
+ st.dataframe(df_datasets_public, use_container_width=False)
94
 
95
+ st.header("Commercialy available Polish ASR speech datasets")
96
+ col_groupby = ['License', "Dataset ID"]
97
+ df_datasets_paid = datasets_count_and_size(df_data_cat_available_paid, col_groupby, col_sort='License', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = [])
98
+ st.dataframe(df_datasets_paid, use_container_width=False)
99
 
100
+ st.header("Coverage of metadata across Polish ASR speech datasets")
101
+ df_meta_all_flat, df_meta_all_pivot = metadata_coverage(df_data_cat, df_data_cat_available_free, df_data_cat_available_paid)
102
+ st.dataframe(df_meta_all_pivot, use_container_width=False)
103
 
104
+ # Display distribution of datasets for various speech types
105
+ st.header("Datasets per speech type")
106
+ col_groupby = ['Speech type']
107
+ df_datasets_per_speech_type = datasets_count_and_size(df_data_cat, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
108
+ st.dataframe(df_datasets_per_speech_type, use_container_width=False)
109
 
110
+ # Display distribution of datasets for various speech types
111
+ st.header("Distribution of available speech data per audio device - Public domain datasets")
112
+ col_groupby = ['Audio device']
113
+ df_datasets_per_device = datasets_count_and_size(df_data_cat_available_free, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
114
+ st.dataframe(df_datasets_per_device, use_container_width=False)
115
 
116
+ # Display distribution of datasets for various speech types
117
+ st.header("Distribution of available speech data per audio device - Commercial datasets")
118
+ col_groupby = ['Audio device']
119
+ df_datasets_per_device = datasets_count_and_size(df_data_cat_available_paid, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
120
+ st.dataframe(df_datasets_per_device, use_container_width=False)
121
 
122
+ with bench_cat:
123
+ st.write("Benchmarks catalog")
124
+ # TODO - load and display benchmarks catalog
125
+ st.title("Polish ASR Benchmarks Catalog")
126
 
127
+ st.write(WELCOME_TEXT)
 
 
128
 
129
+ st.write(CITATION_TEXT)
 
 
 
 
130
 
131
+ # Display catalog contents
132
+ st.dataframe(filter_dataframe(df_bench_cat, "benchmarks"), hide_index=True, use_container_width=True)
 
 
 
133
 
134
+ # Display taxonomy contents
 
 
 
 
app_utils.py CHANGED
@@ -18,7 +18,7 @@ def calculate_height_to_display(df):
18
 
19
  return calculated_height
20
 
21
- def filter_dataframe(df: pd.DataFrame) -> pd.DataFrame:
22
  """
23
  Adds a UI on top of a dataframe to let viewers filter columns
24
 
@@ -28,7 +28,12 @@ def filter_dataframe(df: pd.DataFrame) -> pd.DataFrame:
28
  Returns:
29
  pd.DataFrame: Filtered dataframe
30
  """
31
- modify = st.checkbox("Use filters on speech data catalog")
 
 
 
 
 
32
 
33
  if not modify:
34
  return df
 
18
 
19
  return calculated_height
20
 
21
+ def filter_dataframe(df: pd.DataFrame, target) -> pd.DataFrame:
22
  """
23
  Adds a UI on top of a dataframe to let viewers filter columns
24
 
 
28
  Returns:
29
  pd.DataFrame: Filtered dataframe
30
  """
31
+ if(target == "datasets"):
32
+ modify = st.checkbox("Use filters on speech data catalog")
33
+ elif(target == "benchmarks"):
34
+ modify = st.checkbox("Use filters on benchmarks catalog")
35
+ else:
36
+ print("Invalid target")
37
 
38
  if not modify:
39
  return df
contants.py CHANGED
@@ -1,5 +1,17 @@
1
- WELCOME_TEXT = "This dashboard complements [Polish Speech Datasets Catalog](https://github.com/goodmike31/pl-asr-speech-data-survey) with:\n \
2
- a. Dynamic filtering of catalog content\n \
3
- b. Summary statistics about Polish ASR speech datasets\n"
 
 
 
 
4
 
5
- CITATION_TEXT="Please cite this work as: TODO\n"
 
 
 
 
 
 
 
 
 
1
+ INFO_CATALOG = "This dashboard complements *Polish ASR Speech Datasets Catalog* available on [GitHub](https://github.com/goodmike31/pl-asr-speech-data-survey) and [Google Sheets](https://docs.google.com/spreadsheets/d/181EDfwZNtHgHFOMaKNtgKssrYDX4tXTJ9POMzBsCRlI/edit#gid=0) by providing:<br> \
2
+ * More convenient browsing of the catalog content (see the *How to use?* section below) <br>\
3
+ * Analysis of datasets utility for the purpose of ASR evaluation (see the *Dataset Utility Index* tab) <br>\
4
+ * Analysis of the state of Polish ASR speech data (see the *Polish ASR Speech Data Survey* tab <br> \
5
+ IMPORANT - Please share your feedback [HERE](https://forms.gle/EWJ6YfbJJTyEzQs66). <br>\
6
+ Your feedback will help to assess the state of Polish ASR speech data from the community perspective.<br>\
7
+ Each response is granted 50 PLN for the charity of choice."
8
 
9
+ INFO_BENCHMARK = "TODO"
10
+
11
+ INFO_SURVEY = "This dashboard complements [Polish Speech Datasets Survey]"
12
+
13
+ CITATION_CATALOG="Please cite this work as: TODO"
14
+
15
+ CITATION_BENCHMARK="Please cite this work as: TODO"
16
+
17
+ CITATION_SURVEY="Please cite this work as: TODO"
utils.py CHANGED
@@ -43,19 +43,35 @@ def download_tsv_from_google_sheet(sheet_url):
43
  return None
44
 
45
  @st.cache_data
46
- def load_catalog():
47
  print("Reading speech data catalog")
48
  catalog_url="https://docs.google.com/spreadsheets/d/181EDfwZNtHgHFOMaKNtgKssrYDX4tXTJ9POMzBsCRlI/edit#gid=0"
49
  df_catalog = download_tsv_from_google_sheet(catalog_url)
50
  return(df_catalog)
51
 
52
  @st.cache_data
53
- def load_taxonomy():
54
  print("Reading speech data survey taxonomy")
55
  taxonomy_url="https://docs.google.com/spreadsheets/d/181EDfwZNtHgHFOMaKNtgKssrYDX4tXTJ9POMzBsCRlI/edit#gid=2015613057"
56
  df_taxonomy = download_tsv_from_google_sheet(taxonomy_url)
57
  return(df_taxonomy)
58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  def datasets_count_and_size(df_cat, col_groupby, col_sort=None, col_percent=None, col_sum=['Size audio transcribed [hours]'], col_count=['Dataset ID']):
60
  """
61
  Function to generate a summary view of datasets by speech type and other relevant metrics.
 
43
  return None
44
 
45
  @st.cache_data
46
+ def load_data_catalog():
47
  print("Reading speech data catalog")
48
  catalog_url="https://docs.google.com/spreadsheets/d/181EDfwZNtHgHFOMaKNtgKssrYDX4tXTJ9POMzBsCRlI/edit#gid=0"
49
  df_catalog = download_tsv_from_google_sheet(catalog_url)
50
  return(df_catalog)
51
 
52
  @st.cache_data
53
+ def load_data_taxonomy():
54
  print("Reading speech data survey taxonomy")
55
  taxonomy_url="https://docs.google.com/spreadsheets/d/181EDfwZNtHgHFOMaKNtgKssrYDX4tXTJ9POMzBsCRlI/edit#gid=2015613057"
56
  df_taxonomy = download_tsv_from_google_sheet(taxonomy_url)
57
  return(df_taxonomy)
58
 
59
+
60
+ @st.cache_data
61
+ def load_bench_catalog():
62
+ print("Reading ASR benchmarks catalog")
63
+ catalog_url="https://docs.google.com/spreadsheets/d/1fVsE98Ulmt-EIEe4wx8sUdo7RLigDdAVjQxNpAJIrH8/edit#gid=0"
64
+ df_catalog = download_tsv_from_google_sheet(catalog_url)
65
+ return(df_catalog)
66
+
67
+ @st.cache_data
68
+ def load_bench_taxonomy():
69
+ print("Reading ASR benchmarks survey taxonomy")
70
+ taxonomy_url="https://docs.google.com/spreadsheets/d/181EDfwZNtHgHFOMaKNtgKssrYDX4tXTJ9POMzBsCRlI/edit#gid=2015613057"
71
+ df_taxonomy = download_tsv_from_google_sheet(taxonomy_url)
72
+ return(df_taxonomy)
73
+
74
+
75
  def datasets_count_and_size(df_cat, col_groupby, col_sort=None, col_percent=None, col_sum=['Size audio transcribed [hours]'], col_count=['Dataset ID']):
76
  """
77
  Function to generate a summary view of datasets by speech type and other relevant metrics.