mj-new commited on
Commit
ad8c37c
1 Parent(s): 4eee292

Fixed filtering of freely and commercially available datasets

Browse files
__pycache__/contants.cpython-310.pyc CHANGED
Binary files a/__pycache__/contants.cpython-310.pyc and b/__pycache__/contants.cpython-310.pyc differ
 
__pycache__/utils.cpython-310.pyc CHANGED
Binary files a/__pycache__/utils.cpython-310.pyc and b/__pycache__/utils.cpython-310.pyc differ
 
app.py CHANGED
@@ -1,5 +1,7 @@
1
  import pandas as pd
2
  import streamlit as st
 
 
3
 
4
  from app_utils import filter_dataframe, calculate_height_to_display
5
  from contants import INFO_CATALOG, CITATION_CATALOG, HOWTO_CATALOG,INFO_BENCHMARK, CITATION_BENCHMARK, HOWTO_BENCHMARK, INFO_MAIN, CITATION_MAIN, HOWTO_TAXONOMY_CAT
@@ -8,10 +10,6 @@ from utils import load_data_catalog, load_data_taxonomy, load_bench_catalog, lo
8
  from utils import datasets_count_and_size, datasets_count_and_size_standard, metadata_coverage, catalog_summary_statistics
9
  from utils import left_align, right_align
10
 
11
- import matplotlib.pyplot as plt
12
- import seaborn as sns
13
-
14
-
15
  st.set_page_config(layout="wide")
16
 
17
 
@@ -23,10 +21,10 @@ df_data_tax = load_data_taxonomy()
23
  # Filter out non available datasets
24
  df_data_cat_available = df_data_cat[df_data_cat['Available online'] == 'yes']
25
  # Available and free
26
- df_data_cat_available_free = df_data_cat[(df_data_cat['Available online'] == 'yes') & (df_data_cat['Price - non-commercial usage'] == 'free')]
27
 
28
  # Available and paid
29
- df_data_cat_available_paid = df_data_cat[(df_data_cat['Available online'] == 'yes') & (df_data_cat['Price - non-commercial usage'] != 'free')]
30
 
31
 
32
  # Load PL ASR benchmarks survey data
@@ -86,11 +84,17 @@ with data_survey:
86
 
87
  st.dataframe(df_datasets_per_year, use_container_width=False)
88
 
89
- st.header("Institutions contributing Polish ASR speech dataset")
90
  col_groupby = ['Publisher']
91
  df_datasets_per_publisher = datasets_count_and_size(df_data_cat, col_groupby, col_sort='Count Dataset ID', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
92
  st.dataframe(df_datasets_per_publisher, use_container_width=False)
93
 
 
 
 
 
 
 
94
  st.header("Repositories hosting Polish ASR speech datasets")
95
  col_groupby = ['Repository']
96
  df_datasets_per_repo = datasets_count_and_size(df_data_cat, col_groupby, col_sort='Count Dataset ID', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
@@ -114,19 +118,43 @@ with data_survey:
114
  st.header("Datasets per speech type")
115
  col_groupby = ['Speech type']
116
  df_datasets_per_speech_type = datasets_count_and_size(df_data_cat, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
 
 
117
  st.dataframe(df_datasets_per_speech_type, use_container_width=False)
118
 
 
 
 
 
 
 
 
 
 
119
  # Display distribution of datasets for various speech types
120
  st.header("Distribution of available speech data per audio device - Public domain datasets")
121
  col_groupby = ['Audio device']
122
- df_datasets_per_device = datasets_count_and_size(df_data_cat_available_free, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
123
- st.dataframe(df_datasets_per_device, use_container_width=False)
 
 
124
 
125
  # Display distribution of datasets for various speech types
126
  st.header("Distribution of available speech data per audio device - Commercial datasets")
127
  col_groupby = ['Audio device']
128
- df_datasets_per_device = datasets_count_and_size(df_data_cat_available_paid, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
129
- st.dataframe(df_datasets_per_device, use_container_width=False)
 
 
 
 
 
 
 
 
 
 
 
130
 
131
  with data_taxonomy:
132
  st.title("Polish ASR Speech Data Taxonomy")
 
1
  import pandas as pd
2
  import streamlit as st
3
+ import matplotlib.pyplot as plt
4
+ import seaborn as sns
5
 
6
  from app_utils import filter_dataframe, calculate_height_to_display
7
  from contants import INFO_CATALOG, CITATION_CATALOG, HOWTO_CATALOG,INFO_BENCHMARK, CITATION_BENCHMARK, HOWTO_BENCHMARK, INFO_MAIN, CITATION_MAIN, HOWTO_TAXONOMY_CAT
 
10
  from utils import datasets_count_and_size, datasets_count_and_size_standard, metadata_coverage, catalog_summary_statistics
11
  from utils import left_align, right_align
12
 
 
 
 
 
13
  st.set_page_config(layout="wide")
14
 
15
 
 
21
  # Filter out non available datasets
22
  df_data_cat_available = df_data_cat[df_data_cat['Available online'] == 'yes']
23
  # Available and free
24
+ df_data_cat_available_free = df_data_cat[(df_data_cat['Available online'] == 'yes') & (df_data_cat['Price - non-commercial usage'] == '0')]
25
 
26
  # Available and paid
27
+ df_data_cat_available_paid = df_data_cat[(df_data_cat['Available online'] == 'yes') & (df_data_cat['Price - non-commercial usage'] != '0')]
28
 
29
 
30
  # Load PL ASR benchmarks survey data
 
84
 
85
  st.dataframe(df_datasets_per_year, use_container_width=False)
86
 
87
+ st.header("Institutions contributing Polish ASR speech datasets")
88
  col_groupby = ['Publisher']
89
  df_datasets_per_publisher = datasets_count_and_size(df_data_cat, col_groupby, col_sort='Count Dataset ID', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
90
  st.dataframe(df_datasets_per_publisher, use_container_width=False)
91
 
92
+ st.header("Institutions contributing freely available Polish ASR speech datasets")
93
+ col_groupby = ['Publisher']
94
+ df_datasets_per_publisher_free = datasets_count_and_size(df_data_cat_available_free, col_groupby, col_sort='Count Dataset ID', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
95
+ st.dataframe(df_datasets_per_publisher_free, use_container_width=False)
96
+
97
+
98
  st.header("Repositories hosting Polish ASR speech datasets")
99
  col_groupby = ['Repository']
100
  df_datasets_per_repo = datasets_count_and_size(df_data_cat, col_groupby, col_sort='Count Dataset ID', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
 
118
  st.header("Datasets per speech type")
119
  col_groupby = ['Speech type']
120
  df_datasets_per_speech_type = datasets_count_and_size(df_data_cat, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
121
+ # sort by the size of audio transcribed
122
+ df_datasets_per_speech_type = df_datasets_per_speech_type.sort_values(by='Size audio transcribed [hours]', ascending=False)
123
  st.dataframe(df_datasets_per_speech_type, use_container_width=False)
124
 
125
+
126
+ # Display distribution of datasets for various speech types
127
+ st.header("Distribution of available speech data per audio device - All available datasets")
128
+ col_groupby = ['Audio device']
129
+ df_datasets_per_device_all = datasets_count_and_size(df_data_cat, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
130
+ # sort by the size of audio transcribed
131
+ df_datasets_per_device_all = df_datasets_per_device_all.sort_values(by='Size audio transcribed [hours]', ascending=False)
132
+ st.dataframe(df_datasets_per_device_all, use_container_width=False)
133
+
134
  # Display distribution of datasets for various speech types
135
  st.header("Distribution of available speech data per audio device - Public domain datasets")
136
  col_groupby = ['Audio device']
137
+ df_datasets_per_device_free = datasets_count_and_size(df_data_cat_available_free, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
138
+ # sort by the size of audio transcribed
139
+ df_datasets_per_device_free = df_datasets_per_device_free.sort_values(by='Size audio transcribed [hours]', ascending=False)
140
+ st.dataframe(df_datasets_per_device_free, use_container_width=False)
141
 
142
  # Display distribution of datasets for various speech types
143
  st.header("Distribution of available speech data per audio device - Commercial datasets")
144
  col_groupby = ['Audio device']
145
+ df_datasets_per_device_paid = datasets_count_and_size(df_data_cat_available_paid, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
146
+ # sort by the size of audio transcribed
147
+ df_datasets_per_device_paid = df_datasets_per_device_paid.sort_values(by='Size audio transcribed [hours]', ascending=False)
148
+ st.dataframe(df_datasets_per_device_paid, use_container_width=False)
149
+
150
+ # Display distribution of datasets for various speech types
151
+ st.header("Datasets per sampling rate")
152
+ col_groupby = ['Sampling rate [Hz]']
153
+ df_datasets_per_sr = datasets_count_and_size(df_data_cat, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
154
+ # sort by the size of audio transcribed
155
+ df_datasets_per_sr = df_datasets_per_sr.sort_values(by='Size audio transcribed [hours]', ascending=False)
156
+ st.dataframe(df_datasets_per_sr, use_container_width=False)
157
+
158
 
159
  with data_taxonomy:
160
  st.title("Polish ASR Speech Data Taxonomy")
contants.py CHANGED
@@ -1,5 +1,5 @@
1
  INFO_MAIN= " Welcome to the Polish ASR Survey dashboard! <br> \
2
- You can use this dashboard to learn about the state of Polish ASR speech data and benchmarks. <br> \
3
  The dashboard is built upon the [*Polish ASR Speech Datasets Catalog*](https://github.com/goodmike31/pl-asr-speech-data-survey) and [*Polish ASR Benchmarks Catalog*](https://docs.google.com/spreadsheets/d/1fVsE98Ulmt-EIEe4wx8sUdo7RLigDdAVjQxNpAJIrH8/edit?usp=sharing). <br><br> \
4
  The dashboard is divided into the following tabs: <br> \
5
  * **About Polish ASR Survey** - general information about the survey, references, and contact points <br> \
@@ -11,7 +11,7 @@ The dashboard is divided into the following tabs: <br> \
11
  * **ASR Benchmarks Taxonomy** - explanation of the columns in the *Polish ASR Benchmarks Catalog* <br> \
12
  Please visit respective tab to learn how to use it and provide feedback. <br><br> \
13
  If you want to share your feedback regarding the Speech Data catalog, please use this [FORM](https://forms.gle/EWJ6YfbJJTyEzQs66). <br><br> \
14
- If you are looking for the latest ASR benchmarks for Polish, please visit the [BIGOS/PELCRA ASR leaderboard](https://huggingface.co/spaces/amu-cai/pl-asr-bigos-bench-dash). <br><br> \
15
  You can also contact the author via [email](mailto:michal.junczyk@amu.edu.pl) or [LinkedIn](https://www.linkedin.com/in/michaljunczyk/).<br>"
16
 
17
  CITATION_MAIN = "@misc{junczyk-2024-pl-asr-survey <br> \
 
1
  INFO_MAIN= " Welcome to the Polish ASR Survey dashboard! <br> \
2
+ You can use it to learn about the state of Polish ASR speech data and benchmarks. <br> \
3
  The dashboard is built upon the [*Polish ASR Speech Datasets Catalog*](https://github.com/goodmike31/pl-asr-speech-data-survey) and [*Polish ASR Benchmarks Catalog*](https://docs.google.com/spreadsheets/d/1fVsE98Ulmt-EIEe4wx8sUdo7RLigDdAVjQxNpAJIrH8/edit?usp=sharing). <br><br> \
4
  The dashboard is divided into the following tabs: <br> \
5
  * **About Polish ASR Survey** - general information about the survey, references, and contact points <br> \
 
11
  * **ASR Benchmarks Taxonomy** - explanation of the columns in the *Polish ASR Benchmarks Catalog* <br> \
12
  Please visit respective tab to learn how to use it and provide feedback. <br><br> \
13
  If you want to share your feedback regarding the Speech Data catalog, please use this [FORM](https://forms.gle/EWJ6YfbJJTyEzQs66). <br><br> \
14
+ If you are looking for the latest ASR benchmarks for Polish, please visit the [AMU ASR leaderboard](https://huggingface.co/spaces/amu-cai/pl-asr-leaderboard). <br><br> \
15
  You can also contact the author via [email](mailto:michal.junczyk@amu.edu.pl) or [LinkedIn](https://www.linkedin.com/in/michaljunczyk/).<br>"
16
 
17
  CITATION_MAIN = "@misc{junczyk-2024-pl-asr-survey <br> \