Spaces:
Running
Running
mj-new
commited on
Commit
•
ad8c37c
1
Parent(s):
4eee292
Fixed filtering of freely and commercially available datasets
Browse files- __pycache__/contants.cpython-310.pyc +0 -0
- __pycache__/utils.cpython-310.pyc +0 -0
- app.py +39 -11
- contants.py +2 -2
__pycache__/contants.cpython-310.pyc
CHANGED
Binary files a/__pycache__/contants.cpython-310.pyc and b/__pycache__/contants.cpython-310.pyc differ
|
|
__pycache__/utils.cpython-310.pyc
CHANGED
Binary files a/__pycache__/utils.cpython-310.pyc and b/__pycache__/utils.cpython-310.pyc differ
|
|
app.py
CHANGED
@@ -1,5 +1,7 @@
|
|
1 |
import pandas as pd
|
2 |
import streamlit as st
|
|
|
|
|
3 |
|
4 |
from app_utils import filter_dataframe, calculate_height_to_display
|
5 |
from contants import INFO_CATALOG, CITATION_CATALOG, HOWTO_CATALOG,INFO_BENCHMARK, CITATION_BENCHMARK, HOWTO_BENCHMARK, INFO_MAIN, CITATION_MAIN, HOWTO_TAXONOMY_CAT
|
@@ -8,10 +10,6 @@ from utils import load_data_catalog, load_data_taxonomy, load_bench_catalog, lo
|
|
8 |
from utils import datasets_count_and_size, datasets_count_and_size_standard, metadata_coverage, catalog_summary_statistics
|
9 |
from utils import left_align, right_align
|
10 |
|
11 |
-
import matplotlib.pyplot as plt
|
12 |
-
import seaborn as sns
|
13 |
-
|
14 |
-
|
15 |
st.set_page_config(layout="wide")
|
16 |
|
17 |
|
@@ -23,10 +21,10 @@ df_data_tax = load_data_taxonomy()
|
|
23 |
# Filter out non available datasets
|
24 |
df_data_cat_available = df_data_cat[df_data_cat['Available online'] == 'yes']
|
25 |
# Available and free
|
26 |
-
df_data_cat_available_free = df_data_cat[(df_data_cat['Available online'] == 'yes') & (df_data_cat['Price - non-commercial usage'] == '
|
27 |
|
28 |
# Available and paid
|
29 |
-
df_data_cat_available_paid = df_data_cat[(df_data_cat['Available online'] == 'yes') & (df_data_cat['Price - non-commercial usage'] != '
|
30 |
|
31 |
|
32 |
# Load PL ASR benchmarks survey data
|
@@ -86,11 +84,17 @@ with data_survey:
|
|
86 |
|
87 |
st.dataframe(df_datasets_per_year, use_container_width=False)
|
88 |
|
89 |
-
st.header("Institutions contributing Polish ASR speech
|
90 |
col_groupby = ['Publisher']
|
91 |
df_datasets_per_publisher = datasets_count_and_size(df_data_cat, col_groupby, col_sort='Count Dataset ID', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
|
92 |
st.dataframe(df_datasets_per_publisher, use_container_width=False)
|
93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
st.header("Repositories hosting Polish ASR speech datasets")
|
95 |
col_groupby = ['Repository']
|
96 |
df_datasets_per_repo = datasets_count_and_size(df_data_cat, col_groupby, col_sort='Count Dataset ID', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
|
@@ -114,19 +118,43 @@ with data_survey:
|
|
114 |
st.header("Datasets per speech type")
|
115 |
col_groupby = ['Speech type']
|
116 |
df_datasets_per_speech_type = datasets_count_and_size(df_data_cat, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
|
|
|
|
|
117 |
st.dataframe(df_datasets_per_speech_type, use_container_width=False)
|
118 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
# Display distribution of datasets for various speech types
|
120 |
st.header("Distribution of available speech data per audio device - Public domain datasets")
|
121 |
col_groupby = ['Audio device']
|
122 |
-
|
123 |
-
|
|
|
|
|
124 |
|
125 |
# Display distribution of datasets for various speech types
|
126 |
st.header("Distribution of available speech data per audio device - Commercial datasets")
|
127 |
col_groupby = ['Audio device']
|
128 |
-
|
129 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
130 |
|
131 |
with data_taxonomy:
|
132 |
st.title("Polish ASR Speech Data Taxonomy")
|
|
|
1 |
import pandas as pd
|
2 |
import streamlit as st
|
3 |
+
import matplotlib.pyplot as plt
|
4 |
+
import seaborn as sns
|
5 |
|
6 |
from app_utils import filter_dataframe, calculate_height_to_display
|
7 |
from contants import INFO_CATALOG, CITATION_CATALOG, HOWTO_CATALOG,INFO_BENCHMARK, CITATION_BENCHMARK, HOWTO_BENCHMARK, INFO_MAIN, CITATION_MAIN, HOWTO_TAXONOMY_CAT
|
|
|
10 |
from utils import datasets_count_and_size, datasets_count_and_size_standard, metadata_coverage, catalog_summary_statistics
|
11 |
from utils import left_align, right_align
|
12 |
|
|
|
|
|
|
|
|
|
13 |
st.set_page_config(layout="wide")
|
14 |
|
15 |
|
|
|
21 |
# Filter out non available datasets
|
22 |
df_data_cat_available = df_data_cat[df_data_cat['Available online'] == 'yes']
|
23 |
# Available and free
|
24 |
+
df_data_cat_available_free = df_data_cat[(df_data_cat['Available online'] == 'yes') & (df_data_cat['Price - non-commercial usage'] == '0')]
|
25 |
|
26 |
# Available and paid
|
27 |
+
df_data_cat_available_paid = df_data_cat[(df_data_cat['Available online'] == 'yes') & (df_data_cat['Price - non-commercial usage'] != '0')]
|
28 |
|
29 |
|
30 |
# Load PL ASR benchmarks survey data
|
|
|
84 |
|
85 |
st.dataframe(df_datasets_per_year, use_container_width=False)
|
86 |
|
87 |
+
st.header("Institutions contributing Polish ASR speech datasets")
|
88 |
col_groupby = ['Publisher']
|
89 |
df_datasets_per_publisher = datasets_count_and_size(df_data_cat, col_groupby, col_sort='Count Dataset ID', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
|
90 |
st.dataframe(df_datasets_per_publisher, use_container_width=False)
|
91 |
|
92 |
+
st.header("Institutions contributing freely available Polish ASR speech datasets")
|
93 |
+
col_groupby = ['Publisher']
|
94 |
+
df_datasets_per_publisher_free = datasets_count_and_size(df_data_cat_available_free, col_groupby, col_sort='Count Dataset ID', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
|
95 |
+
st.dataframe(df_datasets_per_publisher_free, use_container_width=False)
|
96 |
+
|
97 |
+
|
98 |
st.header("Repositories hosting Polish ASR speech datasets")
|
99 |
col_groupby = ['Repository']
|
100 |
df_datasets_per_repo = datasets_count_and_size(df_data_cat, col_groupby, col_sort='Count Dataset ID', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
|
|
|
118 |
st.header("Datasets per speech type")
|
119 |
col_groupby = ['Speech type']
|
120 |
df_datasets_per_speech_type = datasets_count_and_size(df_data_cat, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
|
121 |
+
# sort by the size of audio transcribed
|
122 |
+
df_datasets_per_speech_type = df_datasets_per_speech_type.sort_values(by='Size audio transcribed [hours]', ascending=False)
|
123 |
st.dataframe(df_datasets_per_speech_type, use_container_width=False)
|
124 |
|
125 |
+
|
126 |
+
# Display distribution of datasets for various speech types
|
127 |
+
st.header("Distribution of available speech data per audio device - All available datasets")
|
128 |
+
col_groupby = ['Audio device']
|
129 |
+
df_datasets_per_device_all = datasets_count_and_size(df_data_cat, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
|
130 |
+
# sort by the size of audio transcribed
|
131 |
+
df_datasets_per_device_all = df_datasets_per_device_all.sort_values(by='Size audio transcribed [hours]', ascending=False)
|
132 |
+
st.dataframe(df_datasets_per_device_all, use_container_width=False)
|
133 |
+
|
134 |
# Display distribution of datasets for various speech types
|
135 |
st.header("Distribution of available speech data per audio device - Public domain datasets")
|
136 |
col_groupby = ['Audio device']
|
137 |
+
df_datasets_per_device_free = datasets_count_and_size(df_data_cat_available_free, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
|
138 |
+
# sort by the size of audio transcribed
|
139 |
+
df_datasets_per_device_free = df_datasets_per_device_free.sort_values(by='Size audio transcribed [hours]', ascending=False)
|
140 |
+
st.dataframe(df_datasets_per_device_free, use_container_width=False)
|
141 |
|
142 |
# Display distribution of datasets for various speech types
|
143 |
st.header("Distribution of available speech data per audio device - Commercial datasets")
|
144 |
col_groupby = ['Audio device']
|
145 |
+
df_datasets_per_device_paid = datasets_count_and_size(df_data_cat_available_paid, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
|
146 |
+
# sort by the size of audio transcribed
|
147 |
+
df_datasets_per_device_paid = df_datasets_per_device_paid.sort_values(by='Size audio transcribed [hours]', ascending=False)
|
148 |
+
st.dataframe(df_datasets_per_device_paid, use_container_width=False)
|
149 |
+
|
150 |
+
# Display distribution of datasets for various speech types
|
151 |
+
st.header("Datasets per sampling rate")
|
152 |
+
col_groupby = ['Sampling rate [Hz]']
|
153 |
+
df_datasets_per_sr = datasets_count_and_size(df_data_cat, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
|
154 |
+
# sort by the size of audio transcribed
|
155 |
+
df_datasets_per_sr = df_datasets_per_sr.sort_values(by='Size audio transcribed [hours]', ascending=False)
|
156 |
+
st.dataframe(df_datasets_per_sr, use_container_width=False)
|
157 |
+
|
158 |
|
159 |
with data_taxonomy:
|
160 |
st.title("Polish ASR Speech Data Taxonomy")
|
contants.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
INFO_MAIN= " Welcome to the Polish ASR Survey dashboard! <br> \
|
2 |
-
You can use
|
3 |
The dashboard is built upon the [*Polish ASR Speech Datasets Catalog*](https://github.com/goodmike31/pl-asr-speech-data-survey) and [*Polish ASR Benchmarks Catalog*](https://docs.google.com/spreadsheets/d/1fVsE98Ulmt-EIEe4wx8sUdo7RLigDdAVjQxNpAJIrH8/edit?usp=sharing). <br><br> \
|
4 |
The dashboard is divided into the following tabs: <br> \
|
5 |
* **About Polish ASR Survey** - general information about the survey, references, and contact points <br> \
|
@@ -11,7 +11,7 @@ The dashboard is divided into the following tabs: <br> \
|
|
11 |
* **ASR Benchmarks Taxonomy** - explanation of the columns in the *Polish ASR Benchmarks Catalog* <br> \
|
12 |
Please visit respective tab to learn how to use it and provide feedback. <br><br> \
|
13 |
If you want to share your feedback regarding the Speech Data catalog, please use this [FORM](https://forms.gle/EWJ6YfbJJTyEzQs66). <br><br> \
|
14 |
-
If you are looking for the latest ASR benchmarks for Polish, please visit the [
|
15 |
You can also contact the author via [email](mailto:michal.junczyk@amu.edu.pl) or [LinkedIn](https://www.linkedin.com/in/michaljunczyk/).<br>"
|
16 |
|
17 |
CITATION_MAIN = "@misc{junczyk-2024-pl-asr-survey <br> \
|
|
|
1 |
INFO_MAIN= " Welcome to the Polish ASR Survey dashboard! <br> \
|
2 |
+
You can use it to learn about the state of Polish ASR speech data and benchmarks. <br> \
|
3 |
The dashboard is built upon the [*Polish ASR Speech Datasets Catalog*](https://github.com/goodmike31/pl-asr-speech-data-survey) and [*Polish ASR Benchmarks Catalog*](https://docs.google.com/spreadsheets/d/1fVsE98Ulmt-EIEe4wx8sUdo7RLigDdAVjQxNpAJIrH8/edit?usp=sharing). <br><br> \
|
4 |
The dashboard is divided into the following tabs: <br> \
|
5 |
* **About Polish ASR Survey** - general information about the survey, references, and contact points <br> \
|
|
|
11 |
* **ASR Benchmarks Taxonomy** - explanation of the columns in the *Polish ASR Benchmarks Catalog* <br> \
|
12 |
Please visit respective tab to learn how to use it and provide feedback. <br><br> \
|
13 |
If you want to share your feedback regarding the Speech Data catalog, please use this [FORM](https://forms.gle/EWJ6YfbJJTyEzQs66). <br><br> \
|
14 |
+
If you are looking for the latest ASR benchmarks for Polish, please visit the [AMU ASR leaderboard](https://huggingface.co/spaces/amu-cai/pl-asr-leaderboard). <br><br> \
|
15 |
You can also contact the author via [email](mailto:michal.junczyk@amu.edu.pl) or [LinkedIn](https://www.linkedin.com/in/michaljunczyk/).<br>"
|
16 |
|
17 |
CITATION_MAIN = "@misc{junczyk-2024-pl-asr-survey <br> \
|