Spaces:
Sleeping
Sleeping
mj-new
commited on
Commit
•
968224e
1
Parent(s):
de78526
Improved description and references
Browse files- __pycache__/app_utils.cpython-310.pyc +0 -0
- __pycache__/contants.cpython-310.pyc +0 -0
- app.py +12 -6
- app_utils.py +2 -2
- contants.py +65 -22
__pycache__/app_utils.cpython-310.pyc
CHANGED
Binary files a/__pycache__/app_utils.cpython-310.pyc and b/__pycache__/app_utils.cpython-310.pyc differ
|
|
__pycache__/contants.cpython-310.pyc
CHANGED
Binary files a/__pycache__/contants.cpython-310.pyc and b/__pycache__/contants.cpython-310.pyc differ
|
|
app.py
CHANGED
@@ -2,7 +2,7 @@ import pandas as pd
|
|
2 |
import streamlit as st
|
3 |
|
4 |
from app_utils import filter_dataframe, calculate_height_to_display
|
5 |
-
from contants import INFO_CATALOG, CITATION_CATALOG, HOWTO_CATALOG,INFO_BENCHMARK, CITATION_BENCHMARK,
|
6 |
from utils import BASE_SUMMARY_METRICS
|
7 |
from utils import load_data_catalog, load_data_taxonomy, load_bench_catalog, load_bench_taxonomy
|
8 |
from utils import datasets_count_and_size, datasets_count_and_size_standard, metadata_coverage, catalog_summary_statistics
|
@@ -32,9 +32,16 @@ df_data_cat_available_paid = df_data_cat[(df_data_cat['Available online'] == 'ye
|
|
32 |
df_bench_cat = load_bench_catalog()
|
33 |
df_bench_tax = load_bench_taxonomy()
|
34 |
|
35 |
-
data_cat, data_taxonomy, data_survey, bench_cat,
|
36 |
|
37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
with data_cat:
|
39 |
st.title("Polish ASR Speech Datasets Catalog")
|
40 |
|
@@ -43,14 +50,13 @@ with data_cat:
|
|
43 |
st.header("How to use?")
|
44 |
st.markdown(HOWTO_CATALOG, unsafe_allow_html=True)
|
45 |
|
46 |
-
st.header("How to cite?")
|
47 |
-
st.markdown(CITATION_CATALOG, unsafe_allow_html=True)
|
48 |
-
|
49 |
# Display catalog contents
|
50 |
st.header("Browse the catalog content")
|
51 |
st.dataframe(filter_dataframe(df_data_cat, "datasets"), hide_index=True, use_container_width=True)
|
52 |
|
53 |
-
|
|
|
|
|
54 |
|
55 |
with data_survey:
|
56 |
# Display summary statistics
|
|
|
2 |
import streamlit as st
|
3 |
|
4 |
from app_utils import filter_dataframe, calculate_height_to_display
|
5 |
+
from contants import INFO_CATALOG, CITATION_CATALOG, HOWTO_CATALOG,INFO_BENCHMARK, CITATION_BENCHMARK, HOWTO_BENCHMARK, INFO_MAIN, CITATION_MAIN
|
6 |
from utils import BASE_SUMMARY_METRICS
|
7 |
from utils import load_data_catalog, load_data_taxonomy, load_bench_catalog, load_bench_taxonomy
|
8 |
from utils import datasets_count_and_size, datasets_count_and_size_standard, metadata_coverage, catalog_summary_statistics
|
|
|
32 |
df_bench_cat = load_bench_catalog()
|
33 |
df_bench_tax = load_bench_taxonomy()
|
34 |
|
35 |
+
about, data_cat, data_taxonomy, data_survey, bench_cat, bench_survey, bench_taxonomy = st.tabs(["PL ASR survey", "PL ASR Speech Data **Catalog**", "PL ASR Speech data **Survey**", "ASR Speech Data **Taxonomy**", "PL ASR Benchmarks Catalog", "PL ASR Benchmarks Survey", "ASR Benchmarks Taxonomy"])
|
36 |
|
37 |
|
38 |
+
with about:
|
39 |
+
st.title("About Polish ASR Survey")
|
40 |
+
st.markdown(INFO_MAIN, unsafe_allow_html=True)
|
41 |
+
|
42 |
+
st.header("How to cite this resource?")
|
43 |
+
st.markdown(CITATION_MAIN, unsafe_allow_html=True)
|
44 |
+
|
45 |
with data_cat:
|
46 |
st.title("Polish ASR Speech Datasets Catalog")
|
47 |
|
|
|
50 |
st.header("How to use?")
|
51 |
st.markdown(HOWTO_CATALOG, unsafe_allow_html=True)
|
52 |
|
|
|
|
|
|
|
53 |
# Display catalog contents
|
54 |
st.header("Browse the catalog content")
|
55 |
st.dataframe(filter_dataframe(df_data_cat, "datasets"), hide_index=True, use_container_width=True)
|
56 |
|
57 |
+
st.header("How to cite this resource?")
|
58 |
+
st.markdown(CITATION_CATALOG, unsafe_allow_html=True)
|
59 |
+
|
60 |
|
61 |
with data_survey:
|
62 |
# Display summary statistics
|
app_utils.py
CHANGED
@@ -29,9 +29,9 @@ def filter_dataframe(df: pd.DataFrame, target) -> pd.DataFrame:
|
|
29 |
pd.DataFrame: Filtered dataframe
|
30 |
"""
|
31 |
if(target == "datasets"):
|
32 |
-
modify = st.checkbox("
|
33 |
elif(target == "benchmarks"):
|
34 |
-
modify = st.checkbox("
|
35 |
else:
|
36 |
print("Invalid target")
|
37 |
|
|
|
29 |
pd.DataFrame: Filtered dataframe
|
30 |
"""
|
31 |
if(target == "datasets"):
|
32 |
+
modify = st.checkbox("Enable filters to browse ASR speech data catalog")
|
33 |
elif(target == "benchmarks"):
|
34 |
+
modify = st.checkbox("Enable filters to browse ASR benchmarks catalog")
|
35 |
else:
|
36 |
print("Invalid target")
|
37 |
|
contants.py
CHANGED
@@ -1,28 +1,71 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
howpublished={\\url{https://github.com/goodmike31/pl-asr-speech-data-survey}}*<br>"
|
17 |
-
|
18 |
-
HOWTO_CATALOG = "You can use the filters on the left to browse the catalog content. <br> \
|
19 |
-
Please refer to the **Data Catalog Taxonomy** tab for the explanation of the columns. <br>"
|
20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
INFO_BENCHMARK = "TODO"
|
22 |
|
23 |
-
|
|
|
|
|
|
|
|
|
|
|
24 |
|
|
|
|
|
|
|
|
|
25 |
|
26 |
-
CITATION_BENCHMARK="Please cite this work as: TODO"
|
27 |
|
28 |
-
|
|
|
1 |
+
INFO_MAIN= " Welcome to the Polish ASR Survey dashboard! <br> \
|
2 |
+
You can use this dashboard to learn about the state of Polish ASR speech data and benchmarks. <br> \
|
3 |
+
The dashboard is built upon the [*Polish ASR Speech Datasets Catalog*](https://github.com/goodmike31/pl-asr-speech-data-survey) and [*Polish ASR Benchmarks Catalog*](https://github.com/goodmike31/pl-asr-benchmarks-survey). <br><br> \
|
4 |
+
The dashboard is divided into the following tabs: <br> \
|
5 |
+
* **About Polish ASR Survey** - general information about the survey, references, and contact points <br> \
|
6 |
+
* **Polish ASR Speech Data Catalog** - detailed information about the speech data available for Polish ASR <br> \
|
7 |
+
* **Polish ASR Speech Data Survey** - analysis of the state of Polish ASR speech data <br> \
|
8 |
+
* **ASR Speech Data Taxonomy** - explanation of the columns in the *Polish ASR Speech Datasets Catalog* <br> \
|
9 |
+
* **Polish ASR Benchmarks Catalog** - detailed information about the benchmarks available for Polish ASR <br> \
|
10 |
+
* **Polish ASR Benchmarks Survey** - analysis of the state of Polish ASR benchmarks <br> \
|
11 |
+
* **ASR Benchmarks Taxonomy** - explanation of the columns in the *Polish ASR Benchmarks Catalog* <br> \
|
12 |
+
Please visit respective tab to learn how to use it and provide feedback. <br><br> \
|
13 |
+
If you want to share your feedback regarding the Speech Data catalog, please use this [FORM](https://forms.gle/EWJ6YfbJJTyEzQs66). <br><br> \
|
14 |
+
If you are looking for the latest ASR benchmarks for Polish, please visit the [BIGOS/PELCRA ASR leaderboard](https://huggingface.co/spaces/amu-cai/pl-asr-bigos-bench-dash). <br><br> \
|
15 |
+
You can also contact the author via [email](mailto:michal.junczyk@amu.edu.pl) or [LinkedIn](https://www.linkedin.com/in/michaljunczyk/).<br>"
|
|
|
|
|
|
|
|
|
16 |
|
17 |
+
CITATION_MAIN = "@misc{junczyk-2024-pl-asr-survey <br> \
|
18 |
+
title = {Polish ASR Survey}, <br> \
|
19 |
+
author = {Michał Junczyk}, <br> \
|
20 |
+
year = {2024}, <br> \
|
21 |
+
publisher = {Hugging Face}, <br> \
|
22 |
+
url = {https://huggingface.co/spaces/amu-cai/pl-asr-survey} }"
|
23 |
+
|
24 |
+
# TODO
|
25 |
+
# * Analysis of datasets utility for the purpose of ASR evaluation (see the **Dataset Utility Index** tab) <br>\
|
26 |
+
############################################################################################################
|
27 |
+
|
28 |
+
INFO_CATALOG= "This dashboard complements *Polish ASR Speech Datasets Catalog* available on [GitHub](https://github.com/goodmike31/pl-asr-speech-data-survey) and [Google Sheets](https://docs.google.com/spreadsheets/d/181EDfwZNtHgHFOMaKNtgKssrYDX4tXTJ9POMzBsCRlI/edit#gid=0) by providing:<br> \
|
29 |
+
* More convenient browsing of the catalog content (*see the **How to use?** section below*) <br>\
|
30 |
+
* Up-to-data analysis of the state of Polish ASR speech data (*see the **Polish ASR Speech Data Survey** tab*) <br><br> \
|
31 |
+
IMPORTANT - If you want to share your feedback regarding the catalog, please use this [FORM](https://forms.gle/EWJ6YfbJJTyEzQs66). Each response is granted 50 PLN for the charity of your choice. <br>\
|
32 |
+
Your feedback will be helpful to assess the state of Polish ASR speech data from the community perspective.<br><br> \
|
33 |
+
If you want report missing dataset or request correction of descriptons, please follow the steps described on [GitHub](https://github.com/goodmike31/pl-asr-speech-data-survey?tab=readme-ov-file#how-to-contribute-to-the-polish-asr-speech-datasets-catalog) <br> \
|
34 |
+
You can also contact the author via [email](mailto:michal.junczyk@amu.edu.pl) or [LinkedIn](https://www.linkedin.com/in/michaljunczyk/).<br>"
|
35 |
+
|
36 |
+
CITATION_CATALOG="@article{Junczyk+2024+27+52, <br>\
|
37 |
+
url = {https://doi.org/10.1515/psicl-2023-0019},<br>\
|
38 |
+
title = {A survey of Polish ASR speech datasets},<br>\
|
39 |
+
author = {Michał Junczyk},<br>\
|
40 |
+
pages = {27--52},<br>\
|
41 |
+
volume = {60},<br>\
|
42 |
+
number = {1},<br>\
|
43 |
+
journal = {Poznan Studies in Contemporary Linguistics},<br>\
|
44 |
+
doi = {doi:10.1515/psicl-2023-0019},<br>\
|
45 |
+
year = {2024},<br>\
|
46 |
+
lastchecked = {2024-03-10}<br>\
|
47 |
+
}"
|
48 |
+
|
49 |
+
HOWTO_CATALOG = "To browse the catalog content using filters you must enable them first. <br> \
|
50 |
+
You can also sort the columns by clicking on the column header. <br> \
|
51 |
+
Depending on the column type, you can use the search box to filter the content. <br> \
|
52 |
+
Please refer to the **ASR Speech Data Taxonomy** tab for the explanation of the columns. <br> \
|
53 |
+
If you looking for insights derived from the collected in the catalog, please go to **Polish ASR Speech Data Survey** tab. <br>"
|
54 |
+
|
55 |
+
############################################################################################################
|
56 |
INFO_BENCHMARK = "TODO"
|
57 |
|
58 |
+
CITATION_BENCHMARK="@misc{junczyk-2023-pl-asr-speech-data-catalog, <br> \
|
59 |
+
title = {Polish ASR Speech Datasets Catalog}, <br> \
|
60 |
+
author = {Michał Junczyk}, <br> \
|
61 |
+
year = {2023}, <br> \
|
62 |
+
publisher = {Github}, <br> \
|
63 |
+
url = {https://github.com/goodmike31/pl-asr-speech-data-survey} }"
|
64 |
|
65 |
+
HOWTO_BENCHMARK = "You can use the filters to browse the catalog content. <br> \
|
66 |
+
You can also sort the columns by clicking on the column header. <br> \
|
67 |
+
Depending on the column type, you can use the search box to filter the content. <br> \
|
68 |
+
Please refer to the **ASR Benchmarks Catalog Taxonomy** tab for the explanation of the columns. <br>"
|
69 |
|
|
|
70 |
|
71 |
+
############################################################################################################
|