Spaces:

amu-cai
/

pl-asr-survey

Running

App Files Files Community

mj-new commited on Mar 10, 2024

Commit

968224e

1 Parent(s): de78526

Improved description and references

Browse files

Files changed (5) hide show

__pycache__/app_utils.cpython-310.pyc +0 -0
__pycache__/contants.cpython-310.pyc +0 -0
app.py +12 -6
app_utils.py +2 -2
contants.py +65 -22

__pycache__/app_utils.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/app_utils.cpython-310.pyc and b/__pycache__/app_utils.cpython-310.pyc differ

__pycache__/contants.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/contants.cpython-310.pyc and b/__pycache__/contants.cpython-310.pyc differ

app.py CHANGED Viewed

@@ -2,7 +2,7 @@ import pandas as pd
 import streamlit as st
 from app_utils import filter_dataframe, calculate_height_to_display
-from contants import INFO_CATALOG, CITATION_CATALOG, HOWTO_CATALOG,INFO_BENCHMARK, CITATION_BENCHMARK, INFO_SURVEY, CITATION_SURVEY
 from utils import BASE_SUMMARY_METRICS
 from utils import  load_data_catalog, load_data_taxonomy, load_bench_catalog, load_bench_taxonomy
 from utils import datasets_count_and_size, datasets_count_and_size_standard, metadata_coverage, catalog_summary_statistics
@@ -32,9 +32,16 @@ df_data_cat_available_paid = df_data_cat[(df_data_cat['Available online'] == 'ye
 df_bench_cat = load_bench_catalog()
 df_bench_tax = load_bench_taxonomy()
-data_cat, data_taxonomy, data_survey, bench_cat, bench_taxonomy, bench_survey = st.tabs(["PL ASR speech data **catalog**", "PL ASR speech data **survey**", "ASR speech data **taxonomy**", "PL ASR benchmarks catalog", "ASR benchmarks taxonomy", "PL ASR benchmarks survey"])
 with data_cat:
     st.title("Polish ASR Speech Datasets Catalog")
@@ -43,14 +50,13 @@ with data_cat:
     st.header("How to use?")
     st.markdown(HOWTO_CATALOG, unsafe_allow_html=True)
-    st.header("How to cite?")
-    st.markdown(CITATION_CATALOG, unsafe_allow_html=True)
     # Display catalog contents
     st.header("Browse the catalog content")
     st.dataframe(filter_dataframe(df_data_cat, "datasets"), hide_index=True, use_container_width=True)
-    # Display taxonomy contents
 with data_survey:
     # Display summary statistics

 import streamlit as st
 from app_utils import filter_dataframe, calculate_height_to_display
+from contants import INFO_CATALOG, CITATION_CATALOG, HOWTO_CATALOG,INFO_BENCHMARK, CITATION_BENCHMARK, HOWTO_BENCHMARK, INFO_MAIN, CITATION_MAIN
 from utils import BASE_SUMMARY_METRICS
 from utils import  load_data_catalog, load_data_taxonomy, load_bench_catalog, load_bench_taxonomy
 from utils import datasets_count_and_size, datasets_count_and_size_standard, metadata_coverage, catalog_summary_statistics
 df_bench_cat = load_bench_catalog()
 df_bench_tax = load_bench_taxonomy()
+about, data_cat, data_taxonomy, data_survey, bench_cat, bench_survey, bench_taxonomy = st.tabs(["PL ASR survey", "PL ASR Speech Data **Catalog**", "PL ASR Speech data **Survey**", "ASR Speech Data **Taxonomy**", "PL ASR Benchmarks Catalog", "PL ASR Benchmarks Survey", "ASR Benchmarks Taxonomy"])
+with about:
+    st.title("About Polish ASR Survey")
+    st.markdown(INFO_MAIN, unsafe_allow_html=True)
+    st.header("How to cite this resource?")
+    st.markdown(CITATION_MAIN, unsafe_allow_html=True)
 with data_cat:
     st.title("Polish ASR Speech Datasets Catalog")
     st.header("How to use?")
     st.markdown(HOWTO_CATALOG, unsafe_allow_html=True)
     # Display catalog contents
     st.header("Browse the catalog content")
     st.dataframe(filter_dataframe(df_data_cat, "datasets"), hide_index=True, use_container_width=True)
+    st.header("How to cite this resource?")
+    st.markdown(CITATION_CATALOG, unsafe_allow_html=True)
 with data_survey:
     # Display summary statistics

app_utils.py CHANGED Viewed

@@ -29,9 +29,9 @@ def filter_dataframe(df: pd.DataFrame, target) -> pd.DataFrame:
         pd.DataFrame: Filtered dataframe
     """
     if(target == "datasets"):
-        modify = st.checkbox("Use filters on speech data catalog")
     elif(target == "benchmarks"):
-        modify = st.checkbox("Use filters on benchmarks catalog")
     else:
         print("Invalid target")

         pd.DataFrame: Filtered dataframe
     """
     if(target == "datasets"):
+        modify = st.checkbox("Enable filters to browse ASR speech data catalog")
     elif(target == "benchmarks"):
+        modify = st.checkbox("Enable filters to browse ASR benchmarks catalog")
     else:
         print("Invalid target")

contants.py CHANGED Viewed

@@ -1,28 +1,71 @@
-INFO_CATALOG = "This dashboard complements *Polish ASR Speech Datasets Catalog* available on [GitHub](https://github.com/goodmike31/pl-asr-speech-data-survey) and [Google Sheets](https://docs.google.com/spreadsheets/d/181EDfwZNtHgHFOMaKNtgKssrYDX4tXTJ9POMzBsCRlI/edit#gid=0) by providing:<br> \
-* More convenient browsing of the catalog content (see the **How to use?** section below) <br>\
-* Analysis of datasets utility for the purpose of ASR evaluation (see the **Dataset Utility Index** tab) <br>\
-* Analysis of the state of Polish ASR speech data (see the **Polish ASR Speech Data Survey** tab <br><br> \
-IMPORTANT - You can share your feedback [HERE](https://forms.gle/EWJ6YfbJJTyEzQs66). <br>\
-Each response is granted 50 PLN for the charity of your choice. <br>\
-The feedback will help to assess the state of Polish ASR speech data from the community perspective.<br>\
-"
-CITATION_CATALOG="Please cite this work as: <br> \
-*@misc{pl-asr-speech-data-catalog, <br> \
-  title={Polish ASR Speech Datasets Catalog}, <br> \
-  author={Michał Junczyk}, <br> \
-  year={2023}, <br> \
-  publisher={Github}, <br> \
-  howpublished={\\url{https://github.com/goodmike31/pl-asr-speech-data-survey}}*<br>"
-HOWTO_CATALOG = "You can use the filters on the left to browse the catalog content. <br> \
-Please refer to the **Data Catalog Taxonomy** tab for the explanation of the columns. <br>"
 INFO_BENCHMARK = "TODO"
-INFO_SURVEY = "This dashboard complements [Polish Speech Datasets Survey]"
-CITATION_BENCHMARK="Please cite this work as: TODO"
-CITATION_SURVEY="Please cite this work as: TODO"

+INFO_MAIN= " Welcome to the Polish ASR Survey dashboard! <br> \
+You can use this dashboard to learn about the state of Polish ASR speech data and benchmarks. <br> \
+The dashboard is built upon the [*Polish ASR Speech Datasets Catalog*](https://github.com/goodmike31/pl-asr-speech-data-survey) and [*Polish ASR Benchmarks Catalog*](https://github.com/goodmike31/pl-asr-benchmarks-survey). <br><br> \
+The dashboard is divided into the following tabs: <br> \
+* **About Polish ASR Survey** - general information about the survey, references, and contact points <br> \
+* **Polish ASR Speech Data Catalog** - detailed information about the speech data available for Polish ASR <br> \
+* **Polish ASR Speech Data Survey** - analysis of the state of Polish ASR speech data <br> \
+* **ASR Speech Data Taxonomy** - explanation of the columns in the *Polish ASR Speech Datasets Catalog* <br> \
+* **Polish ASR Benchmarks Catalog** - detailed information about the benchmarks available for Polish ASR <br> \
+* **Polish ASR Benchmarks Survey** - analysis of the state of Polish ASR benchmarks <br> \
+* **ASR Benchmarks Taxonomy** - explanation of the columns in the *Polish ASR Benchmarks Catalog* <br> \
+Please visit respective tab to learn how to use it and provide feedback. <br><br> \
+If you want to share your feedback regarding the Speech Data catalog, please use this [FORM](https://forms.gle/EWJ6YfbJJTyEzQs66). <br><br> \
+If you are looking for the latest ASR benchmarks for Polish, please visit the [BIGOS/PELCRA ASR leaderboard](https://huggingface.co/spaces/amu-cai/pl-asr-bigos-bench-dash). <br><br> \
+You can also contact the author via [email](mailto:michal.junczyk@amu.edu.pl) or [LinkedIn](https://www.linkedin.com/in/michaljunczyk/).<br>"
+CITATION_MAIN = "@misc{junczyk-2024-pl-asr-survey <br> \
+  title = {Polish ASR Survey}, <br> \
+  author = {Michał Junczyk}, <br> \
+  year = {2024}, <br> \
+  publisher = {Hugging Face}, <br> \
+  url = {https://huggingface.co/spaces/amu-cai/pl-asr-survey} }"
+# TODO
+# * Analysis of datasets utility for the purpose of ASR evaluation (see the **Dataset Utility Index** tab) <br>\
+############################################################################################################
+INFO_CATALOG= "This dashboard complements *Polish ASR Speech Datasets Catalog* available on [GitHub](https://github.com/goodmike31/pl-asr-speech-data-survey) and [Google Sheets](https://docs.google.com/spreadsheets/d/181EDfwZNtHgHFOMaKNtgKssrYDX4tXTJ9POMzBsCRlI/edit#gid=0) by providing:<br> \
+* More convenient browsing of the catalog content (*see the **How to use?** section below*) <br>\
+* Up-to-data analysis of the state of Polish ASR speech data (*see the **Polish ASR Speech Data Survey** tab*) <br><br> \
+IMPORTANT - If you want to share your feedback regarding the catalog, please use this [FORM](https://forms.gle/EWJ6YfbJJTyEzQs66). Each response is granted 50 PLN for the charity of your choice. <br>\
+Your feedback will be helpful to assess the state of Polish ASR speech data from the community perspective.<br><br> \
+If you want report missing dataset or request correction of descriptons, please follow the steps described on [GitHub](https://github.com/goodmike31/pl-asr-speech-data-survey?tab=readme-ov-file#how-to-contribute-to-the-polish-asr-speech-datasets-catalog) <br> \
+You can also contact the author via [email](mailto:michal.junczyk@amu.edu.pl) or [LinkedIn](https://www.linkedin.com/in/michaljunczyk/).<br>"
+CITATION_CATALOG="@article{Junczyk+2024+27+52, <br>\
+url = {https://doi.org/10.1515/psicl-2023-0019},<br>\
+title = {A survey of Polish ASR speech datasets},<br>\
+author = {Michał Junczyk},<br>\
+pages = {27--52},<br>\
+volume = {60},<br>\
+number = {1},<br>\
+journal = {Poznan Studies in Contemporary Linguistics},<br>\
+doi = {doi:10.1515/psicl-2023-0019},<br>\
+year = {2024},<br>\
+lastchecked = {2024-03-10}<br>\
+}"
+HOWTO_CATALOG = "To browse the catalog content using filters you must enable them first. <br> \
+You can also sort the columns by clicking on the column header. <br> \
+Depending on the column type, you can use the search box to filter the content. <br> \
+Please refer to the **ASR Speech Data Taxonomy** tab for the explanation of the columns. <br> \
+If you looking for insights derived from the collected in the catalog, please go to **Polish ASR Speech Data Survey** tab. <br>"
+############################################################################################################
 INFO_BENCHMARK = "TODO"
+CITATION_BENCHMARK="@misc{junczyk-2023-pl-asr-speech-data-catalog, <br> \
+  title = {Polish ASR Speech Datasets Catalog}, <br> \
+  author = {Michał Junczyk}, <br> \
+  year = {2023}, <br> \
+  publisher = {Github}, <br> \
+  url = {https://github.com/goodmike31/pl-asr-speech-data-survey} }"
+HOWTO_BENCHMARK = "You can use the filters to browse the catalog content. <br> \
+You can also sort the columns by clicking on the column header. <br> \
+Depending on the column type, you can use the search box to filter the content. <br> \
+Please refer to the **ASR Benchmarks Catalog Taxonomy** tab for the explanation of the columns. <br>"
+############################################################################################################