mj-new commited on
Commit
968224e
1 Parent(s): de78526

Improved description and references

Browse files
__pycache__/app_utils.cpython-310.pyc CHANGED
Binary files a/__pycache__/app_utils.cpython-310.pyc and b/__pycache__/app_utils.cpython-310.pyc differ
 
__pycache__/contants.cpython-310.pyc CHANGED
Binary files a/__pycache__/contants.cpython-310.pyc and b/__pycache__/contants.cpython-310.pyc differ
 
app.py CHANGED
@@ -2,7 +2,7 @@ import pandas as pd
2
  import streamlit as st
3
 
4
  from app_utils import filter_dataframe, calculate_height_to_display
5
- from contants import INFO_CATALOG, CITATION_CATALOG, HOWTO_CATALOG,INFO_BENCHMARK, CITATION_BENCHMARK, INFO_SURVEY, CITATION_SURVEY
6
  from utils import BASE_SUMMARY_METRICS
7
  from utils import load_data_catalog, load_data_taxonomy, load_bench_catalog, load_bench_taxonomy
8
  from utils import datasets_count_and_size, datasets_count_and_size_standard, metadata_coverage, catalog_summary_statistics
@@ -32,9 +32,16 @@ df_data_cat_available_paid = df_data_cat[(df_data_cat['Available online'] == 'ye
32
  df_bench_cat = load_bench_catalog()
33
  df_bench_tax = load_bench_taxonomy()
34
 
35
- data_cat, data_taxonomy, data_survey, bench_cat, bench_taxonomy, bench_survey = st.tabs(["PL ASR speech data **catalog**", "PL ASR speech data **survey**", "ASR speech data **taxonomy**", "PL ASR benchmarks catalog", "ASR benchmarks taxonomy", "PL ASR benchmarks survey"])
36
 
37
 
 
 
 
 
 
 
 
38
  with data_cat:
39
  st.title("Polish ASR Speech Datasets Catalog")
40
 
@@ -43,14 +50,13 @@ with data_cat:
43
  st.header("How to use?")
44
  st.markdown(HOWTO_CATALOG, unsafe_allow_html=True)
45
 
46
- st.header("How to cite?")
47
- st.markdown(CITATION_CATALOG, unsafe_allow_html=True)
48
-
49
  # Display catalog contents
50
  st.header("Browse the catalog content")
51
  st.dataframe(filter_dataframe(df_data_cat, "datasets"), hide_index=True, use_container_width=True)
52
 
53
- # Display taxonomy contents
 
 
54
 
55
  with data_survey:
56
  # Display summary statistics
 
2
  import streamlit as st
3
 
4
  from app_utils import filter_dataframe, calculate_height_to_display
5
+ from contants import INFO_CATALOG, CITATION_CATALOG, HOWTO_CATALOG,INFO_BENCHMARK, CITATION_BENCHMARK, HOWTO_BENCHMARK, INFO_MAIN, CITATION_MAIN
6
  from utils import BASE_SUMMARY_METRICS
7
  from utils import load_data_catalog, load_data_taxonomy, load_bench_catalog, load_bench_taxonomy
8
  from utils import datasets_count_and_size, datasets_count_and_size_standard, metadata_coverage, catalog_summary_statistics
 
32
  df_bench_cat = load_bench_catalog()
33
  df_bench_tax = load_bench_taxonomy()
34
 
35
+ about, data_cat, data_taxonomy, data_survey, bench_cat, bench_survey, bench_taxonomy = st.tabs(["PL ASR survey", "PL ASR Speech Data **Catalog**", "PL ASR Speech data **Survey**", "ASR Speech Data **Taxonomy**", "PL ASR Benchmarks Catalog", "PL ASR Benchmarks Survey", "ASR Benchmarks Taxonomy"])
36
 
37
 
38
+ with about:
39
+ st.title("About Polish ASR Survey")
40
+ st.markdown(INFO_MAIN, unsafe_allow_html=True)
41
+
42
+ st.header("How to cite this resource?")
43
+ st.markdown(CITATION_MAIN, unsafe_allow_html=True)
44
+
45
  with data_cat:
46
  st.title("Polish ASR Speech Datasets Catalog")
47
 
 
50
  st.header("How to use?")
51
  st.markdown(HOWTO_CATALOG, unsafe_allow_html=True)
52
 
 
 
 
53
  # Display catalog contents
54
  st.header("Browse the catalog content")
55
  st.dataframe(filter_dataframe(df_data_cat, "datasets"), hide_index=True, use_container_width=True)
56
 
57
+ st.header("How to cite this resource?")
58
+ st.markdown(CITATION_CATALOG, unsafe_allow_html=True)
59
+
60
 
61
  with data_survey:
62
  # Display summary statistics
app_utils.py CHANGED
@@ -29,9 +29,9 @@ def filter_dataframe(df: pd.DataFrame, target) -> pd.DataFrame:
29
  pd.DataFrame: Filtered dataframe
30
  """
31
  if(target == "datasets"):
32
- modify = st.checkbox("Use filters on speech data catalog")
33
  elif(target == "benchmarks"):
34
- modify = st.checkbox("Use filters on benchmarks catalog")
35
  else:
36
  print("Invalid target")
37
 
 
29
  pd.DataFrame: Filtered dataframe
30
  """
31
  if(target == "datasets"):
32
+ modify = st.checkbox("Enable filters to browse ASR speech data catalog")
33
  elif(target == "benchmarks"):
34
+ modify = st.checkbox("Enable filters to browse ASR benchmarks catalog")
35
  else:
36
  print("Invalid target")
37
 
contants.py CHANGED
@@ -1,28 +1,71 @@
1
- INFO_CATALOG = "This dashboard complements *Polish ASR Speech Datasets Catalog* available on [GitHub](https://github.com/goodmike31/pl-asr-speech-data-survey) and [Google Sheets](https://docs.google.com/spreadsheets/d/181EDfwZNtHgHFOMaKNtgKssrYDX4tXTJ9POMzBsCRlI/edit#gid=0) by providing:<br> \
2
- * More convenient browsing of the catalog content (see the **How to use?** section below) <br>\
3
- * Analysis of datasets utility for the purpose of ASR evaluation (see the **Dataset Utility Index** tab) <br>\
4
- * Analysis of the state of Polish ASR speech data (see the **Polish ASR Speech Data Survey** tab <br><br> \
5
- IMPORTANT - You can share your feedback [HERE](https://forms.gle/EWJ6YfbJJTyEzQs66). <br>\
6
- Each response is granted 50 PLN for the charity of your choice. <br>\
7
- The feedback will help to assess the state of Polish ASR speech data from the community perspective.<br>\
8
- "
9
-
10
- CITATION_CATALOG="Please cite this work as: <br> \
11
- *@misc{pl-asr-speech-data-catalog, <br> \
12
- title={Polish ASR Speech Datasets Catalog}, <br> \
13
- author={Michał Junczyk}, <br> \
14
- year={2023}, <br> \
15
- publisher={Github}, <br> \
16
- howpublished={\\url{https://github.com/goodmike31/pl-asr-speech-data-survey}}*<br>"
17
-
18
- HOWTO_CATALOG = "You can use the filters on the left to browse the catalog content. <br> \
19
- Please refer to the **Data Catalog Taxonomy** tab for the explanation of the columns. <br>"
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  INFO_BENCHMARK = "TODO"
22
 
23
- INFO_SURVEY = "This dashboard complements [Polish Speech Datasets Survey]"
 
 
 
 
 
24
 
 
 
 
 
25
 
26
- CITATION_BENCHMARK="Please cite this work as: TODO"
27
 
28
- CITATION_SURVEY="Please cite this work as: TODO"
 
1
+ INFO_MAIN= " Welcome to the Polish ASR Survey dashboard! <br> \
2
+ You can use this dashboard to learn about the state of Polish ASR speech data and benchmarks. <br> \
3
+ The dashboard is built upon the [*Polish ASR Speech Datasets Catalog*](https://github.com/goodmike31/pl-asr-speech-data-survey) and [*Polish ASR Benchmarks Catalog*](https://github.com/goodmike31/pl-asr-benchmarks-survey). <br><br> \
4
+ The dashboard is divided into the following tabs: <br> \
5
+ * **About Polish ASR Survey** - general information about the survey, references, and contact points <br> \
6
+ * **Polish ASR Speech Data Catalog** - detailed information about the speech data available for Polish ASR <br> \
7
+ * **Polish ASR Speech Data Survey** - analysis of the state of Polish ASR speech data <br> \
8
+ * **ASR Speech Data Taxonomy** - explanation of the columns in the *Polish ASR Speech Datasets Catalog* <br> \
9
+ * **Polish ASR Benchmarks Catalog** - detailed information about the benchmarks available for Polish ASR <br> \
10
+ * **Polish ASR Benchmarks Survey** - analysis of the state of Polish ASR benchmarks <br> \
11
+ * **ASR Benchmarks Taxonomy** - explanation of the columns in the *Polish ASR Benchmarks Catalog* <br> \
12
+ Please visit respective tab to learn how to use it and provide feedback. <br><br> \
13
+ If you want to share your feedback regarding the Speech Data catalog, please use this [FORM](https://forms.gle/EWJ6YfbJJTyEzQs66). <br><br> \
14
+ If you are looking for the latest ASR benchmarks for Polish, please visit the [BIGOS/PELCRA ASR leaderboard](https://huggingface.co/spaces/amu-cai/pl-asr-bigos-bench-dash). <br><br> \
15
+ You can also contact the author via [email](mailto:michal.junczyk@amu.edu.pl) or [LinkedIn](https://www.linkedin.com/in/michaljunczyk/).<br>"
 
 
 
 
16
 
17
+ CITATION_MAIN = "@misc{junczyk-2024-pl-asr-survey <br> \
18
+ title = {Polish ASR Survey}, <br> \
19
+ author = {Michał Junczyk}, <br> \
20
+ year = {2024}, <br> \
21
+ publisher = {Hugging Face}, <br> \
22
+ url = {https://huggingface.co/spaces/amu-cai/pl-asr-survey} }"
23
+
24
+ # TODO
25
+ # * Analysis of datasets utility for the purpose of ASR evaluation (see the **Dataset Utility Index** tab) <br>\
26
+ ############################################################################################################
27
+
28
+ INFO_CATALOG= "This dashboard complements *Polish ASR Speech Datasets Catalog* available on [GitHub](https://github.com/goodmike31/pl-asr-speech-data-survey) and [Google Sheets](https://docs.google.com/spreadsheets/d/181EDfwZNtHgHFOMaKNtgKssrYDX4tXTJ9POMzBsCRlI/edit#gid=0) by providing:<br> \
29
+ * More convenient browsing of the catalog content (*see the **How to use?** section below*) <br>\
30
+ * Up-to-data analysis of the state of Polish ASR speech data (*see the **Polish ASR Speech Data Survey** tab*) <br><br> \
31
+ IMPORTANT - If you want to share your feedback regarding the catalog, please use this [FORM](https://forms.gle/EWJ6YfbJJTyEzQs66). Each response is granted 50 PLN for the charity of your choice. <br>\
32
+ Your feedback will be helpful to assess the state of Polish ASR speech data from the community perspective.<br><br> \
33
+ If you want report missing dataset or request correction of descriptons, please follow the steps described on [GitHub](https://github.com/goodmike31/pl-asr-speech-data-survey?tab=readme-ov-file#how-to-contribute-to-the-polish-asr-speech-datasets-catalog) <br> \
34
+ You can also contact the author via [email](mailto:michal.junczyk@amu.edu.pl) or [LinkedIn](https://www.linkedin.com/in/michaljunczyk/).<br>"
35
+
36
+ CITATION_CATALOG="@article{Junczyk+2024+27+52, <br>\
37
+ url = {https://doi.org/10.1515/psicl-2023-0019},<br>\
38
+ title = {A survey of Polish ASR speech datasets},<br>\
39
+ author = {Michał Junczyk},<br>\
40
+ pages = {27--52},<br>\
41
+ volume = {60},<br>\
42
+ number = {1},<br>\
43
+ journal = {Poznan Studies in Contemporary Linguistics},<br>\
44
+ doi = {doi:10.1515/psicl-2023-0019},<br>\
45
+ year = {2024},<br>\
46
+ lastchecked = {2024-03-10}<br>\
47
+ }"
48
+
49
+ HOWTO_CATALOG = "To browse the catalog content using filters you must enable them first. <br> \
50
+ You can also sort the columns by clicking on the column header. <br> \
51
+ Depending on the column type, you can use the search box to filter the content. <br> \
52
+ Please refer to the **ASR Speech Data Taxonomy** tab for the explanation of the columns. <br> \
53
+ If you looking for insights derived from the collected in the catalog, please go to **Polish ASR Speech Data Survey** tab. <br>"
54
+
55
+ ############################################################################################################
56
  INFO_BENCHMARK = "TODO"
57
 
58
+ CITATION_BENCHMARK="@misc{junczyk-2023-pl-asr-speech-data-catalog, <br> \
59
+ title = {Polish ASR Speech Datasets Catalog}, <br> \
60
+ author = {Michał Junczyk}, <br> \
61
+ year = {2023}, <br> \
62
+ publisher = {Github}, <br> \
63
+ url = {https://github.com/goodmike31/pl-asr-speech-data-survey} }"
64
 
65
+ HOWTO_BENCHMARK = "You can use the filters to browse the catalog content. <br> \
66
+ You can also sort the columns by clicking on the column header. <br> \
67
+ Depending on the column type, you can use the search box to filter the content. <br> \
68
+ Please refer to the **ASR Benchmarks Catalog Taxonomy** tab for the explanation of the columns. <br>"
69
 
 
70
 
71
+ ############################################################################################################