Spaces:
Sleeping
Sleeping
mj-new
commited on
Commit
•
4eee292
1
Parent(s):
d9c6196
Replaced no-info with None values
Browse files- __pycache__/utils.cpython-310.pyc +0 -0
- app.py +4 -2
- requirements.txt +2 -1
- utils.py +36 -7
__pycache__/utils.cpython-310.pyc
CHANGED
Binary files a/__pycache__/utils.cpython-310.pyc and b/__pycache__/utils.cpython-310.pyc differ
|
|
app.py
CHANGED
@@ -6,6 +6,7 @@ from contants import INFO_CATALOG, CITATION_CATALOG, HOWTO_CATALOG,INFO_BENCHMAR
|
|
6 |
from utils import BASE_SUMMARY_METRICS
|
7 |
from utils import load_data_catalog, load_data_taxonomy, load_bench_catalog, load_bench_taxonomy
|
8 |
from utils import datasets_count_and_size, datasets_count_and_size_standard, metadata_coverage, catalog_summary_statistics
|
|
|
9 |
|
10 |
import matplotlib.pyplot as plt
|
11 |
import seaborn as sns
|
@@ -66,6 +67,7 @@ with data_survey:
|
|
66 |
df_summary_metrics = catalog_summary_statistics(df_data_cat)
|
67 |
|
68 |
df_basic_stats = df_summary_metrics.loc[BASE_SUMMARY_METRICS[0:5]]
|
|
|
69 |
st.dataframe(df_basic_stats, use_container_width=False)
|
70 |
|
71 |
st.header("Speech data available across Polish ASR speech datasets")
|
@@ -80,9 +82,9 @@ with data_survey:
|
|
80 |
# Display distribution of datasets created per year
|
81 |
st.header("Polish ASR speech datasets created in 1997-2023")
|
82 |
col_groupby = ['Creation year']
|
83 |
-
|
84 |
|
85 |
-
st.dataframe(
|
86 |
|
87 |
st.header("Institutions contributing Polish ASR speech dataset")
|
88 |
col_groupby = ['Publisher']
|
|
|
6 |
from utils import BASE_SUMMARY_METRICS
|
7 |
from utils import load_data_catalog, load_data_taxonomy, load_bench_catalog, load_bench_taxonomy
|
8 |
from utils import datasets_count_and_size, datasets_count_and_size_standard, metadata_coverage, catalog_summary_statistics
|
9 |
+
from utils import left_align, right_align
|
10 |
|
11 |
import matplotlib.pyplot as plt
|
12 |
import seaborn as sns
|
|
|
67 |
df_summary_metrics = catalog_summary_statistics(df_data_cat)
|
68 |
|
69 |
df_basic_stats = df_summary_metrics.loc[BASE_SUMMARY_METRICS[0:5]]
|
70 |
+
|
71 |
st.dataframe(df_basic_stats, use_container_width=False)
|
72 |
|
73 |
st.header("Speech data available across Polish ASR speech datasets")
|
|
|
82 |
# Display distribution of datasets created per year
|
83 |
st.header("Polish ASR speech datasets created in 1997-2023")
|
84 |
col_groupby = ['Creation year']
|
85 |
+
df_datasets_per_year = datasets_count_and_size(df_data_cat, col_groupby, col_sort=col_groupby, col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
|
86 |
|
87 |
+
st.dataframe(df_datasets_per_year, use_container_width=False)
|
88 |
|
89 |
st.header("Institutions contributing Polish ASR speech dataset")
|
90 |
col_groupby = ['Publisher']
|
requirements.txt
CHANGED
@@ -1,3 +1,4 @@
|
|
1 |
seaborn
|
2 |
matplotlib
|
3 |
-
pandas
|
|
|
|
1 |
seaborn
|
2 |
matplotlib
|
3 |
+
pandas
|
4 |
+
librosa
|
utils.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
import requests
|
2 |
import pandas as pd
|
3 |
import streamlit as st
|
|
|
4 |
|
5 |
catalog_last_update_date = pd.to_datetime('today').strftime('%Y-%m-%d')
|
6 |
# TODO - extract from the catalog name
|
@@ -30,13 +31,14 @@ def download_tsv_from_google_sheet(sheet_url):
|
|
30 |
|
31 |
# Send a GET request to download the TSV file
|
32 |
response = requests.get(tsv_url)
|
33 |
-
|
|
|
34 |
# Check if the request was successful
|
35 |
if response.status_code == 200:
|
36 |
# Read the TSV content into a pandas DataFrame
|
37 |
from io import StringIO
|
38 |
tsv_content = StringIO(response.text)
|
39 |
-
df = pd.read_csv(tsv_content, sep='\t')
|
40 |
return df
|
41 |
else:
|
42 |
print("Failed to download the TSV file.")
|
@@ -71,6 +73,22 @@ def load_bench_taxonomy():
|
|
71 |
df_taxonomy = download_tsv_from_google_sheet(taxonomy_url)
|
72 |
return(df_taxonomy)
|
73 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
|
75 |
def datasets_count_and_size(df_cat, col_groupby, col_sort=None, col_percent=None, col_sum=['Size audio transcribed [hours]'], col_count=['Dataset ID']):
|
76 |
"""
|
@@ -144,11 +162,13 @@ def datasets_count_and_size(df_cat, col_groupby, col_sort=None, col_percent=None
|
|
144 |
# Sort by the provided column col_sort
|
145 |
col_sort = col_groupby if col_sort is None else col_sort
|
146 |
summary.sort_values(by=col_sort, ascending=False, inplace=True)
|
147 |
-
|
148 |
-
|
149 |
for col in col_sum:
|
150 |
-
|
151 |
-
|
|
|
|
|
152 |
return summary
|
153 |
|
154 |
|
@@ -210,6 +230,9 @@ def metadata_coverage(df_cat, df_cat_available_free, df_cat_available_paid):
|
|
210 |
df_meta_all_pivot = df_meta_all_pivot.pivot(index='Metadata', columns='Type', values=[col_name_count, col_name_sum_size, col_name_percent])
|
211 |
df_meta_all_pivot[col_name_count]=df_meta_all_pivot[col_name_count].astype(int)
|
212 |
|
|
|
|
|
|
|
213 |
return(df_meta_all_flat, df_meta_all_pivot)
|
214 |
|
215 |
|
@@ -289,4 +312,10 @@ def catalog_summary_statistics(df_cat):
|
|
289 |
metrics_df = pd.DataFrame(metrics_dict)
|
290 |
metrics_df.reset_index(drop=True, inplace=True)
|
291 |
metrics_df.set_index("Metric", inplace=True)
|
292 |
-
return(metrics_df)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import requests
|
2 |
import pandas as pd
|
3 |
import streamlit as st
|
4 |
+
import numpy as np
|
5 |
|
6 |
catalog_last_update_date = pd.to_datetime('today').strftime('%Y-%m-%d')
|
7 |
# TODO - extract from the catalog name
|
|
|
31 |
|
32 |
# Send a GET request to download the TSV file
|
33 |
response = requests.get(tsv_url)
|
34 |
+
response.encoding = 'utf-8'
|
35 |
+
|
36 |
# Check if the request was successful
|
37 |
if response.status_code == 200:
|
38 |
# Read the TSV content into a pandas DataFrame
|
39 |
from io import StringIO
|
40 |
tsv_content = StringIO(response.text)
|
41 |
+
df = pd.read_csv(tsv_content, sep='\t', encoding='utf-8')
|
42 |
return df
|
43 |
else:
|
44 |
print("Failed to download the TSV file.")
|
|
|
73 |
df_taxonomy = download_tsv_from_google_sheet(taxonomy_url)
|
74 |
return(df_taxonomy)
|
75 |
|
76 |
+
def style_floats(val):
|
77 |
+
"""
|
78 |
+
Converts float to int if the fractional part is zero, formats floats with two decimal places,
|
79 |
+
and leaves strings unchanged.
|
80 |
+
"""
|
81 |
+
# Check if value is a float and if it can be converted to an int without loss
|
82 |
+
if isinstance(val, float):
|
83 |
+
if val % 1 == 0:
|
84 |
+
return f"{int(val)}" # Convert float with no fractional part to int
|
85 |
+
else:
|
86 |
+
return f"{val:.2f}" # Format floats with two decimal places
|
87 |
+
elif isinstance(val, int):
|
88 |
+
return f"{val}" # Handle pure integers separately (though likely unnecessary)
|
89 |
+
else:
|
90 |
+
return val # Return strings unchanged
|
91 |
+
|
92 |
|
93 |
def datasets_count_and_size(df_cat, col_groupby, col_sort=None, col_percent=None, col_sum=['Size audio transcribed [hours]'], col_count=['Dataset ID']):
|
94 |
"""
|
|
|
162 |
# Sort by the provided column col_sort
|
163 |
col_sort = col_groupby if col_sort is None else col_sort
|
164 |
summary.sort_values(by=col_sort, ascending=False, inplace=True)
|
165 |
+
|
166 |
+
print(col_sum)
|
167 |
for col in col_sum:
|
168 |
+
print(col)
|
169 |
+
#summary[col] = summary[col].apply(lambda x: str(int(x)) if float(x).is_integer() else str(x))
|
170 |
+
summary[col] = summary[col].replace(0, np.nan)
|
171 |
+
|
172 |
return summary
|
173 |
|
174 |
|
|
|
230 |
df_meta_all_pivot = df_meta_all_pivot.pivot(index='Metadata', columns='Type', values=[col_name_count, col_name_sum_size, col_name_percent])
|
231 |
df_meta_all_pivot[col_name_count]=df_meta_all_pivot[col_name_count].astype(int)
|
232 |
|
233 |
+
#df_meta_all_pivot_styled = df_meta_all_pivot.style.map(style_floats)
|
234 |
+
#df_meta_all_flat_styled = df_meta_all_flat.style.map(style_floats)
|
235 |
+
|
236 |
return(df_meta_all_flat, df_meta_all_pivot)
|
237 |
|
238 |
|
|
|
312 |
metrics_df = pd.DataFrame(metrics_dict)
|
313 |
metrics_df.reset_index(drop=True, inplace=True)
|
314 |
metrics_df.set_index("Metric", inplace=True)
|
315 |
+
return(metrics_df)
|
316 |
+
|
317 |
+
def right_align(s, props='text-align: right;'):
|
318 |
+
return props
|
319 |
+
|
320 |
+
def left_align(s, props='text-align: left;'):
|
321 |
+
return props
|