Spaces:
Sleeping
Sleeping
File size: 14,090 Bytes
d5cbb7a 4eee292 d5cbb7a 4eee292 d5cbb7a 4eee292 d5cbb7a e283f70 d5cbb7a e283f70 d5cbb7a e283f70 4eee292 e283f70 d5cbb7a 4eee292 d5cbb7a 4eee292 d5cbb7a 4eee292 d5cbb7a 4eee292 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 |
import requests
import pandas as pd
import streamlit as st
import numpy as np
catalog_last_update_date = pd.to_datetime('today').strftime('%Y-%m-%d')
# TODO - extract from the catalog name
BASE_SUMMARY_METRICS = [
"Catalog last update date",
"Unique Polish speech datasets producers",
"Identified datasets reported in the public domain",
"Datasets available to the public (free and paid)",
"Fraction of reported datasets available to the public [%]",
"Speech data reported in the public domain [hours]",
"Speech data available total [hours]",
"Speech data available free of charge [hours]",
"Speech data available commercially [hours]",
"Reported vs available speech data ratio [%]",
"Transcribed speech data reported in the public domain [hours]",
"Transcribed speech data available total [hours]",
"Transcribed speech data available free of charge [hours]",
"Transcribed speech data available commercially [hours]",
"Reported vs available transcribed speech data ratio [%]",
]
def download_tsv_from_google_sheet(sheet_url):
# Modify the Google Sheet URL to export it as TSV
tsv_url = sheet_url.replace('/edit#gid=', '/export?format=tsv&gid=')
# Send a GET request to download the TSV file
response = requests.get(tsv_url)
response.encoding = 'utf-8'
# Check if the request was successful
if response.status_code == 200:
# Read the TSV content into a pandas DataFrame
from io import StringIO
tsv_content = StringIO(response.text)
df = pd.read_csv(tsv_content, sep='\t', encoding='utf-8')
return df
else:
print("Failed to download the TSV file.")
return None
@st.cache_data
def load_data_catalog():
print("Reading speech data catalog")
catalog_url="https://docs.google.com/spreadsheets/d/181EDfwZNtHgHFOMaKNtgKssrYDX4tXTJ9POMzBsCRlI/edit#gid=0"
df_catalog = download_tsv_from_google_sheet(catalog_url)
return(df_catalog)
@st.cache_data
def load_data_taxonomy():
print("Reading speech data survey taxonomy")
taxonomy_url="https://docs.google.com/spreadsheets/d/181EDfwZNtHgHFOMaKNtgKssrYDX4tXTJ9POMzBsCRlI/edit#gid=2015613057"
df_taxonomy = download_tsv_from_google_sheet(taxonomy_url)
return(df_taxonomy)
@st.cache_data
def load_bench_catalog():
print("Reading ASR benchmarks catalog")
catalog_url="https://docs.google.com/spreadsheets/d/1fVsE98Ulmt-EIEe4wx8sUdo7RLigDdAVjQxNpAJIrH8/edit#gid=0"
df_catalog = download_tsv_from_google_sheet(catalog_url)
return(df_catalog)
@st.cache_data
def load_bench_taxonomy():
print("Reading ASR benchmarks survey taxonomy")
taxonomy_url="https://docs.google.com/spreadsheets/d/181EDfwZNtHgHFOMaKNtgKssrYDX4tXTJ9POMzBsCRlI/edit#gid=2015613057"
df_taxonomy = download_tsv_from_google_sheet(taxonomy_url)
return(df_taxonomy)
def style_floats(val):
"""
Converts float to int if the fractional part is zero, formats floats with two decimal places,
and leaves strings unchanged.
"""
# Check if value is a float and if it can be converted to an int without loss
if isinstance(val, float):
if val % 1 == 0:
return f"{int(val)}" # Convert float with no fractional part to int
else:
return f"{val:.2f}" # Format floats with two decimal places
elif isinstance(val, int):
return f"{val}" # Handle pure integers separately (though likely unnecessary)
else:
return val # Return strings unchanged
def datasets_count_and_size(df_cat, col_groupby, col_sort=None, col_percent=None, col_sum=['Size audio transcribed [hours]'], col_count=['Dataset ID']):
"""
Function to generate a summary view of datasets by speech type and other relevant metrics.
Args:
- df_cat (pd.DataFrame): The base dataframe containing dataset information.
- col_sum (str or list): The column(s) to sum.
- col_count (str or list): The column(s) to count.
- col_groupby (str or list): The column(s) to group the datasets by.
- col_percent (str): The column to calculate the percentage of total.
Returns:
- pd.DataFrame: A dataframe summarizing datasets by speech type and other relevant metrics.
"""
# Convert col_sum, col_count, and col_groupby to lists if they are not already
if not isinstance(col_sum, list):
col_sum = [col_sum]
if not isinstance(col_count, list):
col_count = [col_count]
if not isinstance(col_groupby, list):
col_groupby = [col_groupby]
# First, ensure that the data types and potential missing values are handled correctly
for col in col_sum:
num_values = df_cat[col].apply(lambda x: pd.to_numeric(x, errors='coerce')).fillna(0)
df_cat[col] = num_values
# Aggregating datasets by provided column type
summary = df_cat.groupby(col_groupby).agg({
**{col: 'sum' for col in col_sum},
**{col: 'count' for col in col_count}
}).reset_index()
col_name_percent = 'Percent of total'
if col_percent is not None:
# Calculating the percentage
total = summary[col_percent].sum(axis=1)
summary[col_name_percent] = round(total / total.sum() * 100, 2)
# Sorting the summary by the sum of the column
summary.sort_values(by=col_sum[0], ascending=False, inplace=True)
# Replacing index with the groupby column
summary.reset_index(drop=True, inplace=True)
summary.set_index(col_groupby, inplace=True)
# Rename the column to a more descriptive name
if len(col_count) == 0:
col_name_count = None
elif len(col_count) == 1:
col_name_count = 'Count ' + col_count[0]
summary.rename(columns={col_count[0]: col_name_count }, inplace=True)
summary[col_name_count] = summary[col_name_count].astype(int)
else:
#TODO - add support for renaming multiple count columns
pass
# Make the order of columns as follows 'Count Dataset ID', Total transcribed [hours], 'Percent of total'
if col_percent is None:
if col_name_count not in summary.columns:
summary = summary[col_sum]
else:
summary = summary[[col_name_count] + col_sum]
else:
if col_name_count not in summary.columns:
summary = summary[col_sum + [col_name_percent]]
else:
summary = summary[[col_name_count] + col_sum + [col_name_percent]]
# Sort by the provided column col_sort
col_sort = col_groupby if col_sort is None else col_sort
summary.sort_values(by=col_sort, ascending=False, inplace=True)
print(col_sum)
for col in col_sum:
print(col)
#summary[col] = summary[col].apply(lambda x: str(int(x)) if float(x).is_integer() else str(x))
summary[col] = summary[col].replace(0, np.nan)
return summary
def datasets_count_and_size_standard(df_cat, col_groupby):
return datasets_count_and_size(df_cat, col_groupby, col_sort=col_groupby, col_percent=['Size audio transcribed [hours]'], col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count=['Dataset ID'])
def metadata_coverage(df_cat, df_cat_available_free, df_cat_available_paid):
#TODO - add number of speakers and recordings
# 'Speaker id info', 'Part of speech annotation', 'Named entity annotation', 'Emotion annotation'
meta_data_cols = ['Gender info', 'Age info', 'Accent info', 'Nativity info', 'Time alignement annotation']
meta_coverage_all_sets = {}
meta_coverage_free_sets = {}
meta_coverage_paid_sets = {}
col_name_sum_size = 'Size audio transcribed [hours]'
col_name_count = 'Count Dataset ID'
col_name_percent = 'Percent of total'
#, 'Named entity annotation', 'Emotion annotation']
for meta_data_col in meta_data_cols:
df_datasets_per_meta_paid = datasets_count_and_size_standard(df_cat_available_paid, meta_data_col)
#print(df_datasets_per_meta_paid)
if 'yes' in df_datasets_per_meta_paid.index:
meta_coverage_paid_sets[meta_data_col] = df_datasets_per_meta_paid.loc['yes']
else:
meta_coverage_paid_sets[meta_data_col] = {col_name_sum_size:0, col_name_count:0, col_name_percent:0}
df_datasets_per_meta_all = datasets_count_and_size_standard(df_cat, meta_data_col)
#print(df_datasets_per_meta_all)
# select row where index has value "yes" and column name is "Percent of total"
if 'yes' in df_datasets_per_meta_all.index:
meta_coverage_all_sets[meta_data_col] = df_datasets_per_meta_all.loc['yes']
else:
meta_coverage_all_sets[meta_data_col] = {col_name_sum_size:0, col_name_count:0, col_name_percent:0}
df_datasets_per_meta_free = datasets_count_and_size_standard(df_cat_available_free, meta_data_col)
#print(df_datasets_per_meta_free)
# check if index has value "yes", if not assign 0
if 'yes' in df_datasets_per_meta_free.index:
meta_coverage_free_sets[meta_data_col] = df_datasets_per_meta_free.loc['yes']
else:
meta_coverage_free_sets[meta_data_col] = {col_name_sum_size:0, col_name_count:0, col_name_percent:0}
#merge all free and paid dataframes
df_meta_free = pd.DataFrame.from_dict(meta_coverage_free_sets, orient='index')
df_meta_free[col_name_count] = df_meta_free[col_name_count].astype(int)
df_meta_paid = pd.DataFrame.from_dict(meta_coverage_paid_sets, orient='index')
df_meta_paid[col_name_count] = df_meta_paid[col_name_count].astype(int)
df_meta_free['Type'] = 'Free'
df_meta_paid['Type'] = 'Paid'
df_meta_all_flat = pd.concat([df_meta_free, df_meta_paid])
#transform to compare free and paid column by column
df_meta_all_pivot = df_meta_all_flat.reset_index()
df_meta_all_pivot = df_meta_all_pivot.rename(columns={'index':'Metadata'})
df_meta_all_pivot = df_meta_all_pivot.pivot(index='Metadata', columns='Type', values=[col_name_count, col_name_sum_size, col_name_percent])
df_meta_all_pivot[col_name_count]=df_meta_all_pivot[col_name_count].astype(int)
#df_meta_all_pivot_styled = df_meta_all_pivot.style.map(style_floats)
#df_meta_all_flat_styled = df_meta_all_flat.style.map(style_floats)
return(df_meta_all_flat, df_meta_all_pivot)
def catalog_summary_statistics(df_cat):
"""
Function to generate summary statistics for the speech data catalog.
Args:
- df_cat (pd.DataFrame): The base dataframe containing dataset information.
Returns:
- pd.DataFrame: A dataframe summarizing the speech data catalog.
"""
col_name_transcribed = 'Size audio transcribed [hours]'
col_name_audio= 'Size audio total [hours]'
# Convert numerical fields to numeric type
df_cat[col_name_audio] = pd.to_numeric(df_cat[col_name_audio], errors='coerce')
df_cat[col_name_transcribed] = pd.to_numeric(df_cat[col_name_transcribed], errors='coerce')
# Filter out non-available datasets
df_cat_available = df_cat[df_cat['Available online'] == 'yes']
df_cat_free = df_cat[df_cat['Price - non-commercial usage'] == 'free']
df_cat_commercial = df_cat[df_cat['Price - non-commercial usage'] != 'free']
# Available and free
df_cat_available_free = df_cat[(df_cat['Available online'] == 'yes') & (df_cat['Price - non-commercial usage'] == 'free')]
# Available and paid
df_cat_available_paid = df_cat[(df_cat['Available online'] == 'yes') & (df_cat['Price - non-commercial usage'] != 'free')]
# Basic Calculations
identified_datasets_count = df_cat.shape[0]
accessible_datasets_count = df_cat_available.shape[0]
unique_producers_count = df_cat['Publisher'].nunique()
accessible_datasets_fraction = round((accessible_datasets_count / identified_datasets_count) * 100, 2)
# Total audio available and other dependent calculations
audio_reported = round(df_cat[col_name_audio].sum(), 2)
audio_accessible = round(df_cat_available[col_name_audio].sum(), 2)
audio_accessible_free = round(df_cat_available_free[col_name_audio].sum(), 2)
audio_accessible_paid = round(df_cat_available_paid[col_name_audio].sum(), 2)
transcribed_audio_reported = round(df_cat[col_name_transcribed].sum(), 2)
transcribed_audio_accessible = round(df_cat_available[col_name_transcribed].sum(), 2)
transcribed_audio_accessible_free = round(df_cat_available_free[col_name_transcribed].sum(), 2)
transcribed_audio_accessible_paid = round(df_cat_available_paid[col_name_transcribed].sum(), 2)
# available vs Reported Speech Material Ratio
accessible_vs_reported_audio_ratio = round((audio_accessible / audio_reported) * 100, 2)
accessible_vs_reported_transcribed_ratio = round((transcribed_audio_accessible / transcribed_audio_reported) * 100, 2)
# Finalizing the metrics dictionary
metrics_dict = {
"Metric": BASE_SUMMARY_METRICS,
"Value": [
catalog_last_update_date,
unique_producers_count,
identified_datasets_count,
accessible_datasets_count,
accessible_datasets_fraction,
audio_reported,
audio_accessible,
audio_accessible_free,
audio_accessible_paid,
accessible_vs_reported_audio_ratio,
transcribed_audio_reported,
transcribed_audio_accessible,
transcribed_audio_accessible_free,
transcribed_audio_accessible_paid,
accessible_vs_reported_transcribed_ratio,
]
}
# Convert the dictionary into a DataFrame
metrics_df = pd.DataFrame(metrics_dict)
metrics_df.reset_index(drop=True, inplace=True)
metrics_df.set_index("Metric", inplace=True)
return(metrics_df)
def right_align(s, props='text-align: right;'):
return props
def left_align(s, props='text-align: left;'):
return props |