import requests import pandas as pd import streamlit as st catalog_last_update_date = pd.to_datetime('today').strftime('%Y-%m-%d') # TODO - extract from the catalog name BASE_SUMMARY_METRICS = [ "Catalog last update date", "Unique Polish speech datasets producers", "Identified datasets reported in the public domain", "Datasets available to the public (free and paid)", "Fraction of reported datasets available to the public [%]", "Speech data reported in the public domain [hours]", "Speech data available total [hours]", "Speech data available free of charge [hours]", "Speech data available commercially [hours]", "Reported vs available speech data ratio [%]", "Transcribed speech data reported in the public domain [hours]", "Transcribed speech data available total [hours]", "Transcribed speech data available free of charge [hours]", "Transcribed speech data available commercially [hours]", "Reported vs available transcribed speech data ratio [%]", ] def download_tsv_from_google_sheet(sheet_url): # Modify the Google Sheet URL to export it as TSV tsv_url = sheet_url.replace('/edit#gid=', '/export?format=tsv&gid=') # Send a GET request to download the TSV file response = requests.get(tsv_url) # Check if the request was successful if response.status_code == 200: # Read the TSV content into a pandas DataFrame from io import StringIO tsv_content = StringIO(response.text) df = pd.read_csv(tsv_content, sep='\t') return df else: print("Failed to download the TSV file.") return None @st.cache_data def load_catalog(): print("Reading speech data catalog") catalog_url="https://docs.google.com/spreadsheets/d/181EDfwZNtHgHFOMaKNtgKssrYDX4tXTJ9POMzBsCRlI/edit#gid=0" df_catalog = download_tsv_from_google_sheet(catalog_url) return(df_catalog) @st.cache_data def load_taxonomy(): print("Reading speech data survey taxonomy") taxonomy_url="https://docs.google.com/spreadsheets/d/181EDfwZNtHgHFOMaKNtgKssrYDX4tXTJ9POMzBsCRlI/edit#gid=2015613057" df_taxonomy = download_tsv_from_google_sheet(taxonomy_url) return(df_taxonomy) def datasets_count_and_size(df_cat, col_groupby, col_sort=None, col_percent=None, col_sum=['Size audio transcribed [hours]'], col_count=['Dataset ID']): """ Function to generate a summary view of datasets by speech type and other relevant metrics. Args: - df_cat (pd.DataFrame): The base dataframe containing dataset information. - col_sum (str or list): The column(s) to sum. - col_count (str or list): The column(s) to count. - col_groupby (str or list): The column(s) to group the datasets by. - col_percent (str): The column to calculate the percentage of total. Returns: - pd.DataFrame: A dataframe summarizing datasets by speech type and other relevant metrics. """ # Convert col_sum, col_count, and col_groupby to lists if they are not already if not isinstance(col_sum, list): col_sum = [col_sum] if not isinstance(col_count, list): col_count = [col_count] if not isinstance(col_groupby, list): col_groupby = [col_groupby] # First, ensure that the data types and potential missing values are handled correctly for col in col_sum: num_values = df_cat[col].apply(lambda x: pd.to_numeric(x, errors='coerce')).fillna(0) df_cat[col] = num_values # Aggregating datasets by provided column type summary = df_cat.groupby(col_groupby).agg({ **{col: 'sum' for col in col_sum}, **{col: 'count' for col in col_count} }).reset_index() col_name_percent = 'Percent of total' if col_percent is not None: # Calculating the percentage total = summary[col_percent].sum(axis=1) summary[col_name_percent] = round(total / total.sum() * 100, 2) # Sorting the summary by the sum of the column summary.sort_values(by=col_sum[0], ascending=False, inplace=True) # Replacing index with the groupby column summary.reset_index(drop=True, inplace=True) summary.set_index(col_groupby, inplace=True) # Rename the column to a more descriptive name if len(col_count) == 0: col_name_count = None elif len(col_count) == 1: col_name_count = 'Count ' + col_count[0] summary.rename(columns={col_count[0]: col_name_count }, inplace=True) summary[col_name_count] = summary[col_name_count].astype(int) else: #TODO - add support for renaming multiple count columns pass # Make the order of columns as follows 'Count Dataset ID', Total transcribed [hours], 'Percent of total' if col_percent is None: if col_name_count not in summary.columns: summary = summary[col_sum] else: summary = summary[[col_name_count] + col_sum] else: if col_name_count not in summary.columns: summary = summary[col_sum + [col_name_percent]] else: summary = summary[[col_name_count] + col_sum + [col_name_percent]] # Sort by the provided column col_sort col_sort = col_groupby if col_sort is None else col_sort summary.sort_values(by=col_sort, ascending=False, inplace=True) # Replace 0 with no-info in columns with sum for col in col_sum: summary[col] = summary[col].replace(0, 'no-info') return summary def datasets_count_and_size_standard(df_cat, col_groupby): return datasets_count_and_size(df_cat, col_groupby, col_sort=col_groupby, col_percent=['Size audio transcribed [hours]'], col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count=['Dataset ID']) def metadata_coverage(df_cat, df_cat_available_free, df_cat_available_paid): #TODO - add number of speakers and recordings # 'Speaker id info', 'Part of speech annotation', 'Named entity annotation', 'Emotion annotation' meta_data_cols = ['Gender info', 'Age info', 'Accent info', 'Nativity info', 'Time alignement annotation'] meta_coverage_all_sets = {} meta_coverage_free_sets = {} meta_coverage_paid_sets = {} col_name_sum_size = 'Size audio transcribed [hours]' col_name_count = 'Count Dataset ID' col_name_percent = 'Percent of total' #, 'Named entity annotation', 'Emotion annotation'] for meta_data_col in meta_data_cols: df_datasets_per_meta_paid = datasets_count_and_size_standard(df_cat_available_paid, meta_data_col) #print(df_datasets_per_meta_paid) if 'yes' in df_datasets_per_meta_paid.index: meta_coverage_paid_sets[meta_data_col] = df_datasets_per_meta_paid.loc['yes'] else: meta_coverage_paid_sets[meta_data_col] = {col_name_sum_size:0, col_name_count:0, col_name_percent:0} df_datasets_per_meta_all = datasets_count_and_size_standard(df_cat, meta_data_col) #print(df_datasets_per_meta_all) # select row where index has value "yes" and column name is "Percent of total" if 'yes' in df_datasets_per_meta_all.index: meta_coverage_all_sets[meta_data_col] = df_datasets_per_meta_all.loc['yes'] else: meta_coverage_all_sets[meta_data_col] = {col_name_sum_size:0, col_name_count:0, col_name_percent:0} df_datasets_per_meta_free = datasets_count_and_size_standard(df_cat_available_free, meta_data_col) #print(df_datasets_per_meta_free) # check if index has value "yes", if not assign 0 if 'yes' in df_datasets_per_meta_free.index: meta_coverage_free_sets[meta_data_col] = df_datasets_per_meta_free.loc['yes'] else: meta_coverage_free_sets[meta_data_col] = {col_name_sum_size:0, col_name_count:0, col_name_percent:0} #merge all free and paid dataframes df_meta_free = pd.DataFrame.from_dict(meta_coverage_free_sets, orient='index') df_meta_free[col_name_count] = df_meta_free[col_name_count].astype(int) df_meta_paid = pd.DataFrame.from_dict(meta_coverage_paid_sets, orient='index') df_meta_paid[col_name_count] = df_meta_paid[col_name_count].astype(int) df_meta_free['Type'] = 'Free' df_meta_paid['Type'] = 'Paid' df_meta_all_flat = pd.concat([df_meta_free, df_meta_paid]) #transform to compare free and paid column by column df_meta_all_pivot = df_meta_all_flat.reset_index() df_meta_all_pivot = df_meta_all_pivot.rename(columns={'index':'Metadata'}) df_meta_all_pivot = df_meta_all_pivot.pivot(index='Metadata', columns='Type', values=[col_name_count, col_name_sum_size, col_name_percent]) df_meta_all_pivot[col_name_count]=df_meta_all_pivot[col_name_count].astype(int) return(df_meta_all_flat, df_meta_all_pivot) def catalog_summary_statistics(df_cat): """ Function to generate summary statistics for the speech data catalog. Args: - df_cat (pd.DataFrame): The base dataframe containing dataset information. Returns: - pd.DataFrame: A dataframe summarizing the speech data catalog. """ col_name_transcribed = 'Size audio transcribed [hours]' col_name_audio= 'Size audio total [hours]' # Convert numerical fields to numeric type df_cat[col_name_audio] = pd.to_numeric(df_cat[col_name_audio], errors='coerce') df_cat[col_name_transcribed] = pd.to_numeric(df_cat[col_name_transcribed], errors='coerce') # Filter out non-available datasets df_cat_available = df_cat[df_cat['Available online'] == 'yes'] df_cat_free = df_cat[df_cat['Price - non-commercial usage'] == 'free'] df_cat_commercial = df_cat[df_cat['Price - non-commercial usage'] != 'free'] # Available and free df_cat_available_free = df_cat[(df_cat['Available online'] == 'yes') & (df_cat['Price - non-commercial usage'] == 'free')] # Available and paid df_cat_available_paid = df_cat[(df_cat['Available online'] == 'yes') & (df_cat['Price - non-commercial usage'] != 'free')] # Basic Calculations identified_datasets_count = df_cat.shape[0] accessible_datasets_count = df_cat_available.shape[0] unique_producers_count = df_cat['Publisher'].nunique() accessible_datasets_fraction = round((accessible_datasets_count / identified_datasets_count) * 100, 2) # Total audio available and other dependent calculations audio_reported = round(df_cat[col_name_audio].sum(), 2) audio_accessible = round(df_cat_available[col_name_audio].sum(), 2) audio_accessible_free = round(df_cat_available_free[col_name_audio].sum(), 2) audio_accessible_paid = round(df_cat_available_paid[col_name_audio].sum(), 2) transcribed_audio_reported = round(df_cat[col_name_transcribed].sum(), 2) transcribed_audio_accessible = round(df_cat_available[col_name_transcribed].sum(), 2) transcribed_audio_accessible_free = round(df_cat_available_free[col_name_transcribed].sum(), 2) transcribed_audio_accessible_paid = round(df_cat_available_paid[col_name_transcribed].sum(), 2) # available vs Reported Speech Material Ratio accessible_vs_reported_audio_ratio = round((audio_accessible / audio_reported) * 100, 2) accessible_vs_reported_transcribed_ratio = round((transcribed_audio_accessible / transcribed_audio_reported) * 100, 2) # Finalizing the metrics dictionary metrics_dict = { "Metric": BASE_SUMMARY_METRICS, "Value": [ catalog_last_update_date, unique_producers_count, identified_datasets_count, accessible_datasets_count, accessible_datasets_fraction, audio_reported, audio_accessible, audio_accessible_free, audio_accessible_paid, accessible_vs_reported_audio_ratio, transcribed_audio_reported, transcribed_audio_accessible, transcribed_audio_accessible_free, transcribed_audio_accessible_paid, accessible_vs_reported_transcribed_ratio, ] } # Convert the dictionary into a DataFrame metrics_df = pd.DataFrame(metrics_dict) metrics_df.reset_index(drop=True, inplace=True) metrics_df.set_index("Metric", inplace=True) return(metrics_df)