Spaces:

amu-cai
/

pl-asr-survey

Sleeping

App Files Files Community

mj-new commited on Mar 9

Commit

d5cbb7a

•

1 Parent(s): 2901944

Alpha version of the dataset catalog

Browse files

Files changed (10) hide show

README.md +4 -4
__pycache__/app_utils.cpython-310.pyc +0 -0
__pycache__/contants.cpython-310.pyc +0 -0
__pycache__/utils.cpython-310.pyc +0 -0
analysis-playground.ipynb +0 -0
app.py +101 -0
app_utils.py +94 -0
contants.py +5 -0
requirements.txt +3 -0
utils.py +276 -0

README.md CHANGED Viewed

@@ -1,10 +1,10 @@
 ---
-title: Pl Asr Survey
-emoji: 💻
-colorFrom: yellow
 colorTo: red
 sdk: streamlit
-sdk_version: 1.32.0
 app_file: app.py
 pinned: false
 license: cc-by-sa-4.0

 ---
+title: Pl Asr Speech Data Survey
+emoji: 🏃
+colorFrom: pink
 colorTo: red
 sdk: streamlit
+sdk_version: 1.31.1
 app_file: app.py
 pinned: false
 license: cc-by-sa-4.0

__pycache__/app_utils.cpython-310.pyc ADDED Viewed

Binary file (2.28 kB). View file

__pycache__/contants.cpython-310.pyc ADDED Viewed

Binary file (482 Bytes). View file

__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (7.26 kB). View file

analysis-playground.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

app.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import pandas as pd
+import streamlit as st
+from app_utils import filter_dataframe, calculate_height_to_display
+from contants import WELCOME_TEXT, CITATION_TEXT
+from utils import BASE_SUMMARY_METRICS
+from utils import  load_catalog, load_taxonomy
+from utils import datasets_count_and_size, datasets_count_and_size_standard, metadata_coverage, catalog_summary_statistics
+import matplotlib.pyplot as plt
+import seaborn as sns
+st.set_page_config(layout="wide")
+st.title("Polish Speech Datasets Catalog and Survey analysis")
+st.write(WELCOME_TEXT)
+st.write(CITATION_TEXT)
+# Cache the dataframe so it's only loaded once
+df_cat = load_catalog()
+df_tax = load_taxonomy()
+# Filter out non available datasets
+df_cat_available = df_cat[df_cat['Available online'] == 'yes']
+# Available and free
+df_cat_available_free = df_cat[(df_cat['Available online'] == 'yes') & (df_cat['Price - non-commercial usage'] == 'free')]
+# Available and paid
+df_cat_available_paid = df_cat[(df_cat['Available online'] == 'yes') & (df_cat['Price - non-commercial usage'] != 'free')]
+# Display catalog contents
+st.dataframe(filter_dataframe(df_cat), hide_index=True, use_container_width=True)
+# Display taxonomy contents
+# Display summary statistics
+st.header("Polish ASR speech datasets summary statistics")
+df_summary_metrics = catalog_summary_statistics(df_cat)
+df_basic_stats = df_summary_metrics.loc[BASE_SUMMARY_METRICS[0:5]]
+st.dataframe(df_basic_stats, use_container_width=False)
+st.header("Speech data available across Polish ASR speech datasets")
+df_stats_audio_available = df_summary_metrics.loc[BASE_SUMMARY_METRICS[5:10]]
+st.dataframe(df_stats_audio_available, use_container_width=False)
+st.header("Transcribed data available across Polish ASR speech datasets")
+df_stats_transcribed_available = df_summary_metrics.loc[BASE_SUMMARY_METRICS[10:15]]
+st.dataframe(df_stats_transcribed_available, use_container_width=False)
+# Display distribution of datasets created per year
+st.header("Polish ASR speech datasets created in 1997-2023")
+col_groupby = ['Creation year']
+df_datasets_per_speech_type = datasets_count_and_size(df_cat, col_groupby, col_sort=col_groupby, col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
+st.dataframe(df_datasets_per_speech_type, use_container_width=False)
+st.header("Institutions contributing Polish ASR speech dataset")
+col_groupby = ['Publisher']
+df_datasets_per_publisher = datasets_count_and_size(df_cat, col_groupby, col_sort='Count Dataset ID', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
+st.dataframe(df_datasets_per_publisher, use_container_width=False)
+st.header("Repositories hosting Polish ASR speech datasets")
+col_groupby = ['Repository']
+df_datasets_per_repo = datasets_count_and_size(df_cat, col_groupby, col_sort='Count Dataset ID', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
+st.dataframe(df_datasets_per_repo, use_container_width=False)
+st.header("Public domain Polish ASR speech datasets")
+col_groupby = ['License', "Dataset ID"]
+df_datasets_public = datasets_count_and_size(df_cat_available_free, col_groupby, col_sort='License', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = [])
+st.dataframe(df_datasets_public, use_container_width=False)
+st.header("Commercialy available Polish ASR speech datasets")
+col_groupby = ['License', "Dataset ID"]
+df_datasets_paid = datasets_count_and_size(df_cat_available_paid, col_groupby, col_sort='License', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = [])
+st.dataframe(df_datasets_paid, use_container_width=False)
+st.header("Coverage of metadata across Polish ASR speech datasets")
+df_meta_all_flat, df_meta_all_pivot = metadata_coverage(df_cat, df_cat_available_free, df_cat_available_paid)
+st.dataframe(df_meta_all_pivot, use_container_width=False)
+# Display distribution of datasets for various speech types
+st.header("Datasets per speech type")
+col_groupby = ['Speech type']
+df_datasets_per_speech_type = datasets_count_and_size(df_cat, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
+st.dataframe(df_datasets_per_speech_type, use_container_width=False)
+# Display distribution of datasets for various speech types
+st.header("Distribution of available speech data per audio device - Public domain datasets")
+col_groupby = ['Audio device']
+df_datasets_per_device = datasets_count_and_size(df_cat_available_free, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
+st.dataframe(df_datasets_per_device, use_container_width=False)
+# Display distribution of datasets for various speech types
+st.header("Distribution of available speech data per audio device - Commercial datasets")
+col_groupby = ['Audio device']
+df_datasets_per_device = datasets_count_and_size(df_cat_available_paid, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
+st.dataframe(df_datasets_per_device, use_container_width=False)

app_utils.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import pandas as pd
+import streamlit as st
+from pandas.api.types import (
+    is_categorical_dtype,
+    is_datetime64_any_dtype,
+    is_numeric_dtype,
+    is_object_dtype,
+)
+def calculate_height_to_display(df):
+    # Calculate the height of the DataFrame display area
+    num_rows = df.shape[0]
+    row_height = 25  # Estimate of row height in pixels, adjust based on your layout/theme
+    header_height = 50  # Estimate of header height in pixels
+    padding = 20  # Extra padding in pixels
+    calculated_height = num_rows * row_height + header_height + padding
+    return calculated_height
+def filter_dataframe(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Adds a UI on top of a dataframe to let viewers filter columns
+    Args:
+        df (pd.DataFrame): Original dataframe
+    Returns:
+        pd.DataFrame: Filtered dataframe
+    """
+    modify = st.checkbox("Use filters on speech data catalog")
+    if not modify:
+        return df
+    df = df.copy()
+    # Try to convert datetimes into a standard format (datetime, no timezone)
+    for col in df.columns:
+        if is_object_dtype(df[col]):
+            try:
+                df[col] = pd.to_datetime(df[col])
+            except Exception:
+                pass
+        if is_datetime64_any_dtype(df[col]):
+            df[col] = df[col].dt.tz_localize(None)
+    modification_container = st.container()
+    with modification_container:
+        to_filter_columns = st.multiselect("Filter dataframe on", df.columns)
+        for column in to_filter_columns:
+            left, right = st.columns((1, 20))
+            # Treat columns with < 10 unique values as categorical
+            if is_categorical_dtype(df[column]) or df[column].nunique() < 10:
+                user_cat_input = right.multiselect(
+                    f"Values for {column}",
+                    df[column].unique(),
+                    default=list(df[column].unique()),
+                )
+                df = df[df[column].isin(user_cat_input)]
+            elif is_numeric_dtype(df[column]):
+                _min = float(df[column].min())
+                _max = float(df[column].max())
+                step = (_max - _min) / 100
+                user_num_input = right.slider(
+                    f"Values for {column}",
+                    min_value=_min,
+                    max_value=_max,
+                    value=(_min, _max),
+                    step=step,
+                )
+                df = df[df[column].between(*user_num_input)]
+            elif is_datetime64_any_dtype(df[column]):
+                user_date_input = right.date_input(
+                    f"Values for {column}",
+                    value=(
+                        df[column].min(),
+                        df[column].max(),
+                    ),
+                )
+                if len(user_date_input) == 2:
+                    user_date_input = tuple(map(pd.to_datetime, user_date_input))
+                    start_date, end_date = user_date_input
+                    df = df.loc[df[column].between(start_date, end_date)]
+            else:
+                user_text_input = right.text_input(
+                    f"Substring or regex in {column}",
+                )
+                if user_text_input:
+                    df = df[df[column].astype(str).str.contains(user_text_input)]
+    return df

contants.py ADDED Viewed

	@@ -0,0 +1,5 @@

+WELCOME_TEXT = "This dashboard complements [Polish Speech Datasets Catalog](https://github.com/goodmike31/pl-asr-speech-data-survey) with:\n \
+a. Dynamic filtering of catalog content\n \
+b. Summary statistics about Polish ASR speech datasets\n"
+CITATION_TEXT="Please cite this work as: TODO\n"

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+seaborn
+matplotlib
+pandas

utils.py ADDED Viewed

	@@ -0,0 +1,276 @@

+import requests
+import pandas as pd
+import streamlit as st
+catalog_last_update_date = pd.to_datetime('today').strftime('%Y-%m-%d')
+# TODO - extract from the catalog name
+BASE_SUMMARY_METRICS = [
+            "Catalog last update date",
+            "Unique Polish speech datasets producers",
+            "Identified datasets reported in the public domain",
+            "Datasets available to the public (free and paid)",
+            "Fraction of reported datasets available to the public [%]",
+            "Speech data reported in the public domain [hours]",
+            "Speech data available total [hours]",
+            "Speech data available free of charge [hours]",
+            "Speech data available commercially  [hours]",
+            "Reported vs available speech data ratio [%]",
+            "Transcribed speech data reported in the public domain [hours]",
+            "Transcribed speech data available total [hours]",
+            "Transcribed speech data available free of charge [hours]",
+            "Transcribed speech data available commercially [hours]",
+            "Reported vs available transcribed speech data ratio [%]",
+        ]
+def download_tsv_from_google_sheet(sheet_url):
+    # Modify the Google Sheet URL to export it as TSV
+    tsv_url = sheet_url.replace('/edit#gid=', '/export?format=tsv&gid=')
+    # Send a GET request to download the TSV file
+    response = requests.get(tsv_url)
+    # Check if the request was successful
+    if response.status_code == 200:
+        # Read the TSV content into a pandas DataFrame
+        from io import StringIO
+        tsv_content = StringIO(response.text)
+        df = pd.read_csv(tsv_content, sep='\t')
+        return df
+    else:
+        print("Failed to download the TSV file.")
+        return None
+@st.cache_data
+def load_catalog():
+    print("Reading speech data catalog")
+    catalog_url="https://docs.google.com/spreadsheets/d/181EDfwZNtHgHFOMaKNtgKssrYDX4tXTJ9POMzBsCRlI/edit#gid=0"
+    df_catalog = download_tsv_from_google_sheet(catalog_url)
+    return(df_catalog)
+@st.cache_data
+def load_taxonomy():
+    print("Reading speech data survey taxonomy")
+    taxonomy_url="https://docs.google.com/spreadsheets/d/181EDfwZNtHgHFOMaKNtgKssrYDX4tXTJ9POMzBsCRlI/edit#gid=2015613057"
+    df_taxonomy = download_tsv_from_google_sheet(taxonomy_url)
+    return(df_taxonomy)
+def datasets_count_and_size(df_cat, col_groupby, col_sort=None, col_percent=None, col_sum=['Size audio transcribed [hours]'], col_count=['Dataset ID']):
+    """
+    Function to generate a summary view of datasets by speech type and other relevant metrics.
+    Args:
+    - df_cat (pd.DataFrame): The base dataframe containing dataset information.
+    - col_sum (str or list): The column(s) to sum.
+    - col_count (str or list): The column(s) to count.
+    - col_groupby (str or list): The column(s) to group the datasets by.
+    - col_percent (str): The column to calculate the percentage of total.
+    Returns:
+    - pd.DataFrame: A dataframe summarizing datasets by speech type and other relevant metrics.
+    """
+    # Convert col_sum, col_count, and col_groupby to lists if they are not already
+    if not isinstance(col_sum, list):
+        col_sum = [col_sum]
+    if not isinstance(col_count, list):
+        col_count = [col_count]
+    if not isinstance(col_groupby, list):
+        col_groupby = [col_groupby]
+    # First, ensure that the data types and potential missing values are handled correctly
+    for col in col_sum:
+        num_values = df_cat[col].apply(lambda x: pd.to_numeric(x, errors='coerce')).fillna(0)
+        df_cat[col] = num_values
+    # Aggregating datasets by provided column type
+    summary = df_cat.groupby(col_groupby).agg({
+        **{col: 'sum' for col in col_sum},
+        **{col: 'count' for col in col_count}
+    }).reset_index()
+    col_name_percent = 'Percent of total'
+    if col_percent is not None:
+        # Calculating the percentage
+        total = summary[col_percent].sum(axis=1)
+        summary[col_name_percent] = round(total / total.sum() * 100, 2)
+    # Sorting the summary by the sum of the column
+    summary.sort_values(by=col_sum[0], ascending=False, inplace=True)
+    # Replacing index with the groupby column
+    summary.reset_index(drop=True, inplace=True)
+    summary.set_index(col_groupby, inplace=True)
+    # Rename the column to a more descriptive name
+    if len(col_count) == 0:
+        col_name_count = None
+    elif len(col_count) == 1:
+        col_name_count = 'Count ' + col_count[0]
+        summary.rename(columns={col_count[0]: col_name_count }, inplace=True)
+        summary[col_name_count] = summary[col_name_count].astype(int)
+    else:
+        #TODO - add support for renaming multiple count columns
+        pass
+    # Make the order of columns as follows 'Count Dataset ID', Total transcribed [hours], 'Percent of total'
+    if col_percent is None:
+        if col_name_count not in summary.columns:
+            summary = summary[col_sum]
+        else:
+            summary = summary[[col_name_count] + col_sum]
+    else:
+        if col_name_count not in summary.columns:
+            summary = summary[col_sum + [col_name_percent]]
+        else:
+            summary = summary[[col_name_count] + col_sum + [col_name_percent]]
+    # Sort by the provided column col_sort
+    col_sort = col_groupby if col_sort is None else col_sort
+    summary.sort_values(by=col_sort, ascending=False, inplace=True)
+    # Replace 0 with no-info in columns with sum
+    for col in col_sum:
+        summary[col] = summary[col].replace(0, 'no-info')
+    return summary
+def datasets_count_and_size_standard(df_cat, col_groupby):
+    return datasets_count_and_size(df_cat, col_groupby, col_sort=col_groupby, col_percent=['Size audio transcribed [hours]'], col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count=['Dataset ID'])
+def metadata_coverage(df_cat, df_cat_available_free, df_cat_available_paid):
+    #TODO - add number of speakers and recordings
+    # 'Speaker id info', 'Part of speech annotation', 'Named entity annotation', 'Emotion annotation'
+    meta_data_cols = ['Gender info', 'Age info', 'Accent info', 'Nativity info', 'Time alignement annotation']
+    meta_coverage_all_sets = {}
+    meta_coverage_free_sets = {}
+    meta_coverage_paid_sets = {}
+    col_name_sum_size = 'Size audio transcribed [hours]'
+    col_name_count = 'Count Dataset ID'
+    col_name_percent = 'Percent of total'
+    #, 'Named entity annotation', 'Emotion annotation']
+    for meta_data_col in meta_data_cols:
+        df_datasets_per_meta_paid = datasets_count_and_size_standard(df_cat_available_paid, meta_data_col)
+        #print(df_datasets_per_meta_paid)
+        if 'yes' in df_datasets_per_meta_paid.index:
+            meta_coverage_paid_sets[meta_data_col] = df_datasets_per_meta_paid.loc['yes']
+        else:
+            meta_coverage_paid_sets[meta_data_col] = {col_name_sum_size:0, col_name_count:0, col_name_percent:0}
+        df_datasets_per_meta_all = datasets_count_and_size_standard(df_cat, meta_data_col)
+        #print(df_datasets_per_meta_all)
+        # select row where index has value "yes" and column name is "Percent of total"
+        if 'yes' in df_datasets_per_meta_all.index:
+            meta_coverage_all_sets[meta_data_col] = df_datasets_per_meta_all.loc['yes']
+        else:
+            meta_coverage_all_sets[meta_data_col] = {col_name_sum_size:0, col_name_count:0, col_name_percent:0}
+        df_datasets_per_meta_free = datasets_count_and_size_standard(df_cat_available_free, meta_data_col)
+        #print(df_datasets_per_meta_free)
+        # check if index has value "yes", if not assign 0
+        if 'yes' in df_datasets_per_meta_free.index:
+            meta_coverage_free_sets[meta_data_col] = df_datasets_per_meta_free.loc['yes']
+        else:
+            meta_coverage_free_sets[meta_data_col] = {col_name_sum_size:0, col_name_count:0, col_name_percent:0}
+    #merge all free and paid dataframes
+    df_meta_free = pd.DataFrame.from_dict(meta_coverage_free_sets, orient='index')
+    df_meta_free[col_name_count] = df_meta_free[col_name_count].astype(int)
+    df_meta_paid = pd.DataFrame.from_dict(meta_coverage_paid_sets, orient='index')
+    df_meta_paid[col_name_count] = df_meta_paid[col_name_count].astype(int)
+    df_meta_free['Type'] = 'Free'
+    df_meta_paid['Type'] = 'Paid'
+    df_meta_all_flat = pd.concat([df_meta_free, df_meta_paid])
+    #transform to compare free and paid column by column
+    df_meta_all_pivot = df_meta_all_flat.reset_index()
+    df_meta_all_pivot = df_meta_all_pivot.rename(columns={'index':'Metadata'})
+    df_meta_all_pivot = df_meta_all_pivot.pivot(index='Metadata', columns='Type', values=[col_name_count, col_name_sum_size, col_name_percent])
+    df_meta_all_pivot[col_name_count]=df_meta_all_pivot[col_name_count].astype(int)
+    return(df_meta_all_flat, df_meta_all_pivot)
+def catalog_summary_statistics(df_cat):
+    """
+    Function to generate summary statistics for the speech data catalog.
+    Args:
+    - df_cat (pd.DataFrame): The base dataframe containing dataset information.
+    Returns:
+    - pd.DataFrame: A dataframe summarizing the speech data catalog.
+    """
+    col_name_transcribed = 'Size audio transcribed [hours]'
+    col_name_audio= 'Size audio total [hours]'
+    # Convert numerical fields to numeric type
+    df_cat[col_name_audio] = pd.to_numeric(df_cat[col_name_audio], errors='coerce')
+    df_cat[col_name_transcribed] = pd.to_numeric(df_cat[col_name_transcribed], errors='coerce')
+    # Filter out non-available datasets
+    df_cat_available = df_cat[df_cat['Available online'] == 'yes']
+    df_cat_free = df_cat[df_cat['Price - non-commercial usage'] == 'free']
+    df_cat_commercial = df_cat[df_cat['Price - non-commercial usage'] != 'free']
+    # Available and free
+    df_cat_available_free = df_cat[(df_cat['Available online'] == 'yes') & (df_cat['Price - non-commercial usage'] == 'free')]
+    # Available and paid
+    df_cat_available_paid = df_cat[(df_cat['Available online'] == 'yes') & (df_cat['Price - non-commercial usage'] != 'free')]
+    # Basic Calculations
+    identified_datasets_count = df_cat.shape[0]
+    accessible_datasets_count = df_cat_available.shape[0]
+    unique_producers_count = df_cat['Publisher'].nunique()
+    accessible_datasets_fraction = round((accessible_datasets_count / identified_datasets_count) * 100, 2)
+    # Total audio available and other dependent calculations
+    audio_reported = round(df_cat[col_name_audio].sum(), 2)
+    audio_accessible = round(df_cat_available[col_name_audio].sum(), 2)
+    audio_accessible_free = round(df_cat_available_free[col_name_audio].sum(), 2)
+    audio_accessible_paid = round(df_cat_available_paid[col_name_audio].sum(), 2)
+    transcribed_audio_reported = round(df_cat[col_name_transcribed].sum(), 2)
+    transcribed_audio_accessible = round(df_cat_available[col_name_transcribed].sum(), 2)
+    transcribed_audio_accessible_free = round(df_cat_available_free[col_name_transcribed].sum(), 2)
+    transcribed_audio_accessible_paid = round(df_cat_available_paid[col_name_transcribed].sum(), 2)
+    # available vs Reported Speech Material Ratio
+    accessible_vs_reported_audio_ratio = round((audio_accessible / audio_reported) * 100, 2)
+    accessible_vs_reported_transcribed_ratio = round((transcribed_audio_accessible / transcribed_audio_reported) * 100, 2)
+    # Finalizing the metrics dictionary
+    metrics_dict = {
+        "Metric": BASE_SUMMARY_METRICS,
+        "Value": [
+            catalog_last_update_date,
+            unique_producers_count,
+            identified_datasets_count,
+            accessible_datasets_count,
+            accessible_datasets_fraction,
+            audio_reported,
+            audio_accessible,
+            audio_accessible_free,
+            audio_accessible_paid,
+            accessible_vs_reported_audio_ratio,
+            transcribed_audio_reported,
+            transcribed_audio_accessible,
+            transcribed_audio_accessible_free,
+            transcribed_audio_accessible_paid,
+            accessible_vs_reported_transcribed_ratio,
+        ]
+    }
+    # Convert the dictionary into a DataFrame
+    metrics_df = pd.DataFrame(metrics_dict)
+    metrics_df.reset_index(drop=True, inplace=True)
+    metrics_df.set_index("Metric", inplace=True)
+    return(metrics_df)