File size: 14,090 Bytes
d5cbb7a
 
 
4eee292
d5cbb7a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4eee292
 
d5cbb7a
 
 
 
 
4eee292
d5cbb7a
 
 
 
 
 
e283f70
d5cbb7a
 
 
 
 
 
e283f70
d5cbb7a
 
 
 
 
e283f70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4eee292
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e283f70
d5cbb7a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4eee292
 
d5cbb7a
4eee292
 
 
 
d5cbb7a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4eee292
 
 
d5cbb7a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4eee292
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
import requests
import pandas as pd
import streamlit as st
import numpy as np

catalog_last_update_date = pd.to_datetime('today').strftime('%Y-%m-%d')
# TODO - extract from the catalog name

BASE_SUMMARY_METRICS = [
            "Catalog last update date",
            "Unique Polish speech datasets producers",
            "Identified datasets reported in the public domain",
            "Datasets available to the public (free and paid)",
            "Fraction of reported datasets available to the public [%]",
            "Speech data reported in the public domain [hours]",
            "Speech data available total [hours]",
            "Speech data available free of charge [hours]",
            "Speech data available commercially  [hours]",
            "Reported vs available speech data ratio [%]",
            "Transcribed speech data reported in the public domain [hours]",
            "Transcribed speech data available total [hours]",
            "Transcribed speech data available free of charge [hours]",
            "Transcribed speech data available commercially [hours]",
            "Reported vs available transcribed speech data ratio [%]",

        ]

def download_tsv_from_google_sheet(sheet_url):
    # Modify the Google Sheet URL to export it as TSV
    tsv_url = sheet_url.replace('/edit#gid=', '/export?format=tsv&gid=')
    
    # Send a GET request to download the TSV file
    response = requests.get(tsv_url)
    response.encoding = 'utf-8'

    # Check if the request was successful
    if response.status_code == 200:
        # Read the TSV content into a pandas DataFrame
        from io import StringIO
        tsv_content = StringIO(response.text)
        df = pd.read_csv(tsv_content, sep='\t', encoding='utf-8')
        return df
    else:
        print("Failed to download the TSV file.")
        return None

@st.cache_data
def load_data_catalog():
    print("Reading speech data catalog")
    catalog_url="https://docs.google.com/spreadsheets/d/181EDfwZNtHgHFOMaKNtgKssrYDX4tXTJ9POMzBsCRlI/edit#gid=0"
    df_catalog = download_tsv_from_google_sheet(catalog_url)
    return(df_catalog)

@st.cache_data
def load_data_taxonomy():
    print("Reading speech data survey taxonomy")
    taxonomy_url="https://docs.google.com/spreadsheets/d/181EDfwZNtHgHFOMaKNtgKssrYDX4tXTJ9POMzBsCRlI/edit#gid=2015613057"
    df_taxonomy = download_tsv_from_google_sheet(taxonomy_url)
    return(df_taxonomy)


@st.cache_data
def load_bench_catalog():
    print("Reading ASR benchmarks catalog")
    catalog_url="https://docs.google.com/spreadsheets/d/1fVsE98Ulmt-EIEe4wx8sUdo7RLigDdAVjQxNpAJIrH8/edit#gid=0"
    df_catalog = download_tsv_from_google_sheet(catalog_url)
    return(df_catalog)

@st.cache_data
def load_bench_taxonomy():
    print("Reading ASR benchmarks survey taxonomy")
    taxonomy_url="https://docs.google.com/spreadsheets/d/181EDfwZNtHgHFOMaKNtgKssrYDX4tXTJ9POMzBsCRlI/edit#gid=2015613057"
    df_taxonomy = download_tsv_from_google_sheet(taxonomy_url)
    return(df_taxonomy)

def style_floats(val):
    """
    Converts float to int if the fractional part is zero, formats floats with two decimal places,
    and leaves strings unchanged.
    """
    # Check if value is a float and if it can be converted to an int without loss
    if isinstance(val, float):
        if val % 1 == 0:
            return f"{int(val)}"  # Convert float with no fractional part to int
        else:
            return f"{val:.2f}"  # Format floats with two decimal places
    elif isinstance(val, int):
        return f"{val}"  # Handle pure integers separately (though likely unnecessary)
    else:
        return val  # Return strings unchanged


def datasets_count_and_size(df_cat, col_groupby, col_sort=None, col_percent=None, col_sum=['Size audio transcribed [hours]'], col_count=['Dataset ID']):
    """
    Function to generate a summary view of datasets by speech type and other relevant metrics.
    
    Args:
    - df_cat (pd.DataFrame): The base dataframe containing dataset information.
    - col_sum (str or list): The column(s) to sum.
    - col_count (str or list): The column(s) to count.
    - col_groupby (str or list): The column(s) to group the datasets by.
    - col_percent (str): The column to calculate the percentage of total.
    
    Returns:
    - pd.DataFrame: A dataframe summarizing datasets by speech type and other relevant metrics.
    """    
    # Convert col_sum, col_count, and col_groupby to lists if they are not already
    if not isinstance(col_sum, list):
        col_sum = [col_sum]
    if not isinstance(col_count, list):
        col_count = [col_count]
    if not isinstance(col_groupby, list):
        col_groupby = [col_groupby]
    
    # First, ensure that the data types and potential missing values are handled correctly
    for col in col_sum:
        num_values = df_cat[col].apply(lambda x: pd.to_numeric(x, errors='coerce')).fillna(0)
        df_cat[col] = num_values
    
    # Aggregating datasets by provided column type
    summary = df_cat.groupby(col_groupby).agg({
        **{col: 'sum' for col in col_sum},
        **{col: 'count' for col in col_count}
    }).reset_index()
    
    col_name_percent = 'Percent of total'
    if col_percent is not None:
        # Calculating the percentage
        total = summary[col_percent].sum(axis=1)
        summary[col_name_percent] = round(total / total.sum() * 100, 2)    
 
    # Sorting the summary by the sum of the column
    summary.sort_values(by=col_sum[0], ascending=False, inplace=True)

    # Replacing index with the groupby column
    summary.reset_index(drop=True, inplace=True)
    summary.set_index(col_groupby, inplace=True)
    
    # Rename the column to a more descriptive name
    if len(col_count) == 0:
        col_name_count = None
    elif len(col_count) == 1:
        col_name_count = 'Count ' + col_count[0] 
        summary.rename(columns={col_count[0]: col_name_count }, inplace=True)
        summary[col_name_count] = summary[col_name_count].astype(int)
    else:
        #TODO - add support for renaming multiple count columns
        pass

    # Make the order of columns as follows 'Count Dataset ID', Total transcribed [hours], 'Percent of total'
    if col_percent is None:
        if col_name_count not in summary.columns:
            summary = summary[col_sum]
        else:
            summary = summary[[col_name_count] + col_sum]
    else:
        if col_name_count not in summary.columns:
            summary = summary[col_sum + [col_name_percent]]
        else:
            summary = summary[[col_name_count] + col_sum + [col_name_percent]]
    
    # Sort by the provided column col_sort
    col_sort = col_groupby if col_sort is None else col_sort
    summary.sort_values(by=col_sort, ascending=False, inplace=True)
 
    print(col_sum)
    for col in col_sum:
        print(col)
        #summary[col] = summary[col].apply(lambda x: str(int(x)) if float(x).is_integer() else str(x))
        summary[col] = summary[col].replace(0, np.nan)
    
    return summary


def datasets_count_and_size_standard(df_cat, col_groupby):
    return datasets_count_and_size(df_cat, col_groupby, col_sort=col_groupby, col_percent=['Size audio transcribed [hours]'], col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count=['Dataset ID'])

def metadata_coverage(df_cat, df_cat_available_free, df_cat_available_paid):
    #TODO - add number of speakers and recordings

    # 'Speaker id info', 'Part of speech annotation', 'Named entity annotation', 'Emotion annotation'
    meta_data_cols = ['Gender info', 'Age info', 'Accent info', 'Nativity info', 'Time alignement annotation']
    meta_coverage_all_sets = {}
    meta_coverage_free_sets = {}
    meta_coverage_paid_sets = {}

    col_name_sum_size = 'Size audio transcribed [hours]'
    col_name_count = 'Count Dataset ID'
    col_name_percent = 'Percent of total'
    
    #, 'Named entity annotation', 'Emotion annotation']
    for meta_data_col in meta_data_cols:
        df_datasets_per_meta_paid = datasets_count_and_size_standard(df_cat_available_paid, meta_data_col)
        #print(df_datasets_per_meta_paid)
        if 'yes' in df_datasets_per_meta_paid.index:
            meta_coverage_paid_sets[meta_data_col] = df_datasets_per_meta_paid.loc['yes']
        else:
            meta_coverage_paid_sets[meta_data_col] = {col_name_sum_size:0, col_name_count:0, col_name_percent:0}

        df_datasets_per_meta_all = datasets_count_and_size_standard(df_cat, meta_data_col)
        #print(df_datasets_per_meta_all)
        # select row where index has value "yes" and column name is "Percent of total"
        if 'yes' in df_datasets_per_meta_all.index:
            meta_coverage_all_sets[meta_data_col] = df_datasets_per_meta_all.loc['yes']
        else:
            meta_coverage_all_sets[meta_data_col] = {col_name_sum_size:0, col_name_count:0, col_name_percent:0}

        df_datasets_per_meta_free = datasets_count_and_size_standard(df_cat_available_free, meta_data_col)
        #print(df_datasets_per_meta_free)
        # check if index has value "yes", if not assign 0
        if 'yes' in df_datasets_per_meta_free.index:
            meta_coverage_free_sets[meta_data_col] = df_datasets_per_meta_free.loc['yes']
        else:
            meta_coverage_free_sets[meta_data_col] = {col_name_sum_size:0, col_name_count:0, col_name_percent:0}

    #merge all free and paid dataframes
    df_meta_free = pd.DataFrame.from_dict(meta_coverage_free_sets, orient='index')
    df_meta_free[col_name_count] = df_meta_free[col_name_count].astype(int)

    df_meta_paid = pd.DataFrame.from_dict(meta_coverage_paid_sets, orient='index')
    df_meta_paid[col_name_count] = df_meta_paid[col_name_count].astype(int)

    df_meta_free['Type'] = 'Free'
    df_meta_paid['Type'] = 'Paid'
    df_meta_all_flat = pd.concat([df_meta_free, df_meta_paid])

    #transform to compare free and paid column by column
    df_meta_all_pivot = df_meta_all_flat.reset_index()
    df_meta_all_pivot = df_meta_all_pivot.rename(columns={'index':'Metadata'})
    df_meta_all_pivot = df_meta_all_pivot.pivot(index='Metadata', columns='Type', values=[col_name_count, col_name_sum_size, col_name_percent])
    df_meta_all_pivot[col_name_count]=df_meta_all_pivot[col_name_count].astype(int)
    
    #df_meta_all_pivot_styled = df_meta_all_pivot.style.map(style_floats)
    #df_meta_all_flat_styled = df_meta_all_flat.style.map(style_floats)

    return(df_meta_all_flat, df_meta_all_pivot)


def catalog_summary_statistics(df_cat):
    """
    Function to generate summary statistics for the speech data catalog.
    
    Args:
    - df_cat (pd.DataFrame): The base dataframe containing dataset information.
    
    Returns:
    - pd.DataFrame: A dataframe summarizing the speech data catalog.
    """

    col_name_transcribed = 'Size audio transcribed [hours]'
    col_name_audio= 'Size audio total [hours]'

    # Convert numerical fields to numeric type
    df_cat[col_name_audio] = pd.to_numeric(df_cat[col_name_audio], errors='coerce')
    df_cat[col_name_transcribed] = pd.to_numeric(df_cat[col_name_transcribed], errors='coerce')

    # Filter out non-available datasets
    df_cat_available = df_cat[df_cat['Available online'] == 'yes']
    df_cat_free = df_cat[df_cat['Price - non-commercial usage'] == 'free']
    df_cat_commercial = df_cat[df_cat['Price - non-commercial usage'] != 'free']

    # Available and free
    df_cat_available_free = df_cat[(df_cat['Available online'] == 'yes') & (df_cat['Price - non-commercial usage'] == 'free')]

    # Available and paid
    df_cat_available_paid = df_cat[(df_cat['Available online'] == 'yes') & (df_cat['Price - non-commercial usage'] != 'free')]

    # Basic Calculations
    identified_datasets_count = df_cat.shape[0]
    accessible_datasets_count = df_cat_available.shape[0]
    unique_producers_count = df_cat['Publisher'].nunique()
    accessible_datasets_fraction = round((accessible_datasets_count / identified_datasets_count) * 100, 2)

    # Total audio available and other dependent calculations
    audio_reported = round(df_cat[col_name_audio].sum(), 2)
    audio_accessible = round(df_cat_available[col_name_audio].sum(), 2)
    audio_accessible_free = round(df_cat_available_free[col_name_audio].sum(), 2)
    audio_accessible_paid = round(df_cat_available_paid[col_name_audio].sum(), 2)

    transcribed_audio_reported = round(df_cat[col_name_transcribed].sum(), 2)
    transcribed_audio_accessible = round(df_cat_available[col_name_transcribed].sum(), 2)
    transcribed_audio_accessible_free = round(df_cat_available_free[col_name_transcribed].sum(), 2)
    transcribed_audio_accessible_paid = round(df_cat_available_paid[col_name_transcribed].sum(), 2)

    # available vs Reported Speech Material Ratio
    accessible_vs_reported_audio_ratio = round((audio_accessible / audio_reported) * 100, 2)
    accessible_vs_reported_transcribed_ratio = round((transcribed_audio_accessible / transcribed_audio_reported) * 100, 2)

    # Finalizing the metrics dictionary
    metrics_dict = {
        "Metric": BASE_SUMMARY_METRICS,
        "Value": [
            catalog_last_update_date,
            unique_producers_count,
            identified_datasets_count,
            accessible_datasets_count,
            accessible_datasets_fraction,
            audio_reported,
            audio_accessible,
            audio_accessible_free,
            audio_accessible_paid,
            accessible_vs_reported_audio_ratio,
            transcribed_audio_reported,
            transcribed_audio_accessible,
            transcribed_audio_accessible_free,
            transcribed_audio_accessible_paid,
            accessible_vs_reported_transcribed_ratio,
        ]
    }

    # Convert the dictionary into a DataFrame
    metrics_df = pd.DataFrame(metrics_dict)
    metrics_df.reset_index(drop=True, inplace=True)
    metrics_df.set_index("Metric", inplace=True)
    return(metrics_df)

def right_align(s, props='text-align: right;'):
    return props

def left_align(s, props='text-align: left;'):
    return props