"
],
"text/plain": [
" Number of datasets Total transcribed [hours] \\\n",
"Speech type \n",
"read 25 3362.1 \n",
"conversational 13 1184.0 \n",
"various 4 1134.0 \n",
"public speech 8 275.0 \n",
"no info 3 31.0 \n",
"\n",
" Percent of total \n",
"Speech type \n",
"read 56.17 \n",
"conversational 19.78 \n",
"various 18.94 \n",
"public speech 4.59 \n",
"no info 0.52 "
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from utils import datasets_count_and_total_size\n",
"col_groupby = ['Speech type']\n",
"df_datasets_per_speech_type = datasets_count_and_total_size(df_cat, col_groupby)\n",
"df_datasets_per_speech_type\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/michal/Development/hugging-face/michaljunczyk/pl-asr-speech-data-survey-analysis/utils.py:48: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df_cat[col_sum] = num_values\n"
]
},
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
"
\n",
"
\n",
"
Number of datasets
\n",
"
Total transcribed [hours]
\n",
"
Percent of total
\n",
"
\n",
"
\n",
"
Part of speech annotation
\n",
"
\n",
"
\n",
"
\n",
"
\n",
" \n",
" \n",
"
\n",
"
no
\n",
"
13
\n",
"
3172
\n",
"
100.0
\n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Number of datasets Total transcribed [hours] \\\n",
"Part of speech annotation \n",
"no 13 3172 \n",
"\n",
" Percent of total \n",
"Part of speech annotation \n",
"no 100.0 "
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_datasets_per_meta_paid = datasets_count_and_total_size(df_cat_available_paid, 'Part of speech annotation')\n",
"df_datasets_per_meta_paid\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Number of datasets Total transcribed [hours] Percent of total\n",
"Gender info \n",
"yes 19 4874.1 81.42\n",
"no info 23 889.0 14.85\n",
"no 11 223.0 3.73\n",
" Number of datasets Total transcribed [hours] Percent of total\n",
"Age info \n",
"no info 33 4043.0 67.54\n",
"yes 8 1581.0 26.41\n",
"no 12 362.1 6.05\n",
" Number of datasets Total transcribed [hours] Percent of total\n",
"Accent info \n",
"no 49 4276.1 71.43\n",
"yes 4 1710.0 28.57\n",
" Number of datasets Total transcribed [hours] Percent of total\n",
"Nativity info \n",
"no 33 3254.0 54.36\n",
"yes 12 2648.1 44.24\n",
"no info 8 84.0 1.40\n",
" Number of datasets Total transcribed [hours] \\\n",
"Time alignement annotation \n",
"no 48 4852.1 \n",
"yes 5 1134.0 \n",
"\n",
" Percent of total \n",
"Time alignement annotation \n",
"no 81.06 \n",
"yes 18.94 \n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/michal/Development/hugging-face/michaljunczyk/pl-asr-speech-data-survey-analysis/utils.py:48: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df_cat[col_sum] = num_values\n",
"/home/michal/Development/hugging-face/michaljunczyk/pl-asr-speech-data-survey-analysis/utils.py:48: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df_cat[col_sum] = num_values\n",
"/home/michal/Development/hugging-face/michaljunczyk/pl-asr-speech-data-survey-analysis/utils.py:48: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df_cat[col_sum] = num_values\n",
"/home/michal/Development/hugging-face/michaljunczyk/pl-asr-speech-data-survey-analysis/utils.py:48: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df_cat[col_sum] = num_values\n",
"/home/michal/Development/hugging-face/michaljunczyk/pl-asr-speech-data-survey-analysis/utils.py:48: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df_cat[col_sum] = num_values\n",
"/home/michal/Development/hugging-face/michaljunczyk/pl-asr-speech-data-survey-analysis/utils.py:48: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df_cat[col_sum] = num_values\n",
"/home/michal/Development/hugging-face/michaljunczyk/pl-asr-speech-data-survey-analysis/utils.py:48: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df_cat[col_sum] = num_values\n",
"/home/michal/Development/hugging-face/michaljunczyk/pl-asr-speech-data-survey-analysis/utils.py:48: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df_cat[col_sum] = num_values\n",
"/home/michal/Development/hugging-face/michaljunczyk/pl-asr-speech-data-survey-analysis/utils.py:48: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df_cat[col_sum] = num_values\n",
"/home/michal/Development/hugging-face/michaljunczyk/pl-asr-speech-data-survey-analysis/utils.py:48: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df_cat[col_sum] = num_values\n"
]
}
],
"source": [
"from utils import metadata_coverage\n",
"df_meta_all_flat, df_meta_all_pivot = metadata_coverage(df_cat, df_cat_available_free, df_cat_available_paid)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "",
"text/plain": [
"