{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "analyzer_report_fn = \"/home/michal/Development/github/pl-asr-bigos-tools/data/analyzer-reports/bigos-20240425.json\"\n", "\n", "# read json\n", "import json\n", "with open(analyzer_report_fn, \"r\") as f:\n", " analyzer_report = json.load(f)\n", "\n", "print(analyzer_report)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_num_of_samples_per_split(dataset_hf):\n", " # input - huggingface dataset object\n", " # output - dictionary with statistics about number of samples per split\n", " out_dict = {}\n", " # number of samples per subset and split\n", " metric = \"samples_count\"\n", " print(\"Calculating {}\".format(metric))\n", "\n", " out_dict[metric] = {}\n", " for split in dataset_hf.keys():\n", " samples = dataset_hf[split].num_rows\n", " print(split, samples)\n", " out_dict[metric][split] = samples\n", " # add number of samples for all splits\n", " out_dict[metric][\"all_splits\"] = sum(out_dict[metric].values())\n", "\n", " return out_dict\n", "\n", "def get_audio_duration_per_split(dataset_hf):\n", " # input - huggingface dataset object\n", " # output - dictionary with statistics about audio duration per split\n", " out_dict = {}\n", " metric = \"audio_duration[h]\"\n", " print(\"Calculating {}\".format(metric))\n", "\n", " out_dict[metric] = {}\n", " for split in dataset_hf.keys():\n", " #sampling_rate = dataset_hf[split][\"sampling_rate\"][0]\n", " #audio_total_length_samples = 0\n", " #audio_total_length_samples = sum(len(audio_file[\"array\"]) for audio_file in dataset_hf[\"test\"][\"audio\"])\n", " audio_total_length_seconds = sum(len(audio_file[\"array\"]) / audio_file[\"sampling_rate\"] for audio_file in dataset_hf[split][\"audio\"])\n", " audio_total_length_hours = round(audio_total_length_seconds / 3600,2)\n", " out_dict[metric][split] = audio_total_length_hours\n", " print(split, audio_total_length_hours)\n", " # add number of samples for all splits\n", " out_dict[metric][\"all_splits\"] = sum(out_dict[metric].values())\n", " return out_dict" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_speakers_per_split(dataset_hf):\n", " # input - huggingface dataset object\n", " # output - dictionary with statistics about audio duration per split\n", " out_dict = {}\n", " metric = \"speakers_count\"\n", " print(\"Calculating {}\".format(metric))\n", "\n", " out_dict[metric] = {}\n", " for split in dataset_hf.keys():\n", " # extract speakers from file_id \n", " speakers_ids_all = [str(fileid).split(\"-\")[4] for fileid in dataset_hf[split][\"audioname\"]]\n", " speakers_ids_uniq = list(set(speakers_ids_all))\n", " speakers_count = len(speakers_ids_uniq)\n", " print(split, speakers_count)\n", " # add number of samples for all splits\n", " out_dict[metric][\"all_splits\"] = sum(out_dict[metric].values())\n", " return out_dict" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_uniq_utts_per_split(dataset_hf):\n", " # input - huggingface dataset object\n", " # output - dictionary with statistics about audio duration per split\n", " out_dict = {}\n", " metric = \"utterances_unique_count\"\n", " print(\"Calculating {}\".format(metric))\n", "\n", " out_dict[metric] = {}\n", " for split in dataset_hf.keys():\n", " # extract speakers from file_id \n", " utts_all = dataset_hf[split][\"ref_orig\"]\n", " utts_uniq = list(set(utts_all))\n", " utts_uniq_count = len(utts_uniq)\n", " print(split, utts_uniq_count)\n", " # add number of samples for all splits\n", " out_dict[metric][\"all_splits\"] = \"N/A\"\n", " return out_dict" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_words_per_split(dataset_hf):\n", " # input - huggingface dataset object\n", " # output - dictionary with statistics about audio duration per split\n", " out_dict = {}\n", " metric = \"words_count\"\n", " print(\"Calculating {}\".format(metric))\n", "\n", " out_dict[metric] = {}\n", " for split in dataset_hf.keys():\n", " # extract speakers from file_id \n", " utts_all = dataset_hf[split][\"ref_orig\"]\n", " utts_lenghts = [len(utt.split(\" \")) for utt in utts_all]\n", " words_all_count = sum(utts_lenghts)\n", " print(split, words_all_count)\n", " # add number of samples for all splits\n", " out_dict[metric][\"all_splits\"] = sum(out_dict[metric].values())\n", " return out_dict" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_unique_words_per_split(dataset_hf):\n", " # input - huggingface dataset object\n", " # output - dictionary with statistics about audio duration per split\n", " out_dict = {}\n", " out_words_list = []\n", " metric = \"words_unique\"\n", " print(\"Calculating {}\".format(metric))\n", "\n", " out_dict[metric] = {}\n", " for split in dataset_hf.keys():\n", " # extract speakers from file_id \n", " utts_all = dataset_hf[split][\"ref_orig\"]\n", " words_all = \" \".join(utts_all).split(\" \")\n", " words_uniq = list(set(words_all))\n", " out_words_list = out_words_list + words_uniq\n", " words_uniq_count = len(words_uniq)\n", " print(split, words_uniq_count)\n", " # add number of samples for all splits\n", " out_words_uniq = list(set((out_words_list)))\n", " out_words_uniq_count = len(out_words_uniq)\n", " out_dict[metric][\"all_splits\"] = out_words_uniq_count\n", " print(\"all\", out_words_uniq_count)\n", "\n", " return out_dict, out_words_uniq" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_chars_per_split(dataset_hf):\n", " # input - huggingface dataset object\n", " # output - dictionary with statistics about audio duration per split\n", " out_dict = {}\n", "\n", " metric = \"chars\"\n", " print(\"Calculating {}\".format(metric))\n", "\n", " out_dict[metric] = {}\n", " for split in dataset_hf.keys():\n", " # extract speakers from file_id \n", " utts_all = dataset_hf[split][\"ref_orig\"]\n", " words_all = \" \".join(utts_all).split(\" \")\n", " chars_all = \" \".join(words_all)\n", " chars_all_count = len(chars_all)\n", " print(split, chars_all_count)\n", " # add number of samples for all splits\n", " out_dict[metric][\"all_splits\"] = sum(out_dict[metric].values())\n", " return out_dict" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_unique_chars_per_split(dataset_hf):\n", " # input - huggingface dataset object\n", " # output - dictionary with statistics about audio duration per split\n", " out_dict = {}\n", " out_chars_list = []\n", " metric = \"chars_unique\"\n", " print(\"Calculating {}\".format(metric))\n", "\n", " out_dict[metric] = {}\n", " for split in dataset_hf.keys():\n", " # extract speakers from file_id \n", " utts_all = dataset_hf[split][\"ref_orig\"]\n", " words_all = \" \".join(utts_all).split(\" \")\n", " words_uniq = list(set(words_all))\n", " chars_uniq = list(set(\"\".join(words_uniq)))\n", " chars_uniq_count = len(chars_uniq)\n", " print(split, chars_uniq_count)\n", " out_chars_list = out_chars_list + chars_uniq\n", " # add number of samples for all splits\n", " out_chars_uniq = list(set((out_chars_list)))\n", " out_chars_uniq_count = len(out_chars_uniq)\n", " out_dict[metric][\"all_splits\"] = out_chars_uniq_count\n", " print(\"all\", out_chars_uniq_count)\n", "\n", " return out_dict, out_chars_uniq" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_meta_coverage_sex_per_split(dataset_hf):\n", " # input - huggingface dataset object\n", " # output - dictionary with statistics about audio duration per split\n", " out_dict = {}\n", " metric = \"meta_coverage_sex\"\n", " print(\"Calculating {}\".format(metric))\n", "\n", " out_dict[metric] = {}\n", " for split in dataset_hf.keys():\n", " \n", " # extract speakers from file_id\n", " meta_info = dataset_hf[split][\"speaker_sex\"]\n", "\n", " # calculate coverage\n", " meta_info_count = len(meta_info)\n", " meta_info_not_null_count = len([x for x in meta_info if x != \"N/A\"])\n", " meta_info_coverage = round(meta_info_not_null_count / meta_info_count, 2)\n", " print(split, meta_info_coverage)\n", "\n", " # add number of samples for all splits\n", " out_dict[metric][split] = meta_info_coverage\n", "\n", " # add number of samples for all splits\n", " out_dict[metric][\"all_splits\"] = \"N/A\"\n", " return out_dict" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_meta_coverage_age_per_split(dataset_hf):\n", " # input - huggingface dataset object\n", " # output - dictionary with statistics about audio duration per split\n", " out_dict = {}\n", " metric = \"meta_coverage_age\"\n", " print(\"Calculating {}\".format(metric))\n", "\n", " out_dict[metric] = {}\n", " for split in dataset_hf.keys():\n", " meta_info = dataset_hf[split][\"speaker_age\"]\n", " \n", " # calculate coverage\n", " meta_info_count = len(meta_info)\n", " meta_info_not_null_count = len([x for x in meta_info if x != \"N/A\"])\n", " meta_info_coverage = round(meta_info_not_null_count / meta_info_count, 2)\n", " print(split, meta_info_coverage)\n", "\n", " # add number of samples for all splits\n", " out_dict[metric][split] = meta_info_coverage\n", "\n", " # add number of samples for all splits\n", " out_dict[metric][\"all_splits\"] = \"N/A\"\n", " return out_dict" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def speech_rate_per_split(dataset_hf):\n", " # input - huggingface dataset object\n", " # output - dictionary with statistics about audio duration per split\n", " out_dict = {}\n", " metric = \"speech_rate\"\n", " print(\"Calculating {}\".format(metric))\n", "\n", " out_dict[metric] = {}\n", " for split in dataset_hf.keys():\n", " # extract speakers from file_id \n", " utts_all = dataset_hf[split][\"ref_orig\"]\n", " words_all = \" \".join(utts_all).split(\" \")\n", " words_all_count = len(words_all)\n", " audio_total_length_seconds = sum(len(audio_file[\"array\"]) / audio_file[\"sampling_rate\"] for audio_file in dataset_hf[split][\"audio\"])\n", " speech_rate = round(words_all_count / audio_total_length_seconds, 2)\n", " print(split, speech_rate)\n", " out_dict[metric][split] = speech_rate\n", " # add number of samples for all splits\n", " out_dict[metric][\"all_splits\"] = \"N/A\"\n", " return out_dict" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "# distribution of speaker age\n", "def get_speaker_age_distribution(dataset_hf):\n", " no_meta=False\n", " age_buckets = ['teens','twenties', 'thirties', 'fourties', 'fifties', 'sixties', 'seventies', 'eighties', 'nineties']\n", " # input - huggingface dataset object\n", " # output - dictionary with statistics about audio duration per split\n", " out_dict = {}\n", " metric = \"speaker_age_distribution\"\n", " print(\"Calculating {}\".format(metric))\n", "\n", " out_dict[metric] = {}\n", " values_count_total = {}\n", " for age in age_buckets:\n", " values_count_total[age]=0\n", " for split in dataset_hf.keys():\n", " meta_info = dataset_hf[split][\"speaker_age\"]\n", " meta_info_not_null = [x for x in meta_info if x != \"N/A\"]\n", " out_dict[metric][split] = {}\n", "\n", " if len(meta_info_not_null) == 0:\n", " out_dict[metric][split][age]=\"N/A\"\n", " no_meta=True\n", " continue\n", " for age in age_buckets:\n", " values_count = meta_info_not_null.count(age)\n", " values_count_total[age] += values_count\n", " out_dict[metric][split][age] = round(values_count/len(meta_info_not_null),2)\n", " print(split, out_dict[metric][split])\n", " \n", " # add number of samples for all splits\n", " if (no_meta):\n", " out_dict[metric][\"all_splits\"] = \"N/A\"\n", " return out_dict\n", " \n", " out_dict[metric][\"all_splits\"] = {}\n", " # calculate total number of samples in values_count_total\n", " for age in age_buckets:\n", " total_samples = sum(values_count_total.values())\n", " out_dict[metric][\"all_splits\"][age] = round(values_count_total[age]/total_samples,2)\n", " return out_dict\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "# distribution of speaker age\n", "def get_speaker_sex_distribution(dataset_hf):\n", " no_meta=False\n", " sex_types = ['male', 'female']\n", "\n", " # input - huggingface dataset object\n", " # output - dictionary with statistics about audio duration per split\n", " out_dict = {}\n", " metric = \"speaker_sex_distribution\"\n", " print(\"Calculating {}\".format(metric))\n", " out_dict[metric] = {}\n", " values_count_total = {}\n", " for sex in sex_types:\n", " values_count_total[sex]=0\n", " for split in dataset_hf.keys():\n", " meta_info = dataset_hf[split][\"speaker_sex\"]\n", " meta_info_not_null = [x for x in meta_info if x != \"N/A\"]\n", " out_dict[metric][split] = {}\n", "\n", " if len(meta_info_not_null) == 0:\n", " out_dict[metric][split][sex]=\"N/A\"\n", " no_meta=True\n", " continue\n", " for sex in sex_types:\n", " values_count = meta_info_not_null.count(sex)\n", " values_count_total[sex] += values_count\n", " out_dict[metric][split][sex] = round(values_count/len(meta_info_not_null),2)\n", " print(split, out_dict[metric][split])\n", " \n", " # add number of samples for all splits\n", " if (no_meta):\n", " out_dict[metric][\"all_splits\"] = \"N/A\"\n", " return out_dict\n", " \n", " out_dict[metric][\"all_splits\"] = {}\n", " # calculate total number of samples in values_count_total\n", " for sex in sex_types:\n", " total_samples = sum(values_count_total.values())\n", " out_dict[metric][\"all_splits\"][sex] = round(values_count_total[sex]/total_samples,2)\n", " return out_dict\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "recordings_per_speaker_stats_dict = {}\n", "def recordings_per_speaker_stats(dataset_hf):\n", " # input - huggingface dataset object\n", " # output - dictionary with statistics about audio duration per split\n", " out_dict = {}\n", " metric = \"recordings_per_speaker\"\n", " print(\"Calculating {}\".format(metric))\n", " \n", " recordings_per_speaker_stats_dict = {}\n", "\n", " out_dict[metric] = {}\n", " for split in dataset_hf.keys():\n", " # extract speakers from file_id \n", " audiopaths = dataset_hf[split][\"audioname\"]\n", " speaker_prefixes = [str(fileid).split(\"-\")[0:5] for fileid in audiopaths]\n", "\n", " # create dictionary with list of audio paths matching speaker prefix\n", " speakers_dict = {}\n", " # Create initial dictionary keys from speaker prefixes\n", " for speaker_prefix in speaker_prefixes:\n", " speaker_prefix_str = \"-\".join(speaker_prefix)\n", " speakers_dict[speaker_prefix_str] = []\n", "\n", " # Populate the dictionary with matching audio paths\n", " for audio_path in audiopaths:\n", " for speaker_prefix_str in speakers_dict.keys():\n", " if speaker_prefix_str in audio_path:\n", " speakers_dict[speaker_prefix_str].append(audio_path)\n", "\n", "\n", " # todo calculate recordings_per_speaker_stats_dict\n", " # iterate of speaker_dict prefixes and calculate number of recordings per speaker.\n", " recordings_per_speaker_stats_dict = {}\n", " for speaker_prefix_str in speakers_dict.keys():\n", " recordings_per_speaker_stats_dict[speaker_prefix_str] = len(speakers_dict[speaker_prefix_str])\n", " out_dict[metric][split] = {}\n", " \n", " out_dict[metric][split][\"recordings_per_speaker_list\"] = recordings_per_speaker_stats_dict \n", " \n", " # use recordings_per_speaker_stats to calculate statistics like min, max, avg, median, std\n", " out_dict[metric][split][\"recordings_per_speaker_stats\"] = {}\n", " speakers = len(list(recordings_per_speaker_stats_dict.keys()))\n", " recordings_total = len(audiopaths)\n", " average_recordings_per_speaker = round( recordings_total / speakers,2)\n", " out_dict[metric][split][\"recordings_per_speaker_stats\"][\"average\"] = average_recordings_per_speaker\n", " out_dict[metric][split][\"recordings_per_speaker_stats\"][\"std\"] = round(np.std(list(recordings_per_speaker_stats_dict.values())),2)\n", " out_dict[metric][split][\"recordings_per_speaker_stats\"][\"median\"] = np.median(list(recordings_per_speaker_stats_dict.values()))\n", "\n", " out_dict[metric][split][\"recordings_per_speaker_stats\"][\"min\"] = min(recordings_per_speaker_stats_dict.values())\n", " out_dict[metric][split][\"recordings_per_speaker_stats\"][\"max\"] = max(recordings_per_speaker_stats_dict.values())\n", "\n", " # add number of samples for all splits\n", " out_dict[metric][\"all_splits\"] = \"N/A\"\n", " return out_dict" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "import pandas as pd\n", "from datasets import load_dataset\n", "import os\n", "\n", "def distribution_audio_duration(dataset_hf, output_dir, metric = \"audio_duration_seconds\", dimension = \"speaker_sex\"):\n", " # input - huggingface dataset object\n", " # output - figure with distribution of audio duration per sex\n", " out_dict = {}\n", "\n", " print(\"Calculating {}\".format(metric))\n", " out_dict[metric] = {}\n", " # drop samples for which dimension column values are equal to \"N/A\"\n", " for split in dataset_hf.keys():\n", " df_dataset = pd.DataFrame(dataset_hf[split])\n", " df_dataset = df_dataset.drop(columns=[\"audio\"])\n", " \n", " # remove values equal to \"N/A\" for column dimension\n", " df_filtered = df_dataset[df_dataset[dimension] != \"N/A\"] \n", " df_filtered = df_filtered[df_filtered[dimension] != \"other\"]\n", " # if df_filtered is empty, skip violin plot generation for this split and dimension\n", " if df_filtered.empty:\n", " print(\"No data for split {} and dimension {}\".format(split, dimension))\n", " continue\n", " plt.figure(figsize=(15, 10))\n", " sns.violinplot(data = df_filtered, hue=dimension, x='dataset', y=metric, split=True, fill = False, inner=\"box\", legend='auto', common_norm=True)\n", " plt.title('Violin plot of {} by {} for split {}'.format(metric, dimension, split))\n", " plt.xlabel(dimension)\n", " plt.ylabel(metric)\n", " plt.show()\n", " # save figure to file\n", " os.makedirs(output_dir, exist_ok=True)\n", " output_fn = os.path.join(output_dir, metric + \"-\" + dimension + \"-\" + split + \".png\") \n", " plt.savefig(output_fn)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os\n", "from datasets import load_dataset\n", "from datasets import get_dataset_config_names\n", "dataset_name = \"amu-cai/pl-asr-bigos-v2\"\n", "# get dataset config names\n", "dataset_config_names = get_dataset_config_names(dataset_name)\n", "# load dataset\n", "dataset_hf = load_dataset(dataset_name, \"all\")\n", "\n", "dataset_statistics={}\n", "dataset_contents = {}\n", "output_dir_plots = \"./plots\"\n", "os.makedirs(output_dir_plots, exist_ok=True)\n", "output_dir_reports = \"./reports\"\n", "os.makedirs(output_dir_plots, exist_ok=True)\n", "\n", "for config_name in dataset_config_names :\n", " print(config_name)\n", " dataset_hf_subset = load_dataset(dataset_name, config_name) \n", " #dataset_statistics[config_name] = get_num_of_samples_per_split(dataset_hf_subset)\n", " #dataset_statistics[config_name] = get_uniq_utts_per_split(dataset_hf_subset)\n", " #dataset_statistics[config_name] = get_words_per_split(dataset_hf_subset)\n", " \n", " #dataset_contents[config_name] = {}\n", "\n", " #dataset_statistics[config_name], dataset_contents[config_name][\"unique_words\"] = get_unique_words_per_split(dataset_hf_subset)\n", "\n", " #dataset_statistics[config_name] = get_chars_per_split(dataset_hf_subset)\n", " #dataset_statistics[config_name], dataset_contents[config_name][\"unique_chars\"] = get_unique_chars_per_split(dataset_hf_subset)\n", "\n", " #dataset_statistics[config_name] = get_audio_duration_per_split(dataset_hf_subset)\n", " #dataset_statistics[config_name] = get_speakers_per_split(dataset_hf_subset)\n", "\n", " #dataset_statistics[config_name] = get_meta_coverage_sex_per_split(dataset_hf_subset)\n", " #dataset_statistics[config_name] = get_meta_coverage_age_per_split(dataset_hf_subset)\n", " # metadata coverage per subset in percent - speaker accent\n", "\n", " # speech rate per subset\n", " #dataset_statistics[config_name] = speech_rate_per_split(dataset_hf_subset)\n", " #dataset_statistics[config_name] = get_speaker_age_distribution(dataset_hf_subset)\n", " #dataset_statistics[config_name] = get_speaker_sex_distribution(dataset_hf_subset)\n", " #print(dataset_statistics[config_name])\n", " \n", " dataset_statistics[config_name] = recordings_per_speaker_stats(dataset_hf_subset)\n", " #dataset_statistics[config_name] = uniq_utterances_per_speaker_stats(dataset_hf_subset)\n", " # number of words per speaker (min, max, med, avg, std)\n", "\n", "\n", " # distribution\n", " # distribution of audio duration per subset\n", " output_dir_plots_subset = os.path.join(output_dir_plots, config_name)\n", " dataset_statistics[config_name] = distribution_audio_duration(dataset_hf_subset, output_dir_plots_subset, 'audio_duration_seconds', 'speaker_sex')\n", " \n", " # distribution of audio duration per age\n", " dataset_statistics[config_name] = distribution_audio_duration(dataset_hf_subset, output_dir_plots_subset, 'audio_duration_seconds', 'speaker_age')\n", "\n", " \n", " # distribution of speaking rate per subset\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/michal/.pyenv/versions/3.10.11/envs/streamlit/lib/python3.10/site-packages/datasets/load.py:1486: FutureWarning: The repository for amu-cai/pl-asr-bigos-v2 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/amu-cai/pl-asr-bigos-v2\n", "You can avoid this message in future by passing the argument `trust_remote_code=True`.\n", "Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.\n", " warnings.warn(\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Dataset({\n", " features: ['audioname', 'split', 'dataset', 'speaker_id', 'ref_orig', 'audio', 'audio_duration_samples', 'audio_duration_seconds', 'samplingrate_orig', 'sampling_rate', 'audiopath_bigos', 'audiopath_local', 'speaker_age', 'speaker_sex'],\n", " num_rows: 44\n", "})\n" ] } ], "source": [] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "ename": "FileNotFoundError", "evalue": "[Errno 2] No such file or directory: './reports/pelcra/pl-asr-pelcra-for-bigos/dataset_contents.json'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[2], line 22\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mopen\u001b[39m(json_stats_secret, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;28;01mas\u001b[39;00m file:\n\u001b[1;32m 20\u001b[0m stats_dict_secret \u001b[38;5;241m=\u001b[39m json\u001b[38;5;241m.\u001b[39mload(file)\n\u001b[0;32m---> 22\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mjson_contents_public\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mr\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m file:\n\u001b[1;32m 23\u001b[0m contents_dict_public \u001b[38;5;241m=\u001b[39m json\u001b[38;5;241m.\u001b[39mload(file)\n\u001b[1;32m 25\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mopen\u001b[39m(json_stats_public, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;28;01mas\u001b[39;00m file:\n", "File \u001b[0;32m~/.pyenv/versions/3.10.11/envs/streamlit/lib/python3.10/site-packages/IPython/core/interactiveshell.py:324\u001b[0m, in \u001b[0;36m_modified_open\u001b[0;34m(file, *args, **kwargs)\u001b[0m\n\u001b[1;32m 317\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m file \u001b[38;5;129;01min\u001b[39;00m {\u001b[38;5;241m0\u001b[39m, \u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m2\u001b[39m}:\n\u001b[1;32m 318\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 319\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mIPython won\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt let you open fd=\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfile\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m by default \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 320\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mas it is likely to crash IPython. If you know what you are doing, \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 321\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124myou can use builtins\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m open.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 322\u001b[0m )\n\u001b[0;32m--> 324\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mio_open\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfile\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: './reports/pelcra/pl-asr-pelcra-for-bigos/dataset_contents.json'" ] } ], "source": [ "import json\n", "import pandas as pd\n", "\n", "#dataset_public = \"amu-cai/pl-asr-bigos-v2\"\n", "#dataset_secret = \"amu-cai/pl-asr-bigos-v2-secret\"\n", "\n", "dataset_public = \"pelcra/pl-asr-pelcra-for-bigos\"\n", "dataset_secret = \"pelcra/pl-asr-pelcra-for-bigos-secret\"\n", "\n", "json_contents_public = \"./reports/{}/dataset_contents.json\".format(dataset_public)\n", "json_stats_public = \"reports/{}/dataset_statistics.json\".format(dataset_public)\n", "\n", "json_contents_secret = \"./reports/{}/dataset_contents.json\".format(dataset_secret)\n", "json_stats_secret = \"reports/{}/dataset_statistics.json\".format(dataset_secret)\n", "\n", "with open(json_contents_secret, 'r') as file:\n", " contents_dict_secret = json.load(file)\n", "\n", "with open(json_stats_secret, 'r') as file:\n", " stats_dict_secret = json.load(file)\n", "\n", "with open(json_contents_public, 'r') as file:\n", " contents_dict_public = json.load(file)\n", "\n", "with open(json_stats_public, 'r') as file:\n", " stats_dict_public = json.load(file)\n", "\n" ] }, { "cell_type": "code", "execution_count": 116, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ul-diabiz_poleval-22\n", "ul-spokes_mix_emo-18\n", "ul-spokes_mix_luz-18\n", "ul-spokes_mix_parl-18\n", "ul-spokes_biz_bio-23\n", "ul-spokes_biz_int-23\n", "ul-spokes_biz_luz-23\n", "ul-spokes_biz_pod-23\n", "ul-spokes_biz_pres-23\n", "ul-spokes_biz_vc-23\n", "ul-spokes_biz_vc2-23\n", "ul-spokes_biz_wyw-23\n", "all\n" ] } ], "source": [ "# merge contents if dictionaries for fields utts, words, words_unique, chars, chars_unique and speech_rate\n", "for dataset in stats_dict_public.keys():\n", " print(dataset)\n", " for metric in stats_dict_secret[dataset].keys():\n", " for split in stats_dict_secret[dataset][metric].keys():\n", " if split == \"test\":\n", " stats_dict_public[dataset][metric][split] = stats_dict_secret[dataset][metric][split]\n", " \n", "\n" ] }, { "cell_type": "code", "execution_count": 120, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " value\n", "metric split \n", "samples test 947\n", " train 7719\n", " validation 284\n", "utts_unique test 944\n", " train 7556\n", " validation 280\n", "words test 12051\n", " train 89255\n", " validation 3900\n", "words_unique test 2772\n", " train 12341\n", " validation 1209\n", "chars test 66433\n", " train 495454\n", " validation 23594\n", "audio[h] test 2.05\n", " train 16.59\n", " validation 1.04\n", "speakers test 24\n", " train 132\n", " validation 14\n", "speech_rate test 1.63\n", " train 1.49\n", " validation 1.04\n", "meta_cov_sex test N/A\n", " train N/A\n", " validation N/A\n", "meta_cov_age test N/A\n", " train N/A\n", " validation N/A\n", "meta_dist_sex test N/A\n", " train N/A\n", " validation N/A\n", "meta_dist_age test N/A\n", " train N/A\n", " validation N/A\n", "samples_per_spk test {'average': 39.46, 'std': 52.52, 'median': 22....\n", " train {'average': 58.48, 'std': 99.27, 'median': 24....\n", " validation {'average': 20.29, 'std': 7.64, 'median': 19.0...\n" ] } ], "source": [ "# Creating a MultiIndex DataFrame\n", "rows = []\n", "for dataset, metrics in stats_dict_public.items():\n", " if (dataset == \"all\"):\n", " continue\n", " for metric, splits in metrics.items():\n", " for split, value in splits.items():\n", " if (split == \"all_splits\"):\n", " continue\n", " rows.append((dataset, metric, split, value))\n", "\n", "# Convert to DataFrame\n", "df = pd.DataFrame(rows, columns=['dataset', 'metric', 'split', 'value'])\n", "df.set_index(['dataset', 'metric', 'split'], inplace=True)\n", "\n", "print(df.loc['ul-diabiz_poleval-22'])" ] }, { "cell_type": "code", "execution_count": 121, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
metricsamplesutts_uniquewordswords_uniquecharsaudio[h]speakers
dataset
ul-diabiz_poleval-22895087801052061632258548119.68170
ul-spokes_biz_bio-23549175413612782691375207694395275.96158
ul-spokes_biz_int-23110911012312366651416434.519
ul-spokes_biz_luz-2341966416417865931085354490695148.55158
ul-spokes_biz_pod-232280722762605852838073650700110.0113
ul-spokes_biz_pres-23171741715825184154253164281764.4955
ul-spokes_biz_vc-234527244710568780777543348648104.1378
ul-spokes_biz_vc2-232580225596755885998504526688162.0884
ul-spokes_biz_wyw-23113571120425951745114155298056.4138
ul-spokes_mix_emo-18243292106325238019819137969551.2340
ul-spokes_mix_luz-18209191966820458726106113242837.4821
ul-spokes_mix_parl-18865685211009921868166921024.5548
\n", "
" ], "text/plain": [ "metric samples utts_unique words words_unique chars \\\n", "dataset \n", "ul-diabiz_poleval-22 8950 8780 105206 16322 585481 \n", "ul-spokes_biz_bio-23 54917 54136 1278269 137520 7694395 \n", "ul-spokes_biz_int-23 1109 1101 23123 6665 141643 \n", "ul-spokes_biz_luz-23 41966 41641 786593 108535 4490695 \n", "ul-spokes_biz_pod-23 22807 22762 605852 83807 3650700 \n", "ul-spokes_biz_pres-23 17174 17158 251841 54253 1642817 \n", "ul-spokes_biz_vc-23 45272 44710 568780 77754 3348648 \n", "ul-spokes_biz_vc2-23 25802 25596 755885 99850 4526688 \n", "ul-spokes_biz_wyw-23 11357 11204 259517 45114 1552980 \n", "ul-spokes_mix_emo-18 24329 21063 252380 19819 1379695 \n", "ul-spokes_mix_luz-18 20919 19668 204587 26106 1132428 \n", "ul-spokes_mix_parl-18 8656 8521 100992 18681 669210 \n", "\n", "metric audio[h] speakers \n", "dataset \n", "ul-diabiz_poleval-22 19.68 170 \n", "ul-spokes_biz_bio-23 275.96 158 \n", "ul-spokes_biz_int-23 4.51 9 \n", "ul-spokes_biz_luz-23 148.55 158 \n", "ul-spokes_biz_pod-23 110.0 113 \n", "ul-spokes_biz_pres-23 64.49 55 \n", "ul-spokes_biz_vc-23 104.13 78 \n", "ul-spokes_biz_vc2-23 162.08 84 \n", "ul-spokes_biz_wyw-23 56.41 38 \n", "ul-spokes_mix_emo-18 51.23 40 \n", "ul-spokes_mix_luz-18 37.48 21 \n", "ul-spokes_mix_parl-18 24.55 48 " ] }, "execution_count": 121, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Get the total number of speakers, samples, unique utts, words, unique words, chars, unique chars and speech rate\n", "metrics = [\"samples\", \"utts_unique\", \"words\", \"words_unique\", \"chars\", \"audio[h]\", \"speakers\"]\n", " # unique utts, words, unique words, chars, unique chars and speech rate\n", "# filter the multiindex dataframe to leave only specific metrics\n", "df_total = df.loc[(slice(None), metrics), :]\n", "df_total = df_total.unstack(level ='split')\n", "df_total['value', 'total'] = df_total['value'].sum(axis=1)\n", "df_total.columns = df_total.columns.droplevel(0)\n", "columns_to_drop = ['test', 'train', 'validation']\n", "df_total.drop(columns = columns_to_drop, inplace = True)\n", "df_total = df_total.unstack(level ='metric')\n", "df_total.columns = df_total.columns.droplevel(0)\n", "df_total" ] }, { "cell_type": "code", "execution_count": 78, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
value
splittesttrainvalidationtotal
fair-mls-205192504251126072
google-fleurs-2275828413383937
mailabs-corpus_librivox-19150111834152714862
mozilla-common_voice_15-23889619119889536910
pjatk-clarin_mobile-1539228612423495
pjatk-clarin_studio-15140444401488
polyai-minds14-215346247562
pwr-azon_read-2058618203822788
pwr-azon_spont-204835751456
pwr-maleset-unk47737834784738
pwr-shortwords-unk9276186939
pwr-viu-unk26721462902703
Total14993710701288798950
\n", "
" ], "text/plain": [ " value \n", "split test train validation total\n", "fair-mls-20 519 25042 511 26072\n", "google-fleurs-22 758 2841 338 3937\n", "mailabs-corpus_librivox-19 1501 11834 1527 14862\n", "mozilla-common_voice_15-23 8896 19119 8895 36910\n", "pjatk-clarin_mobile-15 392 2861 242 3495\n", "pjatk-clarin_studio-15 1404 44 40 1488\n", "polyai-minds14-21 53 462 47 562\n", "pwr-azon_read-20 586 1820 382 2788\n", "pwr-azon_spont-20 48 357 51 456\n", "pwr-maleset-unk 477 3783 478 4738\n", "pwr-shortwords-unk 92 761 86 939\n", "pwr-viu-unk 267 2146 290 2703\n", "Total 14993 71070 12887 98950" ] }, "execution_count": 78, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Access all data where Metric is 'utts'\n", "df_utts = df.xs('samples', level='metric')\n", "\n", "# change split to columns\n", "df_utts = df_utts.unstack(level='split')\n", "df_utts\n", "\n", "# add column with total number of samples\n", "df_utts['value', 'total'] = df_utts['value'].sum(axis=1)\n", "df_utts\n", "\n", "# create a new row with total number of samples and concatenate it to the DataFrame\n", "df_total = df_utts.sum()\n", "df_total.name = ('Total')\n", "df_utts = pd.concat([df_utts, pd.DataFrame(df_total).T])\n", "df_utts\n", "\n" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " value\n", "dataset metric \n", "pjatk-clarin_mobile-15 samples 2861\n", " utts 2857\n", " words 74634\n", " words_unique 23166\n", " chars 507238\n", "... ...\n", "all meta_cov_sex 0.57\n", " meta_cov_age 0.24\n", " meta_dist_sex {'male': 0.64, 'female': 0.36}\n", " meta_dist_age {'teens': 0.03, 'twenties': 0.43, 'thirties': ...\n", " samples_per_spk {'average': 194.71, 'std': 689.86, 'median': 4...\n", "\n", "[169 rows x 1 columns]\n" ] } ], "source": [ "\n", "# Access all 'train' splits across all metrics\n", "print(df.xs('train', level='split'))\n", "\n", "# xs is the best for single level indexing. It can be chained, but is less effective than loc or boolean masking" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " value\n", "dataset metric split \n", "mozilla-common_voice_15-23 samples test 8896\n", "mailabs-corpus_librivox-19 samples test 1501\n", "pjatk-clarin_studio-15 samples test 1404\n", "google-fleurs-22 samples test 758\n", "pwr-azon_read-20 samples test 586\n", "fair-mls-20 samples test 519\n", "pwr-maleset-unk samples test 477\n", "pjatk-clarin_mobile-15 samples test 392\n", "pwr-viu-unk samples test 267\n" ] } ], "source": [ "# Boolean masking for a more complex condition across levels\n", "mask_test_set = (df.index.get_level_values('metric') == 'samples') & (df.index.get_level_values('split') == 'test') \n", "df_test = df.loc[mask_test_set]\n", "# convert value to numbric\n", "# sort by value\n", "df_test = df_test.sort_values(by='value', ascending=False)\n", "# filter out values smaller than 100\n", "df_test = df_test[df_test['value'] > 100]\n", "\n", "# remove dataset \"all\"\n", "df_test = df_test.drop('all', level='dataset')\n", "print(df_test)\n" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
value
splittesttrainvalidation
dataset
fair-mls-20N/AN/AN/A
google-fleurs-22N/AN/AN/A
mailabs-corpus_librivox-19N/AN/AN/A
mozilla-common_voice_15-23{'teens': 0.11, 'twenties': 0.38, 'thirties': ...{'teens': 0.03, 'twenties': 0.43, 'thirties': ...{'teens': 0.15, 'twenties': 0.46, 'thirties': ...
pjatk-clarin_mobile-15N/AN/AN/A
pjatk-clarin_studio-15N/AN/AN/A
polyai-minds14-21N/AN/AN/A
pwr-azon_read-20N/AN/AN/A
pwr-azon_spont-20N/AN/AN/A
pwr-maleset-unkN/AN/AN/A
pwr-shortwords-unkN/AN/AN/A
pwr-viu-unkN/AN/AN/A
\n", "
" ], "text/plain": [ " value \\\n", "split test \n", "dataset \n", "fair-mls-20 N/A \n", "google-fleurs-22 N/A \n", "mailabs-corpus_librivox-19 N/A \n", "mozilla-common_voice_15-23 {'teens': 0.11, 'twenties': 0.38, 'thirties': ... \n", "pjatk-clarin_mobile-15 N/A \n", "pjatk-clarin_studio-15 N/A \n", "polyai-minds14-21 N/A \n", "pwr-azon_read-20 N/A \n", "pwr-azon_spont-20 N/A \n", "pwr-maleset-unk N/A \n", "pwr-shortwords-unk N/A \n", "pwr-viu-unk N/A \n", "\n", " \\\n", "split train \n", "dataset \n", "fair-mls-20 N/A \n", "google-fleurs-22 N/A \n", "mailabs-corpus_librivox-19 N/A \n", "mozilla-common_voice_15-23 {'teens': 0.03, 'twenties': 0.43, 'thirties': ... \n", "pjatk-clarin_mobile-15 N/A \n", "pjatk-clarin_studio-15 N/A \n", "polyai-minds14-21 N/A \n", "pwr-azon_read-20 N/A \n", "pwr-azon_spont-20 N/A \n", "pwr-maleset-unk N/A \n", "pwr-shortwords-unk N/A \n", "pwr-viu-unk N/A \n", "\n", " \n", "split validation \n", "dataset \n", "fair-mls-20 N/A \n", "google-fleurs-22 N/A \n", "mailabs-corpus_librivox-19 N/A \n", "mozilla-common_voice_15-23 {'teens': 0.15, 'twenties': 0.46, 'thirties': ... \n", "pjatk-clarin_mobile-15 N/A \n", "pjatk-clarin_studio-15 N/A \n", "polyai-minds14-21 N/A \n", "pwr-azon_read-20 N/A \n", "pwr-azon_spont-20 N/A \n", "pwr-maleset-unk N/A \n", "pwr-shortwords-unk N/A \n", "pwr-viu-unk N/A " ] }, "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# What is the total distribution of age in common voice dataset and overall?\n", "df_age = df.xs('meta_dist_age', level='metric')\n", "df_age = df_age.unstack(level='split')\n", "#df_age['value', 'total'] = df_age['value'].sum(axis=1)\n", "df_age" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset \n", "pelcra = load_dataset(\"pelcra/pl-asr-pelcra-for-bigos\", \"all\", split=\"test\")\n", "df_test = pelcra.to_pandas()\n", "df_test = df_test.drop(columns=[\"audio\"])\n", "df_test.to_csv(\"test.tsv\", sep=\"\\t\",index=False)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import matplotlib.pyplot as plt\n", "import pandas as pd\n", "\n", "# Load the data from the TSV file into a DataFrame\n", "#file_path = '/mnt/data/test.tsv'\n", "#data = pd.read_csv(file_path, sep='\\t')\n", "data = df_test\n", "# Group the data by audio duration and calculate the count and total duration for each group\n", "duration_group = data.groupby('audio_duration_seconds').agg(\n", " sample_count=('audio_duration_seconds', 'size'),\n", " total_duration=('audio_duration_seconds', 'sum')\n", ").reset_index()\n", "\n", "# eliminate outliers - samples with duration longer than 150 seconds\n", "duration_group = duration_group[duration_group['audio_duration_seconds'] < 120]\n", "\n", "# Calculate the cumulative percentage of the total duration\n", "duration_group['cumulative_duration'] = (duration_group['total_duration'].cumsum() / \n", " duration_group['total_duration'].sum()) * 100\n", "\n", "# Plotting the data\n", "fig, ax1 = plt.subplots(figsize=(14, 8))\n", "\n", "# Left axis - Count of samples (blue line)\n", "ax1.set_xlabel('Sample Duration (seconds)')\n", "ax1.set_ylabel('Sample Count', color='blue')\n", "ax1.plot(duration_group['audio_duration_seconds'], duration_group['sample_count'], color='blue')\n", "ax1.tick_params(axis='y', labelcolor='blue')\n", "\n", "# Right axis - Total duration (orange line)\n", "ax2 = ax1.twinx()\n", "ax2.set_ylabel('Total Duration (hours)', color='orange')\n", "ax2.plot(duration_group['audio_duration_seconds'], duration_group['total_duration'] / 3600, color='orange')\n", "ax2.tick_params(axis='y', labelcolor='orange')\n", "\n", "# Adding Cumulative % (green dashed line)\n", "ax3 = ax1.twinx()\n", "ax3.spines[\"right\"].set_position((\"axes\", 1.15))\n", "ax3.set_ylabel('Cumulative % of Corpus Total', color='green')\n", "ax3.plot(duration_group['audio_duration_seconds'], duration_group['cumulative_duration'], color='green', linestyle='--')\n", "ax3.tick_params(axis='y', labelcolor='green')\n", "\n", "# Title and legend\n", "plt.title('Sample Duration Distributions')\n", "fig.tight_layout() # Adjust the layout to make room for the third y-axis\n", "\n", "# Show plot\n", "plt.show()\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/michal/.pyenv/versions/3.10.11/envs/streamlit/lib/python3.10/site-packages/datasets/load.py:1486: FutureWarning: The repository for amu-cai/pl-asr-bigos-v2 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/amu-cai/pl-asr-bigos-v2\n", "You can avoid this message in future by passing the argument `trust_remote_code=True`.\n", "Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.\n", " warnings.warn(\n", "Downloading data: 100%|██████████| 976M/976M [01:07<00:00, 14.5MB/s] \n", "Downloading data: 100%|██████████| 78.8M/78.8M [00:05<00:00, 14.6MB/s]\n", "Downloading data: 100%|██████████| 129M/129M [00:08<00:00, 16.1MB/s] \n", "Downloading data: 100%|██████████| 934k/934k [00:00<00:00, 11.4MB/s]\n", "Downloading data: 100%|██████████| 77.5k/77.5k [00:00<00:00, 7.19MB/s]\n", "Downloading data: 100%|██████████| 52.6k/52.6k [00:00<00:00, 3.63MB/s]\n", "Generating test split: 22 examples [00:00, 206.09 examples/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Generating examples\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Generating test split: 392 examples [00:01, 310.58 examples/s]\n", "Generating train split: 36 examples [00:00, 335.23 examples/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Generating examples\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Generating train split: 2861 examples [00:08, 321.16 examples/s]\n", "Generating validation split: 34 examples [00:00, 330.58 examples/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Generating examples\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Generating validation split: 242 examples [00:00, 317.93 examples/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "DatasetDict({\n", " test: Dataset({\n", " features: ['audioname', 'split', 'dataset', 'speaker_id', 'ref_orig', 'audio', 'audio_duration_samples', 'audio_duration_seconds', 'samplingrate_orig', 'sampling_rate', 'audiopath_bigos', 'audiopath_local', 'speaker_age', 'speaker_sex'],\n", " num_rows: 392\n", " })\n", " train: Dataset({\n", " features: ['audioname', 'split', 'dataset', 'speaker_id', 'ref_orig', 'audio', 'audio_duration_samples', 'audio_duration_seconds', 'samplingrate_orig', 'sampling_rate', 'audiopath_bigos', 'audiopath_local', 'speaker_age', 'speaker_sex'],\n", " num_rows: 2861\n", " })\n", " validation: Dataset({\n", " features: ['audioname', 'split', 'dataset', 'speaker_id', 'ref_orig', 'audio', 'audio_duration_samples', 'audio_duration_seconds', 'samplingrate_orig', 'sampling_rate', 'audiopath_bigos', 'audiopath_local', 'speaker_age', 'speaker_sex'],\n", " num_rows: 242\n", " })\n", "})\n" ] } ], "source": [ "import os\n", "from datasets import load_dataset\n", "from datasets import get_dataset_config_names\n", "dataset_name = \"amu-cai/pl-asr-bigos-v2\"\n", "# get dataset config names\n", "dataset_config_names = get_dataset_config_names(dataset_name)\n", "# load dataset\n", "dataset_hf = load_dataset(dataset_name, \"pjatk-clarin_mobile-15\")\n", "\n", "print(dataset_hf)" ] } ], "metadata": { "kernelspec": { "display_name": "bigos-hf", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.11" } }, "nbformat": 4, "nbformat_minor": 2 }