{ "cells": [ { "cell_type": "code", "execution_count": 5, "id": "c9526c52", "metadata": {}, "outputs": [], "source": [ "import datasets\n", "from datasets import DatasetDict, load_dataset, load_metric" ] }, { "cell_type": "code", "execution_count": 44, "id": "663ff92e", "metadata": {}, "outputs": [], "source": [ "import re" ] }, { "cell_type": "code", "execution_count": 6, "id": "cc9f1c45", "metadata": {}, "outputs": [], "source": [ "dataset_name = \"mozilla-foundation/common_voice_7_0\"\n", "dataset_config_name = \"sv-SE\"\n", "train_split_name = \"train+validation\"\n", "use_auth_token = True" ] }, { "cell_type": "code", "execution_count": 7, "id": "21fd7030", "metadata": {}, "outputs": [], "source": [ "raw_datasets = DatasetDict()" ] }, { "cell_type": "code", "execution_count": 4, "id": "81a27912", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "92387075d7064947bfe8117d393afa30", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading: 0%| | 0.00/9.88k [00:00\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0m raw_datasets[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtrain\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43mload_dataset\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2\u001b[0m \u001b[43m \u001b[49m\u001b[43mdataset_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3\u001b[0m \u001b[43m \u001b[49m\u001b[43mdataset_config_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 4\u001b[0m \u001b[43m \u001b[49m\u001b[43msplit\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrain_split_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 5\u001b[0m \u001b[43m \u001b[49m\u001b[43muse_auth_token\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_auth_token\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 6\u001b[0m \u001b[43m)\u001b[49m\n", "File \u001b[0;32m~/Repos/datasets/src/datasets/load.py:1694\u001b[0m, in \u001b[0;36mload_dataset\u001b[0;34m(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, ignore_verifications, keep_in_memory, save_infos, revision, use_auth_token, task, streaming, script_version, **config_kwargs)\u001b[0m\n\u001b[1;32m 1691\u001b[0m try_from_hf_gcs \u001b[38;5;241m=\u001b[39m path \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m _PACKAGED_DATASETS_MODULES\n\u001b[1;32m 1693\u001b[0m \u001b[38;5;66;03m# Download and prepare data\u001b[39;00m\n\u001b[0;32m-> 1694\u001b[0m \u001b[43mbuilder_instance\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdownload_and_prepare\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1695\u001b[0m \u001b[43m \u001b[49m\u001b[43mdownload_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1696\u001b[0m \u001b[43m \u001b[49m\u001b[43mdownload_mode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_mode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1697\u001b[0m \u001b[43m \u001b[49m\u001b[43mignore_verifications\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_verifications\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1698\u001b[0m \u001b[43m \u001b[49m\u001b[43mtry_from_hf_gcs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtry_from_hf_gcs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1699\u001b[0m \u001b[43m \u001b[49m\u001b[43muse_auth_token\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_auth_token\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1700\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1702\u001b[0m \u001b[38;5;66;03m# Build dataset for splits\u001b[39;00m\n\u001b[1;32m 1703\u001b[0m keep_in_memory \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m 1704\u001b[0m keep_in_memory \u001b[38;5;28;01mif\u001b[39;00m keep_in_memory \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m is_small_dataset(builder_instance\u001b[38;5;241m.\u001b[39minfo\u001b[38;5;241m.\u001b[39mdataset_size)\n\u001b[1;32m 1705\u001b[0m )\n", "File \u001b[0;32m~/Repos/datasets/src/datasets/builder.py:595\u001b[0m, in \u001b[0;36mDatasetBuilder.download_and_prepare\u001b[0;34m(self, download_config, download_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, **download_and_prepare_kwargs)\u001b[0m\n\u001b[1;32m 593\u001b[0m logger\u001b[38;5;241m.\u001b[39mwarning(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mHF google storage unreachable. Downloading and preparing it from source\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 594\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m downloaded_from_gcs:\n\u001b[0;32m--> 595\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_download_and_prepare\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 596\u001b[0m \u001b[43m \u001b[49m\u001b[43mdl_manager\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdl_manager\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mverify_infos\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverify_infos\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mdownload_and_prepare_kwargs\u001b[49m\n\u001b[1;32m 597\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 598\u001b[0m \u001b[38;5;66;03m# Sync info\u001b[39;00m\n\u001b[1;32m 599\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39minfo\u001b[38;5;241m.\u001b[39mdataset_size \u001b[38;5;241m=\u001b[39m \u001b[38;5;28msum\u001b[39m(split\u001b[38;5;241m.\u001b[39mnum_bytes \u001b[38;5;28;01mfor\u001b[39;00m split \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39minfo\u001b[38;5;241m.\u001b[39msplits\u001b[38;5;241m.\u001b[39mvalues())\n", "File \u001b[0;32m~/Repos/datasets/src/datasets/builder.py:684\u001b[0m, in \u001b[0;36mDatasetBuilder._download_and_prepare\u001b[0;34m(self, dl_manager, verify_infos, **prepare_split_kwargs)\u001b[0m\n\u001b[1;32m 680\u001b[0m split_dict\u001b[38;5;241m.\u001b[39madd(split_generator\u001b[38;5;241m.\u001b[39msplit_info)\n\u001b[1;32m 682\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 683\u001b[0m \u001b[38;5;66;03m# Prepare split will record examples associated to the split\u001b[39;00m\n\u001b[0;32m--> 684\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_prepare_split\u001b[49m\u001b[43m(\u001b[49m\u001b[43msplit_generator\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mprepare_split_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 685\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mOSError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 686\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mOSError\u001b[39;00m(\n\u001b[1;32m 687\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCannot find data file. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 688\u001b[0m \u001b[38;5;241m+\u001b[39m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmanual_download_instructions \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 689\u001b[0m \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124mOriginal error:\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 690\u001b[0m \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mstr\u001b[39m(e)\n\u001b[1;32m 691\u001b[0m ) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28mNone\u001b[39m\n", "File \u001b[0;32m~/Repos/datasets/src/datasets/builder.py:1083\u001b[0m, in \u001b[0;36mGeneratorBasedBuilder._prepare_split\u001b[0;34m(self, split_generator)\u001b[0m\n\u001b[1;32m 1075\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1076\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m key, record \u001b[38;5;129;01min\u001b[39;00m utils\u001b[38;5;241m.\u001b[39mtqdm(\n\u001b[1;32m 1077\u001b[0m generator,\n\u001b[1;32m 1078\u001b[0m unit\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m examples\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1081\u001b[0m disable\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mbool\u001b[39m(logging\u001b[38;5;241m.\u001b[39mget_verbosity() \u001b[38;5;241m==\u001b[39m logging\u001b[38;5;241m.\u001b[39mNOTSET),\n\u001b[1;32m 1082\u001b[0m ):\n\u001b[0;32m-> 1083\u001b[0m example \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43minfo\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfeatures\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencode_example\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrecord\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1084\u001b[0m writer\u001b[38;5;241m.\u001b[39mwrite(example, key)\n\u001b[1;32m 1085\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n", "File \u001b[0;32m~/Repos/datasets/src/datasets/features/features.py:1214\u001b[0m, in \u001b[0;36mFeatures.encode_example\u001b[0;34m(self, example)\u001b[0m\n\u001b[1;32m 1204\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 1205\u001b[0m \u001b[38;5;124;03mEncode example into a format for Arrow.\u001b[39;00m\n\u001b[1;32m 1206\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1211\u001b[0m \u001b[38;5;124;03m :obj:`dict[str, Any]`\u001b[39;00m\n\u001b[1;32m 1212\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 1213\u001b[0m example \u001b[38;5;241m=\u001b[39m cast_to_python_objects(example)\n\u001b[0;32m-> 1214\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mencode_nested_example\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexample\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m~/Repos/datasets/src/datasets/features/features.py:976\u001b[0m, in \u001b[0;36mencode_nested_example\u001b[0;34m(schema, obj)\u001b[0m\n\u001b[1;32m 974\u001b[0m \u001b[38;5;66;03m# Nested structures: we allow dict, list/tuples, sequences\u001b[39;00m\n\u001b[1;32m 975\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(schema, \u001b[38;5;28mdict\u001b[39m):\n\u001b[0;32m--> 976\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m {\n\u001b[1;32m 977\u001b[0m k: encode_nested_example(sub_schema, sub_obj) \u001b[38;5;28;01mfor\u001b[39;00m k, (sub_schema, sub_obj) \u001b[38;5;129;01min\u001b[39;00m utils\u001b[38;5;241m.\u001b[39mzip_dict(schema, obj)\n\u001b[1;32m 978\u001b[0m }\n\u001b[1;32m 979\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(schema, (\u001b[38;5;28mlist\u001b[39m, \u001b[38;5;28mtuple\u001b[39m)):\n\u001b[1;32m 980\u001b[0m sub_schema \u001b[38;5;241m=\u001b[39m schema[\u001b[38;5;241m0\u001b[39m]\n", "File \u001b[0;32m~/Repos/datasets/src/datasets/features/features.py:976\u001b[0m, in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 974\u001b[0m \u001b[38;5;66;03m# Nested structures: we allow dict, list/tuples, sequences\u001b[39;00m\n\u001b[1;32m 975\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(schema, \u001b[38;5;28mdict\u001b[39m):\n\u001b[0;32m--> 976\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m {\n\u001b[1;32m 977\u001b[0m k: encode_nested_example(sub_schema, sub_obj) \u001b[38;5;28;01mfor\u001b[39;00m k, (sub_schema, sub_obj) \u001b[38;5;129;01min\u001b[39;00m utils\u001b[38;5;241m.\u001b[39mzip_dict(schema, obj)\n\u001b[1;32m 978\u001b[0m }\n\u001b[1;32m 979\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(schema, (\u001b[38;5;28mlist\u001b[39m, \u001b[38;5;28mtuple\u001b[39m)):\n\u001b[1;32m 980\u001b[0m sub_schema \u001b[38;5;241m=\u001b[39m schema[\u001b[38;5;241m0\u001b[39m]\n", "File \u001b[0;32m~/Repos/datasets/src/datasets/utils/py_utils.py:153\u001b[0m, in \u001b[0;36mzip_dict\u001b[0;34m(*dicts)\u001b[0m\n\u001b[1;32m 150\u001b[0m \u001b[38;5;124;03m\"\"\"Iterate over items of dictionaries grouped by their keys.\"\"\"\u001b[39;00m\n\u001b[1;32m 151\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m key \u001b[38;5;129;01min\u001b[39;00m unique_values(itertools\u001b[38;5;241m.\u001b[39mchain(\u001b[38;5;241m*\u001b[39mdicts)): \u001b[38;5;66;03m# set merge all keys\u001b[39;00m\n\u001b[1;32m 152\u001b[0m \u001b[38;5;66;03m# Will raise KeyError if the dict don't have the same keys\u001b[39;00m\n\u001b[0;32m--> 153\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m key, \u001b[38;5;28;43mtuple\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43md\u001b[49m\u001b[43m[\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m]\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43md\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mdicts\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m~/Repos/datasets/src/datasets/utils/py_utils.py:153\u001b[0m, in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 150\u001b[0m \u001b[38;5;124;03m\"\"\"Iterate over items of dictionaries grouped by their keys.\"\"\"\u001b[39;00m\n\u001b[1;32m 151\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m key \u001b[38;5;129;01min\u001b[39;00m unique_values(itertools\u001b[38;5;241m.\u001b[39mchain(\u001b[38;5;241m*\u001b[39mdicts)): \u001b[38;5;66;03m# set merge all keys\u001b[39;00m\n\u001b[1;32m 152\u001b[0m \u001b[38;5;66;03m# Will raise KeyError if the dict don't have the same keys\u001b[39;00m\n\u001b[0;32m--> 153\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m key, \u001b[38;5;28mtuple\u001b[39m(\u001b[43md\u001b[49m\u001b[43m[\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m]\u001b[49m \u001b[38;5;28;01mfor\u001b[39;00m d \u001b[38;5;129;01min\u001b[39;00m dicts)\n", "\u001b[0;31mKeyError\u001b[0m: 'accents'" ] } ], "source": [ "raw_datasets[\"train\"] = load_dataset(\n", " dataset_name,\n", " dataset_config_name,\n", " split=train_split_name,\n", " use_auth_token=use_auth_token,\n", ")" ] }, { "cell_type": "code", "execution_count": 8, "id": "7945cada", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Reusing dataset common_voice (/Users/emiliomarinone/.cache/huggingface/datasets/mozilla-foundation___common_voice/sv-SE/7.0.0/fe20cac47c166e25b1f096ab661832e3da7cf298ed4a91dcaa1343ad972d175b)\n" ] } ], "source": [ "raw_datasets[\"test\"] = load_dataset(\n", " dataset_name,\n", " dataset_config_name,\n", " split=\"test\",\n", " use_auth_token=use_auth_token,\n", ")" ] }, { "cell_type": "code", "execution_count": 36, "id": "c98cb649", "metadata": {}, "outputs": [], "source": [ "training_data = raw_datasets[\"train\"]" ] }, { "cell_type": "code", "execution_count": 10, "id": "1aead6a1", "metadata": {}, "outputs": [], "source": [ "test_data = raw_datasets[\"test\"]" ] }, { "cell_type": "code", "execution_count": 37, "id": "97e9a626", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Dataset({\n", " features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],\n", " num_rows: 11030\n", "})" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "training_data" ] }, { "cell_type": "code", "execution_count": 18, "id": "fc794e39", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Dataset({\n", " features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],\n", " num_rows: 4620\n", "})" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_data" ] }, { "cell_type": "code", "execution_count": 23, "id": "406af02e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "datasets.arrow_dataset.Dataset" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type(test_data)" ] }, { "cell_type": "code", "execution_count": 21, "id": "8cc0fa51", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'cv-corpus-7.0-2021-07-21/sv-SE/clips/common_voice_sv-SE_18711293.mp3'" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_data[0][\"path\"]" ] }, { "cell_type": "code", "execution_count": 31, "id": "31b328fd", "metadata": {}, "outputs": [], "source": [ "train_speakers_dict = {}\n", "for record in training_data:\n", " try:\n", " speakers_dict[record[\"client_id\"]].append(record[\"path\"])\n", " except:\n", " speakers_dict[record[\"client_id\"]] = [record[\"path\"]]" ] }, { "cell_type": "code", "execution_count": 32, "id": "7eba5861", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(f\"Speakers in training set: {train_speakers_dict}\")" ] }, { "cell_type": "code", "execution_count": 38, "id": "17905c39", "metadata": {}, "outputs": [], "source": [ "test_speakers_dict = {}\n", "for record in test_data:\n", " try:\n", " speakers_dict[record[\"client_id\"]].append(record[\"path\"])\n", " except:\n", " speakers_dict[record[\"client_id\"]] = [record[\"path\"]]" ] }, { "cell_type": "code", "execution_count": 43, "id": "25a25454", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "24" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(f\"Speakers in test set: {test_speakers_dict}\")" ] }, { "cell_type": "code", "execution_count": 42, "id": "f72bdb7a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Speakers in both training and test sets: 0\n" ] } ], "source": [ "c = 0\n", "for speaker in test_speakers_dict:\n", " if speaker in train_speakers_dict:\n", " c+=1\n", "print(f\"Speakers in both training and test sets: {c}\")" ] }, { "cell_type": "code", "execution_count": 45, "id": "ed6bc20b", "metadata": {}, "outputs": [], "source": [ "chars_to_ignore_regex = '[,?.!\\-\\;\\:\"“%‘”�—’…–]'\n", "def clean_text(text):\n", " return re.sub(chars_to_ignore_regex, \"\", text.lower())" ] }, { "cell_type": "code", "execution_count": 51, "id": "16b289be", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Avg tokens training data: 7.243336355394379\n" ] } ], "source": [ "num_tokens_train = 0\n", "for record in training_data:\n", " num_tokens_train += len(clean_text(record[\"sentence\"]).split())\n", "avg_tokens_train = num_tokens_train / training_data.num_rows\n", "print(f\"Avg tokens training data: {avg_tokens_train}\")" ] }, { "cell_type": "code", "execution_count": 52, "id": "364aff29", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Avg tokens training data: 7.074891774891775\n" ] } ], "source": [ "num_tokens_test = 0\n", "for record in test_data:\n", " num_tokens_test += len(clean_text(record[\"sentence\"]).split())\n", "avg_tokens_test = num_tokens_test / test_data.num_rows\n", "print(f\"Avg tokens training data: {avg_tokens_test}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "4f906c9c", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.6" } }, "nbformat": 4, "nbformat_minor": 5 }