{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "c9526c52",
   "metadata": {},
   "outputs": [],
   "source": [
    "import datasets\n",
    "from datasets import DatasetDict, load_dataset, load_metric"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "id": "663ff92e",
   "metadata": {},
   "outputs": [],
   "source": [
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "cc9f1c45",
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset_name = \"mozilla-foundation/common_voice_7_0\"\n",
    "dataset_config_name = \"sv-SE\"\n",
    "train_split_name = \"train+validation\"\n",
    "use_auth_token = True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "21fd7030",
   "metadata": {},
   "outputs": [],
   "source": [
    "raw_datasets = DatasetDict()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "81a27912",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Reusing dataset common_voice (/Users/emiliomarinone/.cache/huggingface/datasets/mozilla-foundation___common_voice/sv-SE/7.0.0/33e08856cfa0d0665e837bcad73ffd920a0bc713ce8c5fffb55dbdf1c084d5ba)\n"
     ]
    }
   ],
   "source": [
    "raw_datasets[\"train\"] = load_dataset(\n",
    "    dataset_name,\n",
    "    dataset_config_name,\n",
    "    split=train_split_name,\n",
    "    use_auth_token=use_auth_token,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "7945cada",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Reusing dataset common_voice (/Users/emiliomarinone/.cache/huggingface/datasets/mozilla-foundation___common_voice/sv-SE/7.0.0/33e08856cfa0d0665e837bcad73ffd920a0bc713ce8c5fffb55dbdf1c084d5ba)\n"
     ]
    }
   ],
   "source": [
    "raw_datasets[\"test\"] = load_dataset(\n",
    "    dataset_name,\n",
    "    dataset_config_name,\n",
    "    split=\"test\",\n",
    "    use_auth_token=use_auth_token,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "c98cb649",
   "metadata": {},
   "outputs": [],
   "source": [
    "training_data = raw_datasets[\"train\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "1aead6a1",
   "metadata": {},
   "outputs": [],
   "source": [
    "test_data = raw_datasets[\"test\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "97e9a626",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],\n",
       "    num_rows: 11030\n",
       "})"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "training_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "fc794e39",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],\n",
       "    num_rows: 4620\n",
       "})"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "31b328fd",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_speakers_dict = {}\n",
    "for record in training_data:\n",
    "    try:\n",
    "        speakers_dict[record[\"client_id\"]].append(record[\"path\"])\n",
    "    except:\n",
    "        speakers_dict[record[\"client_id\"]] = [record[\"path\"]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "7eba5861",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(f\"Speakers in training set: {train_speakers_dict}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "17905c39",
   "metadata": {},
   "outputs": [],
   "source": [
    "test_speakers_dict = {}\n",
    "for record in test_data:\n",
    "    try:\n",
    "        speakers_dict[record[\"client_id\"]].append(record[\"path\"])\n",
    "    except:\n",
    "        speakers_dict[record[\"client_id\"]] = [record[\"path\"]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "id": "25a25454",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "24"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(f\"Speakers in test set: {test_speakers_dict}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "f72bdb7a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Speakers in both training and test sets: 0\n"
     ]
    }
   ],
   "source": [
    "c = 0\n",
    "for speaker in test_speakers_dict:\n",
    "    if speaker in train_speakers_dict:\n",
    "        c+=1\n",
    "print(f\"Speakers in both training and test sets: {c}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "id": "ed6bc20b",
   "metadata": {},
   "outputs": [],
   "source": [
    "chars_to_ignore_regex = '[,?.!\\-\\;\\:\"“%‘”�—’…–]'\n",
    "def clean_text(text):\n",
    "    return re.sub(chars_to_ignore_regex, \"\", text.lower())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "id": "16b289be",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Avg tokens training data: 7.243336355394379\n"
     ]
    }
   ],
   "source": [
    "num_tokens_train = 0\n",
    "for record in training_data:\n",
    "    num_tokens_train += len(clean_text(record[\"sentence\"]).split())\n",
    "avg_tokens_train = num_tokens_train / training_data.num_rows\n",
    "print(f\"Avg tokens training data: {avg_tokens_train}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "id": "364aff29",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Avg tokens training data: 7.074891774891775\n"
     ]
    }
   ],
   "source": [
    "num_tokens_test = 0\n",
    "for record in test_data:\n",
    "    num_tokens_test += len(clean_text(record[\"sentence\"]).split())\n",
    "avg_tokens_test = num_tokens_test / test_data.num_rows\n",
    "print(f\"Avg tokens training data: {avg_tokens_test}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}