{ "cells": [ { "cell_type": "code", "execution_count": 4, "id": "c9526c52", "metadata": {}, "outputs": [], "source": [ "import datasets\n", "from datasets import DatasetDict, load_dataset, load_metric" ] }, { "cell_type": "code", "execution_count": 44, "id": "663ff92e", "metadata": {}, "outputs": [], "source": [ "import re" ] }, { "cell_type": "code", "execution_count": 21, "id": "cc9f1c45", "metadata": {}, "outputs": [], "source": [ "dataset_name = \"mozilla-foundation/common_voice_7_0\"\n", "dataset_config_name = \"sv-SE\"\n", "train_split_name = \"train+validation\"\n", "use_auth_token = True" ] }, { "cell_type": "code", "execution_count": 22, "id": "21fd7030", "metadata": {}, "outputs": [], "source": [ "raw_datasets = DatasetDict()" ] }, { "cell_type": "code", "execution_count": 35, "id": "81a27912", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Reusing dataset common_voice (/Users/emiliomarinone/.cache/huggingface/datasets/mozilla-foundation___common_voice/sv-SE/7.0.0/33e08856cfa0d0665e837bcad73ffd920a0bc713ce8c5fffb55dbdf1c084d5ba)\n" ] } ], "source": [ "raw_datasets[\"train\"] = load_dataset(\n", " dataset_name,\n", " dataset_config_name,\n", " split=train_split_name,\n", " use_auth_token=use_auth_token,\n", ")" ] }, { "cell_type": "code", "execution_count": 28, "id": "7945cada", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Reusing dataset common_voice (/Users/emiliomarinone/.cache/huggingface/datasets/mozilla-foundation___common_voice/sv-SE/7.0.0/33e08856cfa0d0665e837bcad73ffd920a0bc713ce8c5fffb55dbdf1c084d5ba)\n" ] } ], "source": [ "raw_datasets[\"test\"] = load_dataset(\n", " dataset_name,\n", " dataset_config_name,\n", " split=\"test\",\n", " use_auth_token=use_auth_token,\n", ")" ] }, { "cell_type": "code", "execution_count": 36, "id": "c98cb649", "metadata": {}, "outputs": [], "source": [ "training_data = raw_datasets[\"train\"]" ] }, { "cell_type": "code", "execution_count": 29, "id": "1aead6a1", "metadata": {}, "outputs": [], "source": [ "test_data = raw_datasets[\"test\"]" ] }, { "cell_type": "code", "execution_count": 37, "id": "97e9a626", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Dataset({\n", " features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],\n", " num_rows: 11030\n", "})" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "training_data" ] }, { "cell_type": "code", "execution_count": 30, "id": "fc794e39", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Dataset({\n", " features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],\n", " num_rows: 4620\n", "})" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_data" ] }, { "cell_type": "code", "execution_count": 31, "id": "31b328fd", "metadata": {}, "outputs": [], "source": [ "train_speakers_dict = {}\n", "for record in training_data:\n", " try:\n", " speakers_dict[record[\"client_id\"]].append(record[\"path\"])\n", " except:\n", " speakers_dict[record[\"client_id\"]] = [record[\"path\"]]" ] }, { "cell_type": "code", "execution_count": 32, "id": "7eba5861", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(f\"Speakers in training set: {train_speakers_dict}\")" ] }, { "cell_type": "code", "execution_count": 38, "id": "17905c39", "metadata": {}, "outputs": [], "source": [ "test_speakers_dict = {}\n", "for record in test_data:\n", " try:\n", " speakers_dict[record[\"client_id\"]].append(record[\"path\"])\n", " except:\n", " speakers_dict[record[\"client_id\"]] = [record[\"path\"]]" ] }, { "cell_type": "code", "execution_count": 43, "id": "25a25454", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "24" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(f\"Speakers in test set: {test_speakers_dict}\")" ] }, { "cell_type": "code", "execution_count": 42, "id": "f72bdb7a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Speakers in both training and test sets: 0\n" ] } ], "source": [ "c = 0\n", "for speaker in test_speakers_dict:\n", " if speaker in train_speakers_dict:\n", " c+=1\n", "print(f\"Speakers in both training and test sets: {c}\")" ] }, { "cell_type": "code", "execution_count": 45, "id": "ed6bc20b", "metadata": {}, "outputs": [], "source": [ "chars_to_ignore_regex = '[,?.!\\-\\;\\:\"“%‘”�—’…–]'\n", "def clean_text(text):\n", " return re.sub(chars_to_ignore_regex, \"\", text.lower())" ] }, { "cell_type": "code", "execution_count": 51, "id": "16b289be", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Avg tokens training data: 7.243336355394379\n" ] } ], "source": [ "num_tokens_train = 0\n", "for record in training_data:\n", " num_tokens_train += len(clean_text(record[\"sentence\"]).split())\n", "avg_tokens_train = num_tokens_train / training_data.num_rows\n", "print(f\"Avg tokens training data: {avg_tokens_train}\")" ] }, { "cell_type": "code", "execution_count": 52, "id": "364aff29", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Avg tokens training data: 7.074891774891775\n" ] } ], "source": [ "num_tokens_test = 0\n", "for record in test_data:\n", " num_tokens_test += len(clean_text(record[\"sentence\"]).split())\n", "avg_tokens_test = num_tokens_test / test_data.num_rows\n", "print(f\"Avg tokens training data: {avg_tokens_test}\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.6" } }, "nbformat": 4, "nbformat_minor": 5 }