{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "executionInfo": { "elapsed": 829, "status": "ok", "timestamp": 1641588786523, "user": { "displayName": "Yurii Paniv", "photoUrl": "https://lh3.googleusercontent.com/a/default-user=s64", "userId": "13095662915325887123" }, "user_tz": -120 }, "id": "YELVqGxMxnbG", "outputId": "876761c1-2e03-411b-e61b-07ac4ad61377" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Mon Dec 26 17:51:09 2022 \n", "+-----------------------------------------------------------------------------+\n", "| NVIDIA-SMI 515.86.01 Driver Version: 515.86.01 CUDA Version: 11.7 |\n", "|-------------------------------+----------------------+----------------------+\n", "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", "| | | MIG M. |\n", "|===============================+======================+======================|\n", "| 0 NVIDIA GeForce ... Off | 00000000:0A:00.0 On | N/A |\n", "| 0% 34C P5 49W / 390W | 1437MiB / 24576MiB | 2% Default |\n", "| | | N/A |\n", "+-------------------------------+----------------------+----------------------+\n", " \n", "+-----------------------------------------------------------------------------+\n", "| Processes: |\n", "| GPU GI CI PID Type Process name GPU Memory |\n", "| ID ID Usage |\n", "|=============================================================================|\n", "| 0 N/A N/A 1223 G /usr/lib/xorg/Xorg 498MiB |\n", "| 0 N/A N/A 2007 G /usr/bin/kwalletd5 4MiB |\n", "| 0 N/A N/A 2174 G ...ec/xdg-desktop-portal-kde 4MiB |\n", "| 0 N/A N/A 2211 G /usr/bin/ksmserver 4MiB |\n", "| 0 N/A N/A 2213 G /usr/bin/kded5 4MiB |\n", "| 0 N/A N/A 2214 G /usr/bin/kwin_x11 96MiB |\n", "| 0 N/A N/A 2263 G /usr/bin/plasmashell 96MiB |\n", "| 0 N/A N/A 2283 G ...de-authentication-agent-1 4MiB |\n", "| 0 N/A N/A 2354 G ...x-gnu/libexec/kdeconnectd 4MiB |\n", "| 0 N/A N/A 2356 G .../usr/bin/telegram-desktop 7MiB |\n", "| 0 N/A N/A 2370 G /usr/bin/kaccess 4MiB |\n", "| 0 N/A N/A 2377 G .../libexec/DiscoverNotifier 4MiB |\n", "| 0 N/A N/A 2443 G ...1/usr/lib/firefox/firefox 96MiB |\n", "| 0 N/A N/A 2704 G /usr/bin/dolphin 4MiB |\n", "| 0 N/A N/A 2806 G /usr/bin/dolphin 4MiB |\n", "| 0 N/A N/A 2911 G /usr/bin/dolphin 4MiB |\n", "| 0 N/A N/A 6634 G ...RendererForSitePerProcess 585MiB |\n", "+-----------------------------------------------------------------------------+\n" ] } ], "source": [ "gpu_info = !nvidia-smi\n", "gpu_info = '\\n'.join(gpu_info)\n", "if gpu_info.find('failed') >= 0:\n", " print('Not connected to a GPU')\n", "else:\n", " print(gpu_info)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "id": "c8eh87Hoee5d" }, "outputs": [], "source": [ "#%%capture\n", "#!pip install datasets==1.13.3\n", "#!pip install transformers==4.11.3\n", "#!pip install huggingface_hub==0.1\n", "#!pip install torchaudio==0.10.0+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html\n", "#!pip install jiwer" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "executionInfo": { "elapsed": 5334, "status": "ok", "timestamp": 1641588811766, "user": { "displayName": "Yurii Paniv", "photoUrl": "https://lh3.googleusercontent.com/a/default-user=s64", "userId": "13095662915325887123" }, "user_tz": -120 }, "id": "2MMXcWFFgCXU", "outputId": "be9fd72e-4395-4cd0-ff87-631dad046e71" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Reusing dataset common_voice_10_0 (/home/robinhad/.cache/huggingface/datasets/mozilla-foundation___common_voice_10_0/uk/10.0.0/27df768ab1b5cac48a7616f145b79b62599167b0ffa2e054bf4c3e74e9619e5e)\n", "Reusing dataset common_voice_10_0 (/home/robinhad/.cache/huggingface/datasets/mozilla-foundation___common_voice_10_0/uk/10.0.0/27df768ab1b5cac48a7616f145b79b62599167b0ffa2e054bf4c3e74e9619e5e)\n" ] } ], "source": [ "from datasets import load_dataset, load_metric, Audio\n", "\n", "common_voice_train = load_dataset(\"mozilla-foundation/common_voice_10_0\", \"uk\", split=\"train\", use_auth_token=True)\n", "common_voice_test = load_dataset(\"mozilla-foundation/common_voice_10_0\", \"uk\", split=\"test\", use_auth_token=True)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Dataset({\n", " features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],\n", " num_rows: 11463\n", "})" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "common_voice_train" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "common_voice_train.cleanup_cache_files()\n", "common_voice_test.cleanup_cache_files()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "id": "kbyq6lDgQc2a" }, "outputs": [], "source": [ "common_voice_train = common_voice_train.remove_columns([\"accent\", \"age\", \"client_id\", \"down_votes\", \"gender\", \"locale\", \"segment\", \"up_votes\"])\n", "common_voice_test = common_voice_test.remove_columns([\"accent\", \"age\", \"client_id\", \"down_votes\", \"gender\", \"locale\", \"segment\", \"up_votes\"])" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "id": "72737oog2F6U" }, "outputs": [], "source": [ "from datasets import ClassLabel\n", "import random\n", "import pandas as pd\n", "from IPython.display import display, HTML\n", "\n", "def show_random_elements(dataset, num_examples=10):\n", " assert num_examples <= len(dataset), \"Can't pick more elements than there are in the dataset.\"\n", " picks = []\n", " for _ in range(num_examples):\n", " pick = random.randint(0, len(dataset)-1)\n", " while pick in picks:\n", " pick = random.randint(0, len(dataset)-1)\n", " picks.append(pick)\n", " \n", " df = pd.DataFrame(dataset[picks])\n", " display(HTML(df.to_html()))" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 363 }, "executionInfo": { "elapsed": 39, "status": "ok", "timestamp": 1641588811771, "user": { "displayName": "Yurii Paniv", "photoUrl": "https://lh3.googleusercontent.com/a/default-user=s64", "userId": "13095662915325887123" }, "user_tz": -120 }, "id": "K_JUmf3G3b9S", "outputId": "8603c909-09e1-43ae-f7c2-b27b25d795a3" }, "outputs": [ { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sentence
0Щоночі одна півсотня ночувала у лісі, друга — в селі.
1І зараз за тим днем твоя доля зміниться на краще.
2Оп'ять Ірисю посила:
3Рушниця впала по той бік плоту.
4Загуде вона, як гром.
5Дружина не витримує і сміється.
6Люблю, тільки боюся говорити.
7Звір заревів востаннє, сіпнувся головою назад і зник під водою.
8Про їзду риссю не могло бути й мови.
9Ми заночували в Бондуровій, — червоні — в Баландиному.
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "show_random_elements(common_voice_train.remove_columns([\"path\", \"audio\"]), num_examples=10)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "executionInfo": { "elapsed": 30, "status": "ok", "timestamp": 1641588811775, "user": { "displayName": "Yurii Paniv", "photoUrl": "https://lh3.googleusercontent.com/a/default-user=s64", "userId": "13095662915325887123" }, "user_tz": -120 }, "id": "XIHocAuTQbBR", "outputId": "e8392853-e0d1-45ba-df74-065c50565654" }, "outputs": [ { "data": { "application/json": { "ascii": false, "bar_format": null, "colour": null, "elapsed": 0.018886804580688477, "initial": 0, "n": 0, "ncols": null, "nrows": null, "postfix": null, "prefix": "", "rate": null, "total": 12, "unit": "ba", "unit_divisor": 1000, "unit_scale": false }, "application/vnd.jupyter.widget-view+json": { "model_id": "609c1f72241d438999ec5fffaf0f23f5", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/12 [00:00\n", " \n", " \n", " \n", " sentence\n", " \n", " \n", " \n", " \n", " 0\n", " простіть\n", " \n", " \n", " 1\n", " полковник дасть вам проїзні документи до табору\n", " \n", " \n", " 2\n", " зараз тільки про це йде мова\n", " \n", " \n", " 3\n", " у мене лопата вартий на килимі до кам'янки вернутися\n", " \n", " \n", " 4\n", " хотілося взнати про холодний яр про долю товаришів\n", " \n", " \n", " 5\n", " а той слухає уважно перепитує\n", " \n", " \n", " 6\n", " чи то образ перемінився в чоловіка чи чоловік був у тім образі\n", " \n", " \n", " 7\n", " кінь\n", " \n", " \n", " 8\n", " у самців вусики трохи довші а у самок дещо коротші за тіло\n", " \n", " \n", " 9\n", " чорний туман що сповивав усе довкола поволі сірів\n", " \n", " \n", "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "show_random_elements(common_voice_train.map(cleaner).remove_columns([\"path\", \"audio\"]), num_examples=10)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Loading cached processed dataset at /home/robinhad/.cache/huggingface/datasets/mozilla-foundation___common_voice_10_0/uk/10.0.0/27df768ab1b5cac48a7616f145b79b62599167b0ffa2e054bf4c3e74e9619e5e/cache-890587fbc5f83609.arrow\n" ] }, { "data": { "application/json": { "ascii": false, "bar_format": null, "colour": null, "elapsed": 0.012672662734985352, "initial": 0, "n": 0, "ncols": null, "nrows": null, "postfix": null, "prefix": "", "rate": null, "total": 6783, "unit": "ex", "unit_divisor": 1000, "unit_scale": false }, "application/vnd.jupyter.widget-view+json": { "model_id": "c10e3f80cadb49a7951a2a6863af53bf", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/6783 [00:00\n", " \n", " \n", " \n", " sentence\n", " \n", " \n", " \n", " \n", " 0\n", " спостерігаючи за їхніми спокійними рухами пригадую роботу чекістів у льоху для розстрілів\n", " \n", " \n", " 1\n", " сірого били дрючками селяни що упіймали його з парою крадених коней\n", " \n", " \n", " 2\n", " мав лише револьвер далековид і планшет із мапою\n", " \n", " \n", " 3\n", " коли еней мене бажа\n", " \n", " \n", " 4\n", " ось і волость\n", " \n", " \n", " 5\n", " і посмоктали кісточки\n", " \n", " \n", " 6\n", " скакати високо\n", " \n", " \n", " 7\n", " я стрепенувся і відкрив повіки\n", " \n", " \n", " 8\n", " нападом на кінноту ми себе виявили\n", " \n", " \n", " 9\n", " червоні розгубилися і в безладі закрутилися на місці\n", " \n", " \n", "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "show_random_elements(common_voice_train.remove_columns([\"path\",\"audio\"]))" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "id": "LwCshNbbeRZR" }, "outputs": [], "source": [ "def extract_all_chars(batch):\n", " all_text = \" \".join(batch[\"sentence\"])\n", " vocab = list(set(all_text))\n", " return {\"vocab\": [vocab], \"all_text\": [all_text]}" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 81, "referenced_widgets": [ "116786d9364a4a57b521cddaabeda688", "9baa2f69aa9c4387bf1086a04ed78420", "a1e2c04dc2cb45ea80bec125e3dbf56f", "b6d46d40efa14b21814f41531f5a2f41", "d8bf8dc5d6c84140a4e96c9c435b8f17", "04ec68b059df4c628839c3ac29e2ebdd", "427056895c674c428400bee0f5b43995", "d518f2c2ab6945b78a6d336dad6262bd", "77f1a51099b24831ad8b2be3d2dc833a", "5815ae1348994bfebba4a8e968489a96", "22ba979142074f1d976e1a905544fd2d", "8b6b7f28751c45c8869aa86eb2a0ab26", "445c84e1e2e541f2a54fb989def386ae", "68502fb433564eee8dfdf272ed7e4f56", "1f3abdf2e0f6459da4179a94d691c4c4", "48c60be3ca9349a295b83f65769c7f27", "6c80bd8a8fe14a5989fe27445c14650f", "5c2a7fea8c434d51ada69a0854b88baf", "414efa8a08cd491cb78af8a95a151daa", "c31a747e18df4b4aa4449a30e387448c", "3dedffa30b774426bd474072a3a0d591", "05d8496d54174ae298c319b0194fc710" ] }, "executionInfo": { "elapsed": 560, "status": "ok", "timestamp": 1641588812313, "user": { "displayName": "Yurii Paniv", "photoUrl": "https://lh3.googleusercontent.com/a/default-user=s64", "userId": "13095662915325887123" }, "user_tz": -120 }, "id": "_m6uUjjcfbjH", "outputId": "4cc94e18-9295-4414-c611-c98916fe3d4d" }, "outputs": [ { "data": { "application/json": { "ascii": false, "bar_format": null, "colour": null, "elapsed": 0.01583385467529297, "initial": 0, "n": 0, "ncols": null, "nrows": null, "postfix": null, "prefix": "", "rate": null, "total": 1, "unit": "ba", "unit_divisor": 1000, "unit_scale": false }, "application/vnd.jupyter.widget-view+json": { "model_id": "3ba1ea8f16284564b96209d4ac0fe9b2", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/1 [00:00\n", " \n", " Your browser does not support the audio element.\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import IPython.display as ipd\n", "import numpy as np\n", "import random\n", "\n", "rand_int = random.randint(0, len(common_voice_train)-1)\n", "\n", "print(common_voice_train[rand_int][\"sentence\"])\n", "ipd.Audio(data=common_voice_train[rand_int][\"audio\"][\"array\"], autoplay=True, rate=16000)" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "executionInfo": { "elapsed": 22, "status": "ok", "timestamp": 1641588821176, "user": { "displayName": "Yurii Paniv", "photoUrl": "https://lh3.googleusercontent.com/a/default-user=s64", "userId": "13095662915325887123" }, "user_tz": -120 }, "id": "1Po2g7YPuRTx", "outputId": "ad79ec8a-ab5a-4c52-edfa-a20d0eec2282" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Target text: троє\n", "Input array shape: (26496,)\n", "Sampling rate: 16000\n" ] } ], "source": [ "rand_int = random.randint(0, len(common_voice_train)-1)\n", "\n", "print(\"Target text:\", common_voice_train[rand_int][\"sentence\"])\n", "print(\"Input array shape:\", common_voice_train[rand_int][\"audio\"][\"array\"].shape)\n", "print(\"Sampling rate:\", common_voice_train[rand_int][\"audio\"][\"sampling_rate\"])" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "id": "eJY7I0XAwe9p" }, "outputs": [], "source": [ "def prepare_dataset(batch):\n", " audio = batch[\"audio\"]\n", "\n", " # batched output is \"un-batched\"\n", " batch[\"input_values\"] = processor(audio[\"array\"], sampling_rate=audio[\"sampling_rate\"]).input_values[0]\n", " batch[\"input_length\"] = len(batch[\"input_values\"])\n", " \n", " with processor.as_target_processor():\n", " batch[\"labels\"] = processor(batch[\"sentence\"]).input_ids\n", " return batch" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 87, "referenced_widgets": [ "a29f88f174f8499082fbb36a36c47fa4", "efc3bc0c48124ebeb79d245216eaf0fe", "d45747150d0b434593a3a7c98399599a", "ea73f7deb1c643f7b81de7fb7acaaf1b", "18bc63944343440f837cdff76db004fc", "9c875952cdd649a5bab87de9bb3f5200", "aa329cb93df44a6da6012c7cc49d7489", "b39b6e9131ca4ce3b31e84ceb04e1b83", "c5eed102ef134a4e8ca41713b82ff6a4", "e6e50da6516847878309fdc5c463edb3", "a4ae510b4f3845f891a796cf844fc2bb" ] }, "executionInfo": { "elapsed": 107521, "status": "ok", "timestamp": 1641588928679, "user": { "displayName": "Yurii Paniv", "photoUrl": "https://lh3.googleusercontent.com/a/default-user=s64", "userId": "13095662915325887123" }, "user_tz": -120 }, "id": "-np9xYK-wl8q", "outputId": "779b4637-0606-4cc8-be3c-16c1c4241e63" }, "outputs": [ { "data": { "application/json": { "ascii": false, "bar_format": null, "colour": null, "elapsed": 0.0130767822265625, "initial": 0, "n": 0, "ncols": null, "nrows": null, "postfix": null, "prefix": "", "rate": null, "total": 11463, "unit": "ex", "unit_divisor": 1000, "unit_scale": false }, "application/vnd.jupyter.widget-view+json": { "model_id": "9b61f7e540b64679b38d986c03621299", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/11463 [00:00