{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "executionInfo": { "elapsed": 829, "status": "ok", "timestamp": 1641588786523, "user": { "displayName": "Yurii Paniv", "photoUrl": "https://lh3.googleusercontent.com/a/default-user=s64", "userId": "13095662915325887123" }, "user_tz": -120 }, "id": "YELVqGxMxnbG", "outputId": "876761c1-2e03-411b-e61b-07ac4ad61377" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Wed Dec 28 20:57:11 2022 \n", "+-----------------------------------------------------------------------------+\n", "| NVIDIA-SMI 515.86.01 Driver Version: 515.86.01 CUDA Version: 11.7 |\n", "|-------------------------------+----------------------+----------------------+\n", "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", "| | | MIG M. |\n", "|===============================+======================+======================|\n", "| 0 NVIDIA GeForce ... Off | 00000000:0A:00.0 On | N/A |\n", "| 0% 31C P8 36W / 390W | 1401MiB / 24576MiB | 3% Default |\n", "| | | N/A |\n", "+-------------------------------+----------------------+----------------------+\n", " \n", "+-----------------------------------------------------------------------------+\n", "| Processes: |\n", "| GPU GI CI PID Type Process name GPU Memory |\n", "| ID ID Usage |\n", "|=============================================================================|\n", "| 0 N/A N/A 1267 G /usr/lib/xorg/Xorg 574MiB |\n", "| 0 N/A N/A 2054 G /usr/bin/kwalletd5 4MiB |\n", "| 0 N/A N/A 2222 G ...ec/xdg-desktop-portal-kde 4MiB |\n", "| 0 N/A N/A 2259 G /usr/bin/ksmserver 4MiB |\n", "| 0 N/A N/A 2261 G /usr/bin/kded5 4MiB |\n", "| 0 N/A N/A 2262 G /usr/bin/kwin_x11 97MiB |\n", "| 0 N/A N/A 2309 G /usr/bin/plasmashell 130MiB |\n", "| 0 N/A N/A 2332 G ...de-authentication-agent-1 4MiB |\n", "| 0 N/A N/A 2399 G ...x-gnu/libexec/kdeconnectd 4MiB |\n", "| 0 N/A N/A 2401 G .../usr/bin/telegram-desktop 7MiB |\n", "| 0 N/A N/A 2415 G /usr/bin/kaccess 4MiB |\n", "| 0 N/A N/A 2421 G .../libexec/DiscoverNotifier 4MiB |\n", "| 0 N/A N/A 2438 G ...1/usr/lib/firefox/firefox 216MiB |\n", "| 0 N/A N/A 2626 G /usr/bin/dolphin 4MiB |\n", "| 0 N/A N/A 2774 G /usr/bin/dolphin 4MiB |\n", "| 0 N/A N/A 2824 G /usr/bin/dolphin 4MiB |\n", "| 0 N/A N/A 3559 G /usr/bin/dolphin 4MiB |\n", "| 0 N/A N/A 3665 G /usr/bin/dolphin 4MiB |\n", "| 0 N/A N/A 4830 G ...RendererForSitePerProcess 308MiB |\n", "+-----------------------------------------------------------------------------+\n" ] } ], "source": [ "gpu_info = !nvidia-smi\n", "gpu_info = '\\n'.join(gpu_info)\n", "if gpu_info.find('failed') >= 0:\n", " print('Not connected to a GPU')\n", "else:\n", " print(gpu_info)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "id": "c8eh87Hoee5d" }, "outputs": [], "source": [ "#%%capture\n", "#!pip install datasets==1.13.3\n", "#!pip install transformers==4.11.3\n", "#!pip install huggingface_hub==0.1\n", "#!pip install torchaudio==0.10.0+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html\n", "#!pip install jiwer" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "executionInfo": { "elapsed": 5334, "status": "ok", "timestamp": 1641588811766, "user": { "displayName": "Yurii Paniv", "photoUrl": "https://lh3.googleusercontent.com/a/default-user=s64", "userId": "13095662915325887123" }, "user_tz": -120 }, "id": "2MMXcWFFgCXU", "outputId": "be9fd72e-4395-4cd0-ff87-631dad046e71" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Reusing dataset common_voice_10_0 (/home/robinhad/.cache/huggingface/datasets/mozilla-foundation___common_voice_10_0/uk/10.0.0/27df768ab1b5cac48a7616f145b79b62599167b0ffa2e054bf4c3e74e9619e5e)\n", "Reusing dataset common_voice_10_0 (/home/robinhad/.cache/huggingface/datasets/mozilla-foundation___common_voice_10_0/uk/10.0.0/27df768ab1b5cac48a7616f145b79b62599167b0ffa2e054bf4c3e74e9619e5e)\n" ] } ], "source": [ "from datasets import load_dataset, load_metric, Audio\n", "\n", "common_voice_train = load_dataset(\"mozilla-foundation/common_voice_10_0\", \"uk\", split=\"train\", use_auth_token=True)\n", "common_voice_test = load_dataset(\"mozilla-foundation/common_voice_10_0\", \"uk\", split=\"test\", use_auth_token=True)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Dataset({\n", " features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],\n", " num_rows: 11463\n", "})" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "common_voice_train" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "common_voice_train.cleanup_cache_files()\n", "common_voice_test.cleanup_cache_files()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "id": "kbyq6lDgQc2a" }, "outputs": [], "source": [ "common_voice_train = common_voice_train.remove_columns([\"accent\", \"age\", \"client_id\", \"down_votes\", \"gender\", \"locale\", \"segment\", \"up_votes\"])\n", "common_voice_test = common_voice_test.remove_columns([\"accent\", \"age\", \"client_id\", \"down_votes\", \"gender\", \"locale\", \"segment\", \"up_votes\"])" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "id": "72737oog2F6U" }, "outputs": [], "source": [ "from datasets import ClassLabel\n", "import random\n", "import pandas as pd\n", "from IPython.display import display, HTML\n", "\n", "def show_random_elements(dataset, num_examples=10):\n", " assert num_examples <= len(dataset), \"Can't pick more elements than there are in the dataset.\"\n", " picks = []\n", " for _ in range(num_examples):\n", " pick = random.randint(0, len(dataset)-1)\n", " while pick in picks:\n", " pick = random.randint(0, len(dataset)-1)\n", " picks.append(pick)\n", " \n", " df = pd.DataFrame(dataset[picks])\n", " display(HTML(df.to_html()))" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 363 }, "executionInfo": { "elapsed": 39, "status": "ok", "timestamp": 1641588811771, "user": { "displayName": "Yurii Paniv", "photoUrl": "https://lh3.googleusercontent.com/a/default-user=s64", "userId": "13095662915325887123" }, "user_tz": -120 }, "id": "K_JUmf3G3b9S", "outputId": "8603c909-09e1-43ae-f7c2-b27b25d795a3" }, "outputs": [ { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sentence
0У червоних — невдачі на фронті.
1Він нагадував недавні ночі в кам'янському парку.
2Творення займенників
3Коли бідний жениться, ніч мала.
4Чорнота і Бугай злізли з дерев і пішли назирцем.
5крик.
6Крім того, мало не завжди погода примушує його десь заночувати.
7Така вже мода тепер.
8Летить що має сили до вікна і — грим грудьми до шибки.
9Ворожа лава проминула вже балку, а Василенко не стріляв.
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "show_random_elements(common_voice_train.remove_columns([\"path\", \"audio\"]), num_examples=10)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "executionInfo": { "elapsed": 30, "status": "ok", "timestamp": 1641588811775, "user": { "displayName": "Yurii Paniv", "photoUrl": "https://lh3.googleusercontent.com/a/default-user=s64", "userId": "13095662915325887123" }, "user_tz": -120 }, "id": "XIHocAuTQbBR", "outputId": "e8392853-e0d1-45ba-df74-065c50565654" }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "cac2ebe21a844f7c8d3699f811555e9c", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/12 [00:00\n", " \n", " \n", " \n", " sentence\n", " \n", " \n", " \n", " \n", " 0\n", " отаманенко почав пояснювати з наукової точки але дід перебив його\n", " \n", " \n", " 1\n", " енею глуздівно сказав\n", " \n", " \n", " 2\n", " ні розвідки вперед ні стежі до лісу\n", " \n", " \n", " 3\n", " ну ну та я нічого не кажу\n", " \n", " \n", " 4\n", " якось прийшов зв'язковий із мельників\n", " \n", " \n", " 5\n", " я хоч не з мельників так мені оповідав батько був козаком у холодному яру\n", " \n", " \n", " 6\n", " ну бо\n", " \n", " \n", " 7\n", " макітру одділив од плеч\n", " \n", " \n", " 8\n", " видно стріляла розвідка ударників що йшла із собакою попереду групи\n", " \n", " \n", " 9\n", " левко слабий лежить просить щоб зайшов\n", " \n", " \n", "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "show_random_elements(common_voice_train.map(cleaner).remove_columns([\"path\", \"audio\"]), num_examples=10)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Loading cached processed dataset at /home/robinhad/.cache/huggingface/datasets/mozilla-foundation___common_voice_10_0/uk/10.0.0/27df768ab1b5cac48a7616f145b79b62599167b0ffa2e054bf4c3e74e9619e5e/cache-96af4ec6cf30f0d6.arrow\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "217b24de248145d3af8d71497dc39b6e", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/6783 [00:00\n", " \n", " \n", " \n", " sentence\n", " \n", " \n", " \n", " \n", " 0\n", " чому алгоритм зупиниться\n", " \n", " \n", " 1\n", " конем\n", " \n", " \n", " 2\n", " наступного дня нас прийняли на службу до міліції\n", " \n", " \n", " 3\n", " я знав що це неправда\n", " \n", " \n", " 4\n", " і взявши з запічка кресало\n", " \n", " \n", " 5\n", " скоріше б на гору бо тачанки ар'єргарду вже відкрили вогонь\n", " \n", " \n", " 6\n", " удень відтягалася вглиб села залишаючи наглядати за виходом із міста неозброєних\n", " \n", " \n", " 7\n", " чи співвідноситься це твердження з поняттям карми\n", " \n", " \n", " 8\n", " селяни мусили шукати бодай якоїсь їжі\n", " \n", " \n", " 9\n", " більшість убитих і полонених були одягнені в селянські кожухи\n", " \n", " \n", "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "show_random_elements(common_voice_train.remove_columns([\"path\",\"audio\"]))" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "id": "LwCshNbbeRZR" }, "outputs": [], "source": [ "def extract_all_chars(batch):\n", " all_text = \" \".join(batch[\"sentence\"])\n", " vocab = list(set(all_text))\n", " return {\"vocab\": [vocab], \"all_text\": [all_text]}" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 81, "referenced_widgets": [ "116786d9364a4a57b521cddaabeda688", "9baa2f69aa9c4387bf1086a04ed78420", "a1e2c04dc2cb45ea80bec125e3dbf56f", "b6d46d40efa14b21814f41531f5a2f41", "d8bf8dc5d6c84140a4e96c9c435b8f17", "04ec68b059df4c628839c3ac29e2ebdd", "427056895c674c428400bee0f5b43995", "d518f2c2ab6945b78a6d336dad6262bd", "77f1a51099b24831ad8b2be3d2dc833a", "5815ae1348994bfebba4a8e968489a96", "22ba979142074f1d976e1a905544fd2d", "8b6b7f28751c45c8869aa86eb2a0ab26", "445c84e1e2e541f2a54fb989def386ae", "68502fb433564eee8dfdf272ed7e4f56", "1f3abdf2e0f6459da4179a94d691c4c4", "48c60be3ca9349a295b83f65769c7f27", "6c80bd8a8fe14a5989fe27445c14650f", "5c2a7fea8c434d51ada69a0854b88baf", "414efa8a08cd491cb78af8a95a151daa", "c31a747e18df4b4aa4449a30e387448c", "3dedffa30b774426bd474072a3a0d591", "05d8496d54174ae298c319b0194fc710" ] }, "executionInfo": { "elapsed": 560, "status": "ok", "timestamp": 1641588812313, "user": { "displayName": "Yurii Paniv", "photoUrl": "https://lh3.googleusercontent.com/a/default-user=s64", "userId": "13095662915325887123" }, "user_tz": -120 }, "id": "_m6uUjjcfbjH", "outputId": "4cc94e18-9295-4414-c611-c98916fe3d4d" }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "3324cb796c2e4ac582a6ba5386336e8f", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/1 [00:00\n", " \n", " Your browser does not support the audio element.\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import IPython.display as ipd\n", "import numpy as np\n", "import random\n", "\n", "rand_int = random.randint(0, len(common_voice_train)-1)\n", "\n", "print(common_voice_train[rand_int][\"sentence\"])\n", "ipd.Audio(data=common_voice_train[rand_int][\"audio\"][\"array\"], autoplay=True, rate=16000)" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "executionInfo": { "elapsed": 22, "status": "ok", "timestamp": 1641588821176, "user": { "displayName": "Yurii Paniv", "photoUrl": "https://lh3.googleusercontent.com/a/default-user=s64", "userId": "13095662915325887123" }, "user_tz": -120 }, "id": "1Po2g7YPuRTx", "outputId": "ad79ec8a-ab5a-4c52-edfa-a20d0eec2282" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Target text: там і стали на ночівлю\n", "Input array shape: (36288,)\n", "Sampling rate: 16000\n" ] } ], "source": [ "rand_int = random.randint(0, len(common_voice_train)-1)\n", "\n", "print(\"Target text:\", common_voice_train[rand_int][\"sentence\"])\n", "print(\"Input array shape:\", common_voice_train[rand_int][\"audio\"][\"array\"].shape)\n", "print(\"Sampling rate:\", common_voice_train[rand_int][\"audio\"][\"sampling_rate\"])" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "id": "eJY7I0XAwe9p" }, "outputs": [], "source": [ "def prepare_dataset(batch):\n", " audio = batch[\"audio\"]\n", "\n", " # batched output is \"un-batched\"\n", " batch[\"input_values\"] = processor(audio[\"array\"], sampling_rate=audio[\"sampling_rate\"]).input_values[0]\n", " batch[\"input_length\"] = len(batch[\"input_values\"])\n", " \n", " with processor.as_target_processor():\n", " batch[\"labels\"] = processor(batch[\"sentence\"]).input_ids\n", " return batch" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 87, "referenced_widgets": [ "a29f88f174f8499082fbb36a36c47fa4", "efc3bc0c48124ebeb79d245216eaf0fe", "d45747150d0b434593a3a7c98399599a", "ea73f7deb1c643f7b81de7fb7acaaf1b", "18bc63944343440f837cdff76db004fc", "9c875952cdd649a5bab87de9bb3f5200", "aa329cb93df44a6da6012c7cc49d7489", "b39b6e9131ca4ce3b31e84ceb04e1b83", "c5eed102ef134a4e8ca41713b82ff6a4", "e6e50da6516847878309fdc5c463edb3", "a4ae510b4f3845f891a796cf844fc2bb" ] }, "executionInfo": { "elapsed": 107521, "status": "ok", "timestamp": 1641588928679, "user": { "displayName": "Yurii Paniv", "photoUrl": "https://lh3.googleusercontent.com/a/default-user=s64", "userId": "13095662915325887123" }, "user_tz": -120 }, "id": "-np9xYK-wl8q", "outputId": "779b4637-0606-4cc8-be3c-16c1c4241e63" }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "a4013929a3b945ef9dcd3041f0cc3e91", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/11463 [00:00