diff --git "a/wav2vec2/wav2vec_data.ipynb" "b/wav2vec2/wav2vec_data.ipynb" --- "a/wav2vec2/wav2vec_data.ipynb" +++ "b/wav2vec2/wav2vec_data.ipynb" @@ -26,16 +26,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "Fri Sep 2 01:31:23 2022 \n", + "Mon Dec 26 17:51:09 2022 \n", "+-----------------------------------------------------------------------------+\n", - "| NVIDIA-SMI 515.65.01 Driver Version: 515.65.01 CUDA Version: 11.7 |\n", + "| NVIDIA-SMI 515.86.01 Driver Version: 515.86.01 CUDA Version: 11.7 |\n", "|-------------------------------+----------------------+----------------------+\n", "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", "| | | MIG M. |\n", "|===============================+======================+======================|\n", "| 0 NVIDIA GeForce ... Off | 00000000:0A:00.0 On | N/A |\n", - "| 0% 35C P5 52W / 390W | 1231MiB / 24576MiB | 34% Default |\n", + "| 0% 34C P5 49W / 390W | 1437MiB / 24576MiB | 2% Default |\n", "| | | N/A |\n", "+-------------------------------+----------------------+----------------------+\n", " \n", @@ -44,13 +44,23 @@ "| GPU GI CI PID Type Process name GPU Memory |\n", "| ID ID Usage |\n", "|=============================================================================|\n", - "| 0 N/A N/A 1216 G /usr/lib/xorg/Xorg 485MiB |\n", - "| 0 N/A N/A 1601 G /usr/bin/kwin_x11 97MiB |\n", - "| 0 N/A N/A 1650 G /usr/bin/plasmashell 64MiB |\n", - "| 0 N/A N/A 1747 G telegram-desktop 4MiB |\n", - "| 0 N/A N/A 4701 G ...5/usr/lib/firefox/firefox 175MiB |\n", - "| 0 N/A N/A 804722 G ...RendererForSitePerProcess 363MiB |\n", - "| 0 N/A N/A 867357 G ...996071496053229024,131072 35MiB |\n", + "| 0 N/A N/A 1223 G /usr/lib/xorg/Xorg 498MiB |\n", + "| 0 N/A N/A 2007 G /usr/bin/kwalletd5 4MiB |\n", + "| 0 N/A N/A 2174 G ...ec/xdg-desktop-portal-kde 4MiB |\n", + "| 0 N/A N/A 2211 G /usr/bin/ksmserver 4MiB |\n", + "| 0 N/A N/A 2213 G /usr/bin/kded5 4MiB |\n", + "| 0 N/A N/A 2214 G /usr/bin/kwin_x11 96MiB |\n", + "| 0 N/A N/A 2263 G /usr/bin/plasmashell 96MiB |\n", + "| 0 N/A N/A 2283 G ...de-authentication-agent-1 4MiB |\n", + "| 0 N/A N/A 2354 G ...x-gnu/libexec/kdeconnectd 4MiB |\n", + "| 0 N/A N/A 2356 G .../usr/bin/telegram-desktop 7MiB |\n", + "| 0 N/A N/A 2370 G /usr/bin/kaccess 4MiB |\n", + "| 0 N/A N/A 2377 G .../libexec/DiscoverNotifier 4MiB |\n", + "| 0 N/A N/A 2443 G ...1/usr/lib/firefox/firefox 96MiB |\n", + "| 0 N/A N/A 2704 G /usr/bin/dolphin 4MiB |\n", + "| 0 N/A N/A 2806 G /usr/bin/dolphin 4MiB |\n", + "| 0 N/A N/A 2911 G /usr/bin/dolphin 4MiB |\n", + "| 0 N/A N/A 6634 G ...RendererForSitePerProcess 585MiB |\n", "+-----------------------------------------------------------------------------+\n" ] } @@ -82,7 +92,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 5, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -120,7 +130,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -132,7 +142,7 @@ "})" ] }, - "execution_count": 4, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -143,7 +153,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -152,7 +162,7 @@ "0" ] }, - "execution_count": 5, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -164,7 +174,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 8, "metadata": { "id": "kbyq6lDgQc2a" }, @@ -176,7 +186,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 9, "metadata": { "id": "72737oog2F6U" }, @@ -202,7 +212,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 10, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -236,43 +246,43 @@ " \n", " \n", " 0\n", - " Вона нас не лякає.\n", + " Щоночі одна півсотня ночувала у лісі, друга — в селі.\n", " \n", " \n", " 1\n", - " Бейнбрідж затримався, готуючи екіпажі, й фактично не встиг узяти участі у війні.\n", + " І зараз за тим днем твоя доля зміниться на краще.\n", " \n", " \n", " 2\n", - " А тепер.\n", + " Оп'ять Ірисю посила:\n", " \n", " \n", " 3\n", - " Наші \"будьонівці\", ніби з цікавості, зібралися й оточили червоні шеренги.\n", + " Рушниця впала по той бік плоту.\n", " \n", " \n", " 4\n", - " Серед квітів я вмирав, Серед хмар я воскресав.\n", + " Загуде вона, як гром.\n", " \n", " \n", " 5\n", - " Сьогодні виробництво полімерів найбільша галузь хімічної промисловості.\n", + " Дружина не витримує і сміється.\n", " \n", " \n", " 6\n", - " Хмельницький заплатив за все на цілий рік наперед.\n", + " Люблю, тільки боюся говорити.\n", " \n", " \n", " 7\n", - " Соловій же залишився підпалити бікфордів шнур.\n", + " Звір заревів востаннє, сіпнувся головою назад і зник під водою.\n", " \n", " \n", " 8\n", - " Тоді його слово буде хвилювати, захоплювати, піднімати людську душу.\n", + " Про їзду риссю не могло бути й мови.\n", " \n", " \n", " 9\n", - " Тут були яблуні, сливи, вишні, — вишень найбільше.\n", + " Ми заночували в Бондуровій, — червоні — в Баландиному.\n", " \n", " \n", "" @@ -291,7 +301,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 11, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -317,7 +327,7 @@ "ascii": false, "bar_format": null, "colour": null, - "elapsed": 0.00995326042175293, + "elapsed": 0.018886804580688477, "initial": 0, "n": 0, "ncols": null, @@ -331,7 +341,7 @@ "unit_scale": false }, "application/vnd.jupyter.widget-view+json": { - "model_id": "4c0c81459dfb4ede8f0ec6fe25a0807e", + "model_id": "609c1f72241d438999ec5fffaf0f23f5", "version_major": 2, "version_minor": 0 }, @@ -348,7 +358,7 @@ "ascii": false, "bar_format": null, "colour": null, - "elapsed": 0.007294893264770508, + "elapsed": 0.011157989501953125, "initial": 0, "n": 0, "ncols": null, @@ -362,7 +372,7 @@ "unit_scale": false }, "application/vnd.jupyter.widget-view+json": { - "model_id": "66c16ae632444339ae8ec80070398586", + "model_id": "29df50a982df4687ae195c2edd1145d3", "version_major": 2, "version_minor": 0 }, @@ -382,7 +392,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 12, "metadata": { "id": "ZcVsD0ETElrR" }, @@ -393,7 +403,7 @@ "{'sentence': \"привіт як у тебе справи загалом м'якотілий друже\"}" ] }, - "execution_count": 10, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -465,7 +475,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 13, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -492,7 +502,7 @@ "ascii": false, "bar_format": null, "colour": null, - "elapsed": 0.00716710090637207, + "elapsed": 0.014832735061645508, "initial": 0, "n": 0, "ncols": null, @@ -506,7 +516,7 @@ "unit_scale": false }, "application/vnd.jupyter.widget-view+json": { - "model_id": "ca29db891c7f4d0cbd328a65477d2392", + "model_id": "0fc7a813101e45a9866279d6d06f4d4f", "version_major": 2, "version_minor": 0 }, @@ -530,43 +540,43 @@ " \n", " \n", " 0\n", - " так так усе на світі кінчається\n", + " простіть\n", " \n", " \n", " 1\n", - " комуністів тільки й є що воєнком та два ротні політруки\n", + " полковник дасть вам проїзні документи до табору\n", " \n", " \n", " 2\n", - " уже й убитих чимало\n", + " зараз тільки про це йде мова\n", " \n", " \n", " 3\n", - " трупів не закопували\n", + " у мене лопата вартий на килимі до кам'янки вернутися\n", " \n", " \n", " 4\n", - " до фастова дісталася з якимось польським обозом\n", + " хотілося взнати про холодний яр про долю товаришів\n", " \n", " \n", " 5\n", - " невже то ви були\n", + " а той слухає уважно перепитує\n", " \n", " \n", " 6\n", - " при отій купці отої нещасної духовної братії\n", + " чи то образ перемінився в чоловіка чи чоловік був у тім образі\n", " \n", " \n", " 7\n", - " вирішуємо напасти на бригаду по дорозі не допустивши до села\n", + " кінь\n", " \n", " \n", " 8\n", - " де то хто таке видав аби хлопи купували панські маєтки\n", + " у самців вусики трохи довші а у самок дещо коротші за тіло\n", " \n", " \n", " 9\n", - " коні пішли з коноводами в балку\n", + " чорний туман що сповивав усе довкола поволі сірів\n", " \n", " \n", "" @@ -585,7 +595,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -601,7 +611,7 @@ "ascii": false, "bar_format": null, "colour": null, - "elapsed": 0.007320880889892578, + "elapsed": 0.012672662734985352, "initial": 0, "n": 0, "ncols": null, @@ -615,7 +625,7 @@ "unit_scale": false }, "application/vnd.jupyter.widget-view+json": { - "model_id": "8261bf1a7bd747fb88f7e063c24273d4", + "model_id": "c10e3f80cadb49a7951a2a6863af53bf", "version_major": 2, "version_minor": 0 }, @@ -634,7 +644,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 15, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -668,43 +678,43 @@ " \n", " \n", " 0\n", - " я замилувався маневруванням тачанок на полях\n", + " спостерігаючи за їхніми спокійними рухами пригадую роботу чекістів у льоху для розстрілів\n", " \n", " \n", " 1\n", - " андрій із бугаєм вилізли на близький горб роздивилися\n", + " сірого били дрючками селяни що упіймали його з парою крадених коней\n", " \n", " \n", " 2\n", - " вона нам потрібна як щоденний хліб\n", + " мав лише револьвер далековид і планшет із мапою\n", " \n", " \n", " 3\n", - " олесеві ще більше захотілось чаю\n", + " коли еней мене бажа\n", " \n", " \n", " 4\n", - " вирішуємо тут поснідати і з годину відпочити бо люди й коні потомлені\n", + " ось і волость\n", " \n", " \n", " 5\n", - " воротилову сотню найдужче боявся він дрібного дощу який почав сіятись удосвіта й міг зашкодити\n", + " і посмоктали кісточки\n", " \n", " \n", " 6\n", - " люта злість піднялася в душі хлопця при вигляді оцього свого тирана\n", + " скакати високо\n", " \n", " \n", " 7\n", - " також цього року в столиці виникла низка профспілкових організацій і був створений робочий клуб\n", + " я стрепенувся і відкрив повіки\n", " \n", " \n", " 8\n", - " накидав того літа а вони в наших плавнях затрималися всю січ мені засмерділи\n", + " нападом на кінноту ми себе виявили\n", " \n", " \n", " 9\n", - " їздять коло нас\n", + " червоні розгубилися і в безладі закрутилися на місці\n", " \n", " \n", "" @@ -723,7 +733,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 16, "metadata": { "id": "LwCshNbbeRZR" }, @@ -737,7 +747,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 17, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -788,7 +798,7 @@ "ascii": false, "bar_format": null, "colour": null, - "elapsed": 0.009069681167602539, + "elapsed": 0.01583385467529297, "initial": 0, "n": 0, "ncols": null, @@ -802,7 +812,7 @@ "unit_scale": false }, "application/vnd.jupyter.widget-view+json": { - "model_id": "21abcee2f4f6401096ebfcc2b283f704", + "model_id": "3ba1ea8f16284564b96209d4ac0fe9b2", "version_major": 2, "version_minor": 0 }, @@ -819,7 +829,7 @@ "ascii": false, "bar_format": null, "colour": null, - "elapsed": 0.007071256637573242, + "elapsed": 0.006883859634399414, "initial": 0, "n": 0, "ncols": null, @@ -833,7 +843,7 @@ "unit_scale": false }, "application/vnd.jupyter.widget-view+json": { - "model_id": "82e2b6e9482345ba913c5800eab41275", + "model_id": "24cf52aaa5534c3d94b2c749af19dd86", "version_major": 2, "version_minor": 0 }, @@ -852,7 +862,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 18, "metadata": { "id": "aQfneNsmlJI0" }, @@ -863,7 +873,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 19, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -923,7 +933,7 @@ " 'ґ': 34}" ] }, - "execution_count": 17, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -935,7 +945,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 20, "metadata": { "id": "npbIbBoLgaFX" }, @@ -947,7 +957,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 21, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -973,7 +983,7 @@ "37" ] }, - "execution_count": 19, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -986,7 +996,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 22, "metadata": { "id": "ehyUoh9vk191" }, @@ -999,7 +1009,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 23, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -1035,18 +1045,18 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 24, "metadata": { "id": "A1XApZBAF2zr" }, "outputs": [], "source": [ - "repo_name = \"wav2vec2-xls-r-300m-uk\"" + "repo_name = \"wav2vec2-xls-r-base-uk\"" ] }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 26, "metadata": { "id": "kAR0-2KLkopp" }, @@ -1059,7 +1069,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 27, "metadata": { "id": "KYZtoW-tlZgl" }, @@ -1072,7 +1082,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 28, "metadata": {}, "outputs": [], "source": [ @@ -1082,7 +1092,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 29, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -1109,7 +1119,7 @@ "'/home/robinhad/.cache/huggingface/datasets/downloads/extracted/ee7155196e5d51620d53e48cf58eb693b7839b8ff183604c8bb948d3e0aad92d/cv-corpus-10.0-2022-07-04/uk/clips/common_voice_uk_20907128.mp3'" ] }, - "execution_count": 26, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } @@ -1120,7 +1130,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 30, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -1149,7 +1159,7 @@ " 'sampling_rate': 48000}" ] }, - "execution_count": 27, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } @@ -1160,7 +1170,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 31, "metadata": { "id": "rrv65aj7G95i" }, @@ -1172,7 +1182,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 32, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -1201,7 +1211,7 @@ " 'sampling_rate': 16000}" ] }, - "execution_count": 29, + "execution_count": 32, "metadata": {}, "output_type": "execute_result" } @@ -1212,7 +1222,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 33, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -1237,7 +1247,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "от би була рада\n" + "що ж хоч утекти можна вкупі з нею\n" ] }, { @@ -1245,7 +1255,7 @@ "text/html": [ "\n", " \n", " " @@ -1254,7 +1264,7 @@ "" ] }, - "execution_count": 30, + "execution_count": 33, "metadata": {}, "output_type": "execute_result" } @@ -1272,7 +1282,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 34, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -1296,8 +1306,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "Target text: от їхало якихось із десять на конях назустріч\n", - "Input array shape: (73152,)\n", + "Target text: троє\n", + "Input array shape: (26496,)\n", "Sampling rate: 16000\n" ] } @@ -1312,7 +1322,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 35, "metadata": { "id": "eJY7I0XAwe9p" }, @@ -1332,7 +1342,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 36, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -1372,7 +1382,7 @@ "ascii": false, "bar_format": null, "colour": null, - "elapsed": 0.00739741325378418, + "elapsed": 0.0130767822265625, "initial": 0, "n": 0, "ncols": null, @@ -1386,7 +1396,7 @@ "unit_scale": false }, "application/vnd.jupyter.widget-view+json": { - "model_id": "c51a283b0cf149d7a84ade53f6eb40d9", + "model_id": "9b61f7e540b64679b38d986c03621299", "version_major": 2, "version_minor": 0 }, @@ -1403,7 +1413,7 @@ "ascii": false, "bar_format": null, "colour": null, - "elapsed": 0.010543107986450195, + "elapsed": 0.007035732269287109, "initial": 0, "n": 0, "ncols": null, @@ -1417,7 +1427,7 @@ "unit_scale": false }, "application/vnd.jupyter.widget-view+json": { - "model_id": "9666ba72472f47db816037cec309e7ed", + "model_id": "6710267a647b45c89a24800f79d88d15", "version_major": 2, "version_minor": 0 }, @@ -1446,2455 +1456,31 @@ "#common_voice_train = common_voice_train.filter(lambda x: x < max_input_length_in_sec * processor.feature_extractor.sampling_rate, input_columns=[\"input_length\"])" ] }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": { - "id": "tborvC9hx88e" - }, - "outputs": [], - "source": [ - "import torch\n", - "\n", - "from dataclasses import dataclass, field\n", - "from typing import Any, Dict, List, Optional, Union\n", - "\n", - "@dataclass\n", - "class DataCollatorCTCWithPadding:\n", - " \"\"\"\n", - " Data collator that will dynamically pad the inputs received.\n", - " Args:\n", - " processor (:class:`~transformers.Wav2Vec2Processor`)\n", - " The processor used for proccessing the data.\n", - " padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):\n", - " Select a strategy to pad the returned sequences (according to the model's padding side and padding index)\n", - " among:\n", - " * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single\n", - " sequence if provided).\n", - " * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the\n", - " maximum acceptable input length for the model if that argument is not provided.\n", - " * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of\n", - " different lengths).\n", - " \"\"\"\n", - "\n", - " processor: Wav2Vec2Processor\n", - " padding: Union[bool, str] = True\n", - "\n", - " def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:\n", - " # split inputs and labels since they have to be of different lenghts and need\n", - " # different padding methods\n", - " input_features = [{\"input_values\": feature[\"input_values\"]} for feature in features]\n", - " label_features = [{\"input_ids\": feature[\"labels\"]} for feature in features]\n", - "\n", - " batch = self.processor.pad(\n", - " input_features,\n", - " padding=self.padding,\n", - " return_tensors=\"pt\",\n", - " )\n", - " with self.processor.as_target_processor():\n", - " labels_batch = self.processor.pad(\n", - " label_features,\n", - " padding=self.padding,\n", - " return_tensors=\"pt\",\n", - " )\n", - "\n", - " # replace padding with -100 to ignore loss correctly\n", - " labels = labels_batch[\"input_ids\"].masked_fill(labels_batch.attention_mask.ne(1), -100)\n", - "\n", - " batch[\"labels\"] = labels\n", - "\n", - " return batch" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": { - "id": "lbQf5GuZyQ4_" - }, - "outputs": [], - "source": [ - "data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)" - ] - }, { "cell_type": "code", "execution_count": 37, - "metadata": { - "id": "9Xsux2gmyXso" - }, + "metadata": {}, "outputs": [], "source": [ - "wer_metric = load_metric(\"wer\")\n", - "cer_metric = load_metric(\"cer\")\n", - "metrics = [wer_metric, cer_metric]" + "!mkdir cached_dataset" ] }, { "cell_type": "code", "execution_count": 38, - "metadata": { - "id": "1XZ-kjweyTy_" - }, + "metadata": {}, "outputs": [], "source": [ - "def compute_metrics(pred):\n", - " pred_logits = pred.predictions\n", - " pred_ids = np.argmax(pred_logits, axis=-1)\n", - "\n", - " pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id\n", - "\n", - " pred_str = processor.batch_decode(pred_ids)\n", - " # we do not want to group tokens when computing the metrics\n", - " label_str = processor.batch_decode(pred.label_ids, group_tokens=False)\n", - "\n", - " wer = wer_metric.compute(predictions=pred_str, references=label_str)\n", - " cer = cer_metric.compute(predictions=pred_str, references=label_str)\n", - "\n", - " return {\"wer\": wer, \"cer\": cer}" + "common_voice_train.save_to_disk(\"cached_dataset/cv_train\")" ] }, { "cell_type": "code", "execution_count": 39, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 9496, - "status": "ok", - "timestamp": 1641588938616, - "user": { - "displayName": "Yurii Paniv", - "photoUrl": "https://lh3.googleusercontent.com/a/default-user=s64", - "userId": "13095662915325887123" - }, - "user_tz": -120 - }, - "id": "e7cqAWIayn6w", - "outputId": "b7b20ce9-e1b2-473f-8032-2a75f98dfa9e" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Some weights of the model checkpoint at facebook/wav2vec2-xls-r-300m were not used when initializing Wav2Vec2ForCTC: ['quantizer.codevectors', 'project_hid.bias', 'project_q.weight', 'quantizer.weight_proj.weight', 'quantizer.weight_proj.bias', 'project_q.bias', 'project_hid.weight']\n", - "- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", - "- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", - "Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['lm_head.bias', 'lm_head.weight']\n", - "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" - ] - } - ], - "source": [ - "from transformers import Wav2Vec2ForCTC\n", - "\n", - "model = Wav2Vec2ForCTC.from_pretrained(\n", - " \"facebook/wav2vec2-xls-r-300m\", \n", - " attention_dropout=0.3,\n", - " hidden_dropout=0.3,\n", - " feat_proj_dropout=0.3,\n", - " mask_time_prob=0.05,\n", - " layerdrop=0.3,\n", - " ctc_loss_reduction=\"mean\", \n", - " pad_token_id=processor.tokenizer.pad_token_id,\n", - " vocab_size=len(processor.tokenizer),\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": { - "id": "oGI8zObtZ3V0" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/robinhad/Projects/Speech/wav2vec2-xls-r-ukrainian/.venv/lib/python3.9/site-packages/transformers/models/wav2vec2/modeling_wav2vec2.py:1618: FutureWarning: The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5.Please use the equivalent `freeze_feature_encoder` method instead.\n", - " warnings.warn(\n" - ] - } - ], - "source": [ - "model.freeze_feature_extractor()" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "metadata": { - "id": "KbeKSV7uzGPP" - }, + "metadata": {}, "outputs": [], "source": [ - "from transformers import TrainingArguments\n", - "\n", - "training_args = TrainingArguments(\n", - " output_dir=repo_name,\n", - " group_by_length=True,\n", - " per_device_train_batch_size=16,\n", - " gradient_accumulation_steps=6,\n", - " eval_accumulation_steps=4,\n", - " evaluation_strategy=\"steps\",\n", - " num_train_epochs=100,\n", - " gradient_checkpointing=True,\n", - " fp16=True,\n", - " save_steps=400,\n", - " eval_steps=400,\n", - " logging_steps=400,\n", - " learning_rate=3e-4,\n", - " warmup_steps=500,\n", - " save_total_limit=2,\n", - " report_to=\"tensorboard\"\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 11063, - "status": "ok", - "timestamp": 1641588949674, - "user": { - "displayName": "Yurii Paniv", - "photoUrl": "https://lh3.googleusercontent.com/a/default-user=s64", - "userId": "13095662915325887123" - }, - "user_tz": -120 - }, - "id": "rY7vBmFCPFgC", - "outputId": "2e89d5ea-5b25-44bf-8492-a6220b0b1c38" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Using cuda_amp half precision backend\n" - ] - } - ], - "source": [ - "from transformers import Trainer\n", - "\n", - "trainer = Trainer(\n", - " model=model,\n", - " data_collator=data_collator,\n", - " args=training_args,\n", - " compute_metrics=compute_metrics,\n", - " train_dataset=common_voice_train,\n", - " eval_dataset=common_voice_test,\n", - " tokenizer=processor.feature_extractor,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 409 - }, - "id": "9fRr9TG5pGBl", - "outputId": "c2a7c797-326c-4bd2-b167-9d2f41d77def" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "The following columns in the training set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message.\n", - "/home/robinhad/Projects/Speech/wav2vec2-xls-r-ukrainian/.venv/lib/python3.9/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", - " warnings.warn(\n", - "***** Running training *****\n", - " Num examples = 11463\n", - " Num Epochs = 100\n", - " Instantaneous batch size per device = 16\n", - " Total train batch size (w. parallel, distributed & accumulation) = 96\n", - " Gradient Accumulation steps = 6\n", - " Total optimization steps = 11900\n" - ] - }, - { - "data": { - "application/json": { - "ascii": false, - "bar_format": null, - "colour": null, - "elapsed": 0.007272958755493164, - "initial": 0, - "n": 0, - "ncols": null, - "nrows": null, - "postfix": null, - "prefix": "", - "rate": null, - "total": 11900, - "unit": "it", - "unit_divisor": 1000, - "unit_scale": false - }, - "application/vnd.jupyter.widget-view+json": { - "model_id": "deebda57b25f4f95b4915d7a8d479a62", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/11900 [00:00 to the vocabulary\n", - "Adding to the vocabulary\n" - ] - } - ], - "source": [ - "from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor\n", - "model = Wav2Vec2ForCTC.from_pretrained(repo_name + \"/checkpoint-11200\").to(\"cuda\")\n", - "processor = Wav2Vec2Processor.from_pretrained(repo_name)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "jD7TZ1YS3S_K" - }, - "source": [ - "\n", - "Now, we will just take the first example of the test set, run it through the model and take the `argmax(...)` of the logits to retrieve the predicted token ids." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "pax07TnL3WZn" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.\n" - ] - } - ], - "source": [ - "audio_id = 10\n", - "\n", - "input_dict = processor(common_voice_test[\"input_values\"], return_tensors=\"pt\", padding=True)\n", - "\n", - "logits = model(input_dict.input_values.to(\"cuda\")).logits\n", - "\n", - "pred_ids = torch.argmax(logits, dim=-1)[audio_id]\n", - "\n", - "common_voice_test_transcription = load_dataset(\"common_voice\", \"uk\", split=\"test\")\n", - "\n", - "print(\"Prediction:\")\n", - "print(processor.decode(pred_ids))\n", - "\n", - "print(\"\\nReference:\")\n", - "print(common_voice_test_transcription[audio_id][\"sentence\"].lower())" + "common_voice_test.save_to_disk(\"cached_dataset/cv_test\")" ] } ],