{ "cells": [ { "cell_type": "code", "execution_count": 8, "id": "072d16f1", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/workspace/xls-r-uzbek-cv8\n" ] } ], "source": [ "%cd ~/xls-r-uzbek-cv8" ] }, { "cell_type": "code", "execution_count": 10, "id": "12382315", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[33mWARNING: Ignoring invalid distribution -ransformers (/opt/conda/lib/python3.8/site-packages)\u001b[0m\u001b[33m\n", "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -ip (/opt/conda/lib/python3.8/site-packages)\u001b[0m\u001b[33m\n", "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution - (/opt/conda/lib/python3.8/site-packages)\u001b[0m\u001b[33m\n", "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -ransformers (/opt/conda/lib/python3.8/site-packages)\u001b[0m\u001b[33m\n", "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -ip (/opt/conda/lib/python3.8/site-packages)\u001b[0m\u001b[33m\n", "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution - (/opt/conda/lib/python3.8/site-packages)\u001b[0m\u001b[33m\n", "\u001b[0mCollecting https://github.com/kpu/kenlm/archive/master.zip (from -r requirements.txt (line 10))\n", " Using cached https://github.com/kpu/kenlm/archive/master.zip (541 kB)\n", " Preparing metadata (setup.py) ... \u001b[?25ldone\n", "\u001b[?25hRequirement already satisfied: unidecode in /opt/conda/lib/python3.8/site-packages (from -r requirements.txt (line 1)) (1.3.2)\n", "Collecting tensorboard\n", " Using cached tensorboard-2.8.0-py3-none-any.whl (5.8 MB)\n", "Requirement already satisfied: torch in /opt/conda/lib/python3.8/site-packages (from -r requirements.txt (line 3)) (1.10.2)\n", "Requirement already satisfied: torchaudio in /opt/conda/lib/python3.8/site-packages (from -r requirements.txt (line 4)) (0.10.2)\n", "Requirement already satisfied: jiwer~=2.3.0 in /opt/conda/lib/python3.8/site-packages (from -r requirements.txt (line 5)) (2.3.0)\n", "Requirement already satisfied: soundfile~=0.10.3 in /opt/conda/lib/python3.8/site-packages (from -r requirements.txt (line 6)) (0.10.3.post1)\n", "Collecting transformers~=4.16.2\n", " Using cached transformers-4.16.2-py3-none-any.whl (3.5 MB)\n", "Collecting datasets~=1.18.3\n", " Using cached datasets-1.18.3-py3-none-any.whl (311 kB)\n", "Requirement already satisfied: pyctcdecode in /opt/conda/lib/python3.8/site-packages (from -r requirements.txt (line 9)) (0.3.0)\n", "Requirement already satisfied: protobuf>=3.6.0 in /opt/conda/lib/python3.8/site-packages (from tensorboard->-r requirements.txt (line 2)) (3.19.4)\n", "Requirement already satisfied: tensorboard-plugin-wit>=1.6.0 in /opt/conda/lib/python3.8/site-packages (from tensorboard->-r requirements.txt (line 2)) (1.8.1)\n", "Collecting google-auth-oauthlib<0.5,>=0.4.1\n", " Using cached google_auth_oauthlib-0.4.6-py2.py3-none-any.whl (18 kB)\n", "Requirement already satisfied: google-auth<3,>=1.6.3 in /opt/conda/lib/python3.8/site-packages (from tensorboard->-r requirements.txt (line 2)) (2.6.0)\n", "Requirement already satisfied: numpy>=1.12.0 in /opt/conda/lib/python3.8/site-packages (from tensorboard->-r requirements.txt (line 2)) (1.19.2)\n", "Requirement already satisfied: setuptools>=41.0.0 in /opt/conda/lib/python3.8/site-packages (from tensorboard->-r requirements.txt (line 2)) (50.3.1.post20201107)\n", "Requirement already satisfied: requests<3,>=2.21.0 in /opt/conda/lib/python3.8/site-packages (from tensorboard->-r requirements.txt (line 2)) (2.24.0)\n", "Requirement already satisfied: markdown>=2.6.8 in /opt/conda/lib/python3.8/site-packages (from tensorboard->-r requirements.txt (line 2)) (3.3.6)\n", "Requirement already satisfied: grpcio>=1.24.3 in /opt/conda/lib/python3.8/site-packages (from tensorboard->-r requirements.txt (line 2)) (1.43.0)\n", "Requirement already satisfied: wheel>=0.26 in /opt/conda/lib/python3.8/site-packages (from tensorboard->-r requirements.txt (line 2)) (0.35.1)\n", "Requirement already satisfied: absl-py>=0.4 in /opt/conda/lib/python3.8/site-packages (from tensorboard->-r requirements.txt (line 2)) (1.0.0)\n", "Requirement already satisfied: werkzeug>=0.11.15 in /opt/conda/lib/python3.8/site-packages (from tensorboard->-r requirements.txt (line 2)) (2.0.2)\n", "Requirement already satisfied: tensorboard-data-server<0.7.0,>=0.6.0 in /opt/conda/lib/python3.8/site-packages (from tensorboard->-r requirements.txt (line 2)) (0.6.1)\n", "Requirement already satisfied: typing-extensions in /opt/conda/lib/python3.8/site-packages (from torch->-r requirements.txt (line 3)) (4.0.1)\n", "Requirement already satisfied: python-Levenshtein==0.12.2 in /opt/conda/lib/python3.8/site-packages (from jiwer~=2.3.0->-r requirements.txt (line 5)) (0.12.2)\n", "Requirement already satisfied: cffi>=1.0 in /opt/conda/lib/python3.8/site-packages (from soundfile~=0.10.3->-r requirements.txt (line 6)) (1.14.3)\n", "Requirement already satisfied: filelock in /opt/conda/lib/python3.8/site-packages (from transformers~=4.16.2->-r requirements.txt (line 7)) (3.0.12)\n", "Requirement already satisfied: regex!=2019.12.17 in /opt/conda/lib/python3.8/site-packages (from transformers~=4.16.2->-r requirements.txt (line 7)) (2022.1.18)\n", "Requirement already satisfied: sacremoses in /opt/conda/lib/python3.8/site-packages (from transformers~=4.16.2->-r requirements.txt (line 7)) (0.0.47)\n", "Requirement already satisfied: packaging>=20.0 in /opt/conda/lib/python3.8/site-packages (from transformers~=4.16.2->-r requirements.txt (line 7)) (21.3)\n", "Requirement already satisfied: tokenizers!=0.11.3,>=0.10.1 in /opt/conda/lib/python3.8/site-packages (from transformers~=4.16.2->-r requirements.txt (line 7)) (0.11.4)\n", "Requirement already satisfied: huggingface-hub<1.0,>=0.1.0 in /opt/conda/lib/python3.8/site-packages (from transformers~=4.16.2->-r requirements.txt (line 7)) (0.4.0)\n", "Requirement already satisfied: pyyaml>=5.1 in /opt/conda/lib/python3.8/site-packages (from transformers~=4.16.2->-r requirements.txt (line 7)) (5.4.1)\n", "Requirement already satisfied: tqdm>=4.27 in /opt/conda/lib/python3.8/site-packages (from transformers~=4.16.2->-r requirements.txt (line 7)) (4.62.3)\n", "Requirement already satisfied: multiprocess in /opt/conda/lib/python3.8/site-packages (from datasets~=1.18.3->-r requirements.txt (line 8)) (0.70.12.2)\n", "Requirement already satisfied: dill in /opt/conda/lib/python3.8/site-packages (from datasets~=1.18.3->-r requirements.txt (line 8)) (0.3.4)\n", "Requirement already satisfied: pandas in /opt/conda/lib/python3.8/site-packages (from datasets~=1.18.3->-r requirements.txt (line 8)) (1.4.0)\n", "Requirement already satisfied: aiohttp in /opt/conda/lib/python3.8/site-packages (from datasets~=1.18.3->-r requirements.txt (line 8)) (3.8.1)\n", "Requirement already satisfied: xxhash in /opt/conda/lib/python3.8/site-packages (from datasets~=1.18.3->-r requirements.txt (line 8)) (2.0.2)\n", "Requirement already satisfied: pyarrow!=4.0.0,>=3.0.0 in /opt/conda/lib/python3.8/site-packages (from datasets~=1.18.3->-r requirements.txt (line 8)) (6.0.1)\n", "Requirement already satisfied: fsspec[http]>=2021.05.0 in /opt/conda/lib/python3.8/site-packages (from datasets~=1.18.3->-r requirements.txt (line 8)) (2022.1.0)\n", "Requirement already satisfied: pygtrie<3.0,>=2.1 in /opt/conda/lib/python3.8/site-packages (from pyctcdecode->-r requirements.txt (line 9)) (2.4.2)\n", "Requirement already satisfied: hypothesis<7,>=6.14 in /opt/conda/lib/python3.8/site-packages (from pyctcdecode->-r requirements.txt (line 9)) (6.36.1)\n", "Requirement already satisfied: six in /opt/conda/lib/python3.8/site-packages (from absl-py>=0.4->tensorboard->-r requirements.txt (line 2)) (1.15.0)\n", "Requirement already satisfied: pycparser in /opt/conda/lib/python3.8/site-packages (from cffi>=1.0->soundfile~=0.10.3->-r requirements.txt (line 6)) (2.20)\n", "Requirement already satisfied: rsa<5,>=3.1.4 in /opt/conda/lib/python3.8/site-packages (from google-auth<3,>=1.6.3->tensorboard->-r requirements.txt (line 2)) (4.8)\n", "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /opt/conda/lib/python3.8/site-packages (from google-auth<3,>=1.6.3->tensorboard->-r requirements.txt (line 2)) (5.0.0)\n", "Requirement already satisfied: pyasn1-modules>=0.2.1 in /opt/conda/lib/python3.8/site-packages (from google-auth<3,>=1.6.3->tensorboard->-r requirements.txt (line 2)) (0.2.8)\n", "Requirement already satisfied: requests-oauthlib>=0.7.0 in /opt/conda/lib/python3.8/site-packages (from google-auth-oauthlib<0.5,>=0.4.1->tensorboard->-r requirements.txt (line 2)) (1.3.1)\n", "Requirement already satisfied: sortedcontainers<3.0.0,>=2.1.0 in /opt/conda/lib/python3.8/site-packages (from hypothesis<7,>=6.14->pyctcdecode->-r requirements.txt (line 9)) (2.4.0)\n", "Requirement already satisfied: attrs>=19.2.0 in /opt/conda/lib/python3.8/site-packages (from hypothesis<7,>=6.14->pyctcdecode->-r requirements.txt (line 9)) (21.4.0)\n", "Requirement already satisfied: importlib-metadata>=4.4 in /opt/conda/lib/python3.8/site-packages (from markdown>=2.6.8->tensorboard->-r requirements.txt (line 2)) (4.10.1)\n", "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /opt/conda/lib/python3.8/site-packages (from packaging>=20.0->transformers~=4.16.2->-r requirements.txt (line 7)) (3.0.7)\n", "Requirement already satisfied: chardet<4,>=3.0.2 in /opt/conda/lib/python3.8/site-packages (from requests<3,>=2.21.0->tensorboard->-r requirements.txt (line 2)) (3.0.4)\n", "Requirement already satisfied: idna<3,>=2.5 in /opt/conda/lib/python3.8/site-packages (from requests<3,>=2.21.0->tensorboard->-r requirements.txt (line 2)) (2.10)\n", "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /opt/conda/lib/python3.8/site-packages (from requests<3,>=2.21.0->tensorboard->-r requirements.txt (line 2)) (1.25.11)\n", "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.8/site-packages (from requests<3,>=2.21.0->tensorboard->-r requirements.txt (line 2)) (2020.12.5)\n", "Requirement already satisfied: yarl<2.0,>=1.0 in /opt/conda/lib/python3.8/site-packages (from aiohttp->datasets~=1.18.3->-r requirements.txt (line 8)) (1.7.2)\n", "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /opt/conda/lib/python3.8/site-packages (from aiohttp->datasets~=1.18.3->-r requirements.txt (line 8)) (4.0.2)\n", "Requirement already satisfied: aiosignal>=1.1.2 in /opt/conda/lib/python3.8/site-packages (from aiohttp->datasets~=1.18.3->-r requirements.txt (line 8)) (1.2.0)\n", "Requirement already satisfied: multidict<7.0,>=4.5 in /opt/conda/lib/python3.8/site-packages (from aiohttp->datasets~=1.18.3->-r requirements.txt (line 8)) (6.0.2)\n", "Requirement already satisfied: frozenlist>=1.1.1 in /opt/conda/lib/python3.8/site-packages (from aiohttp->datasets~=1.18.3->-r requirements.txt (line 8)) (1.3.0)\n", "Requirement already satisfied: charset-normalizer<3.0,>=2.0 in /opt/conda/lib/python3.8/site-packages (from aiohttp->datasets~=1.18.3->-r requirements.txt (line 8)) (2.0.10)\n", "Requirement already satisfied: python-dateutil>=2.8.1 in /opt/conda/lib/python3.8/site-packages (from pandas->datasets~=1.18.3->-r requirements.txt (line 8)) (2.8.2)\n", "Requirement already satisfied: pytz>=2020.1 in /opt/conda/lib/python3.8/site-packages (from pandas->datasets~=1.18.3->-r requirements.txt (line 8)) (2021.1)\n", "Requirement already satisfied: joblib in /opt/conda/lib/python3.8/site-packages (from sacremoses->transformers~=4.16.2->-r requirements.txt (line 7)) (1.1.0)\n", "Requirement already satisfied: click in /opt/conda/lib/python3.8/site-packages (from sacremoses->transformers~=4.16.2->-r requirements.txt (line 7)) (8.0.3)\n", "Requirement already satisfied: zipp>=0.5 in /opt/conda/lib/python3.8/site-packages (from importlib-metadata>=4.4->markdown>=2.6.8->tensorboard->-r requirements.txt (line 2)) (3.7.0)\n", "Requirement already satisfied: pyasn1<0.5.0,>=0.4.6 in /opt/conda/lib/python3.8/site-packages (from pyasn1-modules>=0.2.1->google-auth<3,>=1.6.3->tensorboard->-r requirements.txt (line 2)) (0.4.8)\n", "Requirement already satisfied: oauthlib>=3.0.0 in /opt/conda/lib/python3.8/site-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<0.5,>=0.4.1->tensorboard->-r requirements.txt (line 2)) (3.2.0)\n", "Building wheels for collected packages: kenlm\n", " Building wheel for kenlm (setup.py) ... \u001b[?25ldone\n", "\u001b[?25h Created wheel for kenlm: filename=kenlm-0.0.0-cp38-cp38-linux_x86_64.whl size=2348591 sha256=d5c8e5430d89f59ddde39bc78aec471c1e66ef43b6cde792711b2e97d7b8b9dc\n", " Stored in directory: /tmp/pip-ephem-wheel-cache-hhcfnszu/wheels/ff/08/4e/a3ddc0e786e0f3c1fcd2e7a82c4324c02fc3ae2638471406d2\n", "Successfully built kenlm\n", "\u001b[33mWARNING: Ignoring invalid distribution -ransformers (/opt/conda/lib/python3.8/site-packages)\u001b[0m\u001b[33m\n", "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -ip (/opt/conda/lib/python3.8/site-packages)\u001b[0m\u001b[33m\n", "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution - (/opt/conda/lib/python3.8/site-packages)\u001b[0m\u001b[33m\n", "\u001b[0mInstalling collected packages: kenlm, transformers, google-auth-oauthlib, tensorboard, datasets\n", "\u001b[33m WARNING: The script transformers-cli is installed in '/workspace/.local/bin' which is not on PATH.\n", " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", "\u001b[0m\u001b[33m WARNING: The script google-oauthlib-tool is installed in '/workspace/.local/bin' which is not on PATH.\n", " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", "\u001b[0m\u001b[33m WARNING: The script tensorboard is installed in '/workspace/.local/bin' which is not on PATH.\n", " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", "\u001b[0m\u001b[33m WARNING: The script datasets-cli is installed in '/workspace/.local/bin' which is not on PATH.\n", " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", "\u001b[0mSuccessfully installed datasets-1.18.3 google-auth-oauthlib-0.4.6 kenlm-0.0.0 tensorboard-2.8.0 transformers-4.16.2\n", "\u001b[33mWARNING: Ignoring invalid distribution -ransformers (/opt/conda/lib/python3.8/site-packages)\u001b[0m\u001b[33m\n", "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -ip (/opt/conda/lib/python3.8/site-packages)\u001b[0m\u001b[33m\n", "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution - (/opt/conda/lib/python3.8/site-packages)\u001b[0m\u001b[33m\n", "\u001b[0m" ] } ], "source": [ "!python -m pip install -r requirements.txt --user" ] }, { "cell_type": "code", "execution_count": 14, "id": "3969d63a", "metadata": {}, "outputs": [], "source": [ "from transformers import AutoFeatureExtractor, AutoTokenizer, pipeline\n", "from datasets import Audio, Dataset, DatasetDict, load_dataset, load_metric\n", "\n", "import re\n", "import string\n", "import unidecode" ] }, { "cell_type": "code", "execution_count": 12, "id": "daff17fd", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Reusing dataset common_voice (/workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/uz/8.0.0/b8bc4d453193c06a43269b46cd87f075c70f152ac963b7f28f7a2760c45ec3e8)\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "a8aad37a859241ff81ac932edc204bf8", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/5 [00:00‘\", # after o/g indicate modified char\n", " unidecode.unidecode(batch[\"sentence\"]).lower()\n", " )\n", " )\n", " ) + \" \"\n", " return batch" ] }, { "cell_type": "code", "execution_count": 18, "id": "f28dc522", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "4b8d2f0df8ea46bdaee2c94996583c5e", "version_major": 2, "version_minor": 0 }, "text/plain": [ "0ex [00:00, ?ex/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "dataset = dataset_dict[\"train\"].map(remove_special_characters, remove_columns=dataset_dict[\"train\"].column_names)" ] }, { "cell_type": "code", "execution_count": 23, "id": "38e02d29", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 0 244494 2030240 uz_cv8_train.txt\n" ] } ], "source": [ "text_data = \"uz_cv8_train.txt\"\n", "with open(text_data, \"w\") as fs:\n", " fs.write(\" \".join(dataset[\"text\"]))\n", "\n", "!wc $text_data" ] }, { "cell_type": "code", "execution_count": 26, "id": "7b3d70f0", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "--2022-02-07 03:18:36-- https://kheafield.com/code/kenlm.tar.gz\n", "Resolving kheafield.com (kheafield.com)... 35.196.63.85\n", "Connecting to kheafield.com (kheafield.com)|35.196.63.85|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 491090 (480K) [application/x-gzip]\n", "Saving to: ‘STDOUT’\n", "\n", "- 100%[===================>] 479.58K 2.31MB/s in 0.2s \n", "\n", "2022-02-07 03:18:37 (2.31 MB/s) - written to stdout [491090/491090]\n", "\n", "/bin/bash: line 1: cmake: command not found\n" ] } ], "source": [ "!wget -O - https://kheafield.com/code/kenlm.tar.gz | tar xz\n", "!mkdir kenlm/build && cd kenlm/build && cmake .. && make -j2\n" ] }, { "cell_type": "code", "execution_count": null, "id": "65118a69", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.8" } }, "nbformat": 4, "nbformat_minor": 5 }