{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "b7523cd66cf343f98fd3006be918a3b6", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading: 0%| | 0.00/10.1k [00:00=1.10\n", " Downloading wrapt-1.13.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (84 kB)\n", " |████████████████████████████████| 84 kB 12.8 MB/s \n", "\u001b[?25hBuilding wheels for collected packages: jaconv\n", " Building wheel for jaconv (setup.py) ... \u001b[?25ldone\n", "\u001b[?25h Created wheel for jaconv: filename=jaconv-0.3-py3-none-any.whl size=15553 sha256=fd764f215e4d567cb60062a7052497b66729e9e2190e2e00153e0d19734088e7\n", " Stored in directory: /workspace/.cache/pip/wheels/73/e8/fb/b4ad8117719f79ac73bc05406d1768f845688cdbeed7aad87e\n", "Successfully built jaconv\n", "Installing collected packages: wrapt, jaconv, deprecated, pykakasi\n", "Successfully installed deprecated-1.2.13 jaconv-0.3 pykakasi-2.2.1 wrapt-1.13.3\n", "\u001b[33mWARNING: You are using pip version 21.3.1; however, version 22.0.2 is available.\n", "You should consider upgrading via the '/opt/conda/bin/python -m pip install --upgrade pip' command.\u001b[0m\n" ] } ], "source": [ "!pip install pykakasi" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "にんじゃ ひらがな kana\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_2159/3076271513.py:4: DeprecationWarning: Call to deprecated method setMode. (Old API will be removed in v3.0.) -- Deprecated since version 2.1.\n", " kakasi.setMode('J', 'H') #Convert from kanji to hiragana\n", "/tmp/ipykernel_2159/3076271513.py:6: DeprecationWarning: Call to deprecated method getConverter. (Old API will be removed in v3.0.) -- Deprecated since version 2.1.\n", " conv = kakasi.getConverter()\n", "/tmp/ipykernel_2159/3076271513.py:10: DeprecationWarning: Call to deprecated method do. (Old API will be removed in v3.0.) -- Deprecated since version 2.1.\n", " print(conv.do(str))\n" ] } ], "source": [ "from pykakasi import kakasi\n", "\n", "kakasi = kakasi()\n", "kakasi.setMode('J', 'H') #Convert from kanji to hiragana\n", "# kakasi.setMode(\"K\", \"H\") #Convert from katakana to hiragana\n", "conv = kakasi.getConverter()\n", "\n", "str = 'にんじゃ 平仮名 kana'\n", "\n", "print(conv.do(str))" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "repo_name = 'https://huggingface.co/AndrewMcDowell/wav2vec2-xls-r-1B-german'\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "common_voice_train = common_voice_train.remove_columns([\"accent\", \"age\", \"client_id\", \"down_votes\", \"gender\", \"locale\", \"segment\", \"up_votes\"])\n", "common_voice_test = common_voice_test.remove_columns([\"accent\", \"age\", \"client_id\", \"down_votes\", \"gender\", \"locale\", \"segment\", \"up_votes\"])\n", "\n" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "ad26c4d7d02948a3bc30d86a0f3527c8", "version_major": 2, "version_minor": 0 }, "text/plain": [ "0ex [00:00, ?ex/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_2159/322450745.py:5: DeprecationWarning: Call to deprecated method do. (Old API will be removed in v3.0.) -- Deprecated since version 2.1.\n", " batch[\"sentence\"] = conv.do(re.sub(chars_to_remove_regex, '', batch[\"sentence\"]))\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "93295f1cd50f4557a96ff1bf139c9a37", "version_major": 2, "version_minor": 0 }, "text/plain": [ "0ex [00:00, ?ex/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import re\n", "chars_to_remove_regex = '[\\,\\?\\!\\-\\;\\:\\\"\\“\\%\\‘\\”\\�\\—\\’\\…\\–\\(\\,\\[\\]\\)\\(\\!]'\n", "# \\.\n", "def remove_special_characters(batch):\n", " batch[\"sentence\"] = conv.do(re.sub(chars_to_remove_regex, '', batch[\"sentence\"]))\n", " return batch\n", "\n", "common_voice_train = common_voice_train.map(remove_special_characters)\n", "common_voice_test = common_voice_test.map(remove_special_characters)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'client_id': '02a8841a00d762472a4797b56ee01643e8d9ece5a225f2e91c007ab1f94c49c99e50d19986ff3fefb18190257323f34238828114aa607f84fbe9764ecf5aaeaa',\n", " 'path': 'cv-corpus-8.0-2022-01-19/ja/clips/common_voice_ja_25467658.mp3',\n", " 'audio': {'path': 'cv-corpus-8.0-2022-01-19/ja/clips/common_voice_ja_25467658.mp3',\n", " 'array': array([0. , 0. , 0. , ..., 0.00026336, 0.00038834,\n", " 0.00026771], dtype=float32),\n", " 'sampling_rate': 48000},\n", " 'sentence': 'ちょっとがっこうでトラブルがありまして。',\n", " 'up_votes': 2,\n", " 'down_votes': 0,\n", " 'age': 'fourties',\n", " 'gender': 'female',\n", " 'accent': '',\n", " 'locale': 'ja',\n", " 'segment': ''}" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "common_voice_train[1]" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: datasets in /opt/conda/lib/python3.8/site-packages (1.18.2.dev0)\n", "Collecting datasets\n", " Downloading datasets-1.18.3-py3-none-any.whl (311 kB)\n", " |████████████████████████████████| 311 kB 11.0 MB/s \n", "\u001b[?25hRequirement already satisfied: aiohttp in /opt/conda/lib/python3.8/site-packages (from datasets) (3.8.1)\n", "Requirement already satisfied: huggingface-hub<1.0.0,>=0.1.0 in /opt/conda/lib/python3.8/site-packages (from datasets) (0.4.0)\n", "Requirement already satisfied: dill in /opt/conda/lib/python3.8/site-packages (from datasets) (0.3.4)\n", "Requirement already satisfied: xxhash in /opt/conda/lib/python3.8/site-packages (from datasets) (2.0.2)\n", "Requirement already satisfied: multiprocess in /opt/conda/lib/python3.8/site-packages (from datasets) (0.70.12.2)\n", "Requirement already satisfied: pandas in /opt/conda/lib/python3.8/site-packages (from datasets) (1.4.0)\n", "Requirement already satisfied: pyarrow!=4.0.0,>=3.0.0 in /opt/conda/lib/python3.8/site-packages (from datasets) (6.0.1)\n", "Requirement already satisfied: requests>=2.19.0 in /opt/conda/lib/python3.8/site-packages (from datasets) (2.24.0)\n", "Requirement already satisfied: fsspec[http]>=2021.05.0 in /opt/conda/lib/python3.8/site-packages (from datasets) (2022.1.0)\n", "Requirement already satisfied: numpy>=1.17 in /opt/conda/lib/python3.8/site-packages (from datasets) (1.19.2)\n", "Requirement already satisfied: tqdm>=4.62.1 in /opt/conda/lib/python3.8/site-packages (from datasets) (4.62.3)\n", "Requirement already satisfied: packaging in /opt/conda/lib/python3.8/site-packages (from datasets) (21.3)\n", "Requirement already satisfied: filelock in /opt/conda/lib/python3.8/site-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets) (3.0.12)\n", "Requirement already satisfied: pyyaml in /opt/conda/lib/python3.8/site-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets) (5.4.1)\n", "Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.8/site-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets) (4.0.1)\n", "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /opt/conda/lib/python3.8/site-packages (from packaging->datasets) (3.0.7)\n", "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /opt/conda/lib/python3.8/site-packages (from requests>=2.19.0->datasets) (1.25.11)\n", "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.8/site-packages (from requests>=2.19.0->datasets) (2020.12.5)\n", "Requirement already satisfied: chardet<4,>=3.0.2 in /opt/conda/lib/python3.8/site-packages (from requests>=2.19.0->datasets) (3.0.4)\n", "Requirement already satisfied: idna<3,>=2.5 in /opt/conda/lib/python3.8/site-packages (from requests>=2.19.0->datasets) (2.10)\n", "Requirement already satisfied: multidict<7.0,>=4.5 in /opt/conda/lib/python3.8/site-packages (from aiohttp->datasets) (6.0.2)\n", "Requirement already satisfied: frozenlist>=1.1.1 in /opt/conda/lib/python3.8/site-packages (from aiohttp->datasets) (1.3.0)\n", "Requirement already satisfied: charset-normalizer<3.0,>=2.0 in /opt/conda/lib/python3.8/site-packages (from aiohttp->datasets) (2.0.10)\n", "Requirement already satisfied: yarl<2.0,>=1.0 in /opt/conda/lib/python3.8/site-packages (from aiohttp->datasets) (1.7.2)\n", "Requirement already satisfied: attrs>=17.3.0 in /opt/conda/lib/python3.8/site-packages (from aiohttp->datasets) (21.4.0)\n", "Requirement already satisfied: aiosignal>=1.1.2 in /opt/conda/lib/python3.8/site-packages (from aiohttp->datasets) (1.2.0)\n", "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /opt/conda/lib/python3.8/site-packages (from aiohttp->datasets) (4.0.2)\n", "Requirement already satisfied: pytz>=2020.1 in /opt/conda/lib/python3.8/site-packages (from pandas->datasets) (2021.1)\n", "Requirement already satisfied: python-dateutil>=2.8.1 in /opt/conda/lib/python3.8/site-packages (from pandas->datasets) (2.8.2)\n", "Requirement already satisfied: six>=1.5 in /opt/conda/lib/python3.8/site-packages (from python-dateutil>=2.8.1->pandas->datasets) (1.15.0)\n", "Installing collected packages: datasets\n", " Attempting uninstall: datasets\n", " Found existing installation: datasets 1.18.2.dev0\n", " Uninstalling datasets-1.18.2.dev0:\n", "\u001b[31mERROR: Could not install packages due to an OSError: [Errno 13] Permission denied: 'entry_points.txt'\n", "Consider using the `--user` option or check the permissions.\n", "\u001b[0m\n", "\u001b[33mWARNING: You are using pip version 21.3.1; however, version 22.0.2 is available.\n", "You should consider upgrading via the '/opt/conda/bin/python -m pip install --upgrade pip' command.\u001b[0m\n" ] } ], "source": [ "!pip install --upgrade datasets" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Collecting num2words\n", " Downloading num2words-0.5.10-py3-none-any.whl (101 kB)\n", " |████████████████████████████████| 101 kB 7.9 MB/s \n", "\u001b[?25hCollecting docopt>=0.6.2\n", " Downloading docopt-0.6.2.tar.gz (25 kB)\n", " Preparing metadata (setup.py) ... \u001b[?25ldone\n", "\u001b[?25hBuilding wheels for collected packages: docopt\n", " Building wheel for docopt (setup.py) ... \u001b[?25ldone\n", "\u001b[?25h Created wheel for docopt: filename=docopt-0.6.2-py2.py3-none-any.whl size=13704 sha256=7cda85e4b3980668714aad8f5d706fb5b189c2804ce1d99ca2380537fdc78031\n", " Stored in directory: /workspace/.cache/pip/wheels/56/ea/58/ead137b087d9e326852a851351d1debf4ada529b6ac0ec4e8c\n", "Successfully built docopt\n", "Installing collected packages: docopt, num2words\n", "Successfully installed docopt-0.6.2 num2words-0.5.10\n", "\u001b[33mWARNING: You are using pip version 21.3.1; however, version 22.0.2 is available.\n", "You should consider upgrading via the '/opt/conda/bin/python -m pip install --upgrade pip' command.\u001b[0m\n" ] } ], "source": [ "!pip install num2words" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "0da8fd9cdae64c1fa80fbcfc412bcf9c", "version_major": 2, "version_minor": 0 }, "text/plain": [ "0ex [00:00, ?ex/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "\n", "from num2words import num2words\n", "import regex as re\n", "matches = []\n", "\n", "def replace_numbers(match):\n", " match = match.group()\n", " matches.append(match)\n", " return num2words(match, lang='de')\n", "\n", "def replace_numbers_in_batch(batch):\n", " batch[\"sentence\"] = re.sub(r'\\d+(?:,\\d+)?', replace_numbers, batch[\"sentence\"])\n", " return batch\n", "\n", "common_voice_test_2 = common_voice_test.map(replace_numbers_in_batch)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "54d62ea7a0214b6abc5de1f106b330dc", "version_major": 2, "version_minor": 0 }, "text/plain": [ "0ex [00:00, ?ex/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "common_voice_train_2 = common_voice_train.map(replace_numbers_in_batch)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(matches)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# def replace_accented_characters(batch):\n", "# accented_string = u'Málaga'\n", "# # accented_string is of type 'unicode'\n", "# import unidecode\n", "# unaccented_string = unidecode.unidecode(accented_string)\n", "# batch[\"sentence\"] = re.sub('[â]', 'a', batch[\"sentence\"])\n", "# batch[\"sentence\"] = re.sub('[î]', 'i', batch[\"sentence\"])\n", "# batch[\"sentence\"] = re.sub('[ô]', 'o', batch[\"sentence\"])\n", "# batch[\"sentence\"] = re.sub('[û]', 'u', batch[\"sentence\"])\n", "# return batch\n", "\n", "def strip_accents(batch):\n", " return ''.join(c for c in unicodedata.normalize('NFD', batch[\"sentence\"]) if unicodedata.category(c) != 'Mn')\n", "\n", "common_voice_train = common_voice_train.map(replace_accented_characters)\n", "common_voice_test = common_voice_test.map(replace_accented_characters)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "def extract_all_chars(batch):\n", " all_text = \" \".join(batch[\"sentence\"])\n", " vocab = list(set(all_text))\n", " return {\"vocab\": [vocab], \"all_text\": [all_text]}" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "c433125fde60482ab48e7db72a0759a0", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/11 [00:00\": 0, \"\": 1, \"\": 2, \"\": 3, \"|\": 4, \"'\": 5, \"-\": 6, \"A\": 7, \"B\": 8, \"C\": 9, \"D\": 10, \"E\": 11, \"F\": 12, \"G\": 13, \"H\": 14, \"I\": 15, \"J\": 16, \"K\": 17, \"L\": 18, \"M\": 19, \"N\": 20, \"O\": 21, \"P\": 22, \"Q\": 23, \"R\": 24, \"S\": 25, \"T\": 26, \"U\": 27, \"V\": 28, \"W\": 29, \"X\": 30, \"Y\": 31, \"Z\": 32, \"Ä\": 33, \"Í\": 34, \"Ó\": 35, \"Ö\": 36, \"Ü\": 37}\n" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [], "source": [ "manually_kept_values = ['ß', 'ä', 'ö', 'ü']\n", "\n", "punctuation = ['.', ]" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['$', '&', '(', ')', '*', '+', '.', '/', '=', '@', '[', ']', '_', '`', '¡', '§', '«', '°', '´', 'µ', '·', '»', '×', 'à', 'á', 'â', 'ã', 'å', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï', 'ð', 'ñ', 'ò', 'ó', 'ô', 'õ', 'ø', 'ù', 'ú', 'û', 'ý', 'þ', 'ā', 'ă', 'ą', 'ć', 'č', 'ď', 'đ', 'ē', 'ė', 'ę', 'ě', 'ğ', 'ġ', 'ħ', 'ī', 'ı', 'ł', 'ń', 'ņ', 'ň', 'ō', 'ŏ', 'ő', 'œ', 'ř', 'ś', 'ş', 'š', 'ť', 'ū', 'ů', 'ź', 'ż', 'ž', 'ơ', 'ǐ', 'ǔ', 'ș', 'ț', 'ə', 'ʻ', 'ʾ', 'ʿ', '̆', '̇', '̥', 'а', 'в', 'е', 'и', 'к', 'м', 'о', 'р', 'с', 'ф', 'ч', 'ш', 'ѹ', 'א', 'ב', 'נ', 'ע', 'ש', '་', 'ན', 'ḫ', 'ṟ', 'ṣ', 'ṭ', 'ạ', 'ả', 'ắ', 'ằ', 'ế', 'ễ', 'ệ', 'ọ', 'ồ', 'ộ', 'ụ', 'ứ', '‑', '‚', '„', '‟', '′', '″', '‹', '›', '→', '−', '≡', '⟨', '⟩', 'カ', '东', '临', '乡', '关', '合', '城', '孙', '尣', '幺', '支', '比', '毛', '泽', '無', '生', '臣', '辶', '道', '镇', '黃']\n" ] } ], "source": [ "odd_values = []\n", "for index, value in enumerate(sorted(vocab_list)):\n", "# if :\n", " if value not in j_vocab and not (16 <= index <= 41 or value == ' ') and value not in manually_kept_values:\n", " odd_values.append(value)\n", "# print(index, value)\n", " \n", "print(odd_values)" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "$ & ( ) * + . / = @ [ ] _ ` ¡ § « ° ´ µ · » × à á â ã å æ ç è é ê ë ì í î ï ð ñ ò ó ô õ ø ù ú û ý þ ā ă ą ć č ď đ ē ė ę ě ğ ġ ħ ī ı ł ń ņ ň ō ŏ ő œ ř ś ş š ť ū ů ź ż ž ơ ǐ ǔ ș ț ə ʻ ʾ ʿ ̆ ̇ ̥ а в е и к м о р с ф ч ш ѹ א ב נ ע ש ་ ན ḫ ṟ ṣ ṭ ạ ả ắ ằ ế ễ ệ ọ ồ ộ ụ ứ ‑ ‚ „ ‟ ′ ″ ‹ › → − ≡ ⟨ ⟩ カ 东 临 乡 关 合 城 孙 尣 幺 支 比 毛 泽 無 生 臣 辶 道 镇 黃\n" ] } ], "source": [ "print(\" \".join(odd_values))\n", "\n", "# for value in odd_values:\n", "# if value not in manually_kept_values:\n", "# print(value)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "$ & ( ) * + = @ [ ] _ ` ¡ § « ° ´ µ · » × à á â ã å æ ç è é ê ë ì í î ï ð ñ ò ó ô õ ø ù ú û ý þ ā ă ą ć č ď đ ē ė ę ě ğ ġ ħ ī ı ł ń ņ ň ō ŏ ő œ ř ś ş š ť ū ů ź ż ž ơ ǐ ǔ ș ț ə ʻ ʾ ʿ ̆ ̇ ̥ а в е и к м о р с ф ч ш ѹ א ב נ ע ש ་ ན ḫ ṟ ṣ ṭ ạ ả ắ ằ ế ễ ệ ọ ồ ộ ụ ứ ‑ ‚ „ ‟ ′ ″ ‹ › → − ≡ ⟨ ⟩ カ 东 临 乡 关 合 城 孙 尣 幺 支 比 毛 泽 無 生 臣 辶 道 镇 黃" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [], "source": [ "filtered_vocab_list = [value for value in vocab_list if value not in odd_values]" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['ß',\n", " 'j',\n", " 'r',\n", " 'h',\n", " 'd',\n", " 'l',\n", " 'z',\n", " 'n',\n", " 'm',\n", " 'c',\n", " 'ä',\n", " \"'\",\n", " 'g',\n", " 'e',\n", " 'w',\n", " 's',\n", " 'u',\n", " 'k',\n", " 'o',\n", " 'f',\n", " ' ',\n", " 'y',\n", " 'v',\n", " 'ö',\n", " 'ü',\n", " 'p',\n", " 'a',\n", " 'x',\n", " 'b',\n", " 'q',\n", " 't',\n", " 'i']" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "filtered_vocab_list" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'word_delimiter_token' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "Input \u001b[0;32mIn [21]\u001b[0m, in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m vocab_dict \u001b[38;5;241m=\u001b[39m {v: k \u001b[38;5;28;01mfor\u001b[39;00m k, v \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(\u001b[38;5;28msorted\u001b[39m(vocab_list))}\n\u001b[1;32m 3\u001b[0m \u001b[38;5;66;03m# replace white space with delimiter token\u001b[39;00m\n\u001b[0;32m----> 4\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[43mword_delimiter_token\u001b[49m \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 5\u001b[0m vocab_dict[word_delimiter_token] \u001b[38;5;241m=\u001b[39m vocab_dict[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 6\u001b[0m \u001b[38;5;28;01mdel\u001b[39;00m vocab_dict[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m\"\u001b[39m]\n", "\u001b[0;31mNameError\u001b[0m: name 'word_delimiter_token' is not defined" ] } ], "source": [ "vocab_dict = {v: k for k, v in enumerate(sorted(vocab_list))}\n", "\n", "# replace white space with delimiter token\n", "if word_delimiter_token is not None:\n", " vocab_dict[word_delimiter_token] = vocab_dict[\" \"]\n", " del vocab_dict[\" \"]\n", "\n", "# add unk and pad token\n", "if unk_token is not None:\n", " vocab_dict[unk_token] = len(vocab_dict)\n", "\n", "if pad_token is not None:\n", " vocab_dict[pad_token] = len(vocab_dict)" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "59e89471ea85449ebbc709d0a9d7325c", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/437 [00:00\n" ] } ], "source": [ "import regex\n", "print(re.search('[a-zA-Z]', \"9a2\"))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 \n", "1 &\n", "2 '\n", "3 .\n", "4 /\n", "5 A\n", "6 B\n", "7 C\n", "8 D\n", "9 E\n", "10 F\n", "11 G\n", "12 H\n", "13 I\n", "14 J\n", "15 K\n", "16 L\n", "17 M\n", "18 N\n", "19 O\n", "20 P\n", "21 Q\n", "22 R\n", "23 S\n", "24 T\n", "25 U\n", "26 V\n", "27 W\n", "28 X\n", "29 Y\n", "30 Z\n", "31 a\n", "32 b\n", "33 c\n", "34 d\n", "35 e\n", "36 f\n", "37 g\n", "38 h\n", "39 i\n", "40 j\n", "41 k\n", "42 l\n", "43 m\n", "44 n\n", "45 o\n", "46 p\n", "47 q\n", "48 r\n", "49 s\n", "50 t\n", "51 u\n", "52 v\n", "53 w\n", "54 x\n", "55 y\n", "56 z\n", "57 ―\n", "58 、\n", "59 。\n", "60 々\n", "61 〇\n", "62 「\n", "63 」\n", "64 『\n", "65 』\n", "66 〜\n", "67 ぁ\n", "68 あ\n", "69 い\n", "70 う\n", "71 ぇ\n", "72 え\n", "73 お\n", "74 か\n", "75 が\n", "76 き\n", "77 ぎ\n", "78 く\n", "79 ぐ\n", "80 け\n", "81 げ\n", "82 こ\n", "83 ご\n", "84 さ\n", "85 ざ\n", "86 し\n", "87 じ\n", "88 す\n", "89 ず\n", "90 せ\n", "91 ぜ\n", "92 そ\n", "93 ぞ\n", "94 た\n", "95 だ\n", "96 ち\n", "97 ぢ\n", "98 っ\n", "99 つ\n", "100 づ\n", "101 て\n", "102 で\n", "103 と\n", "104 ど\n", "105 な\n", "106 に\n", "107 ぬ\n", "108 ね\n", "109 の\n", "110 は\n", "111 ば\n", "112 ぱ\n", "113 ひ\n", "114 び\n", "115 ぴ\n", "116 ふ\n", "117 ぶ\n", "118 ぷ\n", "119 へ\n", "120 べ\n", "121 ぺ\n", "122 ほ\n", "123 ぼ\n", "124 ぽ\n", "125 ま\n", "126 み\n", "127 む\n", "128 め\n", "129 も\n", "130 ゃ\n", "131 や\n", "132 ゅ\n", "133 ゆ\n", "134 ょ\n", "135 よ\n", "136 ら\n", "137 り\n", "138 る\n", "139 れ\n", "140 ろ\n", "141 わ\n", "142 を\n", "143 ん\n", "144 ァ\n", "145 ア\n", "146 ィ\n", "147 イ\n", "148 ゥ\n", "149 ウ\n", "150 ェ\n", "151 エ\n", "152 ォ\n", "153 オ\n", "154 カ\n", "155 ガ\n", "156 キ\n", "157 ギ\n", "158 ク\n", "159 グ\n", "160 ケ\n", "161 ゲ\n", "162 コ\n", "163 ゴ\n", "164 サ\n", "165 ザ\n", "166 シ\n", "167 ジ\n", "168 ス\n", "169 ズ\n", "170 セ\n", "171 ゼ\n", "172 ソ\n", "173 ゾ\n", "174 タ\n", "175 ダ\n", "176 チ\n", "177 ッ\n", "178 ツ\n", "179 ヅ\n", "180 テ\n", "181 デ\n", "182 ト\n", "183 ド\n", "184 ナ\n", "185 ニ\n", "186 ヌ\n", "187 ネ\n", "188 ノ\n", "189 ハ\n", "190 バ\n", "191 パ\n", "192 ヒ\n", "193 ビ\n", "194 ピ\n", "195 フ\n", "196 ブ\n", "197 プ\n", "198 ヘ\n", "199 ベ\n", "200 ペ\n", "201 ホ\n", "202 ボ\n", "203 ポ\n", "204 マ\n", "205 ミ\n", "206 ム\n", "207 メ\n", "208 モ\n", "209 ャ\n", "210 ヤ\n", "211 ュ\n", "212 ユ\n", "213 ョ\n", "214 ヨ\n", "215 ラ\n", "216 リ\n", "217 ル\n", "218 レ\n", "219 ロ\n", "220 ワ\n", "221 ン\n", "222 ヴ\n", "223 ヶ\n", "224 ・\n", "225 ー\n", "226 繫\n", "227 !\n", "228 &\n", "229 )\n", "230 -\n", "231 .\n", "232 :\n", "233 =\n", "234 ?\n", "235 A\n", "236 D\n", "237 F\n", "238 G\n", "239 N\n", "240 O\n", "241 P\n", "242 S\n", "243 U\n", "244 h\n", "245 j\n", "246 「\n", "247 」\n", "248 ・\n" ] } ], "source": [ "vocab_dict = {v: k for k, v in enumerate(sorted(vocab_list))}\n", "for key, value in enumerate(vocab_dict):\n", " print(key, value)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def create_vocabulary_from_data(\n", " datasets: DatasetDict,\n", " word_delimiter_token: Optional[str] = None,\n", " unk_token: Optional[str] = None,\n", " pad_token: Optional[str] = None,\n", "):\n", " # Given training and test labels create vocabulary\n", " def extract_all_chars(batch):\n", " all_text = \" \".join(batch[\"target_text\"])\n", " vocab = list(set(all_text))\n", " return {\"vocab\": [vocab], \"all_text\": [all_text]}\n", "\n", " vocabs = datasets.map(\n", " extract_all_chars,\n", " batched=True,\n", " batch_size=-1,\n", " keep_in_memory=True,\n", " remove_columns=datasets[\"train\"].column_names,\n", " )\n", "\n", " # take union of all unique characters in each dataset\n", " vocab_set = functools.reduce(\n", " lambda vocab_1, vocab_2: set(vocab_1[\"vocab\"][0]) | set(vocab_2[\"vocab\"][0]), vocabs.values()\n", " )\n", "\n", " vocab_dict = {v: k for k, v in enumerate(sorted(list(vocab_set)))}\n", "\n", " # replace white space with delimiter token\n", " if word_delimiter_token is not None:\n", " vocab_dict[word_delimiter_token] = vocab_dict[\" \"]\n", " del vocab_dict[\" \"]\n", "\n", " # add unk and pad token\n", " if unk_token is not None:\n", " vocab_dict[unk_token] = len(vocab_dict)\n", "\n", " if pad_token is not None:\n", " vocab_dict[pad_token] = len(vocab_dict)\n", "\n", " return vocab_dict" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# load processor\n", "feature_extractor = AutoFeatureExtractor.from_pretrained(repo_name)\n", "# feature_extractor = processor_with_lm.feature_extractor\n", "sampling_rate = feature_extractor.sampling_rate\n", "\n", "# resample audio\n", "dataset = dataset.cast_column(\"audio\", Audio(sampling_rate=sampling_rate))\n", "\n", "# load eval pipeline\n", "asr = pipeline(\"automatic-speech-recognition\", model=repo_name, feature_extractor=feature_extractor)\n", "\n", "# map function to decode audio\n", "def map_to_pred(batch):\n", " prediction = asr(\n", " batch[\"audio\"][\"array\"])\n", "\n", " batch[\"prediction\"] = prediction[\"text\"]\n", " batch[\"target\"] = batch[\"sentence\"]\n", " return batch\n", "\n", "# run inference on all examples\n", "result = dataset.map(map_to_pred, remove_columns=dataset.column_names)\n", "print(result[\"prediction\"])\n", "\n", "result[0]['target']" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.8" } }, "nbformat": 4, "nbformat_minor": 4 }