{ "cells": [ { "cell_type": "code", "execution_count": 51, "id": "831245a1", "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 1, "id": "2ac8a30f", "metadata": {}, "outputs": [], "source": [ "target_lang=\"ga-IE\" # change to your target lang" ] }, { "cell_type": "code", "execution_count": 101, "id": "15710167", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Using custom data configuration ga-pl-lang1=ga,lang2=pl\n", "Reusing dataset opus_dgt (/workspace/cache/hf/datasets/opus_dgt/ga-pl-lang1=ga,lang2=pl/0.0.0/a4db75cea3712eb5d4384f0539db82abf897c6b6da5e5e81693e8fd201efc346)\n" ] } ], "source": [ "from datasets import load_dataset\n", "\n", "# dataset = load_dataset(\"mozilla-foundation/common_voice_8_0\", \n", "# \"ga-IE\", \n", "# split=\"train\", \n", "# use_auth_token = True)\n", "\n", "dataset = load_dataset(\"opus_dgt\", lang1=\"ga\", lang2=\"pl\", split = 'train')" ] }, { "cell_type": "code", "execution_count": 102, "id": "fb20d4de", "metadata": {}, "outputs": [], "source": [ "# ga_txt = [i['ga'] for i in dataset['translation']]\n", "# ga_txt = pd.Series(ga_txt)" ] }, { "cell_type": "code", "execution_count": 103, "id": "eeca1851", "metadata": {}, "outputs": [], "source": [ "chars_to_ignore_regex = '[,?.!\\-\\;\\:\"“%‘”�—’…–]' # change to the ignored characters of your fine-tuned model" ] }, { "cell_type": "code", "execution_count": 107, "id": "4df93c9c", "metadata": {}, "outputs": [], "source": [ "import re\n", "\n", "def extract_text(batch):\n", " text = batch[\"translation\"]\n", " ga_text = text['ga']\n", " batch[\"text\"] = re.sub(chars_to_ignore_regex, \"\", ga_text.lower())\n", " return batch" ] }, { "cell_type": "code", "execution_count": 108, "id": "84bedd13", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "d9a11f167bb94faa8e9f6a511407acb4", "version_major": 2, "version_minor": 0 }, "text/plain": [ "0ex [00:00, ?ex/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "dataset = dataset.map(extract_text, remove_columns=dataset.column_names)" ] }, { "cell_type": "code", "execution_count": 112, "id": "31cb3c6b", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "342d92a5d9c44c59bcb5dca143ced3b6", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Pushing dataset shards to the dataset hub: 0%| | 0/1 [00:00 \"5gram.arpa\"" ] }, { "cell_type": "code", "execution_count": 122, "id": "0e043b87", "metadata": {}, "outputs": [], "source": [ "with open(\"5gram.arpa\", \"r\") as read_file, open(\"5gram_correct.arpa\", \"w\") as write_file:\n", " has_added_eos = False\n", " for line in read_file:\n", " if not has_added_eos and \"ngram 1=\" in line:\n", " count=line.strip().split(\"=\")[-1]\n", " write_file.write(line.replace(f\"{count}\", f\"{int(count)+1}\"))\n", " elif not has_added_eos and \"\" in line:\n", " write_file.write(line)\n", " write_file.write(line.replace(\"\", \"\"))\n", " has_added_eos = True\n", " else:\n", " write_file.write(line)" ] }, { "cell_type": "code", "execution_count": 123, "id": "d106c7d1", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\\data\\\n", "ngram 1=70781\n", "ngram 2=652306\n", "ngram 3=1669326\n", "ngram 4=2514789\n", "ngram 5=3053088\n", "\n", "\\1-grams:\n", "-5.8501472\t\t0\n", "0\t\t-0.11565505\n", "0\t\t-0.11565505\n", "-5.4088216\tmiontuairisc\t-0.20133564\n", "-4.6517477\tcheartaitheach\t-0.24842946\n", "-2.1893916\tmaidir\t-1.7147961\n", "-2.1071756\tle\t-0.7007309\n", "-4.156014\tcoinbhinsiún\t-0.31064242\n", "-1.8876181\tar\t-0.9045828\n", "-4.62287\tdhlínse\t-0.24268326\n", "-1.6051095\tagus\t-0.8729715\n", "-4.1465816\taithint\t-0.21693327\n" ] } ], "source": [ "!head -20 5gram_correct.arpa" ] }, { "cell_type": "code", "execution_count": 124, "id": "85ef4c43", "metadata": {}, "outputs": [], "source": [ "from transformers import AutoProcessor\n", "\n", "processor = AutoProcessor.from_pretrained(\"./\")" ] }, { "cell_type": "code", "execution_count": 125, "id": "cb2a2768", "metadata": {}, "outputs": [], "source": [ "vocab_dict = processor.tokenizer.get_vocab()\n", "sorted_vocab_dict = {k.lower(): v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}" ] }, { "cell_type": "code", "execution_count": 126, "id": "d19eee6f", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Found entries of length > 1 in alphabet. This is unusual unless style is BPE, but the alphabet was not recognized as BPE type. Is this correct?\n", "Unigrams and labels don't seem to agree.\n" ] } ], "source": [ "from pyctcdecode import build_ctcdecoder\n", "\n", "decoder = build_ctcdecoder(\n", " labels=list(sorted_vocab_dict.keys()),\n", " kenlm_model_path=\"5gram_correct.arpa\",\n", ")" ] }, { "cell_type": "code", "execution_count": 127, "id": "4e8031a9", "metadata": {}, "outputs": [], "source": [ "from transformers import Wav2Vec2ProcessorWithLM\n", "\n", "processor_with_lm = Wav2Vec2ProcessorWithLM(\n", " feature_extractor=processor.feature_extractor,\n", " tokenizer=processor.tokenizer,\n", " decoder=decoder\n", ")" ] }, { "cell_type": "code", "execution_count": 128, "id": "6f32faf4", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/workspace/wav2vec-1b-cv8-ir/./ is already a clone of https://huggingface.co/jcmc/wav2vec-1b-cv8-ir. Make sure you pull the latest changes with `repo.git_pull()`.\n" ] } ], "source": [ "from huggingface_hub import Repository\n", "\n", "repo = Repository(local_dir=\"./\", clone_from=\"jcmc/wav2vec-1b-cv8-ir\")" ] }, { "cell_type": "code", "execution_count": 129, "id": "a7e91068", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'/workspace/wav2vec-1b-cv8-ir'" ] }, "execution_count": 129, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pwd" ] }, { "cell_type": "code", "execution_count": null, "id": "0a1de336", "metadata": {}, "outputs": [], "source": [ "processor_with_lm.save_pretrained(\"xls-r-300m-sv\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.8" } }, "nbformat": 4, "nbformat_minor": 5 }