{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "2e612e3a", "metadata": {}, "outputs": [], "source": [ "target_lang=\"ga-IE\" # change to your target lang" ] }, { "cell_type": "code", "execution_count": 39, "id": "7fe65d91", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Using custom data configuration ga-pl-lang1=ga,lang2=pl\n", "Reusing dataset opus_dgt (/workspace/cache/hf/datasets/opus_dgt/ga-pl-lang1=ga,lang2=pl/0.0.0/a4db75cea3712eb5d4384f0539db82abf897c6b6da5e5e81693e8fd201efc346)\n" ] } ], "source": [ "from datasets import load_dataset\n", "\n", "# dataset = load_dataset(\"mozilla-foundation/common_voice_8_0\", \n", "# \"ga-IE\", \n", "# split=\"train\", \n", "# use_auth_token = True)\n", "\n", "dataset = load_dataset(\"opus_dgt\", lang1=\"ga\", lang2=\"pl\", split = 'train')" ] }, { "cell_type": "code", "execution_count": 45, "id": "03e44482", "metadata": {}, "outputs": [], "source": [ "ga_txt = [i['ga'] for i in dataset['translation']]" ] }, { "cell_type": "code", "execution_count": 46, "id": "c828175b", "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'ga_text' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mga_text\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mNameError\u001b[0m: name 'ga_text' is not defined" ] } ], "source": [ "ga_text" ] }, { "cell_type": "code", "execution_count": 19, "id": "cdb72a9d", "metadata": {}, "outputs": [], "source": [ "chars_to_ignore_regex = '[,?.!\\-\\;\\:\"“%‘”�—’…–]' # change to the ignored characters of your fine-tuned model" ] }, { "cell_type": "code", "execution_count": 20, "id": "4823df21", "metadata": {}, "outputs": [], "source": [ "import re\n", "\n", "def extract_text(batch):\n", " text = batch[\"sentence\"]\n", " batch[\"text_clean\"] = re.sub(chars_to_ignore_regex, \"\", text.lower())\n", " return batch" ] }, { "cell_type": "code", "execution_count": 21, "id": "d2b27f75", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Loading cached processed dataset at /workspace/cache/hf/datasets/mozilla-foundation___common_voice/ga-IE/8.0.0/b8bc4d453193c06a43269b46cd87f075c70f152ac963b7f28f7a2760c45ec3e8/cache-f9f6dd3027923e5a.arrow\n" ] } ], "source": [ "dataset = dataset.map(extract_text, remove_columns=dataset.column_names)" ] }, { "cell_type": "code", "execution_count": null, "id": "91244c41", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.8" } }, "nbformat": 4, "nbformat_minor": 5 }