{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "2e612e3a",
   "metadata": {},
   "outputs": [],
   "source": [
    "target_lang=\"ga-IE\"  # change to your target lang"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "7fe65d91",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using custom data configuration ga-pl-lang1=ga,lang2=pl\n",
      "Reusing dataset opus_dgt (/workspace/cache/hf/datasets/opus_dgt/ga-pl-lang1=ga,lang2=pl/0.0.0/a4db75cea3712eb5d4384f0539db82abf897c6b6da5e5e81693e8fd201efc346)\n"
     ]
    }
   ],
   "source": [
    "from datasets import load_dataset\n",
    "\n",
    "# dataset = load_dataset(\"mozilla-foundation/common_voice_8_0\", \n",
    "#                        \"ga-IE\", \n",
    "#                        split=\"train\", \n",
    "#                        use_auth_token = True)\n",
    "\n",
    "dataset = load_dataset(\"opus_dgt\", lang1=\"ga\", lang2=\"pl\", split = 'train')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "id": "03e44482",
   "metadata": {},
   "outputs": [],
   "source": [
    "ga_txt = [i['ga'] for i in dataset['translation']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "id": "c828175b",
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'ga_text' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-46-c49fc06c912c>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mga_text\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m: name 'ga_text' is not defined"
     ]
    }
   ],
   "source": [
    "ga_text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "cdb72a9d",
   "metadata": {},
   "outputs": [],
   "source": [
    "chars_to_ignore_regex = '[,?.!\\-\\;\\:\"“%‘”�—’…–]'  # change to the ignored characters of your fine-tuned model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "4823df21",
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "def extract_text(batch):\n",
    "  text = batch[\"sentence\"]\n",
    "  batch[\"text_clean\"] = re.sub(chars_to_ignore_regex, \"\", text.lower())\n",
    "  return batch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "d2b27f75",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading cached processed dataset at /workspace/cache/hf/datasets/mozilla-foundation___common_voice/ga-IE/8.0.0/b8bc4d453193c06a43269b46cd87f075c70f152ac963b7f28f7a2760c45ec3e8/cache-f9f6dd3027923e5a.arrow\n"
     ]
    }
   ],
   "source": [
    "dataset = dataset.map(extract_text, remove_columns=dataset.column_names)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "91244c41",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}