{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"ari_en_nr_JW300.ipynb","provenance":[],"collapsed_sections":["smUYbE8bGNE0","TmPMur_UAVM3"],"toc_visible":true},"kernelspec":{"name":"python3","display_name":"Python 3"},"accelerator":"GPU"},"cells":[{"cell_type":"markdown","metadata":{"id":"BdwKlib49HcY","colab_type":"text"},"source":["#
Masakhane - Machine Translation for African Languages (Using JoeyNMT)
\n","## Author : Ari Ramkilowan\n","## Language Pair : English - isiNdebele\n","## Corpus : JW300 "]},{"cell_type":"markdown","metadata":{"id":"GuU-cZx6JyWf","colab_type":"text"},"source":["
"]},{"cell_type":"markdown","metadata":{"id":"smUYbE8bGNE0","colab_type":"text"},"source":["## Install JoeyNMT"]},{"cell_type":"code","metadata":{"id":"O8NG5kJ-9SLW","colab_type":"code","outputId":"97933ae8-6c9d-4595-9d3d-fa3c673bcd01","executionInfo":{"status":"ok","timestamp":1574946362398,"user_tz":-120,"elapsed":23972,"user":{"displayName":"Ari Ramkilowan","photoUrl":"https://lh3.googleusercontent.com/a-/AAuE7mBOdxjV-rMNcCCRFbbimFCdX4yGotTEbejYVcEc=s64","userId":"04960745627161950732"}},"colab":{"base_uri":"https://localhost:8080/","height":1000}},"source":["! git clone https://github.com/joeynmt/joeynmt.git\n","! cd joeynmt; pip3 install ."],"execution_count":0,"outputs":[{"output_type":"stream","text":["Cloning into 'joeynmt'...\n","remote: Enumerating objects: 15, done.\u001b[K\n","remote: Counting objects: 100% (15/15), done.\u001b[K\n","remote: Compressing objects: 100% (12/12), done.\u001b[K\n","remote: Total 2199 (delta 4), reused 5 (delta 3), pack-reused 2184\u001b[K\n","Receiving objects: 100% (2199/2199), 2.60 MiB | 4.24 MiB/s, done.\n","Resolving deltas: 100% (1525/1525), done.\n","Processing /content/joeynmt\n","Requirement already satisfied: future in /usr/local/lib/python3.6/dist-packages (from joeynmt==0.0.1) (0.16.0)\n","Requirement already satisfied: pillow in /usr/local/lib/python3.6/dist-packages (from joeynmt==0.0.1) (4.3.0)\n","Requirement already satisfied: numpy<2.0,>=1.14.5 in /usr/local/lib/python3.6/dist-packages (from joeynmt==0.0.1) (1.17.4)\n","Requirement already satisfied: setuptools>=41.0.0 in /usr/local/lib/python3.6/dist-packages (from joeynmt==0.0.1) (41.6.0)\n","Requirement already satisfied: torch>=1.1 in /usr/local/lib/python3.6/dist-packages (from joeynmt==0.0.1) (1.3.1)\n","Requirement already satisfied: tensorflow>=1.14 in /usr/local/lib/python3.6/dist-packages (from joeynmt==0.0.1) (1.15.0)\n","Requirement already satisfied: torchtext in /usr/local/lib/python3.6/dist-packages (from joeynmt==0.0.1) (0.3.1)\n","Collecting sacrebleu>=1.3.6\n"," Downloading https://files.pythonhosted.org/packages/0e/e5/93d252182f7cbd4b59bb3ec5797e2ce33cfd6f5aadaf327db170cf4b7887/sacrebleu-1.4.2-py3-none-any.whl\n","Collecting subword-nmt\n"," Downloading https://files.pythonhosted.org/packages/74/60/6600a7bc09e7ab38bc53a48a20d8cae49b837f93f5842a41fe513a694912/subword_nmt-0.3.7-py2.py3-none-any.whl\n","Requirement already satisfied: matplotlib in /usr/local/lib/python3.6/dist-packages (from joeynmt==0.0.1) (3.1.1)\n","Requirement already satisfied: seaborn in /usr/local/lib/python3.6/dist-packages (from joeynmt==0.0.1) (0.9.0)\n","Collecting pyyaml>=5.1\n","\u001b[?25l Downloading https://files.pythonhosted.org/packages/e3/e8/b3212641ee2718d556df0f23f78de8303f068fe29cdaa7a91018849582fe/PyYAML-5.1.2.tar.gz (265kB)\n","\u001b[K |████████████████████████████████| 266kB 18.1MB/s \n","\u001b[?25hCollecting pylint\n","\u001b[?25l Downloading https://files.pythonhosted.org/packages/e9/59/43fc36c5ee316bb9aeb7cf5329cdbdca89e5749c34d5602753827c0aa2dc/pylint-2.4.4-py3-none-any.whl (302kB)\n","\u001b[K |████████████████████████████████| 307kB 44.5MB/s \n","\u001b[?25hRequirement already satisfied: six==1.12 in /usr/local/lib/python3.6/dist-packages (from joeynmt==0.0.1) (1.12.0)\n","Requirement already satisfied: olefile in /usr/local/lib/python3.6/dist-packages (from pillow->joeynmt==0.0.1) (0.46)\n","Requirement already satisfied: opt-einsum>=2.3.2 in /usr/local/lib/python3.6/dist-packages (from tensorflow>=1.14->joeynmt==0.0.1) (3.1.0)\n","Requirement already satisfied: google-pasta>=0.1.6 in /usr/local/lib/python3.6/dist-packages (from tensorflow>=1.14->joeynmt==0.0.1) (0.1.8)\n","Requirement already satisfied: wheel>=0.26 in /usr/local/lib/python3.6/dist-packages (from tensorflow>=1.14->joeynmt==0.0.1) (0.33.6)\n","Requirement already satisfied: tensorflow-estimator==1.15.1 in /usr/local/lib/python3.6/dist-packages (from tensorflow>=1.14->joeynmt==0.0.1) (1.15.1)\n","Requirement already satisfied: keras-applications>=1.0.8 in /usr/local/lib/python3.6/dist-packages (from tensorflow>=1.14->joeynmt==0.0.1) (1.0.8)\n","Requirement already satisfied: tensorboard<1.16.0,>=1.15.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow>=1.14->joeynmt==0.0.1) (1.15.0)\n","Requirement already satisfied: protobuf>=3.6.1 in /usr/local/lib/python3.6/dist-packages (from tensorflow>=1.14->joeynmt==0.0.1) (3.10.0)\n","Requirement already satisfied: termcolor>=1.1.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow>=1.14->joeynmt==0.0.1) (1.1.0)\n","Requirement already satisfied: keras-preprocessing>=1.0.5 in /usr/local/lib/python3.6/dist-packages (from tensorflow>=1.14->joeynmt==0.0.1) (1.1.0)\n","Requirement already satisfied: grpcio>=1.8.6 in /usr/local/lib/python3.6/dist-packages (from tensorflow>=1.14->joeynmt==0.0.1) (1.15.0)\n","Requirement already satisfied: wrapt>=1.11.1 in /usr/local/lib/python3.6/dist-packages (from tensorflow>=1.14->joeynmt==0.0.1) (1.11.2)\n","Requirement already satisfied: gast==0.2.2 in /usr/local/lib/python3.6/dist-packages (from tensorflow>=1.14->joeynmt==0.0.1) (0.2.2)\n","Requirement already satisfied: absl-py>=0.7.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow>=1.14->joeynmt==0.0.1) (0.8.1)\n","Requirement already satisfied: astor>=0.6.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow>=1.14->joeynmt==0.0.1) (0.8.0)\n","Requirement already satisfied: tqdm in /usr/local/lib/python3.6/dist-packages (from torchtext->joeynmt==0.0.1) (4.28.1)\n","Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from torchtext->joeynmt==0.0.1) (2.21.0)\n","Requirement already satisfied: typing in /usr/local/lib/python3.6/dist-packages (from sacrebleu>=1.3.6->joeynmt==0.0.1) (3.6.6)\n","Collecting portalocker\n"," Downloading https://files.pythonhosted.org/packages/91/db/7bc703c0760df726839e0699b7f78a4d8217fdc9c7fcb1b51b39c5a22a4e/portalocker-1.5.2-py2.py3-none-any.whl\n","Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->joeynmt==0.0.1) (2.4.5)\n","Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.6/dist-packages (from matplotlib->joeynmt==0.0.1) (0.10.0)\n","Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->joeynmt==0.0.1) (2.6.1)\n","Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->joeynmt==0.0.1) (1.1.0)\n","Requirement already satisfied: scipy>=0.14.0 in /usr/local/lib/python3.6/dist-packages (from seaborn->joeynmt==0.0.1) (1.3.2)\n","Requirement already satisfied: pandas>=0.15.2 in /usr/local/lib/python3.6/dist-packages (from seaborn->joeynmt==0.0.1) (0.25.3)\n","Collecting mccabe<0.7,>=0.6\n"," Downloading https://files.pythonhosted.org/packages/87/89/479dc97e18549e21354893e4ee4ef36db1d237534982482c3681ee6e7b57/mccabe-0.6.1-py2.py3-none-any.whl\n","Collecting astroid<2.4,>=2.3.0\n","\u001b[?25l Downloading https://files.pythonhosted.org/packages/ad/ae/86734823047962e7b8c8529186a1ac4a7ca19aaf1aa0c7713c022ef593fd/astroid-2.3.3-py3-none-any.whl (205kB)\n","\u001b[K |████████████████████████████████| 215kB 37.1MB/s \n","\u001b[?25hCollecting isort<5,>=4.2.5\n","\u001b[?25l Downloading https://files.pythonhosted.org/packages/e5/b0/c121fd1fa3419ea9bfd55c7f9c4fedfec5143208d8c7ad3ce3db6c623c21/isort-4.3.21-py2.py3-none-any.whl (42kB)\n","\u001b[K |████████████████████████████████| 51kB 6.9MB/s \n","\u001b[?25hRequirement already satisfied: h5py in /usr/local/lib/python3.6/dist-packages (from keras-applications>=1.0.8->tensorflow>=1.14->joeynmt==0.0.1) (2.8.0)\n","Requirement already satisfied: markdown>=2.6.8 in /usr/local/lib/python3.6/dist-packages (from tensorboard<1.16.0,>=1.15.0->tensorflow>=1.14->joeynmt==0.0.1) (3.1.1)\n","Requirement already satisfied: werkzeug>=0.11.15 in /usr/local/lib/python3.6/dist-packages (from tensorboard<1.16.0,>=1.15.0->tensorflow>=1.14->joeynmt==0.0.1) (0.16.0)\n","Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->torchtext->joeynmt==0.0.1) (3.0.4)\n","Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->torchtext->joeynmt==0.0.1) (2019.9.11)\n","Requirement already satisfied: urllib3<1.25,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->torchtext->joeynmt==0.0.1) (1.24.3)\n","Requirement already satisfied: idna<2.9,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->torchtext->joeynmt==0.0.1) (2.8)\n","Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas>=0.15.2->seaborn->joeynmt==0.0.1) (2018.9)\n","Collecting lazy-object-proxy==1.4.*\n","\u001b[?25l Downloading https://files.pythonhosted.org/packages/0b/dd/b1e3407e9e6913cf178e506cd0dee818e58694d9a5cd1984e3f6a8b9a10f/lazy_object_proxy-1.4.3-cp36-cp36m-manylinux1_x86_64.whl (55kB)\n","\u001b[K |████████████████████████████████| 61kB 8.6MB/s \n","\u001b[?25hCollecting typed-ast<1.5,>=1.4.0; implementation_name == \"cpython\" and python_version < \"3.8\"\n","\u001b[?25l Downloading https://files.pythonhosted.org/packages/31/d3/9d1802c161626d0278bafb1ffb32f76b9d01e123881bbf9d91e8ccf28e18/typed_ast-1.4.0-cp36-cp36m-manylinux1_x86_64.whl (736kB)\n","\u001b[K |████████████████████████████████| 737kB 50.2MB/s \n","\u001b[?25hBuilding wheels for collected packages: joeynmt, pyyaml\n"," Building wheel for joeynmt (setup.py) ... \u001b[?25l\u001b[?25hdone\n"," Created wheel for joeynmt: filename=joeynmt-0.0.1-cp36-none-any.whl size=72136 sha256=4fbaf089ba87f31032bfb25477698f7b9e8d56ceae62c059790beb6d8befb44e\n"," Stored in directory: /tmp/pip-ephem-wheel-cache-xdbxy0qs/wheels/db/01/db/751cc9f3e7f6faec127c43644ba250a3ea7ad200594aeda70a\n"," Building wheel for pyyaml (setup.py) ... \u001b[?25l\u001b[?25hdone\n"," Created wheel for pyyaml: filename=PyYAML-5.1.2-cp36-cp36m-linux_x86_64.whl size=44104 sha256=e57993b08cb203e65a92aed43683ae2676172b2a9c40957ff475f8c0803ccc26\n"," Stored in directory: /root/.cache/pip/wheels/d9/45/dd/65f0b38450c47cf7e5312883deb97d065e030c5cca0a365030\n","Successfully built joeynmt pyyaml\n","Installing collected packages: portalocker, sacrebleu, subword-nmt, pyyaml, mccabe, lazy-object-proxy, typed-ast, astroid, isort, pylint, joeynmt\n"," Found existing installation: PyYAML 3.13\n"," Uninstalling PyYAML-3.13:\n"," Successfully uninstalled PyYAML-3.13\n","Successfully installed astroid-2.3.3 isort-4.3.21 joeynmt-0.0.1 lazy-object-proxy-1.4.3 mccabe-0.6.1 portalocker-1.5.2 pylint-2.4.4 pyyaml-5.1.2 sacrebleu-1.4.2 subword-nmt-0.3.7 typed-ast-1.4.0\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"xyAKRE7TJ-yt","colab_type":"text"},"source":["## Mount Google Drive"]},{"cell_type":"code","metadata":{"id":"44Sz_9aV9aDX","colab_type":"code","outputId":"4d19399b-3331-47db-a204-86706b8e938a","executionInfo":{"status":"ok","timestamp":1574946438515,"user_tz":-120,"elapsed":28083,"user":{"displayName":"Ari Ramkilowan","photoUrl":"https://lh3.googleusercontent.com/a-/AAuE7mBOdxjV-rMNcCCRFbbimFCdX4yGotTEbejYVcEc=s64","userId":"04960745627161950732"}},"colab":{"base_uri":"https://localhost:8080/","height":124}},"source":["# If running on Google Colab - mount google drive\n","\n","from google.colab import drive\n","drive.mount('/content/drive')"],"execution_count":0,"outputs":[{"output_type":"stream","text":["Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly\n","\n","Enter your authorization code:\n","··········\n","Mounted at /content/drive\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"4xIVqSqWm7HT","colab_type":"code","outputId":"5bb20a3b-6ac6-49ca-baff-e3b7b73136e1","executionInfo":{"status":"ok","timestamp":1575283384304,"user_tz":-120,"elapsed":1328,"user":{"displayName":"Ari Ramkilowan","photoUrl":"https://lh3.googleusercontent.com/a-/AAuE7mBOdxjV-rMNcCCRFbbimFCdX4yGotTEbejYVcEc=s64","userId":"04960745627161950732"}},"colab":{"base_uri":"https://localhost:8080/","height":34}},"source":["import torch\n","\n","device_num = torch.cuda.current_device()\n","torch.cuda.get_device_name(device_num)\n","# torch.cuda.is_available()"],"execution_count":19,"outputs":[{"output_type":"execute_result","data":{"text/plain":["'Tesla T4'"]},"metadata":{"tags":[]},"execution_count":19}]},{"cell_type":"markdown","metadata":{"id":"YkeQhZCh_6Jn","colab_type":"text"},"source":["## Set your source and target languages"]},{"cell_type":"code","metadata":{"id":"tx8BNHwVK1jt","colab_type":"code","colab":{}},"source":["import os\n","import numpy as np\n","import pandas as pd\n","\n","source_language = \"en\"\n","target_language = \"nr\" \n","lc = True # If True, lowercase the data.\n","seed = 42 # Random seed for shuffling.\n","tag = \"baseline\" # Give a unique name to your folder - this is to ensure you don't rewrite any models you've already submitted\n","vocab_size=4000\n","corpus = \"JW300\"\n","\n","os.environ[\"src\"] = source_language # Sets them in bash as well, since we often use bash scripts\n","os.environ[\"tgt\"] = target_language\n","os.environ[\"tag\"] = tag\n","os.environ[\"vocab_size\"] = str(vocab_size)\n","os.environ[\"corpus\"] = corpus"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"Lq_NRbE8LJTI","colab_type":"code","outputId":"5c6fc27b-4496-4cd7-a25c-83532c4da31e","executionInfo":{"status":"ok","timestamp":1574946461448,"user_tz":-120,"elapsed":6606,"user":{"displayName":"Ari Ramkilowan","photoUrl":"https://lh3.googleusercontent.com/a-/AAuE7mBOdxjV-rMNcCCRFbbimFCdX4yGotTEbejYVcEc=s64","userId":"04960745627161950732"}},"colab":{"base_uri":"https://localhost:8080/","height":34}},"source":["# This will save it to a folder in our gdrive instead!\n","# !mkdir -p \"/content/drive/My Drive/masakhane/$src-$tgt-$tag\"\n","gdrive_path = f\"/content/drive/My Drive/masakhane/{source_language}-{target_language}-{tag}/\"\n","os.environ[\"gdrive_path\"] = gdrive_path\n","! echo $gdrive_path"],"execution_count":0,"outputs":[{"output_type":"stream","text":["/content/drive/My Drive/masakhane/en-nr-baseline/\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"T4RrqHaeLNQi","colab_type":"code","outputId":"9cf1334b-8a9b-4923-b4a9-6c3cba9c5b26","executionInfo":{"status":"ok","timestamp":1574946466766,"user_tz":-120,"elapsed":10653,"user":{"displayName":"Ari Ramkilowan","photoUrl":"https://lh3.googleusercontent.com/a-/AAuE7mBOdxjV-rMNcCCRFbbimFCdX4yGotTEbejYVcEc=s64","userId":"04960745627161950732"}},"colab":{"base_uri":"https://localhost:8080/","height":156}},"source":["# create path to joeynmt executables scripts, configs etc\n","\n","joey_path = f\"/content/joeynmt\"\n","os.environ[\"joey_path\"] = joey_path\n","! ls $joey_path/configs"],"execution_count":0,"outputs":[{"output_type":"stream","text":["iwslt14_deen_bpe.yaml\t\t transformer_reverse.yaml\n","iwslt_deen_bahdanau.yaml\t transformer_small.yaml\n","iwslt_envi_luong.yaml\t\t transformer_wmt17_ende.yaml\n","iwslt_envi_xnmt.yaml\t\t transformer_wmt17_lven.yaml\n","reverse.yaml\t\t\t wmt_ende_best.yaml\n","small.yaml\t\t\t wmt_ende_default.yaml\n","transformer_copy.yaml\t\t wmt_lven_best.yaml\n","transformer_iwslt14_deen_bpe.yaml wmt_lven_default.yaml\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"TmPMur_UAVM3","colab_type":"text"},"source":["## Download the global test set.\n","\n","***(This changes from time to time, do this just to make sure you have the most recent version)***"]},{"cell_type":"code","metadata":{"id":"kN-HUp87LVGB","colab_type":"code","outputId":"e8914a7e-c9b7-42c5-cc6b-b19abd4caba2","executionInfo":{"status":"ok","timestamp":1574058046727,"user_tz":-120,"elapsed":17791,"user":{"displayName":"Ari Ramkilowan","photoUrl":"https://lh3.googleusercontent.com/a-/AAuE7mBOdxjV-rMNcCCRFbbimFCdX4yGotTEbejYVcEc=s64","userId":"04960745627161950732"}},"colab":{"base_uri":"https://localhost:8080/","height":590}},"source":["! wget https://raw.githubusercontent.com/juliakreutzer/masakhane/master/jw300_utils/test/test.en-any.en\n"," \n","! wget https://raw.githubusercontent.com/juliakreutzer/masakhane/master/jw300_utils/test/test.en-$tgt.en\n","! mv test.en-$tgt.en test.en\n","\n","! wget https://raw.githubusercontent.com/juliakreutzer/masakhane/master/jw300_utils/test/test.en-$tgt.$tgt \n","! mv test.en-$tgt.$tgt test.$tgt"],"execution_count":0,"outputs":[{"output_type":"stream","text":["--2019-11-18 06:20:30-- https://raw.githubusercontent.com/juliakreutzer/masakhane/master/jw300_utils/test/test.en-any.en\n","Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...\n","Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.\n","HTTP request sent, awaiting response... 200 OK\n","Length: 277791 (271K) [text/plain]\n","Saving to: ‘test.en-any.en’\n","\n","\rtest.en-any.en 0%[ ] 0 --.-KB/s \rtest.en-any.en 100%[===================>] 271.28K --.-KB/s in 0.02s \n","\n","2019-11-18 06:20:30 (11.6 MB/s) - ‘test.en-any.en’ saved [277791/277791]\n","\n","--2019-11-18 06:20:34-- https://raw.githubusercontent.com/juliakreutzer/masakhane/master/jw300_utils/test/test.en-nr.en\n","Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...\n","Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.\n","HTTP request sent, awaiting response... 200 OK\n","Length: 203073 (198K) [text/plain]\n","Saving to: ‘test.en-nr.en’\n","\n","test.en-nr.en 100%[===================>] 198.31K --.-KB/s in 0.02s \n","\n","2019-11-18 06:20:34 (11.3 MB/s) - ‘test.en-nr.en’ saved [203073/203073]\n","\n","--2019-11-18 06:20:40-- https://raw.githubusercontent.com/juliakreutzer/masakhane/master/jw300_utils/test/test.en-nr.nr\n","Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...\n","Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.\n","HTTP request sent, awaiting response... 200 OK\n","Length: 214288 (209K) [text/plain]\n","Saving to: ‘test.en-nr.nr’\n","\n","test.en-nr.nr 100%[===================>] 209.27K --.-KB/s in 0.02s \n","\n","2019-11-18 06:20:41 (9.36 MB/s) - ‘test.en-nr.nr’ saved [214288/214288]\n","\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"1VpW7P7U_tc0","colab_type":"code","colab":{}},"source":[""],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"Ve8qqVviAVPu","colab_type":"code","outputId":"56259906-eff0-4f72-d6e7-52e0ae337f30","executionInfo":{"status":"ok","timestamp":1574058211903,"user_tz":-120,"elapsed":1401,"user":{"displayName":"Ari Ramkilowan","photoUrl":"https://lh3.googleusercontent.com/a-/AAuE7mBOdxjV-rMNcCCRFbbimFCdX4yGotTEbejYVcEc=s64","userId":"04960745627161950732"}},"colab":{"base_uri":"https://localhost:8080/","height":52}},"source":["# Read the test data to filter from train and dev splits.\n","# Store english portion in set for quick filtering checks.\n","en_test_sents = set()\n","filter_test_sents = \"test.en-any.en\"\n","j = 0\n","blanks = [] # sometimes blank lines creep innto test set - store which lines these are\n","with open(filter_test_sents) as f:\n"," for line in f:\n"," en_test_sents.add(line.strip())\n"," if len(line)<=1:\n"," blanks.append(j)\n"," j += 1\n","print('Loaded {} global test sentences to filter from the training/dev data.'.format(j))\n","print(f'There are {len(blanks)} blank lines in the test set')"],"execution_count":0,"outputs":[{"output_type":"stream","text":["Loaded 3571 global test sentences to filter from the training/dev data.\n","There are 0 blank lines in the test set\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"RHh4MOzGAVSJ","colab_type":"code","colab":{}},"source":["# filter test set\n","\n","source_file = f\"test.{source_language}\"\n","target_file = f\"test.{target_language}\"\n","\n","source = []\n","target = []\n","\n","with open(source_file) as f:\n"," source = f.readlines()\n"," \n","with open(target_file) as f:\n"," target = f.readlines()\n","\n","df = pd.DataFrame(zip(source, target), columns=['source_sentence', 'target_sentence'])\n","\n","# remove trailing newline chars\n","df['source_sentence'] = df['source_sentence'].str.rstrip('\" \\n')\n","df['target_sentence'] = df['target_sentence'].str.rstrip('\" \\n')\n","\n","# remove leading newline chars\n","df['source_sentence'] = df['source_sentence'].str.lstrip('\"')\n","df['target_sentence'] = df['target_sentence'].str.lstrip('\"')\n","\n","# remove rows with really short sentences\n","df = df[~(df['source_sentence'].str.len() <8)] # remove rows wher esource text len <8 characters\n","df = df[~(df['target_sentence'].str.len() <8)] # remove rows wher esource text len <8 characters\n","\n","# save the filtered test set\n","df['source_sentence'].to_csv(f'{source_file}', index=False, header=False, doublequote=False)\n","df['target_sentence'].to_csv(f'{target_file}', index=False, header=False, doublequote=False)"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"lRclKYsmAVUe","colab_type":"code","colab":{}},"source":["# copy test sets to gdrive\n","! cp test.$src \"$gdrive_path\"\n","! cp test.$tgt \"$gdrive_path\"\n","! cp test.$src-any.$src \"$gdrive_path\""],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"TJAfPZfKMGz-","colab_type":"text"},"source":["## Import prepared dataset"]},{"cell_type":"code","metadata":{"id":"Iykgv6nTAVXB","colab_type":"code","colab":{}},"source":["import pandas as pd\n","from IPython.core.interactiveshell import InteractiveShell\n","InteractiveShell.ast_node_interactivity = \"all\""],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"AXmXE_yGMTTb","colab_type":"text"},"source":[""]},{"cell_type":"code","metadata":{"id":"WgPhV9EkMSgf","colab_type":"code","colab":{}},"source":["# This csv has extra columns added but no preprocessing done. all preprocessing should be captured in the NMT modelling notebook\n","\n","input_file = f\"{gdrive_path}/{source_language}-{target_language}-{corpus}-new.csv\"\n","df = pd.read_csv(input_file)"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"a-rKljkHMSpT","colab_type":"code","outputId":"c2e0f1ff-4ffe-4697-83c0-1eed9dd78f05","executionInfo":{"status":"ok","timestamp":1574058308015,"user_tz":-120,"elapsed":1204,"user":{"displayName":"Ari Ramkilowan","photoUrl":"https://lh3.googleusercontent.com/a-/AAuE7mBOdxjV-rMNcCCRFbbimFCdX4yGotTEbejYVcEc=s64","userId":"04960745627161950732"}},"colab":{"base_uri":"https://localhost:8080/","height":206}},"source":["df.head()"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/html":["
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
source_sentencetarget_sentence
0Table of ContentsOkungaphakathi
1July 1 , 2010Arhostosi 1 , 2010
2Do You Know God by Name ?Umazi Kuhle na UZimu ?
3FROM OUR COVEREZIKHAMBISANA NESIHLOKO ESINGAPHANDLE
43 Can You Know God by Name ?3 Ungamazi UZimu Ngebizo ?
\n","
"],"text/plain":[" source_sentence target_sentence\n","0 Table of Contents Okungaphakathi\n","1 July 1 , 2010 Arhostosi 1 , 2010\n","2 Do You Know God by Name ? Umazi Kuhle na UZimu ?\n","3 FROM OUR COVER EZIKHAMBISANA NESIHLOKO ESINGAPHANDLE\n","4 3 Can You Know God by Name ? 3 Ungamazi UZimu Ngebizo ?"]},"metadata":{"tags":[]},"execution_count":31}]},{"cell_type":"code","metadata":{"id":"Xw9yvWqRMSs_","colab_type":"code","outputId":"c308f1af-83b4-438a-b9a9-56c1d9e131e7","executionInfo":{"status":"ok","timestamp":1574058310480,"user_tz":-120,"elapsed":1702,"user":{"displayName":"Ari Ramkilowan","photoUrl":"https://lh3.googleusercontent.com/a-/AAuE7mBOdxjV-rMNcCCRFbbimFCdX4yGotTEbejYVcEc=s64","userId":"04960745627161950732"}},"colab":{"base_uri":"https://localhost:8080/","height":52}},"source":["# How many samples\n","size = len(df)\n","print(f\"\\n {size} samples in original text\")\n"," "],"execution_count":0,"outputs":[{"output_type":"stream","text":["\n"," 103983 samples in original text\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"zxBcbWrhMSwy","colab_type":"text"},"source":["## Preprocess input data"]},{"cell_type":"code","metadata":{"id":"96yovwFjNfLm","colab_type":"code","outputId":"d730284a-5c05-490b-fac4-f9eab26f2982","executionInfo":{"status":"ok","timestamp":1574058404506,"user_tz":-120,"elapsed":1533,"user":{"displayName":"Ari Ramkilowan","photoUrl":"https://lh3.googleusercontent.com/a-/AAuE7mBOdxjV-rMNcCCRFbbimFCdX4yGotTEbejYVcEc=s64","userId":"04960745627161950732"}},"colab":{"base_uri":"https://localhost:8080/","height":173}},"source":["## Preprocessing - Step 1 : Drop NaNs\n","\n","df_pp = df.dropna()\n","df_pp.info(memory_usage='deep')\n","new_size = len(df_pp)\n","print(f\"\\n {size-new_size}({100*(size-new_size)/size :.2f} %) samples removed by dropping all NaNs\")\n","size = new_size"],"execution_count":0,"outputs":[{"output_type":"stream","text":["\n","Int64Index: 102127 entries, 0 to 103982\n","Data columns (total 2 columns):\n","source_sentence 102127 non-null object\n","target_sentence 102127 non-null object\n","dtypes: object(2)\n","memory usage: 36.4 MB\n","\n"," 1856(1.78 %) samples removed by dropping all NaNs\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"cMlFDy5dNk1k","colab_type":"code","outputId":"1cd269c3-0b1a-4149-8098-061c0a28ce21","executionInfo":{"status":"ok","timestamp":1574058407125,"user_tz":-120,"elapsed":1454,"user":{"displayName":"Ari Ramkilowan","photoUrl":"https://lh3.googleusercontent.com/a-/AAuE7mBOdxjV-rMNcCCRFbbimFCdX4yGotTEbejYVcEc=s64","userId":"04960745627161950732"}},"colab":{"base_uri":"https://localhost:8080/","height":173}},"source":["## Preprocessing - Step 2a : Drop all duplicates in Source (en) text\n","\n","df_pp = df_pp.drop_duplicates(subset='source_sentence')\n","df_pp.info(memory_usage='deep')\n","new_size = len(df_pp)\n","print(f\"\\n {size-new_size}({100*(size-new_size)/size :.2f} %) samples removed by dropping Source sentence duplicates\")\n","size = new_size"],"execution_count":0,"outputs":[{"output_type":"stream","text":["\n","Int64Index: 95792 entries, 0 to 103982\n","Data columns (total 2 columns):\n","source_sentence 95792 non-null object\n","target_sentence 95792 non-null object\n","dtypes: object(2)\n","memory usage: 39.2 MB\n","\n"," 6335(6.20 %) samples removed by dropping Source sentence duplicates\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"2UqpPbb-Nk6n","colab_type":"code","outputId":"c24b9e76-6a28-4de0-b43b-864c5ca5d367","executionInfo":{"status":"ok","timestamp":1574058411070,"user_tz":-120,"elapsed":1619,"user":{"displayName":"Ari Ramkilowan","photoUrl":"https://lh3.googleusercontent.com/a-/AAuE7mBOdxjV-rMNcCCRFbbimFCdX4yGotTEbejYVcEc=s64","userId":"04960745627161950732"}},"colab":{"base_uri":"https://localhost:8080/","height":173}},"source":["## Preprocessing - Step 2b : Drop all duplicates in Target (zu) text\n","\n","df_pp = df_pp.drop_duplicates(subset='target_sentence')\n","df_pp.info(memory_usage='deep')\n","new_size = len(df_pp)\n","print(f\"\\n {size-new_size}({100*(size-new_size)/size :.2f} %) samples removed by dropping Target sentence duplicates\")\n","size = new_size"],"execution_count":0,"outputs":[{"output_type":"stream","text":["\n","Int64Index: 95382 entries, 0 to 103982\n","Data columns (total 2 columns):\n","source_sentence 95382 non-null object\n","target_sentence 95382 non-null object\n","dtypes: object(2)\n","memory usage: 41.9 MB\n","\n"," 410(0.43 %) samples removed by dropping Target sentence duplicates\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"HuQT7HJoNlHW","colab_type":"code","colab":{}},"source":["## Preprocessing - Step 3 : Remove all numeric entries\n","\n","pattern = r\"([0-9]*\\.?[0-9]*)\" # catch integers and decimals\n","import re\n","r = re.compile(pattern)"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"Mkn3wJkKNlLM","colab_type":"code","outputId":"722f8ccd-680b-44aa-ae1d-e02ca6a14d73","executionInfo":{"status":"ok","timestamp":1574058417001,"user_tz":-120,"elapsed":3803,"user":{"displayName":"Ari Ramkilowan","photoUrl":"https://lh3.googleusercontent.com/a-/AAuE7mBOdxjV-rMNcCCRFbbimFCdX4yGotTEbejYVcEc=s64","userId":"04960745627161950732"}},"colab":{"base_uri":"https://localhost:8080/","height":208}},"source":["%%time\n","## Preprocessing - Step 3a : Remove all numeric entries - Source text\n","\n","df_pp['source_sentence'] = df_pp['source_sentence'].str.replace(pattern,\"\")\n","df_pp['source_sentence'] = df_pp['source_sentence'].replace(\"\",np.nan)\n","\n","df_pp = df_pp.dropna()\n","df_pp.info(memory_usage='deep')\n","new_size = len(df_pp)\n","\n","print(f\"\\n {size-new_size}({100*(size-new_size)/size :.2f} %) samples removed by dropping nummeric entries from source text\")\n","size = new_size"],"execution_count":0,"outputs":[{"output_type":"stream","text":["\n","Int64Index: 95343 entries, 0 to 103982\n","Data columns (total 2 columns):\n","source_sentence 95343 non-null object\n","target_sentence 95343 non-null object\n","dtypes: object(2)\n","memory usage: 37.8 MB\n","\n"," 39(0.04 %) samples removed by dropping nummeric entries from source text\n","CPU times: user 1.68 s, sys: 10.2 ms, total: 1.69 s\n","Wall time: 1.7 s\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"8M5qLc8sNlD5","colab_type":"code","outputId":"20274762-6101-427d-f9af-bb885e6b2234","executionInfo":{"status":"ok","timestamp":1574058417536,"user_tz":-120,"elapsed":3351,"user":{"displayName":"Ari Ramkilowan","photoUrl":"https://lh3.googleusercontent.com/a-/AAuE7mBOdxjV-rMNcCCRFbbimFCdX4yGotTEbejYVcEc=s64","userId":"04960745627161950732"}},"colab":{"base_uri":"https://localhost:8080/","height":208}},"source":["%%time\n","## Preprocessing - Step 3b : Remove all numeric entries - Target text\n","\n","df_pp['target_sentence'] = df_pp['target_sentence'].str.replace(r,\"\")\n","df_pp['target_sentence'] = df_pp['target_sentence'].replace(\"\",np.nan)\n","\n","df_pp = df_pp.dropna()\n","df_pp.info(memory_usage='deep')\n","new_size = len(df_pp)\n","\n","print(f\"\\n {size-new_size}({100*(size-new_size)/size :.2f} %) samples removed by dropping nummeric entries from target text\")\n","size = new_size"],"execution_count":0,"outputs":[{"output_type":"stream","text":["\n","Int64Index: 95343 entries, 0 to 103982\n","Data columns (total 2 columns):\n","source_sentence 95343 non-null object\n","target_sentence 95343 non-null object\n","dtypes: object(2)\n","memory usage: 34.8 MB\n","\n"," 0(0.00 %) samples removed by dropping nummeric entries from target text\n","CPU times: user 1.73 s, sys: 20.6 ms, total: 1.75 s\n","Wall time: 1.77 s\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"cboxEtSJNlBq","colab_type":"text"},"source":["#### Preprocessing - Step 4 :Get length of sentences and then drop really short sentences"]},{"cell_type":"code","metadata":{"id":"1ninZmS4Nk_D","colab_type":"code","outputId":"6c816bff-1f88-4814-c1f9-c2fe6f5ad82a","executionInfo":{"status":"ok","timestamp":1574058432770,"user_tz":-120,"elapsed":1707,"user":{"displayName":"Ari Ramkilowan","photoUrl":"https://lh3.googleusercontent.com/a-/AAuE7mBOdxjV-rMNcCCRFbbimFCdX4yGotTEbejYVcEc=s64","userId":"04960745627161950732"}},"colab":{"base_uri":"https://localhost:8080/","height":243}},"source":["%%time\n","# add length columns\n","\n","\n","df_pp['source_ch_len'] = df_pp['source_sentence'].str.len()\n","df_pp['source_w_len'] = [len(text.split()) for text in df_pp['source_sentence']] \n","df_pp['target_ch_len'] = df_pp['target_sentence'].str.len()\n","df_pp['target_w_len'] = [len(text.split()) for text in df_pp['target_sentence']] \n","df_pp.info(memory_usage='deep')"],"execution_count":0,"outputs":[{"output_type":"stream","text":["\n","Int64Index: 95343 entries, 0 to 103982\n","Data columns (total 6 columns):\n","source_sentence 95343 non-null object\n","target_sentence 95343 non-null object\n","source_ch_len 95343 non-null int64\n","source_w_len 95343 non-null int64\n","target_ch_len 95343 non-null int64\n","target_w_len 95343 non-null int64\n","dtypes: int64(4), object(2)\n","memory usage: 37.7 MB\n","CPU times: user 477 ms, sys: 0 ns, total: 477 ms\n","Wall time: 477 ms\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"qnBEuswUOv-r","colab_type":"code","colab":{}},"source":["# # character len distrn - source text - \n","# df_pp['source_ch_len'].value_counts().sort_index()"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"YPtXT_8AOwHt","colab_type":"code","colab":{}},"source":["# # character len distrn - target text\n","# df_pp['target_ch_len'].value_counts().sort_index()"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"pgC7fDk2OwSg","colab_type":"code","colab":{}},"source":["## how many rows with source text <=2chars and what do they look like ?"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"RrdlJbiDOwQV","colab_type":"code","colab":{}},"source":["# # how many single character sentences from source ?\n","# f\"{df_pp['source_ch_len'].value_counts()[1]} single character source sentences\"\n","\n","# df_pp[df_pp['source_ch_len']<=1]"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"ZMbqX15kOwOK","colab_type":"code","colab":{}},"source":["# # how many 2-character sentences from source ?\n","# f\"{df_pp['source_ch_len'].value_counts()[2]} 2-character source sentences\"\n","\n","# df_pp[df_pp['source_ch_len']==2]"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"abJQERXGOwL7","colab_type":"code","outputId":"d4a76fb4-c345-4b42-98b7-048e9f935a37","executionInfo":{"status":"ok","timestamp":1574058440258,"user_tz":-120,"elapsed":1402,"user":{"displayName":"Ari Ramkilowan","photoUrl":"https://lh3.googleusercontent.com/a-/AAuE7mBOdxjV-rMNcCCRFbbimFCdX4yGotTEbejYVcEc=s64","userId":"04960745627161950732"}},"colab":{"base_uri":"https://localhost:8080/","height":243}},"source":["## Preprocessing - Step 4a : drop everything where the ch_len <=8 in source text\n","\n","df_pp = df_pp[~(df_pp['source_ch_len'] <=8) ]\n","\n","df_pp.info(memory_usage='deep')\n","new_size = len(df_pp)\n","print(f\"\\n {size-new_size}({100*(size-new_size)/size :.2f} %) samples removed by dropping rows with source sentences <= 8 characters\")\n","size = new_size"],"execution_count":0,"outputs":[{"output_type":"stream","text":["\n","Int64Index: 93690 entries, 0 to 103982\n","Data columns (total 6 columns):\n","source_sentence 93690 non-null object\n","target_sentence 93690 non-null object\n","source_ch_len 93690 non-null int64\n","source_w_len 93690 non-null int64\n","target_ch_len 93690 non-null int64\n","target_w_len 93690 non-null int64\n","dtypes: int64(4), object(2)\n","memory usage: 37.4 MB\n","\n"," 1653(1.73 %) samples removed by dropping rows with source sentences <= 2 characters\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"ziOVG-qhO9aN","colab_type":"code","outputId":"e1e48bfc-c40a-413e-ede0-d41f2934be01","executionInfo":{"status":"ok","timestamp":1574058463704,"user_tz":-120,"elapsed":1371,"user":{"displayName":"Ari Ramkilowan","photoUrl":"https://lh3.googleusercontent.com/a-/AAuE7mBOdxjV-rMNcCCRFbbimFCdX4yGotTEbejYVcEc=s64","userId":"04960745627161950732"}},"colab":{"base_uri":"https://localhost:8080/","height":243}},"source":["## Preprocessing - Step 4b : drop everything where the ch_len <=8 in target text\n","\n","df_pp = df_pp[~(df_pp['target_ch_len'] <=8) ]\n","\n","df_pp.info(memory_usage='deep')\n","new_size = len(df_pp)\n","print(f\"\\n {size-new_size}({100*(size-new_size)/size :.2f} %) samples removed by dropping rows with target sentences <= 8 characters\")\n","size = new_size"],"execution_count":0,"outputs":[{"output_type":"stream","text":["\n","Int64Index: 93557 entries, 0 to 103982\n","Data columns (total 6 columns):\n","source_sentence 93557 non-null object\n","target_sentence 93557 non-null object\n","source_ch_len 93557 non-null int64\n","source_w_len 93557 non-null int64\n","target_ch_len 93557 non-null int64\n","target_w_len 93557 non-null int64\n","dtypes: int64(4), object(2)\n","memory usage: 37.4 MB\n","\n"," 133(0.14 %) samples removed by dropping rows with target sentences <= 8 characters\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"6uAVeYx6O9kb","colab_type":"code","outputId":"c88857e3-e043-484d-fead-7fe70bfb5afe","executionInfo":{"status":"ok","timestamp":1574058476192,"user_tz":-120,"elapsed":1306,"user":{"displayName":"Ari Ramkilowan","photoUrl":"https://lh3.googleusercontent.com/a-/AAuE7mBOdxjV-rMNcCCRFbbimFCdX4yGotTEbejYVcEc=s64","userId":"04960745627161950732"}},"colab":{"base_uri":"https://localhost:8080/","height":278}},"source":["%%time\n","## Preprocessing - Step 5 : remove text from test set\n","\n","with open(f\"{gdrive_path}/test.en-any.en\") as f:\n"," rows = f.readlines()\n","test_set_en = [row.strip() for row in rows]\n","\n","\n","df_pp = df_pp[~df_pp['source_sentence'].str.strip().isin(test_set_en)]\n","\n","df_pp.info(memory_usage='deep')\n","new_size = len(df_pp)\n","print(f\"\\n {size-new_size}({100*(size-new_size)/size :.2f} %) samples removed by dropping rows from test set\")\n","size = new_size"],"execution_count":0,"outputs":[{"output_type":"stream","text":["\n","Int64Index: 92508 entries, 2 to 103982\n","Data columns (total 6 columns):\n","source_sentence 92508 non-null object\n","target_sentence 92508 non-null object\n","source_ch_len 92508 non-null int64\n","source_w_len 92508 non-null int64\n","target_ch_len 92508 non-null int64\n","target_w_len 92508 non-null int64\n","dtypes: int64(4), object(2)\n","memory usage: 37.1 MB\n","\n"," 1049(1.12 %) samples removed by dropping rows from test set\n","CPU times: user 212 ms, sys: 5.45 ms, total: 218 ms\n","Wall time: 224 ms\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"qM6SeYWBO9iU","colab_type":"code","outputId":"2a8aeb2f-3764-4c14-83bb-38bcb8e93e31","executionInfo":{"status":"ok","timestamp":1574058483663,"user_tz":-120,"elapsed":1330,"user":{"displayName":"Ari Ramkilowan","photoUrl":"https://lh3.googleusercontent.com/a-/AAuE7mBOdxjV-rMNcCCRFbbimFCdX4yGotTEbejYVcEc=s64","userId":"04960745627161950732"}},"colab":{"base_uri":"https://localhost:8080/","height":278}},"source":["%%time\n","## Preprocessing - Step 6 : remove the extra \"\n","df_pp['source_sentence'] = df_pp['source_sentence'].map(lambda x: x.lstrip('\"').rstrip('\"'))\n","df_pp['target_sentence'] = df_pp['target_sentence'].map(lambda x: x.lstrip('\"').rstrip('\"'))\n","\n","\n","df_pp.info(memory_usage='deep')\n","new_size = len(df_pp)\n","print(f\"\\n {size-new_size}({100*(size-new_size)/size :.2f} %) samples removed by dropping rows with extra quotes\")\n","size = new_size"],"execution_count":0,"outputs":[{"output_type":"stream","text":["\n","Int64Index: 92508 entries, 2 to 103982\n","Data columns (total 6 columns):\n","source_sentence 92508 non-null object\n","target_sentence 92508 non-null object\n","source_ch_len 92508 non-null int64\n","source_w_len 92508 non-null int64\n","target_ch_len 92508 non-null int64\n","target_w_len 92508 non-null int64\n","dtypes: int64(4), object(2)\n","memory usage: 37.1 MB\n","\n"," 0(0.00 %) samples removed by dropping rows with extra quotes\n","CPU times: user 237 ms, sys: 560 µs, total: 238 ms\n","Wall time: 241 ms\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"h-qKPxS1O9gA","colab_type":"text"},"source":["## create dev df "]},{"cell_type":"code","metadata":{"id":"7jJTnBETPQ0_","colab_type":"code","outputId":"d623464d-606c-4c5c-f228-c11df49b19f7","executionInfo":{"status":"ok","timestamp":1574058493300,"user_tz":-120,"elapsed":1565,"user":{"displayName":"Ari Ramkilowan","photoUrl":"https://lh3.googleusercontent.com/a-/AAuE7mBOdxjV-rMNcCCRFbbimFCdX4yGotTEbejYVcEc=s64","userId":"04960745627161950732"}},"colab":{"base_uri":"https://localhost:8080/","height":139}},"source":["df_dev = df_pp[['source_sentence', 'target_sentence']]\n","# Shuffle the data to remove bias in dev set selection.\n","seed=42\n","df_dev = df_dev.sample(frac=1, random_state=seed).reset_index(drop=True)\n","df_dev.info()"],"execution_count":0,"outputs":[{"output_type":"stream","text":["\n","RangeIndex: 92508 entries, 0 to 92507\n","Data columns (total 2 columns):\n","source_sentence 92508 non-null object\n","target_sentence 92508 non-null object\n","dtypes: object(2)\n","memory usage: 1.4+ MB\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"61oWGnN4PSxZ","colab_type":"text"},"source":["## Create train test dev sets"]},{"cell_type":"code","metadata":{"id":"W2yVgwDZPS5b","colab_type":"code","outputId":"c2933f0c-09df-4c9b-8e0f-b9fb468a07f6","executionInfo":{"status":"ok","timestamp":1574058521973,"user_tz":-120,"elapsed":15668,"user":{"displayName":"Ari Ramkilowan","photoUrl":"https://lh3.googleusercontent.com/a-/AAuE7mBOdxjV-rMNcCCRFbbimFCdX4yGotTEbejYVcEc=s64","userId":"04960745627161950732"}},"colab":{"base_uri":"https://localhost:8080/","height":52}},"source":["%%time\n","# This section does the split between train/dev for the parallel corpora then saves them as separate files\n","# We use 1000 dev test and the given test set.\n","\n","# Do the split between dev/train and create parallel corpora\n","num_dev_patterns = 1000\n","\n","# Optional: lower case the corpora - this will make it easier to generalize, but without proper casing.\n","if lc: # Julia: making lowercasing optional\n"," df_dev[\"source_sentence\"] = df_dev[\"source_sentence\"].str.lower()\n"," df_dev[\"target_sentence\"] = df_dev[\"target_sentence\"].str.lower()\n","\n","# Julia: test sets are already generated\n","dev = df_dev.tail(num_dev_patterns) # Herman: Error in original\n","stripped = df_dev.drop(df_dev.tail(num_dev_patterns).index)\n","\n","with open(f\"{gdrive_path}/train.\"+source_language, \"w\") as src_file, open(f\"{gdrive_path}/train.\"+target_language, \"w\") as tgt_file:\n"," for index, row in stripped.iterrows():\n"," src_file.write(row[\"source_sentence\"]+\"\\n\")\n"," tgt_file.write(row[\"target_sentence\"]+\"\\n\")\n"," \n","with open(f\"{gdrive_path}/dev.\"+source_language, \"w\") as src_file, open(f\"{gdrive_path}/dev.\"+target_language, \"w\") as tgt_file:\n"," for index, row in dev.iterrows():\n"," src_file.write(row[\"source_sentence\"]+\"\\n\")\n"," tgt_file.write(row[\"target_sentence\"]+\"\\n\")"],"execution_count":0,"outputs":[{"output_type":"stream","text":["CPU times: user 14.1 s, sys: 29 ms, total: 14.1 s\n","Wall time: 14.2 s\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"rkh4P5tpPS8y","colab_type":"code","outputId":"4ede3fd6-a121-446c-efcd-8d1b28913b4a","executionInfo":{"status":"ok","timestamp":1574058542654,"user_tz":-120,"elapsed":20656,"user":{"displayName":"Ari Ramkilowan","photoUrl":"https://lh3.googleusercontent.com/a-/AAuE7mBOdxjV-rMNcCCRFbbimFCdX4yGotTEbejYVcEc=s64","userId":"04960745627161950732"}},"colab":{"base_uri":"https://localhost:8080/","height":593}},"source":["# Doublecheck the format below. There should be no extra quotation marks or weird characters.\n","! head \"$gdrive_path/train.$src\"\n","! echo \"=================================\"\n","! head \"$gdrive_path/dev.$src\"\n","! echo \"=================================\"\n","! head \"$gdrive_path/test.$src\""],"execution_count":0,"outputs":[{"output_type":"stream","text":["we need similar courage in these last days \n","five of them , including the manager , accepted the truth and were baptized \n","your congregation also no doubt has a standing resolution to assist financially with the worldwide program of building assembly halls and kingdom halls , which benefits our brothers throughout the earth \n","it’s good that you are going straight to the source \n","third , they were willing to use what they had heard and understood , not only in their personal lives but also in their efforts to help others ​ — matt \n"," , ( a ) how did you make your decision to serve god ?\n","total ( in us funds ) repaid to jehovah’s witnesses on december , , by the government of france , after a - year legal battle\n","paul went on to indicate that there would be others raised to heavenly life , adding : “ each one in his own proper order : christ the firstfruits , afterward those who belong to the christ during his presence ” ​ — cor \n","as a result , , perished at jehovah’s hand \n","how did jehovah help david to cope with the challenges he faced ?\n","=================================\n","( a ) how does jehovah mold us today ?\n","( read thessalonians : , )\n","page • songs : , \n","if that has been true in your case , what can you do to free yourself from their influence ?\n","god’s word reminds us that “ the wisdom from above is peaceable , reasonable ”\n","read proverbs : , \n","that is a worthwhile question \n","but he died when i was seven years old \n","god’s love reaches out to each of us as individuals \n","estimates of the number of species on earth vary from million to million \n","=================================\n","( b ) What questions will we consider ?\n","( b ) What will we consider in the following article ?\n","( Read 1 Corinthians 15 : 58 . )\n","Dorcas “ abounded in good deeds and gifts of mercy . ”\n","\"What will be considered in this article , and why ?\"\n","Some names in this article have been changed .\n","\"( Read Proverbs 3 : 5 , 6 . )\"\n","( b ) What will we consider in this article ?\n","Jehovah is the name of God as revealed in the Bible .\n","( b ) What will we consider in the next article ?\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"5LmHit7TPTAx","colab_type":"code","outputId":"6c291084-b204-4cd1-a1ed-d60aeb67f6ec","executionInfo":{"status":"ok","timestamp":1574058560974,"user_tz":-120,"elapsed":38964,"user":{"displayName":"Ari Ramkilowan","photoUrl":"https://lh3.googleusercontent.com/a-/AAuE7mBOdxjV-rMNcCCRFbbimFCdX4yGotTEbejYVcEc=s64","userId":"04960745627161950732"}},"colab":{"base_uri":"https://localhost:8080/","height":593}},"source":["! head \"$gdrive_path/train.$tgt\"\n","! echo \"=================================\"\n","! head \"$gdrive_path/dev.$tgt\"\n","! echo \"=================================\"\n","! head \"$gdrive_path/test.$tgt\""],"execution_count":0,"outputs":[{"output_type":"stream","text":["sitlhoga isibindi esifanako emihleni yokuphela le \n","abahlanu babo , kuhlanganise nemenenjara , bamukele iqiniso babhabhadiswa \n","nawe kungenzeka ibandla lakho liqunte ukukhupha imali ethileko imali leyo isekela ihlelo lomsebenzi wokwakha amawolo wemihlangano emincani nemikhulu amawolo la azuzisa abazalwana ephasini loke \n","kuhle ngombana ubuze emthonjeni ofaneleko \n","kwesithathu , lokhu abafundileko bebangafuni ukukusebenzisa emaphilwenabo kwaphela kodwana bebasiza nabanye ngakho — mat \n"," , ( a ) wasenza njani isiqunto sakho sokukhonza uzimu ?\n","inani elipheleleko elabuyiselwa kibofakazi bakajehova ngodesemba , , libuyiselwa ngurhulumende wefrance , ngemva kwepi engokomthetho yeminyaka eli - \n","upowula waragela phambili wahlathulula ukuthi kunabanye abazokuvuselwa ezulwini : “ omunye nomunye ngedlhego lakhe : kuzokuthoma ukrestu ntangi bese kuthi nabuyako , kuvuke labo abangebakhe ” — kor \n","umphumela waba kukuthi ujehova amadlhe abazi - zabantwabo \n","ujehova wamsiza njani udavida bona akghodlhelelele imiraro aqalana nayo ?\n","=================================\n","( a ) ujehova usibumba njani namhlanjesi ?\n","( funda yoku - tesalonika : , )\n","ikhasi • iingoma : , \n","nengabe lokho kunjalo ngawe , khuyini ongayenza bona uzokwazi ukutjhaphuluka kiyo ?\n","ibhayibheli lisikhumbuza bonyana “ ubuhlakani obuvela ezulwini bunokuthula , bunobumnene ”\n","funda iziyema : , \n","lo mbuzo oqakathekileko \n","kodwana wahlongakala nangineminyaka elikhomba \n","uzimu usithanda soke \n","ngokwesilinganiso , iinlwana eziphila ephasini zimihlobohlobo ebalwa kusukela eengidini ezimbili ukuya kwezimakhulu \n","=================================\n","( b ) Ngimiphi iimbuzo esizoyicabangela ?\n","( b ) Khuyini esizoyicabangela esihlokweni esilandelako ?\n","( Funda yoku - 1 KwebeKorinte 15 : 58 . )\n","UDorkasi “ gade azinikele ukwenza okulungileko ngeenkhathi zoke nakwabadududu . ”\n","\"Khuyini esizoyicabangela esihlokwenesi , begodu kubayini ?\"\n","Amanye amabizo esihlokwenesi atjhentjhiwe .\n","\"( Funda Iziyema 3 : 5 , 6 . )\"\n","( b ) Khuyini esizoyicabangela esihlokwenesi ?\n","UJehova libizo lakaZimu njengombana lembulwe eBhayibhilini .\n","( b ) Yini esizoyicabangela esihlokweni esilandelako ?\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"VoNcSQDQPTKa","colab_type":"text"},"source":["## Preprocessing the Data into Subword BPE Tokens\n","\n","- One of the most powerful improvements for agglutinative languages (a feature of most Bantu languages) is using BPE tokenization [ (Sennrich, 2015) ](https://arxiv.org/abs/1508.07909).\n","\n","- It was also shown that by optimizing the umber of BPE codes we significantly improve results for low-resourced languages [(Sennrich, 2019)](https://www.aclweb.org/anthology/P19-1021) [(Martinus, 2019)](https://arxiv.org/abs/1906.05685)\n","\n","- Below we have the scripts for doing BPE tokenization of our data. We use 4000 tokens as recommended by [(Sennrich, 2019)](https://www.aclweb.org/anthology/P19-1021). You do not need to change anything. Simply running the below will be suitable. "]},{"cell_type":"code","metadata":{"id":"vG7wL8RNPTQ5","colab_type":"code","outputId":"bb7562ad-5c2f-4b58-d5eb-dd4582cd830a","executionInfo":{"status":"ok","timestamp":1574059008040,"user_tz":-120,"elapsed":74421,"user":{"displayName":"Ari Ramkilowan","photoUrl":"https://lh3.googleusercontent.com/a-/AAuE7mBOdxjV-rMNcCCRFbbimFCdX4yGotTEbejYVcEc=s64","userId":"04960745627161950732"}},"colab":{"base_uri":"https://localhost:8080/","height":52}},"source":["%%time\n","! subword-nmt learn-joint-bpe-and-vocab --input \"$gdrive_path\"train.$src \"$gdrive_path\"train.$tgt -s $vocab_size -o \"$gdrive_path\"bpe.codes.$vocab_size --write-vocabulary \"$gdrive_path\"vocab.$src \"$gdrive_path\"vocab.$tgt\n","\n","# Apply BPE splits to the train, development and test data.\n","! subword-nmt apply-bpe -c \"$gdrive_path\"bpe.codes.$vocab_size --vocabulary \"$gdrive_path\"vocab.$src < \"$gdrive_path\"train.$src > \"$gdrive_path\"train.bpe.$src\n","! subword-nmt apply-bpe -c \"$gdrive_path\"bpe.codes.$vocab_size --vocabulary \"$gdrive_path\"vocab.$tgt < \"$gdrive_path\"train.$tgt > \"$gdrive_path\"train.bpe.$tgt\n","\n","! subword-nmt apply-bpe -c \"$gdrive_path\"bpe.codes.$vocab_size --vocabulary \"$gdrive_path\"vocab.$src < \"$gdrive_path\"dev.$src > \"$gdrive_path\"dev.bpe.$src\n","! subword-nmt apply-bpe -c \"$gdrive_path\"bpe.codes.$vocab_size --vocabulary \"$gdrive_path\"vocab.$tgt < \"$gdrive_path\"dev.$tgt > \"$gdrive_path\"dev.bpe.$tgt\n","\n","! subword-nmt apply-bpe -c \"$gdrive_path\"bpe.codes.$vocab_size --vocabulary \"$gdrive_path\"vocab.$src < \"$gdrive_path\"test.$src > \"$gdrive_path\"test.bpe.$src\n","! subword-nmt apply-bpe -c \"$gdrive_path\"bpe.codes.$vocab_size --vocabulary \"$gdrive_path\"vocab.$tgt < \"$gdrive_path\"test.$tgt > \"$gdrive_path\"test.bpe.$tgt\n"],"execution_count":0,"outputs":[{"output_type":"stream","text":["CPU times: user 380 ms, sys: 82.1 ms, total: 462 ms\n","Wall time: 1min 12s\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"CGR5FZukPTYa","colab_type":"code","colab":{}},"source":["# Create that vocab using build_vocab\n","! sudo chmod 777 joeynmt/scripts/build_vocab.py\n","! joeynmt/scripts/build_vocab.py \"$gdrive_path\"train.bpe.\"$src\" \"$gdrive_path\"train.bpe.\"$tgt\" --output_path \"$gdrive_path\"vocab.txt"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"G8VNWCsPPTcX","colab_type":"code","outputId":"b231fafd-6f28-43d0-8ca9-cd5f0a86249d","executionInfo":{"status":"ok","timestamp":1574059276785,"user_tz":-120,"elapsed":17202,"user":{"displayName":"Ari Ramkilowan","photoUrl":"https://lh3.googleusercontent.com/a-/AAuE7mBOdxjV-rMNcCCRFbbimFCdX4yGotTEbejYVcEc=s64","userId":"04960745627161950732"}},"colab":{"base_uri":"https://localhost:8080/","height":312}},"source":["# Some output\n","! echo \"BPE Ndebele Sentences\"\n","! tail -n 5 \"$gdrive_path\"test.bpe.$tgt\n","\n","! echo \"Combined BPE Vocab\"\n","! tail -n 10 \"$gdrive_path\"vocab.txt # Herman\n"],"execution_count":0,"outputs":[{"output_type":"stream","text":["BPE Ndebele Sentences\n","L@@ okho kwenza bona ngaz@@ iwe njengom@@ untu ong@@ akath@@ embek@@ i .\n","\"@@ K@@ wathi bona ng@@ ifunde iqiniso , akhenge ngis@@ av@@ uma uku@@ r@@ aga n@@ ent@@ wel@@ ey@@ o , nanyana umb@@ ereg@@ o lo bew@@ ub@@ had@@ ela kangaka .@@ \"\n","\"@@ N@@ g@@ isibonelo esihle eb@@ as@@ any@@ an@@ eni bami ababili , begodu ng@@ ikgh@@ ona noku@@ b@@ ereg@@ iswa ebandleni .@@ \"\n","E@@ b@@ ant@@ wini abab@@ had@@ el@@ isa umth@@ elo nak@@ il@@ abo eng@@ ib@@ ereg@@ isana nabo ngaz@@ iwa njengom@@ untu oth@@ embekileko . ”\n","U@@ R@@ ute waf@@ ud@@ ukela kw@@ a - I@@ s@@ rayeli lapho ebek@@ azoku@@ kgh@@ ona uku@@ lotjha khona u@@ Z@@ imu weqiniso .\n","Combined BPE Vocab\n","wha@@\n","prophec@@\n","espe@@\n","ething\n","nex@@\n","haps\n","probl@@\n","ʺ\n","uld\n","+\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"I6o0iIaLPThr","colab_type":"text"},"source":["## Creating the JoeyNMT Config\n","\n","JoeyNMT requires a yaml config. We provide a template below. We've also set a number of defaults with it, that you may play with!\n","\n","- We used Transformer architecture \n","- We set our dropout to reasonably high: 0.3 (recommended in [(Sennrich, 2019)](https://www.aclweb.org/anthology/P19-1021))\n","\n","Things worth playing with:\n","- The batch size (also recommended to change for low-resourced languages)\n","- The number of epochs (we've set it at 30 just so it runs in about an hour, for testing purposes)\n","- The decoder options (beam_size, alpha)\n","- Evaluation metrics (BLEU versus Crhf4)"]},{"cell_type":"code","metadata":{"id":"ibXa29wCPToR","colab_type":"code","outputId":"4e8ed225-a7b9-4172-ca8f-c77fb7eef9ba","executionInfo":{"status":"ok","timestamp":1574946485406,"user_tz":-120,"elapsed":1839,"user":{"displayName":"Ari Ramkilowan","photoUrl":"https://lh3.googleusercontent.com/a-/AAuE7mBOdxjV-rMNcCCRFbbimFCdX4yGotTEbejYVcEc=s64","userId":"04960745627161950732"}},"colab":{"base_uri":"https://localhost:8080/","height":34}},"source":["name = '%s%s%s%s' % (source_language, target_language, str(vocab_size),tag)\n","name"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["'ennr4000baseline'"]},"metadata":{"tags":[]},"execution_count":7}]},{"cell_type":"code","metadata":{"id":"zjDjZGtiPTl8","colab_type":"code","colab":{}},"source":["# Create this dir before we run for the first time so we store check points\n","# !mkdir -p \"$gdrive_path/pretrained/$src$tgt$vocab_size$tag/\" # Herman"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"B69oIj2LPTfv","colab_type":"code","colab":{}},"source":["# This creates the config file for our JoeyNMT system. It might seem overwhelming so we've provided a couple of useful parameters you'll need to update\n","# (You can of course play with all the parameters if you'd like!)\n","\n","name = '%s%s%s%s' % (source_language, target_language, str(vocab_size),tag)\n","gdrive_path = os.environ[\"gdrive_path\"]\n","\n","# Create the config\n","config = \"\"\"\n","name: \"{name}_transformer\"\n","\n","data:\n"," src: \"{source_language}\"\n"," trg: \"{target_language}\"\n"," train: \"{gdrive_path}train.bpe\"\n"," dev: \"{gdrive_path}dev.bpe\"\n"," test: \"{gdrive_path}test.bpe\"\n"," level: \"bpe\"\n"," lowercase: False\n"," max_sent_length: 100\n"," src_vocab: \"{gdrive_path}vocab.txt\"\n"," trg_vocab: \"{gdrive_path}vocab.txt\"\n","\n","testing:\n"," beam_size: 5\n"," alpha: 1.0\n","\n","training:\n"," load_model: \"{gdrive_path}pretrained/{name}/46000.ckpt\"\n"," random_seed: 42\n"," optimizer: \"adam\"\n"," normalization: \"tokens\"\n"," adam_betas: [0.9, 0.999] \n"," scheduling: \"plateau\" # TODO: try switching from plateau to Noam scheduling\n"," patience: 5 # For plateau: decrease learning rate by decrease_factor if validation score has not improved for this many validation rounds.\n"," learning_rate_factor: 0.5 # factor for Noam scheduler (used with Transformer)\n"," learning_rate_warmup: 1000 # warmup steps for Noam scheduler (used with Transformer)\n"," decrease_factor: 0.7\n"," loss: \"crossentropy\"\n"," learning_rate: 0.0003\n"," learning_rate_min: 0.00000001\n"," weight_decay: 0.0\n"," label_smoothing: 0.1\n"," batch_size: 4096\n"," batch_type: \"token\"\n"," eval_batch_size: 3600\n"," eval_batch_type: \"token\"\n"," batch_multiplier: 1\n"," early_stopping_metric: \"ppl\"\n"," epochs: 30 # TODO: Decrease for when playing around and checking of working. Around 30 is sufficient to check if its working at all\n"," validation_freq: 1000 # TODO: Set to at least once per epoch.\n"," logging_freq: 100\n"," eval_metric: \"bleu\"\n"," model_dir: \"models/{name}_transformer/{name}\"\n"," overwrite: True # TODO: Set to True if you want to overwrite possibly existing models. \n"," shuffle: True\n"," use_cuda: True\n"," max_output_length: 100\n"," print_valid_sents: [0, 1, 2, 3]\n"," keep_last_ckpts: 3\n","\n","model:\n"," initializer: \"xavier\"\n"," bias_initializer: \"zeros\"\n"," init_gain: 1.0\n"," embed_initializer: \"xavier\"\n"," embed_init_gain: 1.0\n"," tied_embeddings: True\n"," tied_softmax: True\n"," encoder:\n"," type: \"transformer\"\n"," num_layers: 6\n"," num_heads: 4 # TODO: Increase to 8 for larger data.\n"," embeddings:\n"," embedding_dim: 256 # TODO: Increase to 512 for larger data.\n"," scale: True\n"," dropout: 0.3\n"," # typically ff_size = 4 x hidden_size\n"," hidden_size: 256 # TODO: Increase to 512 for larger data.\n"," ff_size: 1024 # TODO: Increase to 2048 for larger data.\n"," dropout: 0.4\n"," decoder:\n"," type: \"transformer\"\n"," num_layers: 6\n"," num_heads: 8 # TODO: Increase to 8 for larger data.\n"," embeddings:\n"," embedding_dim: 256 # TODO: Increase to 512 for larger data.\n"," scale: True\n"," dropout: 0.3\n"," # typically ff_size = 4 x hidden_size\n"," hidden_size: 256 # TODO: Increase to 512 for larger data.\n"," ff_size: 1024 # TODO: Increase to 2048 for larger data.\n"," dropout: 0.4\n","\"\"\".format(name=name,\n"," gdrive_path=os.environ[\"gdrive_path\"],\n"," source_language=source_language,\n"," target_language=target_language\n"," )\n","\n","with open(\"joeynmt/configs/transformer_{name}.yaml\".format(name=name),'w') as f:\n"," f.write(config)"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"6D3d3xbNPTVK","colab_type":"text"},"source":["## Train the Model"]},{"cell_type":"code","metadata":{"id":"hVrBHa_bPTOD","colab_type":"code","outputId":"9e1b4fbb-2e70-4dbe-dc87-de8116bdd168","executionInfo":{"status":"ok","timestamp":1574947771256,"user_tz":-120,"elapsed":7567,"user":{"displayName":"Ari Ramkilowan","photoUrl":"https://lh3.googleusercontent.com/a-/AAuE7mBOdxjV-rMNcCCRFbbimFCdX4yGotTEbejYVcEc=s64","userId":"04960745627161950732"}},"colab":{"base_uri":"https://localhost:8080/","height":173}},"source":["! cd joeynmt/configs; ls\n","# copy config to gdrive\n","! cp joeynmt/configs/transformer_$src$tgt$vocab_size$tag.yaml \"$gdrive_path/pretrained/$src$tgt$vocab_size$tag/\""],"execution_count":0,"outputs":[{"output_type":"stream","text":["iwslt14_deen_bpe.yaml\t\t transformer_reverse.yaml\n","iwslt_deen_bahdanau.yaml\t transformer_small.yaml\n","iwslt_envi_luong.yaml\t\t transformer_wmt17_ende.yaml\n","iwslt_envi_xnmt.yaml\t\t transformer_wmt17_lven.yaml\n","reverse.yaml\t\t\t wmt_ende_best.yaml\n","small.yaml\t\t\t wmt_ende_default.yaml\n","transformer_copy.yaml\t\t wmt_lven_best.yaml\n","transformer_ennr4000baseline.yaml wmt_lven_default.yaml\n","transformer_iwslt14_deen_bpe.yaml\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"YJU6gERiPTG9","colab_type":"code","outputId":"b0c1178a-b8e0-410c-8984-abf7bd8f3961","executionInfo":{"status":"ok","timestamp":1574946918467,"user_tz":-120,"elapsed":303915,"user":{"displayName":"Ari Ramkilowan","photoUrl":"https://lh3.googleusercontent.com/a-/AAuE7mBOdxjV-rMNcCCRFbbimFCdX4yGotTEbejYVcEc=s64","userId":"04960745627161950732"}},"colab":{"base_uri":"https://localhost:8080/","height":1000}},"source":["%%time\n","# Train the model\n","# You can press Ctrl-C to stop. And then run the next cell to save your checkpoints! \n","! cd joeynmt; python3 -m joeynmt train configs/transformer_$src$tgt$vocab_size$tag.yaml"],"execution_count":0,"outputs":[{"output_type":"stream","text":["2019-11-28 13:10:22,179 Hello! This is Joey-NMT.\n","2019-11-28 13:10:23,931 Total params: 12118528\n","2019-11-28 13:10:23,933 Trainable parameters: ['decoder.layer_norm.bias', 'decoder.layer_norm.weight', 'decoder.layers.0.dec_layer_norm.bias', 'decoder.layers.0.dec_layer_norm.weight', 'decoder.layers.0.feed_forward.layer_norm.bias', 'decoder.layers.0.feed_forward.layer_norm.weight', 'decoder.layers.0.feed_forward.pwff_layer.0.bias', 'decoder.layers.0.feed_forward.pwff_layer.0.weight', 'decoder.layers.0.feed_forward.pwff_layer.3.bias', 'decoder.layers.0.feed_forward.pwff_layer.3.weight', 'decoder.layers.0.src_trg_att.k_layer.bias', 'decoder.layers.0.src_trg_att.k_layer.weight', 'decoder.layers.0.src_trg_att.output_layer.bias', 'decoder.layers.0.src_trg_att.output_layer.weight', 'decoder.layers.0.src_trg_att.q_layer.bias', 'decoder.layers.0.src_trg_att.q_layer.weight', 'decoder.layers.0.src_trg_att.v_layer.bias', 'decoder.layers.0.src_trg_att.v_layer.weight', 'decoder.layers.0.trg_trg_att.k_layer.bias', 'decoder.layers.0.trg_trg_att.k_layer.weight', 'decoder.layers.0.trg_trg_att.output_layer.bias', 'decoder.layers.0.trg_trg_att.output_layer.weight', 'decoder.layers.0.trg_trg_att.q_layer.bias', 'decoder.layers.0.trg_trg_att.q_layer.weight', 'decoder.layers.0.trg_trg_att.v_layer.bias', 'decoder.layers.0.trg_trg_att.v_layer.weight', 'decoder.layers.0.x_layer_norm.bias', 'decoder.layers.0.x_layer_norm.weight', 'decoder.layers.1.dec_layer_norm.bias', 'decoder.layers.1.dec_layer_norm.weight', 'decoder.layers.1.feed_forward.layer_norm.bias', 'decoder.layers.1.feed_forward.layer_norm.weight', 'decoder.layers.1.feed_forward.pwff_layer.0.bias', 'decoder.layers.1.feed_forward.pwff_layer.0.weight', 'decoder.layers.1.feed_forward.pwff_layer.3.bias', 'decoder.layers.1.feed_forward.pwff_layer.3.weight', 'decoder.layers.1.src_trg_att.k_layer.bias', 'decoder.layers.1.src_trg_att.k_layer.weight', 'decoder.layers.1.src_trg_att.output_layer.bias', 'decoder.layers.1.src_trg_att.output_layer.weight', 'decoder.layers.1.src_trg_att.q_layer.bias', 'decoder.layers.1.src_trg_att.q_layer.weight', 'decoder.layers.1.src_trg_att.v_layer.bias', 'decoder.layers.1.src_trg_att.v_layer.weight', 'decoder.layers.1.trg_trg_att.k_layer.bias', 'decoder.layers.1.trg_trg_att.k_layer.weight', 'decoder.layers.1.trg_trg_att.output_layer.bias', 'decoder.layers.1.trg_trg_att.output_layer.weight', 'decoder.layers.1.trg_trg_att.q_layer.bias', 'decoder.layers.1.trg_trg_att.q_layer.weight', 'decoder.layers.1.trg_trg_att.v_layer.bias', 'decoder.layers.1.trg_trg_att.v_layer.weight', 'decoder.layers.1.x_layer_norm.bias', 'decoder.layers.1.x_layer_norm.weight', 'decoder.layers.2.dec_layer_norm.bias', 'decoder.layers.2.dec_layer_norm.weight', 'decoder.layers.2.feed_forward.layer_norm.bias', 'decoder.layers.2.feed_forward.layer_norm.weight', 'decoder.layers.2.feed_forward.pwff_layer.0.bias', 'decoder.layers.2.feed_forward.pwff_layer.0.weight', 'decoder.layers.2.feed_forward.pwff_layer.3.bias', 'decoder.layers.2.feed_forward.pwff_layer.3.weight', 'decoder.layers.2.src_trg_att.k_layer.bias', 'decoder.layers.2.src_trg_att.k_layer.weight', 'decoder.layers.2.src_trg_att.output_layer.bias', 'decoder.layers.2.src_trg_att.output_layer.weight', 'decoder.layers.2.src_trg_att.q_layer.bias', 'decoder.layers.2.src_trg_att.q_layer.weight', 'decoder.layers.2.src_trg_att.v_layer.bias', 'decoder.layers.2.src_trg_att.v_layer.weight', 'decoder.layers.2.trg_trg_att.k_layer.bias', 'decoder.layers.2.trg_trg_att.k_layer.weight', 'decoder.layers.2.trg_trg_att.output_layer.bias', 'decoder.layers.2.trg_trg_att.output_layer.weight', 'decoder.layers.2.trg_trg_att.q_layer.bias', 'decoder.layers.2.trg_trg_att.q_layer.weight', 'decoder.layers.2.trg_trg_att.v_layer.bias', 'decoder.layers.2.trg_trg_att.v_layer.weight', 'decoder.layers.2.x_layer_norm.bias', 'decoder.layers.2.x_layer_norm.weight', 'decoder.layers.3.dec_layer_norm.bias', 'decoder.layers.3.dec_layer_norm.weight', 'decoder.layers.3.feed_forward.layer_norm.bias', 'decoder.layers.3.feed_forward.layer_norm.weight', 'decoder.layers.3.feed_forward.pwff_layer.0.bias', 'decoder.layers.3.feed_forward.pwff_layer.0.weight', 'decoder.layers.3.feed_forward.pwff_layer.3.bias', 'decoder.layers.3.feed_forward.pwff_layer.3.weight', 'decoder.layers.3.src_trg_att.k_layer.bias', 'decoder.layers.3.src_trg_att.k_layer.weight', 'decoder.layers.3.src_trg_att.output_layer.bias', 'decoder.layers.3.src_trg_att.output_layer.weight', 'decoder.layers.3.src_trg_att.q_layer.bias', 'decoder.layers.3.src_trg_att.q_layer.weight', 'decoder.layers.3.src_trg_att.v_layer.bias', 'decoder.layers.3.src_trg_att.v_layer.weight', 'decoder.layers.3.trg_trg_att.k_layer.bias', 'decoder.layers.3.trg_trg_att.k_layer.weight', 'decoder.layers.3.trg_trg_att.output_layer.bias', 'decoder.layers.3.trg_trg_att.output_layer.weight', 'decoder.layers.3.trg_trg_att.q_layer.bias', 'decoder.layers.3.trg_trg_att.q_layer.weight', 'decoder.layers.3.trg_trg_att.v_layer.bias', 'decoder.layers.3.trg_trg_att.v_layer.weight', 'decoder.layers.3.x_layer_norm.bias', 'decoder.layers.3.x_layer_norm.weight', 'decoder.layers.4.dec_layer_norm.bias', 'decoder.layers.4.dec_layer_norm.weight', 'decoder.layers.4.feed_forward.layer_norm.bias', 'decoder.layers.4.feed_forward.layer_norm.weight', 'decoder.layers.4.feed_forward.pwff_layer.0.bias', 'decoder.layers.4.feed_forward.pwff_layer.0.weight', 'decoder.layers.4.feed_forward.pwff_layer.3.bias', 'decoder.layers.4.feed_forward.pwff_layer.3.weight', 'decoder.layers.4.src_trg_att.k_layer.bias', 'decoder.layers.4.src_trg_att.k_layer.weight', 'decoder.layers.4.src_trg_att.output_layer.bias', 'decoder.layers.4.src_trg_att.output_layer.weight', 'decoder.layers.4.src_trg_att.q_layer.bias', 'decoder.layers.4.src_trg_att.q_layer.weight', 'decoder.layers.4.src_trg_att.v_layer.bias', 'decoder.layers.4.src_trg_att.v_layer.weight', 'decoder.layers.4.trg_trg_att.k_layer.bias', 'decoder.layers.4.trg_trg_att.k_layer.weight', 'decoder.layers.4.trg_trg_att.output_layer.bias', 'decoder.layers.4.trg_trg_att.output_layer.weight', 'decoder.layers.4.trg_trg_att.q_layer.bias', 'decoder.layers.4.trg_trg_att.q_layer.weight', 'decoder.layers.4.trg_trg_att.v_layer.bias', 'decoder.layers.4.trg_trg_att.v_layer.weight', 'decoder.layers.4.x_layer_norm.bias', 'decoder.layers.4.x_layer_norm.weight', 'decoder.layers.5.dec_layer_norm.bias', 'decoder.layers.5.dec_layer_norm.weight', 'decoder.layers.5.feed_forward.layer_norm.bias', 'decoder.layers.5.feed_forward.layer_norm.weight', 'decoder.layers.5.feed_forward.pwff_layer.0.bias', 'decoder.layers.5.feed_forward.pwff_layer.0.weight', 'decoder.layers.5.feed_forward.pwff_layer.3.bias', 'decoder.layers.5.feed_forward.pwff_layer.3.weight', 'decoder.layers.5.src_trg_att.k_layer.bias', 'decoder.layers.5.src_trg_att.k_layer.weight', 'decoder.layers.5.src_trg_att.output_layer.bias', 'decoder.layers.5.src_trg_att.output_layer.weight', 'decoder.layers.5.src_trg_att.q_layer.bias', 'decoder.layers.5.src_trg_att.q_layer.weight', 'decoder.layers.5.src_trg_att.v_layer.bias', 'decoder.layers.5.src_trg_att.v_layer.weight', 'decoder.layers.5.trg_trg_att.k_layer.bias', 'decoder.layers.5.trg_trg_att.k_layer.weight', 'decoder.layers.5.trg_trg_att.output_layer.bias', 'decoder.layers.5.trg_trg_att.output_layer.weight', 'decoder.layers.5.trg_trg_att.q_layer.bias', 'decoder.layers.5.trg_trg_att.q_layer.weight', 'decoder.layers.5.trg_trg_att.v_layer.bias', 'decoder.layers.5.trg_trg_att.v_layer.weight', 'decoder.layers.5.x_layer_norm.bias', 'decoder.layers.5.x_layer_norm.weight', 'encoder.layer_norm.bias', 'encoder.layer_norm.weight', 'encoder.layers.0.feed_forward.layer_norm.bias', 'encoder.layers.0.feed_forward.layer_norm.weight', 'encoder.layers.0.feed_forward.pwff_layer.0.bias', 'encoder.layers.0.feed_forward.pwff_layer.0.weight', 'encoder.layers.0.feed_forward.pwff_layer.3.bias', 'encoder.layers.0.feed_forward.pwff_layer.3.weight', 'encoder.layers.0.layer_norm.bias', 'encoder.layers.0.layer_norm.weight', 'encoder.layers.0.src_src_att.k_layer.bias', 'encoder.layers.0.src_src_att.k_layer.weight', 'encoder.layers.0.src_src_att.output_layer.bias', 'encoder.layers.0.src_src_att.output_layer.weight', 'encoder.layers.0.src_src_att.q_layer.bias', 'encoder.layers.0.src_src_att.q_layer.weight', 'encoder.layers.0.src_src_att.v_layer.bias', 'encoder.layers.0.src_src_att.v_layer.weight', 'encoder.layers.1.feed_forward.layer_norm.bias', 'encoder.layers.1.feed_forward.layer_norm.weight', 'encoder.layers.1.feed_forward.pwff_layer.0.bias', 'encoder.layers.1.feed_forward.pwff_layer.0.weight', 'encoder.layers.1.feed_forward.pwff_layer.3.bias', 'encoder.layers.1.feed_forward.pwff_layer.3.weight', 'encoder.layers.1.layer_norm.bias', 'encoder.layers.1.layer_norm.weight', 'encoder.layers.1.src_src_att.k_layer.bias', 'encoder.layers.1.src_src_att.k_layer.weight', 'encoder.layers.1.src_src_att.output_layer.bias', 'encoder.layers.1.src_src_att.output_layer.weight', 'encoder.layers.1.src_src_att.q_layer.bias', 'encoder.layers.1.src_src_att.q_layer.weight', 'encoder.layers.1.src_src_att.v_layer.bias', 'encoder.layers.1.src_src_att.v_layer.weight', 'encoder.layers.2.feed_forward.layer_norm.bias', 'encoder.layers.2.feed_forward.layer_norm.weight', 'encoder.layers.2.feed_forward.pwff_layer.0.bias', 'encoder.layers.2.feed_forward.pwff_layer.0.weight', 'encoder.layers.2.feed_forward.pwff_layer.3.bias', 'encoder.layers.2.feed_forward.pwff_layer.3.weight', 'encoder.layers.2.layer_norm.bias', 'encoder.layers.2.layer_norm.weight', 'encoder.layers.2.src_src_att.k_layer.bias', 'encoder.layers.2.src_src_att.k_layer.weight', 'encoder.layers.2.src_src_att.output_layer.bias', 'encoder.layers.2.src_src_att.output_layer.weight', 'encoder.layers.2.src_src_att.q_layer.bias', 'encoder.layers.2.src_src_att.q_layer.weight', 'encoder.layers.2.src_src_att.v_layer.bias', 'encoder.layers.2.src_src_att.v_layer.weight', 'encoder.layers.3.feed_forward.layer_norm.bias', 'encoder.layers.3.feed_forward.layer_norm.weight', 'encoder.layers.3.feed_forward.pwff_layer.0.bias', 'encoder.layers.3.feed_forward.pwff_layer.0.weight', 'encoder.layers.3.feed_forward.pwff_layer.3.bias', 'encoder.layers.3.feed_forward.pwff_layer.3.weight', 'encoder.layers.3.layer_norm.bias', 'encoder.layers.3.layer_norm.weight', 'encoder.layers.3.src_src_att.k_layer.bias', 'encoder.layers.3.src_src_att.k_layer.weight', 'encoder.layers.3.src_src_att.output_layer.bias', 'encoder.layers.3.src_src_att.output_layer.weight', 'encoder.layers.3.src_src_att.q_layer.bias', 'encoder.layers.3.src_src_att.q_layer.weight', 'encoder.layers.3.src_src_att.v_layer.bias', 'encoder.layers.3.src_src_att.v_layer.weight', 'encoder.layers.4.feed_forward.layer_norm.bias', 'encoder.layers.4.feed_forward.layer_norm.weight', 'encoder.layers.4.feed_forward.pwff_layer.0.bias', 'encoder.layers.4.feed_forward.pwff_layer.0.weight', 'encoder.layers.4.feed_forward.pwff_layer.3.bias', 'encoder.layers.4.feed_forward.pwff_layer.3.weight', 'encoder.layers.4.layer_norm.bias', 'encoder.layers.4.layer_norm.weight', 'encoder.layers.4.src_src_att.k_layer.bias', 'encoder.layers.4.src_src_att.k_layer.weight', 'encoder.layers.4.src_src_att.output_layer.bias', 'encoder.layers.4.src_src_att.output_layer.weight', 'encoder.layers.4.src_src_att.q_layer.bias', 'encoder.layers.4.src_src_att.q_layer.weight', 'encoder.layers.4.src_src_att.v_layer.bias', 'encoder.layers.4.src_src_att.v_layer.weight', 'encoder.layers.5.feed_forward.layer_norm.bias', 'encoder.layers.5.feed_forward.layer_norm.weight', 'encoder.layers.5.feed_forward.pwff_layer.0.bias', 'encoder.layers.5.feed_forward.pwff_layer.0.weight', 'encoder.layers.5.feed_forward.pwff_layer.3.bias', 'encoder.layers.5.feed_forward.pwff_layer.3.weight', 'encoder.layers.5.layer_norm.bias', 'encoder.layers.5.layer_norm.weight', 'encoder.layers.5.src_src_att.k_layer.bias', 'encoder.layers.5.src_src_att.k_layer.weight', 'encoder.layers.5.src_src_att.output_layer.bias', 'encoder.layers.5.src_src_att.output_layer.weight', 'encoder.layers.5.src_src_att.q_layer.bias', 'encoder.layers.5.src_src_att.q_layer.weight', 'encoder.layers.5.src_src_att.v_layer.bias', 'encoder.layers.5.src_src_att.v_layer.weight', 'src_embed.lut.weight']\n","2019-11-28 13:10:30,411 Loading model from /content/drive/My Drive/masakhane/en-nr-baseline/pretrained/ennr4000baseline/46000.ckpt\n","2019-11-28 13:10:32,328 cfg.name : ennr4000baseline_transformer\n","2019-11-28 13:10:32,328 cfg.data.src : en\n","2019-11-28 13:10:32,328 cfg.data.trg : nr\n","2019-11-28 13:10:32,328 cfg.data.train : /content/drive/My Drive/masakhane/en-nr-baseline/train.bpe\n","2019-11-28 13:10:32,328 cfg.data.dev : /content/drive/My Drive/masakhane/en-nr-baseline/dev.bpe\n","2019-11-28 13:10:32,328 cfg.data.test : /content/drive/My Drive/masakhane/en-nr-baseline/test.bpe\n","2019-11-28 13:10:32,329 cfg.data.level : bpe\n","2019-11-28 13:10:32,329 cfg.data.lowercase : False\n","2019-11-28 13:10:32,329 cfg.data.max_sent_length : 100\n","2019-11-28 13:10:32,329 cfg.data.src_vocab : /content/drive/My Drive/masakhane/en-nr-baseline/vocab.txt\n","2019-11-28 13:10:32,329 cfg.data.trg_vocab : /content/drive/My Drive/masakhane/en-nr-baseline/vocab.txt\n","2019-11-28 13:10:32,329 cfg.testing.beam_size : 5\n","2019-11-28 13:10:32,329 cfg.testing.alpha : 1.0\n","2019-11-28 13:10:32,329 cfg.training.load_model : /content/drive/My Drive/masakhane/en-nr-baseline/pretrained/ennr4000baseline/46000.ckpt\n","2019-11-28 13:10:32,329 cfg.training.random_seed : 42\n","2019-11-28 13:10:32,329 cfg.training.optimizer : adam\n","2019-11-28 13:10:32,330 cfg.training.normalization : tokens\n","2019-11-28 13:10:32,330 cfg.training.adam_betas : [0.9, 0.999]\n","2019-11-28 13:10:32,330 cfg.training.scheduling : plateau\n","2019-11-28 13:10:32,330 cfg.training.patience : 5\n","2019-11-28 13:10:32,330 cfg.training.learning_rate_factor : 0.5\n","2019-11-28 13:10:32,330 cfg.training.learning_rate_warmup : 1000\n","2019-11-28 13:10:32,330 cfg.training.decrease_factor : 0.7\n","2019-11-28 13:10:32,330 cfg.training.loss : crossentropy\n","2019-11-28 13:10:32,330 cfg.training.learning_rate : 0.0003\n","2019-11-28 13:10:32,330 cfg.training.learning_rate_min : 1e-08\n","2019-11-28 13:10:32,331 cfg.training.weight_decay : 0.0\n","2019-11-28 13:10:32,331 cfg.training.label_smoothing : 0.1\n","2019-11-28 13:10:32,331 cfg.training.batch_size : 4096\n","2019-11-28 13:10:32,331 cfg.training.batch_type : token\n","2019-11-28 13:10:32,331 cfg.training.eval_batch_size : 3600\n","2019-11-28 13:10:32,331 cfg.training.eval_batch_type : token\n","2019-11-28 13:10:32,331 cfg.training.batch_multiplier : 1\n","2019-11-28 13:10:32,331 cfg.training.early_stopping_metric : ppl\n","2019-11-28 13:10:32,331 cfg.training.epochs : 30\n","2019-11-28 13:10:32,331 cfg.training.validation_freq : 1000\n","2019-11-28 13:10:32,332 cfg.training.logging_freq : 100\n","2019-11-28 13:10:32,332 cfg.training.eval_metric : bleu\n","2019-11-28 13:10:32,332 cfg.training.model_dir : models/ennr4000baseline_transformer\n","2019-11-28 13:10:32,332 cfg.training.overwrite : True\n","2019-11-28 13:10:32,332 cfg.training.shuffle : True\n","2019-11-28 13:10:32,332 cfg.training.use_cuda : True\n","2019-11-28 13:10:32,332 cfg.training.max_output_length : 100\n","2019-11-28 13:10:32,332 cfg.training.print_valid_sents : [0, 1, 2, 3]\n","2019-11-28 13:10:32,333 cfg.training.keep_last_ckpts : 3\n","2019-11-28 13:10:32,333 cfg.model.initializer : xavier\n","2019-11-28 13:10:32,333 cfg.model.bias_initializer : zeros\n","2019-11-28 13:10:32,333 cfg.model.init_gain : 1.0\n","2019-11-28 13:10:32,333 cfg.model.embed_initializer : xavier\n","2019-11-28 13:10:32,333 cfg.model.embed_init_gain : 1.0\n","2019-11-28 13:10:32,333 cfg.model.tied_embeddings : True\n","2019-11-28 13:10:32,333 cfg.model.tied_softmax : True\n","2019-11-28 13:10:32,333 cfg.model.encoder.type : transformer\n","2019-11-28 13:10:32,333 cfg.model.encoder.num_layers : 6\n","2019-11-28 13:10:32,334 cfg.model.encoder.num_heads : 4\n","2019-11-28 13:10:32,334 cfg.model.encoder.embeddings.embedding_dim : 256\n","2019-11-28 13:10:32,334 cfg.model.encoder.embeddings.scale : True\n","2019-11-28 13:10:32,334 cfg.model.encoder.embeddings.dropout : 0.3\n","2019-11-28 13:10:32,334 cfg.model.encoder.hidden_size : 256\n","2019-11-28 13:10:32,334 cfg.model.encoder.ff_size : 1024\n","2019-11-28 13:10:32,334 cfg.model.encoder.dropout : 0.4\n","2019-11-28 13:10:32,334 cfg.model.decoder.type : transformer\n","2019-11-28 13:10:32,334 cfg.model.decoder.num_layers : 6\n","2019-11-28 13:10:32,334 cfg.model.decoder.num_heads : 8\n","2019-11-28 13:10:32,335 cfg.model.decoder.embeddings.embedding_dim : 256\n","2019-11-28 13:10:32,335 cfg.model.decoder.embeddings.scale : True\n","2019-11-28 13:10:32,335 cfg.model.decoder.embeddings.dropout : 0.3\n","2019-11-28 13:10:32,335 cfg.model.decoder.hidden_size : 256\n","2019-11-28 13:10:32,335 cfg.model.decoder.ff_size : 1024\n","2019-11-28 13:10:32,335 cfg.model.decoder.dropout : 0.4\n","2019-11-28 13:10:32,335 Data set sizes: \n","\ttrain 91454,\n","\tvalid 1000,\n","\ttest 2671\n","2019-11-28 13:10:32,335 First training example:\n","\t[SRC] we need similar courage in these last days\n","\t[TRG] s@@ itlhoga isib@@ indi esifan@@ ako emihleni yoku@@ phela le\n","2019-11-28 13:10:32,335 First 10 words (src): (0) (1) (2) (3) (4) , (5) the (6) to (7) a (8) of (9) and\n","2019-11-28 13:10:32,335 First 10 words (trg): (0) (1) (2) (3) (4) , (5) the (6) to (7) a (8) of (9) and\n","2019-11-28 13:10:32,336 Number of Src words (types): 4134\n","2019-11-28 13:10:32,336 Number of Trg words (types): 4134\n","2019-11-28 13:10:32,336 Model(\n","\tencoder=TransformerEncoder(num_layers=6, num_heads=4),\n","\tdecoder=TransformerDecoder(num_layers=6, num_heads=8),\n","\tsrc_embed=Embeddings(embedding_dim=256, vocab_size=4134),\n","\ttrg_embed=Embeddings(embedding_dim=256, vocab_size=4134))\n","2019-11-28 13:10:32,340 EPOCH 1\n","2019-11-28 13:10:47,336 Epoch 1 Step: 46100 Batch Loss: 2.208934 Tokens per Sec: 15514, Lr: 0.000300\n","2019-11-28 13:11:02,565 Epoch 1 Step: 46200 Batch Loss: 1.919596 Tokens per Sec: 14737, Lr: 0.000300\n","2019-11-28 13:11:17,741 Epoch 1 Step: 46300 Batch Loss: 1.797789 Tokens per Sec: 14876, Lr: 0.000300\n","2019-11-28 13:11:32,917 Epoch 1 Step: 46400 Batch Loss: 2.305316 Tokens per Sec: 14579, Lr: 0.000300\n","2019-11-28 13:11:48,192 Epoch 1 Step: 46500 Batch Loss: 1.891086 Tokens per Sec: 14905, Lr: 0.000300\n","2019-11-28 13:12:03,321 Epoch 1 Step: 46600 Batch Loss: 1.163202 Tokens per Sec: 14902, Lr: 0.000300\n","2019-11-28 13:12:18,167 Epoch 1 Step: 46700 Batch Loss: 1.642465 Tokens per Sec: 15199, Lr: 0.000300\n","2019-11-28 13:12:33,291 Epoch 1 Step: 46800 Batch Loss: 2.020858 Tokens per Sec: 15142, Lr: 0.000300\n","2019-11-28 13:12:48,477 Epoch 1 Step: 46900 Batch Loss: 2.465777 Tokens per Sec: 14943, Lr: 0.000300\n","2019-11-28 13:12:54,882 Epoch 1: total training loss 1802.47\n","2019-11-28 13:12:54,882 EPOCH 2\n","2019-11-28 13:13:03,977 Epoch 2 Step: 47000 Batch Loss: 2.068535 Tokens per Sec: 14371, Lr: 0.000300\n","2019-11-28 13:13:39,300 Example #0\n","2019-11-28 13:13:39,301 \tSource: ( a ) how does jehovah mold us today ?\n","2019-11-28 13:13:39,301 \tReference: ( a ) ujehova usibumba njani namhlanjesi ?\n","2019-11-28 13:13:39,301 \tHypothesis: ( a ) ujehova usibumba njani namhlanjesi ?\n","2019-11-28 13:13:39,301 Example #1\n","2019-11-28 13:13:39,301 \tSource: ( read thessalonians : , )\n","2019-11-28 13:13:39,301 \tReference: ( funda yoku - tesalonika : , )\n","2019-11-28 13:13:39,301 \tHypothesis: ( funda thesalonika : , )\n","2019-11-28 13:13:39,302 Example #2\n","2019-11-28 13:13:39,302 \tSource: page • songs : ,\n","2019-11-28 13:13:39,302 \tReference: ikhasi • iingoma : ,\n","2019-11-28 13:13:39,302 \tHypothesis: ikhasi • iingoma : ,\n","2019-11-28 13:13:39,302 Example #3\n","2019-11-28 13:13:39,302 \tSource: if that has been true in your case , what can you do to free yourself from their influence ?\n","2019-11-28 13:13:39,303 \tReference: nengabe lokho kunjalo ngawe , khuyini ongayenza bona uzokwazi ukutjhaphuluka kiyo ?\n","2019-11-28 13:13:39,303 \tHypothesis: nengabe lokho kuliqiniso endabeni yakho , khuyini ongayenza bona uzitjhaphuluke emthelwenabo ?\n","2019-11-28 13:13:39,303 Validation result (greedy) at epoch 2, step 47000: bleu: 15.03, loss: 44884.7891, ppl: 6.8823, duration: 35.3252s\n","2019-11-28 13:13:54,398 Epoch 2 Step: 47100 Batch Loss: 1.805137 Tokens per Sec: 15029, Lr: 0.000300\n","2019-11-28 13:14:09,399 Epoch 2 Step: 47200 Batch Loss: 1.646797 Tokens per Sec: 15335, Lr: 0.000300\n","2019-11-28 13:14:24,848 Epoch 2 Step: 47300 Batch Loss: 1.718542 Tokens per Sec: 14242, Lr: 0.000300\n","2019-11-28 13:14:40,001 Epoch 2 Step: 47400 Batch Loss: 1.887118 Tokens per Sec: 14991, Lr: 0.000300\n","2019-11-28 13:14:55,192 Epoch 2 Step: 47500 Batch Loss: 2.246618 Tokens per Sec: 14853, Lr: 0.000300\n","2019-11-28 13:15:10,414 Epoch 2 Step: 47600 Batch Loss: 1.978031 Tokens per Sec: 14924, Lr: 0.000300\n","Traceback (most recent call last):\n"," File \"/usr/lib/python3.6/runpy.py\", line 193, in _run_module_as_main\n"," \"__main__\", mod_spec)\n"," File \"/usr/lib/python3.6/runpy.py\", line 85, in _run_code\n"," exec(code, run_globals)\n"," File \"/content/joeynmt/joeynmt/__main__.py\", line 41, in \n"," main()\n"," File \"/content/joeynmt/joeynmt/__main__.py\", line 29, in main\n"," train(cfg_file=args.config_path)\n"," File \"/content/joeynmt/joeynmt/training.py\", line 596, in train\n"," trainer.train_and_validate(train_data=train_data, valid_data=dev_data)\n"," File \"/content/joeynmt/joeynmt/training.py\", line 296, in train_and_validate\n"," batch_loss = self._train_batch(batch, update=update)\n"," File \"/content/joeynmt/joeynmt/training.py\", line 459, in _train_batch\n"," self.optimizer.step()\n"," File \"/usr/local/lib/python3.6/dist-packages/torch/optim/adam.py\", line 95, in step\n"," exp_avg.mul_(beta1).add_(1 - beta1, grad)\n","KeyboardInterrupt\n","CPU times: user 902 ms, sys: 108 ms, total: 1.01 s\n","Wall time: 5min 1s\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"8ThMN3ATPTEt","colab_type":"code","outputId":"620b9f67-beba-497c-e8a6-912b493e6b89","executionInfo":{"status":"ok","timestamp":1574081651553,"user_tz":-120,"elapsed":9674,"user":{"displayName":"Ari Ramkilowan","photoUrl":"https://lh3.googleusercontent.com/a-/AAuE7mBOdxjV-rMNcCCRFbbimFCdX4yGotTEbejYVcEc=s64","userId":"04960745627161950732"}},"colab":{"base_uri":"https://localhost:8080/","height":34}},"source":["# Copy the created models from the notebook storage to google drive for persistant storage \n","!cp -r joeynmt/models/${src}${tgt}${vocab_size}${tag}_transformer/* \"$gdrive_path\"\"pretrained/$src$tgt$vocab_size$tag/\"\n","!cp joeynmt/models/${src}${tgt}${vocab_size}${tag}_transformer/best.ckpt \"$gdrive_path\"\"pretrained/$src$tgt$vocab_size$tag\""],"execution_count":0,"outputs":[{"output_type":"stream","text":["cp: cannot create symbolic link '/content/drive/My Drive/masakhane/en-nr-baseline/pretrained/ennr4000baseline/best.ckpt': Function not implemented\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"4Z3MiMStJ7jL","colab_type":"code","colab":{}},"source":["# Copy the created models from t google drive to joeynmt folder\n","!cp -r \"$gdrive_path\"\"pretrained/$src$tgt$vocab_size$tag/\" \"joeynmt/models/$src$tgt$vocab_size$tag\"\"_transformer/\"\n","# !cp joeynmt/models/${src}${tgt}${vocab_size}${tag}_transformer/best.ckpt \"$gdrive_path\"\"pretrained/$src$tgt$vocab_size$tag\""],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"aSJUF7weWPv5","colab_type":"code","colab":{}},"source":["# copy across the config file\n","!cp joeynmt/configs/transformer_${src}${tgt}${vocab_size}${tag}.yaml \"$gdrive_path\""],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"HsEWaXbDWP2I","colab_type":"code","colab":{}},"source":["# Test our model\n","# ! cd joeynmt; python3 -m joeynmt test \"$gdrive_path\"\"pretrained/$src$tgt$vocab_size$tag/config.yaml\""],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"nlREWwrxWP0G","colab_type":"code","outputId":"2126f095-c562-4a6e-e09c-5de053a0129b","executionInfo":{"status":"ok","timestamp":1574948355978,"user_tz":-120,"elapsed":99970,"user":{"displayName":"Ari Ramkilowan","photoUrl":"https://lh3.googleusercontent.com/a-/AAuE7mBOdxjV-rMNcCCRFbbimFCdX4yGotTEbejYVcEc=s64","userId":"04960745627161950732"}},"colab":{"base_uri":"https://localhost:8080/","height":69}},"source":["# OR\n","! cd joeynmt; python3 -m joeynmt test \"$gdrive_path\"\"transformer_${src}${tgt}${vocab_size}${tag}.yaml\""],"execution_count":0,"outputs":[{"output_type":"stream","text":["2019-11-28 13:37:38,730 Hello! This is Joey-NMT.\n","2019-11-28 13:38:08,636 dev bleu: 14.93 [Beam search decoding with beam size = 5 and alpha = 1.0]\n","2019-11-28 13:39:12,496 test bleu: 4.01 [Beam search decoding with beam size = 5 and alpha = 1.0]\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"yJUQvIenRKtN","colab_type":"code","colab":{}},"source":["!touch \"$gdrive_path\"sample_input.txt\n","!echo 'Hello there' > \"$gdrive_path\"sample_input.txt"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"bcj1AxY6QU2n","colab_type":"code","colab":{}},"source":["# BPE some bespoke input\n","! subword-nmt apply-bpe -c \"$gdrive_path\"bpe.codes.$vocab_size --vocabulary \"$gdrive_path\"vocab.$tgt < \"$gdrive_path\"sample_input.txt > \"$gdrive_path\"sample_input_bped.txt"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"DJuaiUXkSrL1","colab_type":"code","outputId":"3193772f-65c6-4327-e156-b7f0747031e3","executionInfo":{"status":"ok","timestamp":1574949461078,"user_tz":-120,"elapsed":7501,"user":{"displayName":"Ari Ramkilowan","photoUrl":"https://lh3.googleusercontent.com/a-/AAuE7mBOdxjV-rMNcCCRFbbimFCdX4yGotTEbejYVcEc=s64","userId":"04960745627161950732"}},"colab":{"base_uri":"https://localhost:8080/","height":34}},"source":["! more \"$gdrive_path\"sample_input_bped.txt"],"execution_count":0,"outputs":[{"output_type":"stream","text":["H@@ el@@ lo there\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"LO_hXIdAWPtU","colab_type":"code","outputId":"3d8f8dd1-5435-4a82-bff5-8003d2e37054","executionInfo":{"status":"ok","timestamp":1574949656647,"user_tz":-120,"elapsed":188245,"user":{"displayName":"Ari Ramkilowan","photoUrl":"https://lh3.googleusercontent.com/a-/AAuE7mBOdxjV-rMNcCCRFbbimFCdX4yGotTEbejYVcEc=s64","userId":"04960745627161950732"}},"colab":{"base_uri":"https://localhost:8080/","height":312}},"source":["# Translate mode is mopre interactive but almsot the same as running in test mode\n","! cd joeynmt; python3 -m joeynmt translate \"$gdrive_path\"\"transformer_${src}${tgt}${vocab_size}${tag}.yaml\""],"execution_count":0,"outputs":[{"output_type":"stream","text":["2019-11-28 13:57:52,461 Hello! This is Joey-NMT.\n","\n","Please enter a source sentence (pre-processed): \n","H@@ el@@ lo there\n","JoeyNMT: u - ella ukhona\n","\n","Please enter a source sentence (pre-processed): \n","H@@ el@@ lo\n","JoeyNMT: u - ella\n","\n","Please enter a source sentence (pre-processed): \n","h@@ el@@ lo\n","JoeyNMT: hella\n","\n","Please enter a source sentence (pre-processed): \n","\n","Bye.\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"V6HTKX2UWPrB","colab_type":"code","outputId":"2b3d1f45-deda-4e62-c1ef-56cf1b68ef24","executionInfo":{"status":"ok","timestamp":1574081870718,"user_tz":-120,"elapsed":6161,"user":{"displayName":"Ari Ramkilowan","photoUrl":"https://lh3.googleusercontent.com/a-/AAuE7mBOdxjV-rMNcCCRFbbimFCdX4yGotTEbejYVcEc=s64","userId":"04960745627161950732"}},"colab":{"base_uri":"https://localhost:8080/","height":503}},"source":["# Output our validation accuracy\n","! cat \"$gdrive_path/pretrained/${src}${tgt}${vocab_size}${tag}/validations.txt\""],"execution_count":0,"outputs":[{"output_type":"stream","text":["Steps: 19000\tLoss: 52407.53906\tPPL: 9.50908\tbleu: 10.34544\tLR: 0.00030000\t*\n","Steps: 20000\tLoss: 51819.73047\tPPL: 9.27188\tbleu: 10.89485\tLR: 0.00030000\t*\n","Steps: 21000\tLoss: 51056.16797\tPPL: 8.97256\tbleu: 10.77206\tLR: 0.00030000\t*\n","Steps: 22000\tLoss: 50872.08984\tPPL: 8.90186\tbleu: 11.24929\tLR: 0.00030000\t*\n","Steps: 23000\tLoss: 50294.21484\tPPL: 8.68351\tbleu: 11.40370\tLR: 0.00030000\t*\n","Steps: 24000\tLoss: 49921.73047\tPPL: 8.54561\tbleu: 11.70847\tLR: 0.00030000\t*\n","Steps: 25000\tLoss: 49970.34766\tPPL: 8.56349\tbleu: 11.78933\tLR: 0.00030000\t\n","Steps: 26000\tLoss: 49318.21484\tPPL: 8.32682\tbleu: 12.43307\tLR: 0.00030000\t*\n","Steps: 27000\tLoss: 48703.39844\tPPL: 8.10969\tbleu: 12.78421\tLR: 0.00030000\t*\n","Steps: 28000\tLoss: 48488.24219\tPPL: 8.03505\tbleu: 12.43170\tLR: 0.00030000\t*\n","Steps: 29000\tLoss: 48164.97656\tPPL: 7.92419\tbleu: 12.85117\tLR: 0.00030000\t*\n","Steps: 30000\tLoss: 47853.74219\tPPL: 7.81891\tbleu: 13.07397\tLR: 0.00030000\t*\n","Steps: 31000\tLoss: 47605.07031\tPPL: 7.73579\tbleu: 13.14769\tLR: 0.00030000\t*\n","Steps: 32000\tLoss: 47267.08594\tPPL: 7.62424\tbleu: 12.74547\tLR: 0.00030000\t*\n","Steps: 33000\tLoss: 47156.44922\tPPL: 7.58808\tbleu: 13.60611\tLR: 0.00030000\t*\n","Steps: 34000\tLoss: 46885.63281\tPPL: 7.50028\tbleu: 13.52205\tLR: 0.00030000\t*\n","Steps: 35000\tLoss: 46845.80469\tPPL: 7.48745\tbleu: 13.81222\tLR: 0.00030000\t*\n","Steps: 36000\tLoss: 46372.63672\tPPL: 7.33673\tbleu: 13.69633\tLR: 0.00030000\t*\n","Steps: 37000\tLoss: 46225.23047\tPPL: 7.29040\tbleu: 14.04278\tLR: 0.00030000\t*\n","Steps: 38000\tLoss: 45979.46484\tPPL: 7.21381\tbleu: 14.45530\tLR: 0.00030000\t*\n","Steps: 39000\tLoss: 45890.48047\tPPL: 7.18627\tbleu: 14.22063\tLR: 0.00030000\t*\n","Steps: 40000\tLoss: 45623.07031\tPPL: 7.10416\tbleu: 14.22280\tLR: 0.00030000\t*\n","Steps: 41000\tLoss: 45425.37500\tPPL: 7.04406\tbleu: 14.62090\tLR: 0.00030000\t*\n","Steps: 42000\tLoss: 45385.63672\tPPL: 7.03204\tbleu: 14.45233\tLR: 0.00030000\t*\n","Steps: 43000\tLoss: 45362.38281\tPPL: 7.02501\tbleu: 14.57730\tLR: 0.00030000\t*\n","Steps: 44000\tLoss: 45083.61719\tPPL: 6.94136\tbleu: 14.89510\tLR: 0.00030000\t*\n","Steps: 45000\tLoss: 44869.75781\tPPL: 6.87785\tbleu: 14.83738\tLR: 0.00030000\t*\n","Steps: 46000\tLoss: 44789.00000\tPPL: 6.85402\tbleu: 15.05657\tLR: 0.00030000\t*\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"jjwfXFAvWPpI","colab_type":"code","colab":{}},"source":[""],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"_uXHLz9JWPma","colab_type":"code","colab":{}},"source":[""],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"43pEw3cKWPh3","colab_type":"code","colab":{}},"source":[""],"execution_count":0,"outputs":[]}]}