{ "cells": [ { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "view-in-github" }, "source": [ "\"Open" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "FdyHSnoj7Iun", "outputId": "d0624c60-68c4-470f-9ade-c517e3296044" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/content/training\n" ] } ], "source": [ "# create a seperate folder to store everything\n", "!mkdir training\n", "%cd training" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "y55OfxBz8QeP", "outputId": "6d0ab016-0f96-4671-ddee-f06b50506dcd" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Cloning into 'indicTrans'...\n", "remote: Enumerating objects: 432, done.\u001b[K\n", "remote: Counting objects: 100% (139/139), done.\u001b[K\n", "remote: Compressing objects: 100% (34/34), done.\u001b[K\n", "remote: Total 432 (delta 122), reused 105 (delta 105), pack-reused 293\u001b[K\n", "Receiving objects: 100% (432/432), 1.43 MiB | 14.11 MiB/s, done.\n", "Resolving deltas: 100% (248/248), done.\n", "/content/training/indicTrans\n", "Cloning into 'indic_nlp_library'...\n", "remote: Enumerating objects: 1325, done.\u001b[K\n", "remote: Counting objects: 100% (147/147), done.\u001b[K\n", "remote: Compressing objects: 100% (103/103), done.\u001b[K\n", "remote: Total 1325 (delta 84), reused 89 (delta 41), pack-reused 1178\u001b[K\n", "Receiving objects: 100% (1325/1325), 9.57 MiB | 10.51 MiB/s, done.\n", "Resolving deltas: 100% (688/688), done.\n", "Cloning into 'indic_nlp_resources'...\n", "remote: Enumerating objects: 133, done.\u001b[K\n", "remote: Counting objects: 100% (7/7), done.\u001b[K\n", "remote: Compressing objects: 100% (7/7), done.\u001b[K\n", "remote: Total 133 (delta 0), reused 2 (delta 0), pack-reused 126\u001b[K\n", "Receiving objects: 100% (133/133), 149.77 MiB | 34.05 MiB/s, done.\n", "Resolving deltas: 100% (51/51), done.\n", "Checking out files: 100% (28/28), done.\n", "Cloning into 'subword-nmt'...\n", "remote: Enumerating objects: 580, done.\u001b[K\n", "remote: Counting objects: 100% (4/4), done.\u001b[K\n", "remote: Compressing objects: 100% (4/4), done.\u001b[K\n", "remote: Total 580 (delta 0), reused 1 (delta 0), pack-reused 576\u001b[K\n", "Receiving objects: 100% (580/580), 237.41 KiB | 5.28 MiB/s, done.\n", "Resolving deltas: 100% (349/349), done.\n", "/content/training\n" ] } ], "source": [ "# clone the repo for running finetuning\n", "!git clone https://github.com/AI4Bharat/indicTrans.git\n", "%cd indicTrans\n", "# clone requirements repositories\n", "!git clone https://github.com/anoopkunchukuttan/indic_nlp_library.git\n", "!git clone https://github.com/anoopkunchukuttan/indic_nlp_resources.git\n", "!git clone https://github.com/rsennrich/subword-nmt.git\n", "%cd .." ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "ziWWl-1a8SMw", "outputId": "d7908a62-9573-4693-e7cb-44aeeebaaa15" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Reading package lists... Done\n", "Building dependency tree \n", "Reading state information... Done\n", "The following NEW packages will be installed:\n", " tree\n", "0 upgraded, 1 newly installed, 0 to remove and 39 not upgraded.\n", "Need to get 40.7 kB of archives.\n", "After this operation, 105 kB of additional disk space will be used.\n", "Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 tree amd64 1.7.0-5 [40.7 kB]\n", "Fetched 40.7 kB in 0s (133 kB/s)\n", "debconf: unable to initialize frontend: Dialog\n", "debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 76, <> line 1.)\n", "debconf: falling back to frontend: Readline\n", "debconf: unable to initialize frontend: Readline\n", "debconf: (This frontend requires a controlling tty.)\n", "debconf: falling back to frontend: Teletype\n", "dpkg-preconfigure: unable to re-open stdin: \n", "Selecting previously unselected package tree.\n", "(Reading database ... 160772 files and directories currently installed.)\n", "Preparing to unpack .../tree_1.7.0-5_amd64.deb ...\n", "Unpacking tree (1.7.0-5) ...\n", "Setting up tree (1.7.0-5) ...\n", "Processing triggers for man-db (2.8.3-2ubuntu0.1) ...\n", "Collecting sacremoses\n", "\u001b[?25l Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)\n", "\u001b[K |████████████████████████████████| 901kB 4.0MB/s \n", "\u001b[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (1.1.5)\n", "Collecting mock\n", " Downloading https://files.pythonhosted.org/packages/5c/03/b7e605db4a57c0f6fba744b11ef3ddf4ddebcada35022927a2b5fc623fdf/mock-4.0.3-py3-none-any.whl\n", "Collecting sacrebleu\n", "\u001b[?25l Downloading https://files.pythonhosted.org/packages/7e/57/0c7ca4e31a126189dab99c19951910bd081dea5bbd25f24b77107750eae7/sacrebleu-1.5.1-py3-none-any.whl (54kB)\n", "\u001b[K |████████████████████████████████| 61kB 7.4MB/s \n", "\u001b[?25hCollecting tensorboardX\n", "\u001b[?25l Downloading https://files.pythonhosted.org/packages/42/36/2b147652c40c3a858efa0afbf7b8236fae968e88ff530511a4cfa299a506/tensorboardX-2.3-py2.py3-none-any.whl (124kB)\n", "\u001b[K |████████████████████████████████| 133kB 24.0MB/s \n", "\u001b[?25hRequirement already satisfied: pyarrow in /usr/local/lib/python3.7/dist-packages (3.0.0)\n", "Collecting indic-nlp-library\n", "\u001b[?25l Downloading https://files.pythonhosted.org/packages/84/d4/495bb43b88a2a6d04b09c29fc5115f24872af74cd8317fe84026abd4ddb1/indic_nlp_library-0.81-py3-none-any.whl (40kB)\n", "\u001b[K |████████████████████████████████| 40kB 5.4MB/s \n", "\u001b[?25hRequirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from sacremoses) (1.15.0)\n", "Requirement already satisfied: regex in /usr/local/lib/python3.7/dist-packages (from sacremoses) (2019.12.20)\n", "Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from sacremoses) (7.1.2)\n", "Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from sacremoses) (4.41.1)\n", "Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from sacremoses) (1.0.1)\n", "Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas) (2018.9)\n", "Requirement already satisfied: numpy>=1.15.4 in /usr/local/lib/python3.7/dist-packages (from pandas) (1.19.5)\n", "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas) (2.8.1)\n", "Collecting portalocker==2.0.0\n", " Downloading https://files.pythonhosted.org/packages/89/a6/3814b7107e0788040870e8825eebf214d72166adf656ba7d4bf14759a06a/portalocker-2.0.0-py2.py3-none-any.whl\n", "Requirement already satisfied: protobuf>=3.8.0 in /usr/local/lib/python3.7/dist-packages (from tensorboardX) (3.12.4)\n", "Collecting morfessor\n", " Downloading https://files.pythonhosted.org/packages/39/e6/7afea30be2ee4d29ce9de0fa53acbb033163615f849515c0b1956ad074ee/Morfessor-2.0.6-py3-none-any.whl\n", "Collecting sphinx-argparse\n", " Downloading https://files.pythonhosted.org/packages/06/2b/dfad6a1831c3aeeae25d8d3d417224684befbf45e10c7f2141631616a6ed/sphinx-argparse-0.2.5.tar.gz\n", "Collecting sphinx-rtd-theme\n", "\u001b[?25l Downloading https://files.pythonhosted.org/packages/ac/24/2475e8f83519b54b2148d4a56eb1111f9cec630d088c3ffc214492c12107/sphinx_rtd_theme-0.5.2-py2.py3-none-any.whl (9.1MB)\n", "\u001b[K |████████████████████████████████| 9.2MB 21.7MB/s \n", "\u001b[?25hRequirement already satisfied: setuptools in /usr/local/lib/python3.7/dist-packages (from protobuf>=3.8.0->tensorboardX) (57.0.0)\n", "Requirement already satisfied: sphinx>=1.2.0 in /usr/local/lib/python3.7/dist-packages (from sphinx-argparse->indic-nlp-library) (1.8.5)\n", "Collecting docutils<0.17\n", "\u001b[?25l Downloading https://files.pythonhosted.org/packages/81/44/8a15e45ffa96e6cf82956dd8d7af9e666357e16b0d93b253903475ee947f/docutils-0.16-py2.py3-none-any.whl (548kB)\n", "\u001b[K |████████████████████████████████| 552kB 38.5MB/s \n", "\u001b[?25hRequirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (20.9)\n", "Requirement already satisfied: imagesize in /usr/local/lib/python3.7/dist-packages (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (1.2.0)\n", "Requirement already satisfied: requests>=2.0.0 in /usr/local/lib/python3.7/dist-packages (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (2.23.0)\n", "Requirement already satisfied: sphinxcontrib-websupport in /usr/local/lib/python3.7/dist-packages (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (1.2.4)\n", "Requirement already satisfied: Pygments>=2.0 in /usr/local/lib/python3.7/dist-packages (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (2.6.1)\n", "Requirement already satisfied: snowballstemmer>=1.1 in /usr/local/lib/python3.7/dist-packages (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (2.1.0)\n", "Requirement already satisfied: babel!=2.0,>=1.3 in /usr/local/lib/python3.7/dist-packages (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (2.9.1)\n", "Requirement already satisfied: alabaster<0.8,>=0.7 in /usr/local/lib/python3.7/dist-packages (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (0.7.12)\n", "Requirement already satisfied: Jinja2>=2.3 in /usr/local/lib/python3.7/dist-packages (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (2.11.3)\n", "Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (2.4.7)\n", "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests>=2.0.0->sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (1.24.3)\n", "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests>=2.0.0->sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (2.10)\n", "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests>=2.0.0->sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (3.0.4)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests>=2.0.0->sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (2021.5.30)\n", "Requirement already satisfied: sphinxcontrib-serializinghtml in /usr/local/lib/python3.7/dist-packages (from sphinxcontrib-websupport->sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (1.1.5)\n", "Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.7/dist-packages (from Jinja2>=2.3->sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (2.0.1)\n", "Building wheels for collected packages: sphinx-argparse\n", " Building wheel for sphinx-argparse (setup.py) ... \u001b[?25l\u001b[?25hdone\n", " Created wheel for sphinx-argparse: filename=sphinx_argparse-0.2.5-cp37-none-any.whl size=11552 sha256=0f3830a0bf7a6cfa99000091da945e9dd814b2f1e1f9ca5d773f99aaa0d3a4a5\n", " Stored in directory: /root/.cache/pip/wheels/2a/18/1b/4990a1859da4edc77ab312bc2986c08d2733fb5713d06e44f5\n", "Successfully built sphinx-argparse\n", "\u001b[31mERROR: datascience 0.10.6 has requirement folium==0.2.1, but you'll have folium 0.8.3 which is incompatible.\u001b[0m\n", "Installing collected packages: sacremoses, mock, portalocker, sacrebleu, tensorboardX, morfessor, sphinx-argparse, docutils, sphinx-rtd-theme, indic-nlp-library\n", " Found existing installation: docutils 0.17.1\n", " Uninstalling docutils-0.17.1:\n", " Successfully uninstalled docutils-0.17.1\n", "Successfully installed docutils-0.16 indic-nlp-library-0.81 mock-4.0.3 morfessor-2.0.6 portalocker-2.0.0 sacrebleu-1.5.1 sacremoses-0.0.45 sphinx-argparse-0.2.5 sphinx-rtd-theme-0.5.2 tensorboardX-2.3\n", "Cloning into 'fairseq'...\n", "remote: Enumerating objects: 28410, done.\u001b[K\n", "remote: Counting objects: 100% (229/229), done.\u001b[K\n", "remote: Compressing objects: 100% (127/127), done.\u001b[K\n", "remote: Total 28410 (delta 114), reused 187 (delta 99), pack-reused 28181\u001b[K\n", "Receiving objects: 100% (28410/28410), 11.96 MiB | 24.45 MiB/s, done.\n", "Resolving deltas: 100% (21310/21310), done.\n", "/content/training/fairseq\n", "Obtaining file:///content/training/fairseq\n", " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", " Installing backend dependencies ... \u001b[?25l\u001b[?25hdone\n", " Preparing wheel metadata ... \u001b[?25l\u001b[?25hdone\n", "Requirement already satisfied: regex in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (2019.12.20)\n", "Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (4.41.1)\n", "Collecting omegaconf<2.1\n", " Downloading https://files.pythonhosted.org/packages/d0/eb/9d63ce09dd8aa85767c65668d5414958ea29648a0eec80a4a7d311ec2684/omegaconf-2.0.6-py3-none-any.whl\n", "Requirement already satisfied: numpy; python_version >= \"3.7\" in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (1.19.5)\n", "Requirement already satisfied: sacrebleu>=1.4.12 in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (1.5.1)\n", "Requirement already satisfied: cython in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (0.29.23)\n", "Collecting hydra-core<1.1\n", "\u001b[?25l Downloading https://files.pythonhosted.org/packages/52/e3/fbd70dd0d3ce4d1d75c22d56c0c9f895cfa7ed6587a9ffb821d6812d6a60/hydra_core-1.0.6-py3-none-any.whl (123kB)\n", "\u001b[K |████████████████████████████████| 133kB 4.7MB/s \n", "\u001b[?25hRequirement already satisfied: torch in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (1.9.0+cu102)\n", "Requirement already satisfied: cffi in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (1.14.5)\n", "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from omegaconf<2.1->fairseq==1.0.0a0+f887152) (3.7.4.3)\n", "Collecting PyYAML>=5.1.*\n", "\u001b[?25l Downloading https://files.pythonhosted.org/packages/7a/a5/393c087efdc78091afa2af9f1378762f9821c9c1d7a22c5753fb5ac5f97a/PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636kB)\n", "\u001b[K |████████████████████████████████| 645kB 32.4MB/s \n", "\u001b[?25hRequirement already satisfied: portalocker==2.0.0 in /usr/local/lib/python3.7/dist-packages (from sacrebleu>=1.4.12->fairseq==1.0.0a0+f887152) (2.0.0)\n", "Requirement already satisfied: importlib-resources; python_version < \"3.9\" in /usr/local/lib/python3.7/dist-packages (from hydra-core<1.1->fairseq==1.0.0a0+f887152) (5.1.4)\n", "Collecting antlr4-python3-runtime==4.8\n", "\u001b[?25l Downloading https://files.pythonhosted.org/packages/56/02/789a0bddf9c9b31b14c3e79ec22b9656185a803dc31c15f006f9855ece0d/antlr4-python3-runtime-4.8.tar.gz (112kB)\n", "\u001b[K |████████████████████████████████| 112kB 53.0MB/s \n", "\u001b[?25hRequirement already satisfied: pycparser in /usr/local/lib/python3.7/dist-packages (from cffi->fairseq==1.0.0a0+f887152) (2.20)\n", "Requirement already satisfied: zipp>=3.1.0; python_version < \"3.10\" in /usr/local/lib/python3.7/dist-packages (from importlib-resources; python_version < \"3.9\"->hydra-core<1.1->fairseq==1.0.0a0+f887152) (3.4.1)\n", "Building wheels for collected packages: antlr4-python3-runtime\n", " Building wheel for antlr4-python3-runtime (setup.py) ... \u001b[?25l\u001b[?25hdone\n", " Created wheel for antlr4-python3-runtime: filename=antlr4_python3_runtime-4.8-cp37-none-any.whl size=141231 sha256=52f59bfe6322a04598da6960d2d5675a581273a45e4391e04cf1240c97346019\n", " Stored in directory: /root/.cache/pip/wheels/e3/e2/fa/b78480b448b8579ddf393bebd3f47ee23aa84c89b6a78285c8\n", "Successfully built antlr4-python3-runtime\n", "Installing collected packages: PyYAML, omegaconf, antlr4-python3-runtime, hydra-core, fairseq\n", " Found existing installation: PyYAML 3.13\n", " Uninstalling PyYAML-3.13:\n", " Successfully uninstalled PyYAML-3.13\n", " Running setup.py develop for fairseq\n", "Successfully installed PyYAML-5.4.1 antlr4-python3-runtime-4.8 fairseq hydra-core-1.0.6 omegaconf-2.0.6\n", "/content/training\n" ] } ], "source": [ "! sudo apt install tree\n", "\n", "# Install the necessary libraries\n", "!pip install sacremoses pandas mock sacrebleu tensorboardX pyarrow indic-nlp-library\n", "# Install fairseq from source\n", "!git clone https://github.com/pytorch/fairseq.git\n", "%cd fairseq\n", "# !git checkout da9eaba12d82b9bfc1442f0e2c6fc1b895f4d35d\n", "!pip install --editable ./\n", "%cd .." ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "tmfGYkd58UiO", "outputId": "3b83bcf6-bbbf-4e49-c2bb-7d0fb999297d" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "^C\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "--2021-12-18 21:31:57-- https://storage.googleapis.com/samanantar-public/benchmarks.zip\n", "Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.160.144, 216.58.196.176, 142.250.71.16, ...\n", "Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.160.144|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 7301872 (7.0M) [application/zip]\n", "Saving to: 'benchmarks.zip'\n", "\n", " 0K .......... .......... .......... .......... .......... 0% 774K 9s\n", " 50K .......... .......... .......... .......... .......... 1% 2.10M 6s\n", " 100K .......... .......... .......... .......... .......... 2% 2.46M 5s\n", " 150K .......... .......... .......... .......... .......... 2% 2.68M 4s\n", " 200K .......... .......... .......... .......... .......... 3% 1.44M 4s\n", " 250K .......... .......... .......... .......... .......... 4% 2.48M 4s\n", " 300K .......... .......... .......... .......... .......... 4% 3.41M 4s\n", " 350K .......... .......... .......... .......... .......... 5% 2.22M 4s\n", " 400K .......... .......... .......... .......... .......... 6% 1.20M 4s\n", " 450K .......... .......... .......... .......... .......... 7% 2.65M 4s\n", " 500K .......... .......... .......... .......... .......... 7% 2.97M 3s\n", " 550K .......... .......... .......... .......... .......... 8% 887K 4s\n", " 600K .......... .......... .......... .......... .......... 9% 2.90M 4s\n", " 650K .......... .......... .......... .......... .......... 9% 2.76M 4s\n", " 700K .......... .......... .......... .......... .......... 10% 980K 4s\n", " 750K .......... .......... .......... .......... .......... 11% 2.55M 4s\n", " 800K .......... .......... .......... .......... .......... 11% 2.86M 3s\n", " 850K .......... .......... .......... .......... .......... 12% 3.04M 3s\n", " 900K .......... .......... .......... .......... .......... 13% 1.01M 3s\n", " 950K .......... .......... .......... .......... .......... 14% 3.35M 3s\n", " 1000K .......... .......... .......... .......... .......... 14% 5.04M 3s\n", " 1050K .......... .......... .......... .......... .......... 15% 14.5M 3s\n", " 1100K .......... .......... .......... .......... .......... 16% 1.01M 3s\n", " 1150K .......... .......... .......... .......... .......... 16% 4.48M 3s\n", " 1200K .......... .......... .......... .......... .......... 17% 4.34M 3s\n", " 1250K .......... .......... .......... .......... .......... 18% 2.90M 3s\n", " 1300K .......... .......... .......... .......... .......... 18% 1.14M 3s\n", " 1350K .......... .......... .......... .......... .......... 19% 3.00M 3s\n", " 1400K .......... .......... .......... .......... .......... 20% 5.09M 3s\n", " 1450K .......... .......... .......... .......... .......... 21% 1.91M 3s\n", " 1500K .......... .......... .......... .......... .......... 21% 7.70M 3s\n", " 1550K .......... .......... .......... .......... .......... 22% 1.27M 3s\n", " 1600K .......... .......... .......... .......... .......... 23% 3.06M 3s\n", " 1650K .......... .......... .......... .......... .......... 23% 4.11M 3s\n", " 1700K .......... .......... .......... .......... .......... 24% 3.34M 3s\n", " 1750K .......... .......... .......... .......... .......... 25% 4.13M 2s\n", " 1800K .......... .......... .......... .......... .......... 25% 7.95M 2s\n", " 1850K .......... .......... .......... .......... .......... 26% 3.69M 2s\n", " 1900K .......... .......... .......... .......... .......... 27% 4.00M 2s\n", " 1950K .......... .......... .......... .......... .......... 28% 3.50M 2s\n", " 2000K .......... .......... .......... .......... .......... 28% 4.04M 2s\n", " 2050K .......... .......... .......... .......... .......... 29% 3.31M 2s\n", " 2100K .......... .......... .......... .......... .......... 30% 2.49M 2s\n", " 2150K .......... .......... .......... .......... .......... 30% 4.19M 2s\n", " 2200K .......... .......... .......... .......... .......... 31% 5.18M 2s\n", " 2250K .......... .......... .......... .......... .......... 32% 9.49M 2s\n", " 2300K .......... .......... .......... .......... .......... 32% 8.67M 2s\n", " 2350K .......... .......... .......... .......... .......... 33% 4.88M 2s\n", " 2400K .......... .......... .......... .......... .......... 34% 4.56M 2s\n", " 2450K .......... .......... .......... .......... .......... 35% 4.94M 2s\n", " 2500K .......... .......... .......... .......... .......... 35% 4.38M 2s\n", " 2550K .......... .......... .......... .......... .......... 36% 3.78M 2s\n", " 2600K .......... .......... .......... .......... .......... 37% 4.95M 2s\n", " 2650K .......... .......... .......... .......... .......... 37% 5.50M 2s\n", " 2700K .......... .......... .......... .......... .......... 38% 5.23M 2s\n", " 2750K .......... .......... .......... .......... .......... 39% 3.77M 2s\n", " 2800K .......... .......... .......... .......... .......... 39% 10.7M 2s\n", " 2850K .......... .......... .......... .......... .......... 40% 7.16M 2s\n", " 2900K .......... .......... .......... .......... .......... 41% 5.36M 2s\n", " 2950K .......... .......... .......... .......... .......... 42% 6.80M 1s\n", " 3000K .......... .......... .......... .......... .......... 42% 6.57M 1s\n", " 3050K .......... .......... .......... .......... .......... 43% 7.21M 1s\n", " 3100K .......... .......... .......... .......... .......... 44% 6.66M 1s\n", " 3150K .......... .......... .......... .......... .......... 44% 6.42M 1s\n", " 3200K .......... .......... .......... .......... .......... 45% 8.02M 1s\n", " 3250K .......... .......... .......... .......... .......... 46% 5.96M 1s\n", " 3300K .......... .......... .......... .......... .......... 46% 5.13M 1s\n", " 3350K .......... .......... .......... .......... .......... 47% 5.19M 1s\n", " 3400K .......... .......... .......... .......... .......... 48% 7.64M 1s\n", " 3450K .......... .......... .......... .......... .......... 49% 6.11M 1s\n", " 3500K .......... .......... .......... .......... .......... 49% 4.01M 1s\n", " 3550K .......... .......... .......... .......... .......... 50% 4.52M 1s\n", " 3600K .......... .......... .......... .......... .......... 51% 6.72M 1s\n", " 3650K .......... .......... .......... .......... .......... 51% 5.45M 1s\n", " 3700K .......... .......... .......... .......... .......... 52% 4.37M 1s\n", " 3750K .......... .......... .......... .......... .......... 53% 5.39M 1s\n", " 3800K .......... .......... .......... .......... .......... 53% 7.40M 1s\n", " 3850K .......... .......... .......... .......... .......... 54% 6.70M 1s\n", " 3900K .......... .......... .......... .......... .......... 55% 5.14M 1s\n", " 3950K .......... .......... .......... .......... .......... 56% 5.02M 1s\n", " 4000K .......... .......... .......... .......... .......... 56% 6.70M 1s\n", " 4050K .......... .......... .......... .......... .......... 57% 6.76M 1s\n", " 4100K .......... .......... .......... .......... .......... 58% 2.52M 1s\n", " 4150K .......... .......... .......... .......... .......... 58% 887K 1s\n", " 4200K .......... .......... .......... .......... .......... 59% 9.25M 1s\n", " 4250K .......... .......... .......... .......... .......... 60% 1.27M 1s\n", " 4300K .......... .......... .......... .......... .......... 61% 5.72M 1s\n", " 4350K .......... .......... .......... .......... .......... 61% 4.48M 1s\n", " 4400K .......... .......... .......... .......... .......... 62% 5.20M 1s\n", " 4450K .......... .......... .......... .......... .......... 63% 6.21M 1s\n", " 4500K .......... .......... .......... .......... .......... 63% 7.94M 1s\n", " 4550K .......... .......... .......... .......... .......... 64% 4.76M 1s\n", " 4600K .......... .......... .......... .......... .......... 65% 4.74M 1s\n", " 4650K .......... .......... .......... .......... .......... 65% 6.94M 1s\n", " 4700K .......... .......... .......... .......... .......... 66% 5.62M 1s\n", " 4750K .......... .......... .......... .......... .......... 67% 4.44M 1s\n", " 4800K .......... .......... .......... .......... .......... 68% 6.02M 1s\n", " 4850K .......... .......... .......... .......... .......... 68% 6.61M 1s\n", " 4900K .......... .......... .......... .......... .......... 69% 3.04M 1s\n", " 4950K .......... .......... .......... .......... .......... 70% 5.34M 1s\n", " 5000K .......... .......... .......... .......... .......... 70% 3.03M 1s\n", " 5050K .......... .......... .......... .......... .......... 71% 19.8M 1s\n", " 5100K .......... .......... .......... .......... .......... 72% 6.17M 1s\n", " 5150K .......... .......... .......... .......... .......... 72% 5.58M 1s\n", " 5200K .......... .......... .......... .......... .......... 73% 7.38M 1s\n", " 5250K .......... .......... .......... .......... .......... 74% 7.11M 1s\n", " 5300K .......... .......... .......... .......... .......... 75% 6.24M 1s\n", " 5350K .......... .......... .......... .......... .......... 75% 4.62M 1s\n", " 5400K .......... .......... .......... .......... .......... 76% 7.64M 0s\n", " 5450K .......... .......... .......... .......... .......... 77% 6.06M 0s\n", " 5500K .......... .......... .......... .......... .......... 77% 5.56M 0s\n", " 5550K .......... .......... .......... .......... .......... 78% 2.96M 0s\n", " 5600K .......... .......... .......... .......... .......... 79% 6.17M 0s\n", " 5650K .......... .......... .......... .......... .......... 79% 9.58M 0s\n", " 5700K .......... .......... .......... .......... .......... 80% 2.58M 0s\n", " 5750K .......... .......... .......... .......... .......... 81% 4.23M 0s\n", " 5800K .......... .......... .......... .......... .......... 82% 5.70M 0s\n", " 5850K .......... .......... .......... .......... .......... 82% 4.72M 0s\n", " 5900K .......... .......... .......... .......... .......... 83% 6.52M 0s\n", " 5950K .......... .......... .......... .......... .......... 84% 5.86M 0s\n", " 6000K .......... .......... .......... .......... .......... 84% 5.22M 0s\n", " 6050K .......... .......... .......... .......... .......... 85% 5.50M 0s\n", " 6100K .......... .......... .......... .......... .......... 86% 6.29M 0s\n", " 6150K .......... .......... .......... .......... .......... 86% 6.93M 0s\n", " 6200K .......... .......... .......... .......... .......... 87% 5.50M 0s\n", " 6250K .......... .......... .......... .......... .......... 88% 5.82M 0s\n", " 6300K .......... .......... .......... .......... .......... 89% 6.76M 0s\n", " 6350K .......... .......... .......... .......... .......... 89% 3.73M 0s\n", " 6400K .......... .......... .......... .......... .......... 90% 5.98M 0s\n", " 6450K .......... .......... .......... .......... .......... 91% 5.78M 0s\n", " 6500K .......... .......... .......... .......... .......... 91% 5.60M 0s\n", " 6550K .......... .......... .......... .......... .......... 92% 4.84M 0s\n", " 6600K .......... .......... .......... .......... .......... 93% 7.25M 0s\n", " 6650K .......... .......... .......... .......... .......... 93% 2.60M 0s\n", " 6700K .......... .......... .......... .......... .......... 94% 6.02M 0s\n", " 6750K .......... .......... .......... .......... .......... 95% 6.57M 0s\n", " 6800K .......... .......... .......... .......... .......... 96% 8.30M 0s\n", " 6850K .......... .......... .......... .......... .......... 96% 14.4M 0s\n", " 6900K .......... .......... .......... .......... .......... 97% 4.58M 0s\n", " 6950K .......... .......... .......... .......... .......... 98% 3.31M 0s\n", " 7000K .......... .......... .......... .......... .......... 98% 6.88M 0s\n", " 7050K .......... .......... .......... .......... .......... 99% 4.40M 0s\n", " 7100K .......... .......... .......... 100% 15.1M=1.9s\n", "\n", "2021-12-18 21:32:01 (3.64 MB/s) - 'benchmarks.zip' saved [7301872/7301872]\n", "\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Archive: samanatar-en-indic-v0.2.zip\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " End-of-central-directory signature not found. Either this file is not\n", " a zipfile, or it constitutes one disk of a multi-part archive. In the\n", " latter case the central directory and zipfile comment will be found on\n", " the last disk(s) of this archive.\n", "unzip: cannot find zipfile directory in one of samanatar-en-indic-v0.2.zip or\n", " samanatar-en-indic-v0.2.zip.zip, and cannot find samanatar-en-indic-v0.2.zip.ZIP, period.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Archive: benchmarks.zip\n", " creating: benchmarks/\n", " creating: benchmarks/pmi/\n", " creating: benchmarks/pmi/en-as/\n", " inflating: benchmarks/pmi/en-as/dev.as \n", " inflating: benchmarks/pmi/en-as/dev.en \n", " inflating: benchmarks/pmi/en-as/test.as \n", " inflating: benchmarks/pmi/en-as/test.en \n", " creating: benchmarks/wat2021-devtest/\n", " inflating: benchmarks/wat2021-devtest/dev.gu \n", " inflating: benchmarks/wat2021-devtest/dev.en \n", " inflating: benchmarks/wat2021-devtest/test.bn \n", " inflating: benchmarks/wat2021-devtest/dev.bn \n", " inflating: benchmarks/wat2021-devtest/test.hi \n", " inflating: benchmarks/wat2021-devtest/dev.kn \n", " inflating: benchmarks/wat2021-devtest/dev.ta \n", " inflating: benchmarks/wat2021-devtest/test.pa \n", " inflating: benchmarks/wat2021-devtest/test.en \n", " inflating: benchmarks/wat2021-devtest/test.mr \n", " inflating: benchmarks/wat2021-devtest/test.kn \n", " inflating: benchmarks/wat2021-devtest/dev.ml \n", " inflating: benchmarks/wat2021-devtest/test.ta \n", " inflating: benchmarks/wat2021-devtest/test.gu \n", " inflating: benchmarks/wat2021-devtest/dev.or \n", " inflating: benchmarks/wat2021-devtest/test.or \n", " inflating: benchmarks/wat2021-devtest/test.te \n", " inflating: benchmarks/wat2021-devtest/dev.mr \n", " inflating: benchmarks/wat2021-devtest/test.ml \n", " inflating: benchmarks/wat2021-devtest/dev.pa \n", " inflating: benchmarks/wat2021-devtest/dev.te \n", " inflating: benchmarks/wat2021-devtest/dev.hi \n", " creating: benchmarks/wat2020-devtest/\n", " creating: benchmarks/wat2020-devtest/en-bn/\n", " inflating: benchmarks/wat2020-devtest/en-bn/dev.en \n", " inflating: benchmarks/wat2020-devtest/en-bn/test.bn \n", " inflating: benchmarks/wat2020-devtest/en-bn/dev.bn \n", " inflating: benchmarks/wat2020-devtest/en-bn/test.en \n", " creating: benchmarks/wat2020-devtest/en-ta/\n", " inflating: benchmarks/wat2020-devtest/en-ta/dev.en \n", " inflating: benchmarks/wat2020-devtest/en-ta/dev.ta \n", " inflating: benchmarks/wat2020-devtest/en-ta/test.en \n", " inflating: benchmarks/wat2020-devtest/en-ta/test.ta \n", " creating: benchmarks/wat2020-devtest/en-mr/\n", " inflating: benchmarks/wat2020-devtest/en-mr/dev.en \n", " inflating: benchmarks/wat2020-devtest/en-mr/test.en \n", " inflating: benchmarks/wat2020-devtest/en-mr/test.mr \n", " inflating: benchmarks/wat2020-devtest/en-mr/dev.mr \n", " creating: benchmarks/wat2020-devtest/en-te/\n", " inflating: benchmarks/wat2020-devtest/en-te/dev.en \n", " inflating: benchmarks/wat2020-devtest/en-te/test.en \n", " inflating: benchmarks/wat2020-devtest/en-te/test.te \n", " inflating: benchmarks/wat2020-devtest/en-te/dev.te \n", " creating: benchmarks/wat2020-devtest/en-hi/\n", " inflating: benchmarks/wat2020-devtest/en-hi/dev.en \n", " inflating: benchmarks/wat2020-devtest/en-hi/test.hi \n", " inflating: benchmarks/wat2020-devtest/en-hi/test.en \n", " inflating: benchmarks/wat2020-devtest/en-hi/dev.hi \n", " creating: benchmarks/wat2020-devtest/en-gu/\n", " inflating: benchmarks/wat2020-devtest/en-gu/dev.gu \n", " inflating: benchmarks/wat2020-devtest/en-gu/dev.en \n", " inflating: benchmarks/wat2020-devtest/en-gu/test.en \n", " inflating: benchmarks/wat2020-devtest/en-gu/test.gu \n", " creating: benchmarks/wat2020-devtest/en-ml/\n", " inflating: benchmarks/wat2020-devtest/en-ml/dev.en \n", " inflating: benchmarks/wat2020-devtest/en-ml/test.en \n", " inflating: benchmarks/wat2020-devtest/en-ml/dev.ml \n", " inflating: benchmarks/wat2020-devtest/en-ml/test.ml \n", " creating: benchmarks/ufal-ta/\n", " creating: benchmarks/ufal-ta/en-ta/\n", " inflating: benchmarks/ufal-ta/en-ta/dev.en \n", " inflating: benchmarks/ufal-ta/en-ta/dev.ta \n", " inflating: benchmarks/ufal-ta/en-ta/test.en \n", " inflating: benchmarks/ufal-ta/en-ta/test.ta \n", " creating: benchmarks/wmt-news/\n", " creating: benchmarks/wmt-news/en-ta/\n", " inflating: benchmarks/wmt-news/en-ta/dev.en \n", " inflating: benchmarks/wmt-news/en-ta/dev.ta \n", " inflating: benchmarks/wmt-news/en-ta/test.en \n", " inflating: benchmarks/wmt-news/en-ta/test.ta \n", " creating: benchmarks/wmt-news/en-hi/\n", " inflating: benchmarks/wmt-news/en-hi/dev.en \n", " inflating: benchmarks/wmt-news/en-hi/test.hi \n", " inflating: benchmarks/wmt-news/en-hi/test.en \n", " inflating: benchmarks/wmt-news/en-hi/dev.hi \n", " creating: benchmarks/wmt-news/en-gu/\n", " inflating: benchmarks/wmt-news/en-gu/test.en \n", " inflating: benchmarks/wmt-news/en-gu/test.gu \n" ] } ], "source": [ "## for the latest samanantar dataset v0.3 -> please use this link: https://storage.googleapis.com/samanantar-public/V0.3/source_wise_splits.zip\n", "# This v0.3 dataset has source wise splits to indicate where the data has been collected from\n", "# For preprocessing simplicity we will use v0.2( which just uses raw text files without source information) in this tutorial\n", "# \n", "# \n", "# lets now download the indictrans data v0.2 dataset\n", "! wget https://storage.googleapis.com/samanantar-public/V0.2/data/en2indic/samanatar-en-indic-v0.2.zip\n", "\n", "\n", "\n", "# lets also download the benchmarks for dev and test set\n", "\n", "! wget https://storage.googleapis.com/samanantar-public/benchmarks.zip\n", "\n", "# training data is organized as en-X folders where each folder contains two text files containing parallel data for en-X lang pair.\n", "\n", "# final_data\n", "# ├── en-as\n", "# │ ├── train.as\n", "# │ └── train.en\n", "# ├── en-bn\n", "# │ ├── train.bn\n", "# │ └── train.en\n", "# ├── en-gu\n", "# │ ├── train.en\n", "# │ └── train.gu\n", "# ├── en-hi\n", "# │ ├── train.en\n", "# │ └── train.hi\n", "# ├── en-kn\n", "# │ ├── train.en\n", "# │ └── train.kn\n", "# ├── en-ml\n", "# │ ├── train.en\n", "# │ └── train.ml\n", "# ├── en-mr\n", "# │ ├── train.en\n", "# │ └── train.mr\n", "# ├── en-or\n", "# │ ├── train.en\n", "# │ └── train.or\n", "# ├── en-pa\n", "# │ ├── train.en\n", "# │ └── train.pa\n", "# ├── en-ta\n", "# │ ├── train.en\n", "# │ └── train.ta\n", "# └── en-te\n", "# ├── train.en\n", "# └── train.te\n", "\n", "\n", "! unzip samanatar-en-indic-v0.2.zip\n", "\n", "# benchmarks folder consists of all the benchmarks we report in the paper - pmi, ufal-ta, wat2020, wat2021, wmt-news\n", "\n", "! unzip benchmarks.zip" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "MR_2GQoa84Jn" }, "outputs": [], "source": [ "# create an experiment dir to store train data, devtest data. \n", "# This folder will also store vocabulary files (created with subword_nmt for bpe), fairseq bin files (for training), model checkpoints.\n", "\n", "# for this example we will be training indic to en translation model. We will name our exp_dir as indic-en-exp\n", "! mkdir indic-en-exp\n", "# copying all the train folders to exp_dir\n", "! cp -r final_data/* indic-en-exp\n", "\n", "! mkdir -p indic-en-exp/devtest\n", "\n", "# copying all benchmarks to devtest folder in exp_dir\n", "! cp -r benchmarks/* indic-en-exp/devtest\n", "\n", "# folder to store combined devtest data (based on the domains you want to test, you can combine multiple benchmarks dev datasets, remove duplicates)\n", "! mkdir -p indic-en-exp/devtest/all\n", "\n", "# in this tutorial, for simplicity, we will just use wat2020 devtest for dev and test set\n", "! cp -r indic-en-exp/devtest/wat2020-devtest/* indic-en-exp/devtest/all\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "lorcT8wkFPtQ" }, "outputs": [], "source": [ "% cd indicTrans" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "vhvYXUc1FaVn" }, "outputs": [], "source": [ "# prepare_data_joint_training.sh takes experiment dir, src_lang, tgt_lang as input \n", "# This does preprocessing, building vocab, binarization for joint training\n", "\n", "# The learning and applying vocabulary will take a while if the dataset is huge. To make it faster, run it on a multicore system\n", "\n", "! bash prepare_data_joint_training.sh '../indic-en-exp' 'indic' 'en'" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "p1i3fRQzF2-x" }, "outputs": [], "source": [ "# Training the model\n", "\n", "# pls refer to fairseq documentaion to know more about each of these options (https://fairseq.readthedocs.io/en/latest/command_line_tools.html)\n", "\n", "\n", "# some notable args:\n", "# --max-updates -> maximum update steps the model will be trained for\n", "# --arch=transformer_4x -> we use a custom transformer model and name it transformer_4x (4 times the parameter size of transformer base)\n", "# --user_dir -> we define the custom transformer arch in model_configs folder and pass it as an argument to user_dir for fairseq to register this architechture\n", "# --lr -> learning rate. From our limited experiments, we find that lower learning rates like 3e-5 works best for finetuning.\n", "# --max_tokens -> this is max tokens per batch. You should limit to lower values if you get oom errors.\n", "# --update-freq -> gradient accumulation steps\n", "\n", "\n", "!( fairseq-train ../indic-en-exp/final_bin \\\n", "--max-source-positions=210 \\\n", "--max-target-positions=210 \\\n", "--max-update= \\\n", "--save-interval=1 \\\n", "--arch=transformer_4x \\\n", "--criterion=label_smoothed_cross_entropy \\\n", "--source-lang=SRC \\\n", "--lr-scheduler=inverse_sqrt \\\n", "--target-lang=TGT \\\n", "--label-smoothing=0.1 \\\n", "--optimizer adam \\\n", "--adam-betas \"(0.9, 0.98)\" \\\n", "--clip-norm 1.0 \\\n", "--warmup-init-lr 1e-07 \\\n", "--lr 0.0005 \\\n", "--warmup-updates 4000 \\\n", "--dropout 0.2 \\\n", "--save-dir ../indic-en-exp/model \\\n", "--keep-last-epochs 5 \\\n", "--patience 5 \\\n", "--skip-invalid-size-inputs-valid-test \\\n", "--fp16 \\\n", "--user-dir model_configs \\\n", "--wandb-project \\\n", "--update-freq= \\\n", "--distributed-world-size \\\n", "--max-tokens )" ] } ], "metadata": { "colab": { "authorship_tag": "ABX9TyO6AA5gXphZ5kJ6h+dgeSqb", "collapsed_sections": [], "include_colab_link": true, "name": "IndicTrans_training.ipynb", "provenance": [] }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.7" } }, "nbformat": 4, "nbformat_minor": 0 }