{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "view-in-github"
},
"source": [
""
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "FdyHSnoj7Iun",
"outputId": "d0624c60-68c4-470f-9ade-c517e3296044"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/content/training\n"
]
}
],
"source": [
"# create a seperate folder to store everything\n",
"!mkdir training\n",
"%cd training"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "y55OfxBz8QeP",
"outputId": "6d0ab016-0f96-4671-ddee-f06b50506dcd"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Cloning into 'indicTrans'...\n",
"remote: Enumerating objects: 432, done.\u001b[K\n",
"remote: Counting objects: 100% (139/139), done.\u001b[K\n",
"remote: Compressing objects: 100% (34/34), done.\u001b[K\n",
"remote: Total 432 (delta 122), reused 105 (delta 105), pack-reused 293\u001b[K\n",
"Receiving objects: 100% (432/432), 1.43 MiB | 14.11 MiB/s, done.\n",
"Resolving deltas: 100% (248/248), done.\n",
"/content/training/indicTrans\n",
"Cloning into 'indic_nlp_library'...\n",
"remote: Enumerating objects: 1325, done.\u001b[K\n",
"remote: Counting objects: 100% (147/147), done.\u001b[K\n",
"remote: Compressing objects: 100% (103/103), done.\u001b[K\n",
"remote: Total 1325 (delta 84), reused 89 (delta 41), pack-reused 1178\u001b[K\n",
"Receiving objects: 100% (1325/1325), 9.57 MiB | 10.51 MiB/s, done.\n",
"Resolving deltas: 100% (688/688), done.\n",
"Cloning into 'indic_nlp_resources'...\n",
"remote: Enumerating objects: 133, done.\u001b[K\n",
"remote: Counting objects: 100% (7/7), done.\u001b[K\n",
"remote: Compressing objects: 100% (7/7), done.\u001b[K\n",
"remote: Total 133 (delta 0), reused 2 (delta 0), pack-reused 126\u001b[K\n",
"Receiving objects: 100% (133/133), 149.77 MiB | 34.05 MiB/s, done.\n",
"Resolving deltas: 100% (51/51), done.\n",
"Checking out files: 100% (28/28), done.\n",
"Cloning into 'subword-nmt'...\n",
"remote: Enumerating objects: 580, done.\u001b[K\n",
"remote: Counting objects: 100% (4/4), done.\u001b[K\n",
"remote: Compressing objects: 100% (4/4), done.\u001b[K\n",
"remote: Total 580 (delta 0), reused 1 (delta 0), pack-reused 576\u001b[K\n",
"Receiving objects: 100% (580/580), 237.41 KiB | 5.28 MiB/s, done.\n",
"Resolving deltas: 100% (349/349), done.\n",
"/content/training\n"
]
}
],
"source": [
"# clone the repo for running finetuning\n",
"!git clone https://github.com/AI4Bharat/indicTrans.git\n",
"%cd indicTrans\n",
"# clone requirements repositories\n",
"!git clone https://github.com/anoopkunchukuttan/indic_nlp_library.git\n",
"!git clone https://github.com/anoopkunchukuttan/indic_nlp_resources.git\n",
"!git clone https://github.com/rsennrich/subword-nmt.git\n",
"%cd .."
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "ziWWl-1a8SMw",
"outputId": "d7908a62-9573-4693-e7cb-44aeeebaaa15"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Reading package lists... Done\n",
"Building dependency tree \n",
"Reading state information... Done\n",
"The following NEW packages will be installed:\n",
" tree\n",
"0 upgraded, 1 newly installed, 0 to remove and 39 not upgraded.\n",
"Need to get 40.7 kB of archives.\n",
"After this operation, 105 kB of additional disk space will be used.\n",
"Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 tree amd64 1.7.0-5 [40.7 kB]\n",
"Fetched 40.7 kB in 0s (133 kB/s)\n",
"debconf: unable to initialize frontend: Dialog\n",
"debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 76, <> line 1.)\n",
"debconf: falling back to frontend: Readline\n",
"debconf: unable to initialize frontend: Readline\n",
"debconf: (This frontend requires a controlling tty.)\n",
"debconf: falling back to frontend: Teletype\n",
"dpkg-preconfigure: unable to re-open stdin: \n",
"Selecting previously unselected package tree.\n",
"(Reading database ... 160772 files and directories currently installed.)\n",
"Preparing to unpack .../tree_1.7.0-5_amd64.deb ...\n",
"Unpacking tree (1.7.0-5) ...\n",
"Setting up tree (1.7.0-5) ...\n",
"Processing triggers for man-db (2.8.3-2ubuntu0.1) ...\n",
"Collecting sacremoses\n",
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)\n",
"\u001b[K |████████████████████████████████| 901kB 4.0MB/s \n",
"\u001b[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (1.1.5)\n",
"Collecting mock\n",
" Downloading https://files.pythonhosted.org/packages/5c/03/b7e605db4a57c0f6fba744b11ef3ddf4ddebcada35022927a2b5fc623fdf/mock-4.0.3-py3-none-any.whl\n",
"Collecting sacrebleu\n",
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/7e/57/0c7ca4e31a126189dab99c19951910bd081dea5bbd25f24b77107750eae7/sacrebleu-1.5.1-py3-none-any.whl (54kB)\n",
"\u001b[K |████████████████████████████████| 61kB 7.4MB/s \n",
"\u001b[?25hCollecting tensorboardX\n",
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/42/36/2b147652c40c3a858efa0afbf7b8236fae968e88ff530511a4cfa299a506/tensorboardX-2.3-py2.py3-none-any.whl (124kB)\n",
"\u001b[K |████████████████████████████████| 133kB 24.0MB/s \n",
"\u001b[?25hRequirement already satisfied: pyarrow in /usr/local/lib/python3.7/dist-packages (3.0.0)\n",
"Collecting indic-nlp-library\n",
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/84/d4/495bb43b88a2a6d04b09c29fc5115f24872af74cd8317fe84026abd4ddb1/indic_nlp_library-0.81-py3-none-any.whl (40kB)\n",
"\u001b[K |████████████████████████████████| 40kB 5.4MB/s \n",
"\u001b[?25hRequirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from sacremoses) (1.15.0)\n",
"Requirement already satisfied: regex in /usr/local/lib/python3.7/dist-packages (from sacremoses) (2019.12.20)\n",
"Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from sacremoses) (7.1.2)\n",
"Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from sacremoses) (4.41.1)\n",
"Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from sacremoses) (1.0.1)\n",
"Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas) (2018.9)\n",
"Requirement already satisfied: numpy>=1.15.4 in /usr/local/lib/python3.7/dist-packages (from pandas) (1.19.5)\n",
"Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas) (2.8.1)\n",
"Collecting portalocker==2.0.0\n",
" Downloading https://files.pythonhosted.org/packages/89/a6/3814b7107e0788040870e8825eebf214d72166adf656ba7d4bf14759a06a/portalocker-2.0.0-py2.py3-none-any.whl\n",
"Requirement already satisfied: protobuf>=3.8.0 in /usr/local/lib/python3.7/dist-packages (from tensorboardX) (3.12.4)\n",
"Collecting morfessor\n",
" Downloading https://files.pythonhosted.org/packages/39/e6/7afea30be2ee4d29ce9de0fa53acbb033163615f849515c0b1956ad074ee/Morfessor-2.0.6-py3-none-any.whl\n",
"Collecting sphinx-argparse\n",
" Downloading https://files.pythonhosted.org/packages/06/2b/dfad6a1831c3aeeae25d8d3d417224684befbf45e10c7f2141631616a6ed/sphinx-argparse-0.2.5.tar.gz\n",
"Collecting sphinx-rtd-theme\n",
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/ac/24/2475e8f83519b54b2148d4a56eb1111f9cec630d088c3ffc214492c12107/sphinx_rtd_theme-0.5.2-py2.py3-none-any.whl (9.1MB)\n",
"\u001b[K |████████████████████████████████| 9.2MB 21.7MB/s \n",
"\u001b[?25hRequirement already satisfied: setuptools in /usr/local/lib/python3.7/dist-packages (from protobuf>=3.8.0->tensorboardX) (57.0.0)\n",
"Requirement already satisfied: sphinx>=1.2.0 in /usr/local/lib/python3.7/dist-packages (from sphinx-argparse->indic-nlp-library) (1.8.5)\n",
"Collecting docutils<0.17\n",
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/81/44/8a15e45ffa96e6cf82956dd8d7af9e666357e16b0d93b253903475ee947f/docutils-0.16-py2.py3-none-any.whl (548kB)\n",
"\u001b[K |████████████████████████████████| 552kB 38.5MB/s \n",
"\u001b[?25hRequirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (20.9)\n",
"Requirement already satisfied: imagesize in /usr/local/lib/python3.7/dist-packages (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (1.2.0)\n",
"Requirement already satisfied: requests>=2.0.0 in /usr/local/lib/python3.7/dist-packages (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (2.23.0)\n",
"Requirement already satisfied: sphinxcontrib-websupport in /usr/local/lib/python3.7/dist-packages (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (1.2.4)\n",
"Requirement already satisfied: Pygments>=2.0 in /usr/local/lib/python3.7/dist-packages (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (2.6.1)\n",
"Requirement already satisfied: snowballstemmer>=1.1 in /usr/local/lib/python3.7/dist-packages (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (2.1.0)\n",
"Requirement already satisfied: babel!=2.0,>=1.3 in /usr/local/lib/python3.7/dist-packages (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (2.9.1)\n",
"Requirement already satisfied: alabaster<0.8,>=0.7 in /usr/local/lib/python3.7/dist-packages (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (0.7.12)\n",
"Requirement already satisfied: Jinja2>=2.3 in /usr/local/lib/python3.7/dist-packages (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (2.11.3)\n",
"Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (2.4.7)\n",
"Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests>=2.0.0->sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (1.24.3)\n",
"Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests>=2.0.0->sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (2.10)\n",
"Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests>=2.0.0->sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (3.0.4)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests>=2.0.0->sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (2021.5.30)\n",
"Requirement already satisfied: sphinxcontrib-serializinghtml in /usr/local/lib/python3.7/dist-packages (from sphinxcontrib-websupport->sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (1.1.5)\n",
"Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.7/dist-packages (from Jinja2>=2.3->sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (2.0.1)\n",
"Building wheels for collected packages: sphinx-argparse\n",
" Building wheel for sphinx-argparse (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
" Created wheel for sphinx-argparse: filename=sphinx_argparse-0.2.5-cp37-none-any.whl size=11552 sha256=0f3830a0bf7a6cfa99000091da945e9dd814b2f1e1f9ca5d773f99aaa0d3a4a5\n",
" Stored in directory: /root/.cache/pip/wheels/2a/18/1b/4990a1859da4edc77ab312bc2986c08d2733fb5713d06e44f5\n",
"Successfully built sphinx-argparse\n",
"\u001b[31mERROR: datascience 0.10.6 has requirement folium==0.2.1, but you'll have folium 0.8.3 which is incompatible.\u001b[0m\n",
"Installing collected packages: sacremoses, mock, portalocker, sacrebleu, tensorboardX, morfessor, sphinx-argparse, docutils, sphinx-rtd-theme, indic-nlp-library\n",
" Found existing installation: docutils 0.17.1\n",
" Uninstalling docutils-0.17.1:\n",
" Successfully uninstalled docutils-0.17.1\n",
"Successfully installed docutils-0.16 indic-nlp-library-0.81 mock-4.0.3 morfessor-2.0.6 portalocker-2.0.0 sacrebleu-1.5.1 sacremoses-0.0.45 sphinx-argparse-0.2.5 sphinx-rtd-theme-0.5.2 tensorboardX-2.3\n",
"Cloning into 'fairseq'...\n",
"remote: Enumerating objects: 28410, done.\u001b[K\n",
"remote: Counting objects: 100% (229/229), done.\u001b[K\n",
"remote: Compressing objects: 100% (127/127), done.\u001b[K\n",
"remote: Total 28410 (delta 114), reused 187 (delta 99), pack-reused 28181\u001b[K\n",
"Receiving objects: 100% (28410/28410), 11.96 MiB | 24.45 MiB/s, done.\n",
"Resolving deltas: 100% (21310/21310), done.\n",
"/content/training/fairseq\n",
"Obtaining file:///content/training/fairseq\n",
" Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
" Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
" Installing backend dependencies ... \u001b[?25l\u001b[?25hdone\n",
" Preparing wheel metadata ... \u001b[?25l\u001b[?25hdone\n",
"Requirement already satisfied: regex in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (2019.12.20)\n",
"Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (4.41.1)\n",
"Collecting omegaconf<2.1\n",
" Downloading https://files.pythonhosted.org/packages/d0/eb/9d63ce09dd8aa85767c65668d5414958ea29648a0eec80a4a7d311ec2684/omegaconf-2.0.6-py3-none-any.whl\n",
"Requirement already satisfied: numpy; python_version >= \"3.7\" in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (1.19.5)\n",
"Requirement already satisfied: sacrebleu>=1.4.12 in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (1.5.1)\n",
"Requirement already satisfied: cython in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (0.29.23)\n",
"Collecting hydra-core<1.1\n",
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/52/e3/fbd70dd0d3ce4d1d75c22d56c0c9f895cfa7ed6587a9ffb821d6812d6a60/hydra_core-1.0.6-py3-none-any.whl (123kB)\n",
"\u001b[K |████████████████████████████████| 133kB 4.7MB/s \n",
"\u001b[?25hRequirement already satisfied: torch in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (1.9.0+cu102)\n",
"Requirement already satisfied: cffi in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (1.14.5)\n",
"Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from omegaconf<2.1->fairseq==1.0.0a0+f887152) (3.7.4.3)\n",
"Collecting PyYAML>=5.1.*\n",
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/7a/a5/393c087efdc78091afa2af9f1378762f9821c9c1d7a22c5753fb5ac5f97a/PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636kB)\n",
"\u001b[K |████████████████████████████████| 645kB 32.4MB/s \n",
"\u001b[?25hRequirement already satisfied: portalocker==2.0.0 in /usr/local/lib/python3.7/dist-packages (from sacrebleu>=1.4.12->fairseq==1.0.0a0+f887152) (2.0.0)\n",
"Requirement already satisfied: importlib-resources; python_version < \"3.9\" in /usr/local/lib/python3.7/dist-packages (from hydra-core<1.1->fairseq==1.0.0a0+f887152) (5.1.4)\n",
"Collecting antlr4-python3-runtime==4.8\n",
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/56/02/789a0bddf9c9b31b14c3e79ec22b9656185a803dc31c15f006f9855ece0d/antlr4-python3-runtime-4.8.tar.gz (112kB)\n",
"\u001b[K |████████████████████████████████| 112kB 53.0MB/s \n",
"\u001b[?25hRequirement already satisfied: pycparser in /usr/local/lib/python3.7/dist-packages (from cffi->fairseq==1.0.0a0+f887152) (2.20)\n",
"Requirement already satisfied: zipp>=3.1.0; python_version < \"3.10\" in /usr/local/lib/python3.7/dist-packages (from importlib-resources; python_version < \"3.9\"->hydra-core<1.1->fairseq==1.0.0a0+f887152) (3.4.1)\n",
"Building wheels for collected packages: antlr4-python3-runtime\n",
" Building wheel for antlr4-python3-runtime (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
" Created wheel for antlr4-python3-runtime: filename=antlr4_python3_runtime-4.8-cp37-none-any.whl size=141231 sha256=52f59bfe6322a04598da6960d2d5675a581273a45e4391e04cf1240c97346019\n",
" Stored in directory: /root/.cache/pip/wheels/e3/e2/fa/b78480b448b8579ddf393bebd3f47ee23aa84c89b6a78285c8\n",
"Successfully built antlr4-python3-runtime\n",
"Installing collected packages: PyYAML, omegaconf, antlr4-python3-runtime, hydra-core, fairseq\n",
" Found existing installation: PyYAML 3.13\n",
" Uninstalling PyYAML-3.13:\n",
" Successfully uninstalled PyYAML-3.13\n",
" Running setup.py develop for fairseq\n",
"Successfully installed PyYAML-5.4.1 antlr4-python3-runtime-4.8 fairseq hydra-core-1.0.6 omegaconf-2.0.6\n",
"/content/training\n"
]
}
],
"source": [
"! sudo apt install tree\n",
"\n",
"# Install the necessary libraries\n",
"!pip install sacremoses pandas mock sacrebleu tensorboardX pyarrow indic-nlp-library\n",
"# Install fairseq from source\n",
"!git clone https://github.com/pytorch/fairseq.git\n",
"%cd fairseq\n",
"# !git checkout da9eaba12d82b9bfc1442f0e2c6fc1b895f4d35d\n",
"!pip install --editable ./\n",
"%cd .."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "tmfGYkd58UiO",
"outputId": "3b83bcf6-bbbf-4e49-c2bb-7d0fb999297d"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"^C\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"--2021-12-18 21:31:57-- https://storage.googleapis.com/samanantar-public/benchmarks.zip\n",
"Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.160.144, 216.58.196.176, 142.250.71.16, ...\n",
"Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.160.144|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 7301872 (7.0M) [application/zip]\n",
"Saving to: 'benchmarks.zip'\n",
"\n",
" 0K .......... .......... .......... .......... .......... 0% 774K 9s\n",
" 50K .......... .......... .......... .......... .......... 1% 2.10M 6s\n",
" 100K .......... .......... .......... .......... .......... 2% 2.46M 5s\n",
" 150K .......... .......... .......... .......... .......... 2% 2.68M 4s\n",
" 200K .......... .......... .......... .......... .......... 3% 1.44M 4s\n",
" 250K .......... .......... .......... .......... .......... 4% 2.48M 4s\n",
" 300K .......... .......... .......... .......... .......... 4% 3.41M 4s\n",
" 350K .......... .......... .......... .......... .......... 5% 2.22M 4s\n",
" 400K .......... .......... .......... .......... .......... 6% 1.20M 4s\n",
" 450K .......... .......... .......... .......... .......... 7% 2.65M 4s\n",
" 500K .......... .......... .......... .......... .......... 7% 2.97M 3s\n",
" 550K .......... .......... .......... .......... .......... 8% 887K 4s\n",
" 600K .......... .......... .......... .......... .......... 9% 2.90M 4s\n",
" 650K .......... .......... .......... .......... .......... 9% 2.76M 4s\n",
" 700K .......... .......... .......... .......... .......... 10% 980K 4s\n",
" 750K .......... .......... .......... .......... .......... 11% 2.55M 4s\n",
" 800K .......... .......... .......... .......... .......... 11% 2.86M 3s\n",
" 850K .......... .......... .......... .......... .......... 12% 3.04M 3s\n",
" 900K .......... .......... .......... .......... .......... 13% 1.01M 3s\n",
" 950K .......... .......... .......... .......... .......... 14% 3.35M 3s\n",
" 1000K .......... .......... .......... .......... .......... 14% 5.04M 3s\n",
" 1050K .......... .......... .......... .......... .......... 15% 14.5M 3s\n",
" 1100K .......... .......... .......... .......... .......... 16% 1.01M 3s\n",
" 1150K .......... .......... .......... .......... .......... 16% 4.48M 3s\n",
" 1200K .......... .......... .......... .......... .......... 17% 4.34M 3s\n",
" 1250K .......... .......... .......... .......... .......... 18% 2.90M 3s\n",
" 1300K .......... .......... .......... .......... .......... 18% 1.14M 3s\n",
" 1350K .......... .......... .......... .......... .......... 19% 3.00M 3s\n",
" 1400K .......... .......... .......... .......... .......... 20% 5.09M 3s\n",
" 1450K .......... .......... .......... .......... .......... 21% 1.91M 3s\n",
" 1500K .......... .......... .......... .......... .......... 21% 7.70M 3s\n",
" 1550K .......... .......... .......... .......... .......... 22% 1.27M 3s\n",
" 1600K .......... .......... .......... .......... .......... 23% 3.06M 3s\n",
" 1650K .......... .......... .......... .......... .......... 23% 4.11M 3s\n",
" 1700K .......... .......... .......... .......... .......... 24% 3.34M 3s\n",
" 1750K .......... .......... .......... .......... .......... 25% 4.13M 2s\n",
" 1800K .......... .......... .......... .......... .......... 25% 7.95M 2s\n",
" 1850K .......... .......... .......... .......... .......... 26% 3.69M 2s\n",
" 1900K .......... .......... .......... .......... .......... 27% 4.00M 2s\n",
" 1950K .......... .......... .......... .......... .......... 28% 3.50M 2s\n",
" 2000K .......... .......... .......... .......... .......... 28% 4.04M 2s\n",
" 2050K .......... .......... .......... .......... .......... 29% 3.31M 2s\n",
" 2100K .......... .......... .......... .......... .......... 30% 2.49M 2s\n",
" 2150K .......... .......... .......... .......... .......... 30% 4.19M 2s\n",
" 2200K .......... .......... .......... .......... .......... 31% 5.18M 2s\n",
" 2250K .......... .......... .......... .......... .......... 32% 9.49M 2s\n",
" 2300K .......... .......... .......... .......... .......... 32% 8.67M 2s\n",
" 2350K .......... .......... .......... .......... .......... 33% 4.88M 2s\n",
" 2400K .......... .......... .......... .......... .......... 34% 4.56M 2s\n",
" 2450K .......... .......... .......... .......... .......... 35% 4.94M 2s\n",
" 2500K .......... .......... .......... .......... .......... 35% 4.38M 2s\n",
" 2550K .......... .......... .......... .......... .......... 36% 3.78M 2s\n",
" 2600K .......... .......... .......... .......... .......... 37% 4.95M 2s\n",
" 2650K .......... .......... .......... .......... .......... 37% 5.50M 2s\n",
" 2700K .......... .......... .......... .......... .......... 38% 5.23M 2s\n",
" 2750K .......... .......... .......... .......... .......... 39% 3.77M 2s\n",
" 2800K .......... .......... .......... .......... .......... 39% 10.7M 2s\n",
" 2850K .......... .......... .......... .......... .......... 40% 7.16M 2s\n",
" 2900K .......... .......... .......... .......... .......... 41% 5.36M 2s\n",
" 2950K .......... .......... .......... .......... .......... 42% 6.80M 1s\n",
" 3000K .......... .......... .......... .......... .......... 42% 6.57M 1s\n",
" 3050K .......... .......... .......... .......... .......... 43% 7.21M 1s\n",
" 3100K .......... .......... .......... .......... .......... 44% 6.66M 1s\n",
" 3150K .......... .......... .......... .......... .......... 44% 6.42M 1s\n",
" 3200K .......... .......... .......... .......... .......... 45% 8.02M 1s\n",
" 3250K .......... .......... .......... .......... .......... 46% 5.96M 1s\n",
" 3300K .......... .......... .......... .......... .......... 46% 5.13M 1s\n",
" 3350K .......... .......... .......... .......... .......... 47% 5.19M 1s\n",
" 3400K .......... .......... .......... .......... .......... 48% 7.64M 1s\n",
" 3450K .......... .......... .......... .......... .......... 49% 6.11M 1s\n",
" 3500K .......... .......... .......... .......... .......... 49% 4.01M 1s\n",
" 3550K .......... .......... .......... .......... .......... 50% 4.52M 1s\n",
" 3600K .......... .......... .......... .......... .......... 51% 6.72M 1s\n",
" 3650K .......... .......... .......... .......... .......... 51% 5.45M 1s\n",
" 3700K .......... .......... .......... .......... .......... 52% 4.37M 1s\n",
" 3750K .......... .......... .......... .......... .......... 53% 5.39M 1s\n",
" 3800K .......... .......... .......... .......... .......... 53% 7.40M 1s\n",
" 3850K .......... .......... .......... .......... .......... 54% 6.70M 1s\n",
" 3900K .......... .......... .......... .......... .......... 55% 5.14M 1s\n",
" 3950K .......... .......... .......... .......... .......... 56% 5.02M 1s\n",
" 4000K .......... .......... .......... .......... .......... 56% 6.70M 1s\n",
" 4050K .......... .......... .......... .......... .......... 57% 6.76M 1s\n",
" 4100K .......... .......... .......... .......... .......... 58% 2.52M 1s\n",
" 4150K .......... .......... .......... .......... .......... 58% 887K 1s\n",
" 4200K .......... .......... .......... .......... .......... 59% 9.25M 1s\n",
" 4250K .......... .......... .......... .......... .......... 60% 1.27M 1s\n",
" 4300K .......... .......... .......... .......... .......... 61% 5.72M 1s\n",
" 4350K .......... .......... .......... .......... .......... 61% 4.48M 1s\n",
" 4400K .......... .......... .......... .......... .......... 62% 5.20M 1s\n",
" 4450K .......... .......... .......... .......... .......... 63% 6.21M 1s\n",
" 4500K .......... .......... .......... .......... .......... 63% 7.94M 1s\n",
" 4550K .......... .......... .......... .......... .......... 64% 4.76M 1s\n",
" 4600K .......... .......... .......... .......... .......... 65% 4.74M 1s\n",
" 4650K .......... .......... .......... .......... .......... 65% 6.94M 1s\n",
" 4700K .......... .......... .......... .......... .......... 66% 5.62M 1s\n",
" 4750K .......... .......... .......... .......... .......... 67% 4.44M 1s\n",
" 4800K .......... .......... .......... .......... .......... 68% 6.02M 1s\n",
" 4850K .......... .......... .......... .......... .......... 68% 6.61M 1s\n",
" 4900K .......... .......... .......... .......... .......... 69% 3.04M 1s\n",
" 4950K .......... .......... .......... .......... .......... 70% 5.34M 1s\n",
" 5000K .......... .......... .......... .......... .......... 70% 3.03M 1s\n",
" 5050K .......... .......... .......... .......... .......... 71% 19.8M 1s\n",
" 5100K .......... .......... .......... .......... .......... 72% 6.17M 1s\n",
" 5150K .......... .......... .......... .......... .......... 72% 5.58M 1s\n",
" 5200K .......... .......... .......... .......... .......... 73% 7.38M 1s\n",
" 5250K .......... .......... .......... .......... .......... 74% 7.11M 1s\n",
" 5300K .......... .......... .......... .......... .......... 75% 6.24M 1s\n",
" 5350K .......... .......... .......... .......... .......... 75% 4.62M 1s\n",
" 5400K .......... .......... .......... .......... .......... 76% 7.64M 0s\n",
" 5450K .......... .......... .......... .......... .......... 77% 6.06M 0s\n",
" 5500K .......... .......... .......... .......... .......... 77% 5.56M 0s\n",
" 5550K .......... .......... .......... .......... .......... 78% 2.96M 0s\n",
" 5600K .......... .......... .......... .......... .......... 79% 6.17M 0s\n",
" 5650K .......... .......... .......... .......... .......... 79% 9.58M 0s\n",
" 5700K .......... .......... .......... .......... .......... 80% 2.58M 0s\n",
" 5750K .......... .......... .......... .......... .......... 81% 4.23M 0s\n",
" 5800K .......... .......... .......... .......... .......... 82% 5.70M 0s\n",
" 5850K .......... .......... .......... .......... .......... 82% 4.72M 0s\n",
" 5900K .......... .......... .......... .......... .......... 83% 6.52M 0s\n",
" 5950K .......... .......... .......... .......... .......... 84% 5.86M 0s\n",
" 6000K .......... .......... .......... .......... .......... 84% 5.22M 0s\n",
" 6050K .......... .......... .......... .......... .......... 85% 5.50M 0s\n",
" 6100K .......... .......... .......... .......... .......... 86% 6.29M 0s\n",
" 6150K .......... .......... .......... .......... .......... 86% 6.93M 0s\n",
" 6200K .......... .......... .......... .......... .......... 87% 5.50M 0s\n",
" 6250K .......... .......... .......... .......... .......... 88% 5.82M 0s\n",
" 6300K .......... .......... .......... .......... .......... 89% 6.76M 0s\n",
" 6350K .......... .......... .......... .......... .......... 89% 3.73M 0s\n",
" 6400K .......... .......... .......... .......... .......... 90% 5.98M 0s\n",
" 6450K .......... .......... .......... .......... .......... 91% 5.78M 0s\n",
" 6500K .......... .......... .......... .......... .......... 91% 5.60M 0s\n",
" 6550K .......... .......... .......... .......... .......... 92% 4.84M 0s\n",
" 6600K .......... .......... .......... .......... .......... 93% 7.25M 0s\n",
" 6650K .......... .......... .......... .......... .......... 93% 2.60M 0s\n",
" 6700K .......... .......... .......... .......... .......... 94% 6.02M 0s\n",
" 6750K .......... .......... .......... .......... .......... 95% 6.57M 0s\n",
" 6800K .......... .......... .......... .......... .......... 96% 8.30M 0s\n",
" 6850K .......... .......... .......... .......... .......... 96% 14.4M 0s\n",
" 6900K .......... .......... .......... .......... .......... 97% 4.58M 0s\n",
" 6950K .......... .......... .......... .......... .......... 98% 3.31M 0s\n",
" 7000K .......... .......... .......... .......... .......... 98% 6.88M 0s\n",
" 7050K .......... .......... .......... .......... .......... 99% 4.40M 0s\n",
" 7100K .......... .......... .......... 100% 15.1M=1.9s\n",
"\n",
"2021-12-18 21:32:01 (3.64 MB/s) - 'benchmarks.zip' saved [7301872/7301872]\n",
"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Archive: samanatar-en-indic-v0.2.zip\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" End-of-central-directory signature not found. Either this file is not\n",
" a zipfile, or it constitutes one disk of a multi-part archive. In the\n",
" latter case the central directory and zipfile comment will be found on\n",
" the last disk(s) of this archive.\n",
"unzip: cannot find zipfile directory in one of samanatar-en-indic-v0.2.zip or\n",
" samanatar-en-indic-v0.2.zip.zip, and cannot find samanatar-en-indic-v0.2.zip.ZIP, period.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Archive: benchmarks.zip\n",
" creating: benchmarks/\n",
" creating: benchmarks/pmi/\n",
" creating: benchmarks/pmi/en-as/\n",
" inflating: benchmarks/pmi/en-as/dev.as \n",
" inflating: benchmarks/pmi/en-as/dev.en \n",
" inflating: benchmarks/pmi/en-as/test.as \n",
" inflating: benchmarks/pmi/en-as/test.en \n",
" creating: benchmarks/wat2021-devtest/\n",
" inflating: benchmarks/wat2021-devtest/dev.gu \n",
" inflating: benchmarks/wat2021-devtest/dev.en \n",
" inflating: benchmarks/wat2021-devtest/test.bn \n",
" inflating: benchmarks/wat2021-devtest/dev.bn \n",
" inflating: benchmarks/wat2021-devtest/test.hi \n",
" inflating: benchmarks/wat2021-devtest/dev.kn \n",
" inflating: benchmarks/wat2021-devtest/dev.ta \n",
" inflating: benchmarks/wat2021-devtest/test.pa \n",
" inflating: benchmarks/wat2021-devtest/test.en \n",
" inflating: benchmarks/wat2021-devtest/test.mr \n",
" inflating: benchmarks/wat2021-devtest/test.kn \n",
" inflating: benchmarks/wat2021-devtest/dev.ml \n",
" inflating: benchmarks/wat2021-devtest/test.ta \n",
" inflating: benchmarks/wat2021-devtest/test.gu \n",
" inflating: benchmarks/wat2021-devtest/dev.or \n",
" inflating: benchmarks/wat2021-devtest/test.or \n",
" inflating: benchmarks/wat2021-devtest/test.te \n",
" inflating: benchmarks/wat2021-devtest/dev.mr \n",
" inflating: benchmarks/wat2021-devtest/test.ml \n",
" inflating: benchmarks/wat2021-devtest/dev.pa \n",
" inflating: benchmarks/wat2021-devtest/dev.te \n",
" inflating: benchmarks/wat2021-devtest/dev.hi \n",
" creating: benchmarks/wat2020-devtest/\n",
" creating: benchmarks/wat2020-devtest/en-bn/\n",
" inflating: benchmarks/wat2020-devtest/en-bn/dev.en \n",
" inflating: benchmarks/wat2020-devtest/en-bn/test.bn \n",
" inflating: benchmarks/wat2020-devtest/en-bn/dev.bn \n",
" inflating: benchmarks/wat2020-devtest/en-bn/test.en \n",
" creating: benchmarks/wat2020-devtest/en-ta/\n",
" inflating: benchmarks/wat2020-devtest/en-ta/dev.en \n",
" inflating: benchmarks/wat2020-devtest/en-ta/dev.ta \n",
" inflating: benchmarks/wat2020-devtest/en-ta/test.en \n",
" inflating: benchmarks/wat2020-devtest/en-ta/test.ta \n",
" creating: benchmarks/wat2020-devtest/en-mr/\n",
" inflating: benchmarks/wat2020-devtest/en-mr/dev.en \n",
" inflating: benchmarks/wat2020-devtest/en-mr/test.en \n",
" inflating: benchmarks/wat2020-devtest/en-mr/test.mr \n",
" inflating: benchmarks/wat2020-devtest/en-mr/dev.mr \n",
" creating: benchmarks/wat2020-devtest/en-te/\n",
" inflating: benchmarks/wat2020-devtest/en-te/dev.en \n",
" inflating: benchmarks/wat2020-devtest/en-te/test.en \n",
" inflating: benchmarks/wat2020-devtest/en-te/test.te \n",
" inflating: benchmarks/wat2020-devtest/en-te/dev.te \n",
" creating: benchmarks/wat2020-devtest/en-hi/\n",
" inflating: benchmarks/wat2020-devtest/en-hi/dev.en \n",
" inflating: benchmarks/wat2020-devtest/en-hi/test.hi \n",
" inflating: benchmarks/wat2020-devtest/en-hi/test.en \n",
" inflating: benchmarks/wat2020-devtest/en-hi/dev.hi \n",
" creating: benchmarks/wat2020-devtest/en-gu/\n",
" inflating: benchmarks/wat2020-devtest/en-gu/dev.gu \n",
" inflating: benchmarks/wat2020-devtest/en-gu/dev.en \n",
" inflating: benchmarks/wat2020-devtest/en-gu/test.en \n",
" inflating: benchmarks/wat2020-devtest/en-gu/test.gu \n",
" creating: benchmarks/wat2020-devtest/en-ml/\n",
" inflating: benchmarks/wat2020-devtest/en-ml/dev.en \n",
" inflating: benchmarks/wat2020-devtest/en-ml/test.en \n",
" inflating: benchmarks/wat2020-devtest/en-ml/dev.ml \n",
" inflating: benchmarks/wat2020-devtest/en-ml/test.ml \n",
" creating: benchmarks/ufal-ta/\n",
" creating: benchmarks/ufal-ta/en-ta/\n",
" inflating: benchmarks/ufal-ta/en-ta/dev.en \n",
" inflating: benchmarks/ufal-ta/en-ta/dev.ta \n",
" inflating: benchmarks/ufal-ta/en-ta/test.en \n",
" inflating: benchmarks/ufal-ta/en-ta/test.ta \n",
" creating: benchmarks/wmt-news/\n",
" creating: benchmarks/wmt-news/en-ta/\n",
" inflating: benchmarks/wmt-news/en-ta/dev.en \n",
" inflating: benchmarks/wmt-news/en-ta/dev.ta \n",
" inflating: benchmarks/wmt-news/en-ta/test.en \n",
" inflating: benchmarks/wmt-news/en-ta/test.ta \n",
" creating: benchmarks/wmt-news/en-hi/\n",
" inflating: benchmarks/wmt-news/en-hi/dev.en \n",
" inflating: benchmarks/wmt-news/en-hi/test.hi \n",
" inflating: benchmarks/wmt-news/en-hi/test.en \n",
" inflating: benchmarks/wmt-news/en-hi/dev.hi \n",
" creating: benchmarks/wmt-news/en-gu/\n",
" inflating: benchmarks/wmt-news/en-gu/test.en \n",
" inflating: benchmarks/wmt-news/en-gu/test.gu \n"
]
}
],
"source": [
"## for the latest samanantar dataset v0.3 -> please use this link: https://storage.googleapis.com/samanantar-public/V0.3/source_wise_splits.zip\n",
"# This v0.3 dataset has source wise splits to indicate where the data has been collected from\n",
"# For preprocessing simplicity we will use v0.2( which just uses raw text files without source information) in this tutorial\n",
"# \n",
"# \n",
"# lets now download the indictrans data v0.2 dataset\n",
"! wget https://storage.googleapis.com/samanantar-public/V0.2/data/en2indic/samanatar-en-indic-v0.2.zip\n",
"\n",
"\n",
"\n",
"# lets also download the benchmarks for dev and test set\n",
"\n",
"! wget https://storage.googleapis.com/samanantar-public/benchmarks.zip\n",
"\n",
"# training data is organized as en-X folders where each folder contains two text files containing parallel data for en-X lang pair.\n",
"\n",
"# final_data\n",
"# ├── en-as\n",
"# │ ├── train.as\n",
"# │ └── train.en\n",
"# ├── en-bn\n",
"# │ ├── train.bn\n",
"# │ └── train.en\n",
"# ├── en-gu\n",
"# │ ├── train.en\n",
"# │ └── train.gu\n",
"# ├── en-hi\n",
"# │ ├── train.en\n",
"# │ └── train.hi\n",
"# ├── en-kn\n",
"# │ ├── train.en\n",
"# │ └── train.kn\n",
"# ├── en-ml\n",
"# │ ├── train.en\n",
"# │ └── train.ml\n",
"# ├── en-mr\n",
"# │ ├── train.en\n",
"# │ └── train.mr\n",
"# ├── en-or\n",
"# │ ├── train.en\n",
"# │ └── train.or\n",
"# ├── en-pa\n",
"# │ ├── train.en\n",
"# │ └── train.pa\n",
"# ├── en-ta\n",
"# │ ├── train.en\n",
"# │ └── train.ta\n",
"# └── en-te\n",
"# ├── train.en\n",
"# └── train.te\n",
"\n",
"\n",
"! unzip samanatar-en-indic-v0.2.zip\n",
"\n",
"# benchmarks folder consists of all the benchmarks we report in the paper - pmi, ufal-ta, wat2020, wat2021, wmt-news\n",
"\n",
"! unzip benchmarks.zip"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "MR_2GQoa84Jn"
},
"outputs": [],
"source": [
"# create an experiment dir to store train data, devtest data. \n",
"# This folder will also store vocabulary files (created with subword_nmt for bpe), fairseq bin files (for training), model checkpoints.\n",
"\n",
"# for this example we will be training indic to en translation model. We will name our exp_dir as indic-en-exp\n",
"! mkdir indic-en-exp\n",
"# copying all the train folders to exp_dir\n",
"! cp -r final_data/* indic-en-exp\n",
"\n",
"! mkdir -p indic-en-exp/devtest\n",
"\n",
"# copying all benchmarks to devtest folder in exp_dir\n",
"! cp -r benchmarks/* indic-en-exp/devtest\n",
"\n",
"# folder to store combined devtest data (based on the domains you want to test, you can combine multiple benchmarks dev datasets, remove duplicates)\n",
"! mkdir -p indic-en-exp/devtest/all\n",
"\n",
"# in this tutorial, for simplicity, we will just use wat2020 devtest for dev and test set\n",
"! cp -r indic-en-exp/devtest/wat2020-devtest/* indic-en-exp/devtest/all\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "lorcT8wkFPtQ"
},
"outputs": [],
"source": [
"% cd indicTrans"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "vhvYXUc1FaVn"
},
"outputs": [],
"source": [
"# prepare_data_joint_training.sh takes experiment dir, src_lang, tgt_lang as input \n",
"# This does preprocessing, building vocab, binarization for joint training\n",
"\n",
"# The learning and applying vocabulary will take a while if the dataset is huge. To make it faster, run it on a multicore system\n",
"\n",
"! bash prepare_data_joint_training.sh '../indic-en-exp' 'indic' 'en'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "p1i3fRQzF2-x"
},
"outputs": [],
"source": [
"# Training the model\n",
"\n",
"# pls refer to fairseq documentaion to know more about each of these options (https://fairseq.readthedocs.io/en/latest/command_line_tools.html)\n",
"\n",
"\n",
"# some notable args:\n",
"# --max-updates -> maximum update steps the model will be trained for\n",
"# --arch=transformer_4x -> we use a custom transformer model and name it transformer_4x (4 times the parameter size of transformer base)\n",
"# --user_dir -> we define the custom transformer arch in model_configs folder and pass it as an argument to user_dir for fairseq to register this architechture\n",
"# --lr -> learning rate. From our limited experiments, we find that lower learning rates like 3e-5 works best for finetuning.\n",
"# --max_tokens -> this is max tokens per batch. You should limit to lower values if you get oom errors.\n",
"# --update-freq -> gradient accumulation steps\n",
"\n",
"\n",
"!( fairseq-train ../indic-en-exp/final_bin \\\n",
"--max-source-positions=210 \\\n",
"--max-target-positions=210 \\\n",
"--max-update= \\\n",
"--save-interval=1 \\\n",
"--arch=transformer_4x \\\n",
"--criterion=label_smoothed_cross_entropy \\\n",
"--source-lang=SRC \\\n",
"--lr-scheduler=inverse_sqrt \\\n",
"--target-lang=TGT \\\n",
"--label-smoothing=0.1 \\\n",
"--optimizer adam \\\n",
"--adam-betas \"(0.9, 0.98)\" \\\n",
"--clip-norm 1.0 \\\n",
"--warmup-init-lr 1e-07 \\\n",
"--lr 0.0005 \\\n",
"--warmup-updates 4000 \\\n",
"--dropout 0.2 \\\n",
"--save-dir ../indic-en-exp/model \\\n",
"--keep-last-epochs 5 \\\n",
"--patience 5 \\\n",
"--skip-invalid-size-inputs-valid-test \\\n",
"--fp16 \\\n",
"--user-dir model_configs \\\n",
"--wandb-project \\\n",
"--update-freq= \\\n",
"--distributed-world-size \\\n",
"--max-tokens )"
]
}
],
"metadata": {
"colab": {
"authorship_tag": "ABX9TyO6AA5gXphZ5kJ6h+dgeSqb",
"collapsed_sections": [],
"include_colab_link": true,
"name": "IndicTrans_training.ipynb",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.7"
}
},
"nbformat": 4,
"nbformat_minor": 0
}