harveen commited on
Commit
c4f6e1f
1 Parent(s): 88aa560

Harveen | Deleting irrelevant

Browse files
IndicTrans_training.ipynb DELETED
@@ -1,752 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "markdown",
5
- "metadata": {
6
- "colab_type": "text",
7
- "id": "view-in-github"
8
- },
9
- "source": [
10
- "<a href=\"https://colab.research.google.com/github/gowtham1997/indicTrans-1/blob/main/IndicTrans_training.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
11
- ]
12
- },
13
- {
14
- "cell_type": "code",
15
- "execution_count": 1,
16
- "metadata": {
17
- "colab": {
18
- "base_uri": "https://localhost:8080/"
19
- },
20
- "id": "FdyHSnoj7Iun",
21
- "outputId": "d0624c60-68c4-470f-9ade-c517e3296044"
22
- },
23
- "outputs": [
24
- {
25
- "name": "stdout",
26
- "output_type": "stream",
27
- "text": [
28
- "/content/training\n"
29
- ]
30
- }
31
- ],
32
- "source": [
33
- "# create a seperate folder to store everything\n",
34
- "!mkdir training\n",
35
- "%cd training"
36
- ]
37
- },
38
- {
39
- "cell_type": "code",
40
- "execution_count": 2,
41
- "metadata": {
42
- "colab": {
43
- "base_uri": "https://localhost:8080/"
44
- },
45
- "id": "y55OfxBz8QeP",
46
- "outputId": "6d0ab016-0f96-4671-ddee-f06b50506dcd"
47
- },
48
- "outputs": [
49
- {
50
- "name": "stdout",
51
- "output_type": "stream",
52
- "text": [
53
- "Cloning into 'indicTrans'...\n",
54
- "remote: Enumerating objects: 432, done.\u001b[K\n",
55
- "remote: Counting objects: 100% (139/139), done.\u001b[K\n",
56
- "remote: Compressing objects: 100% (34/34), done.\u001b[K\n",
57
- "remote: Total 432 (delta 122), reused 105 (delta 105), pack-reused 293\u001b[K\n",
58
- "Receiving objects: 100% (432/432), 1.43 MiB | 14.11 MiB/s, done.\n",
59
- "Resolving deltas: 100% (248/248), done.\n",
60
- "/content/training/indicTrans\n",
61
- "Cloning into 'indic_nlp_library'...\n",
62
- "remote: Enumerating objects: 1325, done.\u001b[K\n",
63
- "remote: Counting objects: 100% (147/147), done.\u001b[K\n",
64
- "remote: Compressing objects: 100% (103/103), done.\u001b[K\n",
65
- "remote: Total 1325 (delta 84), reused 89 (delta 41), pack-reused 1178\u001b[K\n",
66
- "Receiving objects: 100% (1325/1325), 9.57 MiB | 10.51 MiB/s, done.\n",
67
- "Resolving deltas: 100% (688/688), done.\n",
68
- "Cloning into 'indic_nlp_resources'...\n",
69
- "remote: Enumerating objects: 133, done.\u001b[K\n",
70
- "remote: Counting objects: 100% (7/7), done.\u001b[K\n",
71
- "remote: Compressing objects: 100% (7/7), done.\u001b[K\n",
72
- "remote: Total 133 (delta 0), reused 2 (delta 0), pack-reused 126\u001b[K\n",
73
- "Receiving objects: 100% (133/133), 149.77 MiB | 34.05 MiB/s, done.\n",
74
- "Resolving deltas: 100% (51/51), done.\n",
75
- "Checking out files: 100% (28/28), done.\n",
76
- "Cloning into 'subword-nmt'...\n",
77
- "remote: Enumerating objects: 580, done.\u001b[K\n",
78
- "remote: Counting objects: 100% (4/4), done.\u001b[K\n",
79
- "remote: Compressing objects: 100% (4/4), done.\u001b[K\n",
80
- "remote: Total 580 (delta 0), reused 1 (delta 0), pack-reused 576\u001b[K\n",
81
- "Receiving objects: 100% (580/580), 237.41 KiB | 5.28 MiB/s, done.\n",
82
- "Resolving deltas: 100% (349/349), done.\n",
83
- "/content/training\n"
84
- ]
85
- }
86
- ],
87
- "source": [
88
- "# clone the repo for running finetuning\n",
89
- "!git clone https://github.com/AI4Bharat/indicTrans.git\n",
90
- "%cd indicTrans\n",
91
- "# clone requirements repositories\n",
92
- "!git clone https://github.com/anoopkunchukuttan/indic_nlp_library.git\n",
93
- "!git clone https://github.com/anoopkunchukuttan/indic_nlp_resources.git\n",
94
- "!git clone https://github.com/rsennrich/subword-nmt.git\n",
95
- "%cd .."
96
- ]
97
- },
98
- {
99
- "cell_type": "code",
100
- "execution_count": 3,
101
- "metadata": {
102
- "colab": {
103
- "base_uri": "https://localhost:8080/"
104
- },
105
- "id": "ziWWl-1a8SMw",
106
- "outputId": "d7908a62-9573-4693-e7cb-44aeeebaaa15"
107
- },
108
- "outputs": [
109
- {
110
- "name": "stdout",
111
- "output_type": "stream",
112
- "text": [
113
- "Reading package lists... Done\n",
114
- "Building dependency tree \n",
115
- "Reading state information... Done\n",
116
- "The following NEW packages will be installed:\n",
117
- " tree\n",
118
- "0 upgraded, 1 newly installed, 0 to remove and 39 not upgraded.\n",
119
- "Need to get 40.7 kB of archives.\n",
120
- "After this operation, 105 kB of additional disk space will be used.\n",
121
- "Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 tree amd64 1.7.0-5 [40.7 kB]\n",
122
- "Fetched 40.7 kB in 0s (133 kB/s)\n",
123
- "debconf: unable to initialize frontend: Dialog\n",
124
- "debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 76, <> line 1.)\n",
125
- "debconf: falling back to frontend: Readline\n",
126
- "debconf: unable to initialize frontend: Readline\n",
127
- "debconf: (This frontend requires a controlling tty.)\n",
128
- "debconf: falling back to frontend: Teletype\n",
129
- "dpkg-preconfigure: unable to re-open stdin: \n",
130
- "Selecting previously unselected package tree.\n",
131
- "(Reading database ... 160772 files and directories currently installed.)\n",
132
- "Preparing to unpack .../tree_1.7.0-5_amd64.deb ...\n",
133
- "Unpacking tree (1.7.0-5) ...\n",
134
- "Setting up tree (1.7.0-5) ...\n",
135
- "Processing triggers for man-db (2.8.3-2ubuntu0.1) ...\n",
136
- "Collecting sacremoses\n",
137
- "\u001b[?25l Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)\n",
138
- "\u001b[K |████████████████████████████████| 901kB 4.0MB/s \n",
139
- "\u001b[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (1.1.5)\n",
140
- "Collecting mock\n",
141
- " Downloading https://files.pythonhosted.org/packages/5c/03/b7e605db4a57c0f6fba744b11ef3ddf4ddebcada35022927a2b5fc623fdf/mock-4.0.3-py3-none-any.whl\n",
142
- "Collecting sacrebleu\n",
143
- "\u001b[?25l Downloading https://files.pythonhosted.org/packages/7e/57/0c7ca4e31a126189dab99c19951910bd081dea5bbd25f24b77107750eae7/sacrebleu-1.5.1-py3-none-any.whl (54kB)\n",
144
- "\u001b[K |████████████████████████████████| 61kB 7.4MB/s \n",
145
- "\u001b[?25hCollecting tensorboardX\n",
146
- "\u001b[?25l Downloading https://files.pythonhosted.org/packages/42/36/2b147652c40c3a858efa0afbf7b8236fae968e88ff530511a4cfa299a506/tensorboardX-2.3-py2.py3-none-any.whl (124kB)\n",
147
- "\u001b[K |████████████████████████████████| 133kB 24.0MB/s \n",
148
- "\u001b[?25hRequirement already satisfied: pyarrow in /usr/local/lib/python3.7/dist-packages (3.0.0)\n",
149
- "Collecting indic-nlp-library\n",
150
- "\u001b[?25l Downloading https://files.pythonhosted.org/packages/84/d4/495bb43b88a2a6d04b09c29fc5115f24872af74cd8317fe84026abd4ddb1/indic_nlp_library-0.81-py3-none-any.whl (40kB)\n",
151
- "\u001b[K |████████████████████████████████| 40kB 5.4MB/s \n",
152
- "\u001b[?25hRequirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from sacremoses) (1.15.0)\n",
153
- "Requirement already satisfied: regex in /usr/local/lib/python3.7/dist-packages (from sacremoses) (2019.12.20)\n",
154
- "Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from sacremoses) (7.1.2)\n",
155
- "Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from sacremoses) (4.41.1)\n",
156
- "Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from sacremoses) (1.0.1)\n",
157
- "Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas) (2018.9)\n",
158
- "Requirement already satisfied: numpy>=1.15.4 in /usr/local/lib/python3.7/dist-packages (from pandas) (1.19.5)\n",
159
- "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas) (2.8.1)\n",
160
- "Collecting portalocker==2.0.0\n",
161
- " Downloading https://files.pythonhosted.org/packages/89/a6/3814b7107e0788040870e8825eebf214d72166adf656ba7d4bf14759a06a/portalocker-2.0.0-py2.py3-none-any.whl\n",
162
- "Requirement already satisfied: protobuf>=3.8.0 in /usr/local/lib/python3.7/dist-packages (from tensorboardX) (3.12.4)\n",
163
- "Collecting morfessor\n",
164
- " Downloading https://files.pythonhosted.org/packages/39/e6/7afea30be2ee4d29ce9de0fa53acbb033163615f849515c0b1956ad074ee/Morfessor-2.0.6-py3-none-any.whl\n",
165
- "Collecting sphinx-argparse\n",
166
- " Downloading https://files.pythonhosted.org/packages/06/2b/dfad6a1831c3aeeae25d8d3d417224684befbf45e10c7f2141631616a6ed/sphinx-argparse-0.2.5.tar.gz\n",
167
- "Collecting sphinx-rtd-theme\n",
168
- "\u001b[?25l Downloading https://files.pythonhosted.org/packages/ac/24/2475e8f83519b54b2148d4a56eb1111f9cec630d088c3ffc214492c12107/sphinx_rtd_theme-0.5.2-py2.py3-none-any.whl (9.1MB)\n",
169
- "\u001b[K |████████████████████████████████| 9.2MB 21.7MB/s \n",
170
- "\u001b[?25hRequirement already satisfied: setuptools in /usr/local/lib/python3.7/dist-packages (from protobuf>=3.8.0->tensorboardX) (57.0.0)\n",
171
- "Requirement already satisfied: sphinx>=1.2.0 in /usr/local/lib/python3.7/dist-packages (from sphinx-argparse->indic-nlp-library) (1.8.5)\n",
172
- "Collecting docutils<0.17\n",
173
- "\u001b[?25l Downloading https://files.pythonhosted.org/packages/81/44/8a15e45ffa96e6cf82956dd8d7af9e666357e16b0d93b253903475ee947f/docutils-0.16-py2.py3-none-any.whl (548kB)\n",
174
- "\u001b[K |████████████████████████████████| 552kB 38.5MB/s \n",
175
- "\u001b[?25hRequirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (20.9)\n",
176
- "Requirement already satisfied: imagesize in /usr/local/lib/python3.7/dist-packages (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (1.2.0)\n",
177
- "Requirement already satisfied: requests>=2.0.0 in /usr/local/lib/python3.7/dist-packages (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (2.23.0)\n",
178
- "Requirement already satisfied: sphinxcontrib-websupport in /usr/local/lib/python3.7/dist-packages (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (1.2.4)\n",
179
- "Requirement already satisfied: Pygments>=2.0 in /usr/local/lib/python3.7/dist-packages (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (2.6.1)\n",
180
- "Requirement already satisfied: snowballstemmer>=1.1 in /usr/local/lib/python3.7/dist-packages (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (2.1.0)\n",
181
- "Requirement already satisfied: babel!=2.0,>=1.3 in /usr/local/lib/python3.7/dist-packages (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (2.9.1)\n",
182
- "Requirement already satisfied: alabaster<0.8,>=0.7 in /usr/local/lib/python3.7/dist-packages (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (0.7.12)\n",
183
- "Requirement already satisfied: Jinja2>=2.3 in /usr/local/lib/python3.7/dist-packages (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (2.11.3)\n",
184
- "Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (2.4.7)\n",
185
- "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests>=2.0.0->sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (1.24.3)\n",
186
- "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests>=2.0.0->sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (2.10)\n",
187
- "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests>=2.0.0->sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (3.0.4)\n",
188
- "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests>=2.0.0->sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (2021.5.30)\n",
189
- "Requirement already satisfied: sphinxcontrib-serializinghtml in /usr/local/lib/python3.7/dist-packages (from sphinxcontrib-websupport->sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (1.1.5)\n",
190
- "Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.7/dist-packages (from Jinja2>=2.3->sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (2.0.1)\n",
191
- "Building wheels for collected packages: sphinx-argparse\n",
192
- " Building wheel for sphinx-argparse (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
193
- " Created wheel for sphinx-argparse: filename=sphinx_argparse-0.2.5-cp37-none-any.whl size=11552 sha256=0f3830a0bf7a6cfa99000091da945e9dd814b2f1e1f9ca5d773f99aaa0d3a4a5\n",
194
- " Stored in directory: /root/.cache/pip/wheels/2a/18/1b/4990a1859da4edc77ab312bc2986c08d2733fb5713d06e44f5\n",
195
- "Successfully built sphinx-argparse\n",
196
- "\u001b[31mERROR: datascience 0.10.6 has requirement folium==0.2.1, but you'll have folium 0.8.3 which is incompatible.\u001b[0m\n",
197
- "Installing collected packages: sacremoses, mock, portalocker, sacrebleu, tensorboardX, morfessor, sphinx-argparse, docutils, sphinx-rtd-theme, indic-nlp-library\n",
198
- " Found existing installation: docutils 0.17.1\n",
199
- " Uninstalling docutils-0.17.1:\n",
200
- " Successfully uninstalled docutils-0.17.1\n",
201
- "Successfully installed docutils-0.16 indic-nlp-library-0.81 mock-4.0.3 morfessor-2.0.6 portalocker-2.0.0 sacrebleu-1.5.1 sacremoses-0.0.45 sphinx-argparse-0.2.5 sphinx-rtd-theme-0.5.2 tensorboardX-2.3\n",
202
- "Cloning into 'fairseq'...\n",
203
- "remote: Enumerating objects: 28410, done.\u001b[K\n",
204
- "remote: Counting objects: 100% (229/229), done.\u001b[K\n",
205
- "remote: Compressing objects: 100% (127/127), done.\u001b[K\n",
206
- "remote: Total 28410 (delta 114), reused 187 (delta 99), pack-reused 28181\u001b[K\n",
207
- "Receiving objects: 100% (28410/28410), 11.96 MiB | 24.45 MiB/s, done.\n",
208
- "Resolving deltas: 100% (21310/21310), done.\n",
209
- "/content/training/fairseq\n",
210
- "Obtaining file:///content/training/fairseq\n",
211
- " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
212
- " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
213
- " Installing backend dependencies ... \u001b[?25l\u001b[?25hdone\n",
214
- " Preparing wheel metadata ... \u001b[?25l\u001b[?25hdone\n",
215
- "Requirement already satisfied: regex in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (2019.12.20)\n",
216
- "Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (4.41.1)\n",
217
- "Collecting omegaconf<2.1\n",
218
- " Downloading https://files.pythonhosted.org/packages/d0/eb/9d63ce09dd8aa85767c65668d5414958ea29648a0eec80a4a7d311ec2684/omegaconf-2.0.6-py3-none-any.whl\n",
219
- "Requirement already satisfied: numpy; python_version >= \"3.7\" in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (1.19.5)\n",
220
- "Requirement already satisfied: sacrebleu>=1.4.12 in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (1.5.1)\n",
221
- "Requirement already satisfied: cython in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (0.29.23)\n",
222
- "Collecting hydra-core<1.1\n",
223
- "\u001b[?25l Downloading https://files.pythonhosted.org/packages/52/e3/fbd70dd0d3ce4d1d75c22d56c0c9f895cfa7ed6587a9ffb821d6812d6a60/hydra_core-1.0.6-py3-none-any.whl (123kB)\n",
224
- "\u001b[K |████████████████████████████████| 133kB 4.7MB/s \n",
225
- "\u001b[?25hRequirement already satisfied: torch in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (1.9.0+cu102)\n",
226
- "Requirement already satisfied: cffi in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (1.14.5)\n",
227
- "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from omegaconf<2.1->fairseq==1.0.0a0+f887152) (3.7.4.3)\n",
228
- "Collecting PyYAML>=5.1.*\n",
229
- "\u001b[?25l Downloading https://files.pythonhosted.org/packages/7a/a5/393c087efdc78091afa2af9f1378762f9821c9c1d7a22c5753fb5ac5f97a/PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636kB)\n",
230
- "\u001b[K |████████████████████████████████| 645kB 32.4MB/s \n",
231
- "\u001b[?25hRequirement already satisfied: portalocker==2.0.0 in /usr/local/lib/python3.7/dist-packages (from sacrebleu>=1.4.12->fairseq==1.0.0a0+f887152) (2.0.0)\n",
232
- "Requirement already satisfied: importlib-resources; python_version < \"3.9\" in /usr/local/lib/python3.7/dist-packages (from hydra-core<1.1->fairseq==1.0.0a0+f887152) (5.1.4)\n",
233
- "Collecting antlr4-python3-runtime==4.8\n",
234
- "\u001b[?25l Downloading https://files.pythonhosted.org/packages/56/02/789a0bddf9c9b31b14c3e79ec22b9656185a803dc31c15f006f9855ece0d/antlr4-python3-runtime-4.8.tar.gz (112kB)\n",
235
- "\u001b[K |████████████████████████████████| 112kB 53.0MB/s \n",
236
- "\u001b[?25hRequirement already satisfied: pycparser in /usr/local/lib/python3.7/dist-packages (from cffi->fairseq==1.0.0a0+f887152) (2.20)\n",
237
- "Requirement already satisfied: zipp>=3.1.0; python_version < \"3.10\" in /usr/local/lib/python3.7/dist-packages (from importlib-resources; python_version < \"3.9\"->hydra-core<1.1->fairseq==1.0.0a0+f887152) (3.4.1)\n",
238
- "Building wheels for collected packages: antlr4-python3-runtime\n",
239
- " Building wheel for antlr4-python3-runtime (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
240
- " Created wheel for antlr4-python3-runtime: filename=antlr4_python3_runtime-4.8-cp37-none-any.whl size=141231 sha256=52f59bfe6322a04598da6960d2d5675a581273a45e4391e04cf1240c97346019\n",
241
- " Stored in directory: /root/.cache/pip/wheels/e3/e2/fa/b78480b448b8579ddf393bebd3f47ee23aa84c89b6a78285c8\n",
242
- "Successfully built antlr4-python3-runtime\n",
243
- "Installing collected packages: PyYAML, omegaconf, antlr4-python3-runtime, hydra-core, fairseq\n",
244
- " Found existing installation: PyYAML 3.13\n",
245
- " Uninstalling PyYAML-3.13:\n",
246
- " Successfully uninstalled PyYAML-3.13\n",
247
- " Running setup.py develop for fairseq\n",
248
- "Successfully installed PyYAML-5.4.1 antlr4-python3-runtime-4.8 fairseq hydra-core-1.0.6 omegaconf-2.0.6\n",
249
- "/content/training\n"
250
- ]
251
- }
252
- ],
253
- "source": [
254
- "! sudo apt install tree\n",
255
- "\n",
256
- "# Install the necessary libraries\n",
257
- "!pip install sacremoses pandas mock sacrebleu tensorboardX pyarrow indic-nlp-library\n",
258
- "# Install fairseq from source\n",
259
- "!git clone https://github.com/pytorch/fairseq.git\n",
260
- "%cd fairseq\n",
261
- "# !git checkout da9eaba12d82b9bfc1442f0e2c6fc1b895f4d35d\n",
262
- "!pip install --editable ./\n",
263
- "%cd .."
264
- ]
265
- },
266
- {
267
- "cell_type": "code",
268
- "execution_count": 1,
269
- "metadata": {
270
- "colab": {
271
- "base_uri": "https://localhost:8080/"
272
- },
273
- "id": "tmfGYkd58UiO",
274
- "outputId": "3b83bcf6-bbbf-4e49-c2bb-7d0fb999297d"
275
- },
276
- "outputs": [
277
- {
278
- "name": "stdout",
279
- "output_type": "stream",
280
- "text": [
281
- "^C\n"
282
- ]
283
- },
284
- {
285
- "name": "stderr",
286
- "output_type": "stream",
287
- "text": [
288
- "--2021-12-18 21:31:57-- https://storage.googleapis.com/samanantar-public/benchmarks.zip\n",
289
- "Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.160.144, 216.58.196.176, 142.250.71.16, ...\n",
290
- "Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.160.144|:443... connected.\n",
291
- "HTTP request sent, awaiting response... 200 OK\n",
292
- "Length: 7301872 (7.0M) [application/zip]\n",
293
- "Saving to: 'benchmarks.zip'\n",
294
- "\n",
295
- " 0K .......... .......... .......... .......... .......... 0% 774K 9s\n",
296
- " 50K .......... .......... .......... .......... .......... 1% 2.10M 6s\n",
297
- " 100K .......... .......... .......... .......... .......... 2% 2.46M 5s\n",
298
- " 150K .......... .......... .......... .......... .......... 2% 2.68M 4s\n",
299
- " 200K .......... .......... .......... .......... .......... 3% 1.44M 4s\n",
300
- " 250K .......... .......... .......... .......... .......... 4% 2.48M 4s\n",
301
- " 300K .......... .......... .......... .......... .......... 4% 3.41M 4s\n",
302
- " 350K .......... .......... .......... .......... .......... 5% 2.22M 4s\n",
303
- " 400K .......... .......... .......... .......... .......... 6% 1.20M 4s\n",
304
- " 450K .......... .......... .......... .......... .......... 7% 2.65M 4s\n",
305
- " 500K .......... .......... .......... .......... .......... 7% 2.97M 3s\n",
306
- " 550K .......... .......... .......... .......... .......... 8% 887K 4s\n",
307
- " 600K .......... .......... .......... .......... .......... 9% 2.90M 4s\n",
308
- " 650K .......... .......... .......... .......... .......... 9% 2.76M 4s\n",
309
- " 700K .......... .......... .......... .......... .......... 10% 980K 4s\n",
310
- " 750K .......... .......... .......... .......... .......... 11% 2.55M 4s\n",
311
- " 800K .......... .......... .......... .......... .......... 11% 2.86M 3s\n",
312
- " 850K .......... .......... .......... .......... .......... 12% 3.04M 3s\n",
313
- " 900K .......... .......... .......... .......... .......... 13% 1.01M 3s\n",
314
- " 950K .......... .......... .......... .......... .......... 14% 3.35M 3s\n",
315
- " 1000K .......... .......... .......... .......... .......... 14% 5.04M 3s\n",
316
- " 1050K .......... .......... .......... .......... .......... 15% 14.5M 3s\n",
317
- " 1100K .......... .......... .......... .......... .......... 16% 1.01M 3s\n",
318
- " 1150K .......... .......... .......... .......... .......... 16% 4.48M 3s\n",
319
- " 1200K .......... .......... .......... .......... .......... 17% 4.34M 3s\n",
320
- " 1250K .......... .......... .......... .......... .......... 18% 2.90M 3s\n",
321
- " 1300K .......... .......... .......... .......... .......... 18% 1.14M 3s\n",
322
- " 1350K .......... .......... .......... .......... .......... 19% 3.00M 3s\n",
323
- " 1400K .......... .......... .......... .......... .......... 20% 5.09M 3s\n",
324
- " 1450K .......... .......... .......... .......... .......... 21% 1.91M 3s\n",
325
- " 1500K .......... .......... .......... .......... .......... 21% 7.70M 3s\n",
326
- " 1550K .......... .......... .......... .......... .......... 22% 1.27M 3s\n",
327
- " 1600K .......... .......... .......... .......... .......... 23% 3.06M 3s\n",
328
- " 1650K .......... .......... .......... .......... .......... 23% 4.11M 3s\n",
329
- " 1700K .......... .......... .......... .......... .......... 24% 3.34M 3s\n",
330
- " 1750K .......... .......... .......... .......... .......... 25% 4.13M 2s\n",
331
- " 1800K .......... .......... .......... .......... .......... 25% 7.95M 2s\n",
332
- " 1850K .......... .......... .......... .......... .......... 26% 3.69M 2s\n",
333
- " 1900K .......... .......... .......... .......... .......... 27% 4.00M 2s\n",
334
- " 1950K .......... .......... .......... .......... .......... 28% 3.50M 2s\n",
335
- " 2000K .......... .......... .......... .......... .......... 28% 4.04M 2s\n",
336
- " 2050K .......... .......... .......... .......... .......... 29% 3.31M 2s\n",
337
- " 2100K .......... .......... .......... .......... .......... 30% 2.49M 2s\n",
338
- " 2150K .......... .......... .......... .......... .......... 30% 4.19M 2s\n",
339
- " 2200K .......... .......... .......... .......... .......... 31% 5.18M 2s\n",
340
- " 2250K .......... .......... .......... .......... .......... 32% 9.49M 2s\n",
341
- " 2300K .......... .......... .......... .......... .......... 32% 8.67M 2s\n",
342
- " 2350K .......... .......... .......... .......... .......... 33% 4.88M 2s\n",
343
- " 2400K .......... .......... .......... .......... .......... 34% 4.56M 2s\n",
344
- " 2450K .......... .......... .......... .......... .......... 35% 4.94M 2s\n",
345
- " 2500K .......... .......... .......... .......... .......... 35% 4.38M 2s\n",
346
- " 2550K .......... .......... .......... .......... .......... 36% 3.78M 2s\n",
347
- " 2600K .......... .......... .......... .......... .......... 37% 4.95M 2s\n",
348
- " 2650K .......... .......... .......... .......... .......... 37% 5.50M 2s\n",
349
- " 2700K .......... .......... .......... .......... .......... 38% 5.23M 2s\n",
350
- " 2750K .......... .......... .......... .......... .......... 39% 3.77M 2s\n",
351
- " 2800K .......... .......... .......... .......... .......... 39% 10.7M 2s\n",
352
- " 2850K .......... .......... .......... .......... .......... 40% 7.16M 2s\n",
353
- " 2900K .......... .......... .......... .......... .......... 41% 5.36M 2s\n",
354
- " 2950K .......... .......... .......... .......... .......... 42% 6.80M 1s\n",
355
- " 3000K .......... .......... .......... .......... .......... 42% 6.57M 1s\n",
356
- " 3050K .......... .......... .......... .......... .......... 43% 7.21M 1s\n",
357
- " 3100K .......... .......... .......... .......... .......... 44% 6.66M 1s\n",
358
- " 3150K .......... .......... .......... .......... .......... 44% 6.42M 1s\n",
359
- " 3200K .......... .......... .......... .......... .......... 45% 8.02M 1s\n",
360
- " 3250K .......... .......... .......... .......... .......... 46% 5.96M 1s\n",
361
- " 3300K .......... .......... .......... .......... .......... 46% 5.13M 1s\n",
362
- " 3350K .......... .......... .......... .......... .......... 47% 5.19M 1s\n",
363
- " 3400K .......... .......... .......... .......... .......... 48% 7.64M 1s\n",
364
- " 3450K .......... .......... .......... .......... .......... 49% 6.11M 1s\n",
365
- " 3500K .......... .......... .......... .......... .......... 49% 4.01M 1s\n",
366
- " 3550K .......... .......... .......... .......... .......... 50% 4.52M 1s\n",
367
- " 3600K .......... .......... .......... .......... .......... 51% 6.72M 1s\n",
368
- " 3650K .......... .......... .......... .......... .......... 51% 5.45M 1s\n",
369
- " 3700K .......... .......... .......... .......... .......... 52% 4.37M 1s\n",
370
- " 3750K .......... .......... .......... .......... .......... 53% 5.39M 1s\n",
371
- " 3800K .......... .......... .......... .......... .......... 53% 7.40M 1s\n",
372
- " 3850K .......... .......... .......... .......... .......... 54% 6.70M 1s\n",
373
- " 3900K .......... .......... .......... .......... .......... 55% 5.14M 1s\n",
374
- " 3950K .......... .......... .......... .......... .......... 56% 5.02M 1s\n",
375
- " 4000K .......... .......... .......... .......... .......... 56% 6.70M 1s\n",
376
- " 4050K .......... .......... .......... .......... .......... 57% 6.76M 1s\n",
377
- " 4100K .......... .......... .......... .......... .......... 58% 2.52M 1s\n",
378
- " 4150K .......... .......... .......... .......... .......... 58% 887K 1s\n",
379
- " 4200K .......... .......... .......... .......... .......... 59% 9.25M 1s\n",
380
- " 4250K .......... .......... .......... .......... .......... 60% 1.27M 1s\n",
381
- " 4300K .......... .......... .......... .......... .......... 61% 5.72M 1s\n",
382
- " 4350K .......... .......... .......... .......... .......... 61% 4.48M 1s\n",
383
- " 4400K .......... .......... .......... .......... .......... 62% 5.20M 1s\n",
384
- " 4450K .......... .......... .......... .......... .......... 63% 6.21M 1s\n",
385
- " 4500K .......... .......... .......... .......... .......... 63% 7.94M 1s\n",
386
- " 4550K .......... .......... .......... .......... .......... 64% 4.76M 1s\n",
387
- " 4600K .......... .......... .......... .......... .......... 65% 4.74M 1s\n",
388
- " 4650K .......... .......... .......... .......... .......... 65% 6.94M 1s\n",
389
- " 4700K .......... .......... .......... .......... .......... 66% 5.62M 1s\n",
390
- " 4750K .......... .......... .......... .......... .......... 67% 4.44M 1s\n",
391
- " 4800K .......... .......... .......... .......... .......... 68% 6.02M 1s\n",
392
- " 4850K .......... .......... .......... .......... .......... 68% 6.61M 1s\n",
393
- " 4900K .......... .......... .......... .......... .......... 69% 3.04M 1s\n",
394
- " 4950K .......... .......... .......... .......... .......... 70% 5.34M 1s\n",
395
- " 5000K .......... .......... .......... .......... .......... 70% 3.03M 1s\n",
396
- " 5050K .......... .......... .......... .......... .......... 71% 19.8M 1s\n",
397
- " 5100K .......... .......... .......... .......... .......... 72% 6.17M 1s\n",
398
- " 5150K .......... .......... .......... .......... .......... 72% 5.58M 1s\n",
399
- " 5200K .......... .......... .......... .......... .......... 73% 7.38M 1s\n",
400
- " 5250K .......... .......... .......... .......... .......... 74% 7.11M 1s\n",
401
- " 5300K .......... .......... .......... .......... .......... 75% 6.24M 1s\n",
402
- " 5350K .......... .......... .......... .......... .......... 75% 4.62M 1s\n",
403
- " 5400K .......... .......... .......... .......... .......... 76% 7.64M 0s\n",
404
- " 5450K .......... .......... .......... .......... .......... 77% 6.06M 0s\n",
405
- " 5500K .......... .......... .......... .......... .......... 77% 5.56M 0s\n",
406
- " 5550K .......... .......... .......... .......... .......... 78% 2.96M 0s\n",
407
- " 5600K .......... .......... .......... .......... .......... 79% 6.17M 0s\n",
408
- " 5650K .......... .......... .......... .......... .......... 79% 9.58M 0s\n",
409
- " 5700K .......... .......... .......... .......... .......... 80% 2.58M 0s\n",
410
- " 5750K .......... .......... .......... .......... .......... 81% 4.23M 0s\n",
411
- " 5800K .......... .......... .......... .......... .......... 82% 5.70M 0s\n",
412
- " 5850K .......... .......... .......... .......... .......... 82% 4.72M 0s\n",
413
- " 5900K .......... .......... .......... .......... .......... 83% 6.52M 0s\n",
414
- " 5950K .......... .......... .......... .......... .......... 84% 5.86M 0s\n",
415
- " 6000K .......... .......... .......... .......... .......... 84% 5.22M 0s\n",
416
- " 6050K .......... .......... .......... .......... .......... 85% 5.50M 0s\n",
417
- " 6100K .......... .......... .......... .......... .......... 86% 6.29M 0s\n",
418
- " 6150K .......... .......... .......... .......... .......... 86% 6.93M 0s\n",
419
- " 6200K .......... .......... .......... .......... .......... 87% 5.50M 0s\n",
420
- " 6250K .......... .......... .......... .......... .......... 88% 5.82M 0s\n",
421
- " 6300K .......... .......... .......... .......... .......... 89% 6.76M 0s\n",
422
- " 6350K .......... .......... .......... .......... .......... 89% 3.73M 0s\n",
423
- " 6400K .......... .......... .......... .......... .......... 90% 5.98M 0s\n",
424
- " 6450K .......... .......... .......... .......... .......... 91% 5.78M 0s\n",
425
- " 6500K .......... .......... .......... .......... .......... 91% 5.60M 0s\n",
426
- " 6550K .......... .......... .......... .......... .......... 92% 4.84M 0s\n",
427
- " 6600K .......... .......... .......... .......... .......... 93% 7.25M 0s\n",
428
- " 6650K .......... .......... .......... .......... .......... 93% 2.60M 0s\n",
429
- " 6700K .......... .......... .......... .......... .......... 94% 6.02M 0s\n",
430
- " 6750K .......... .......... .......... .......... .......... 95% 6.57M 0s\n",
431
- " 6800K .......... .......... .......... .......... .......... 96% 8.30M 0s\n",
432
- " 6850K .......... .......... .......... .......... .......... 96% 14.4M 0s\n",
433
- " 6900K .......... .......... .......... .......... .......... 97% 4.58M 0s\n",
434
- " 6950K .......... .......... .......... .......... .......... 98% 3.31M 0s\n",
435
- " 7000K .......... .......... .......... .......... .......... 98% 6.88M 0s\n",
436
- " 7050K .......... .......... .......... .......... .......... 99% 4.40M 0s\n",
437
- " 7100K .......... .......... .......... 100% 15.1M=1.9s\n",
438
- "\n",
439
- "2021-12-18 21:32:01 (3.64 MB/s) - 'benchmarks.zip' saved [7301872/7301872]\n",
440
- "\n"
441
- ]
442
- },
443
- {
444
- "name": "stdout",
445
- "output_type": "stream",
446
- "text": [
447
- "Archive: samanatar-en-indic-v0.2.zip\n"
448
- ]
449
- },
450
- {
451
- "name": "stderr",
452
- "output_type": "stream",
453
- "text": [
454
- " End-of-central-directory signature not found. Either this file is not\n",
455
- " a zipfile, or it constitutes one disk of a multi-part archive. In the\n",
456
- " latter case the central directory and zipfile comment will be found on\n",
457
- " the last disk(s) of this archive.\n",
458
- "unzip: cannot find zipfile directory in one of samanatar-en-indic-v0.2.zip or\n",
459
- " samanatar-en-indic-v0.2.zip.zip, and cannot find samanatar-en-indic-v0.2.zip.ZIP, period.\n"
460
- ]
461
- },
462
- {
463
- "name": "stdout",
464
- "output_type": "stream",
465
- "text": [
466
- "Archive: benchmarks.zip\n",
467
- " creating: benchmarks/\n",
468
- " creating: benchmarks/pmi/\n",
469
- " creating: benchmarks/pmi/en-as/\n",
470
- " inflating: benchmarks/pmi/en-as/dev.as \n",
471
- " inflating: benchmarks/pmi/en-as/dev.en \n",
472
- " inflating: benchmarks/pmi/en-as/test.as \n",
473
- " inflating: benchmarks/pmi/en-as/test.en \n",
474
- " creating: benchmarks/wat2021-devtest/\n",
475
- " inflating: benchmarks/wat2021-devtest/dev.gu \n",
476
- " inflating: benchmarks/wat2021-devtest/dev.en \n",
477
- " inflating: benchmarks/wat2021-devtest/test.bn \n",
478
- " inflating: benchmarks/wat2021-devtest/dev.bn \n",
479
- " inflating: benchmarks/wat2021-devtest/test.hi \n",
480
- " inflating: benchmarks/wat2021-devtest/dev.kn \n",
481
- " inflating: benchmarks/wat2021-devtest/dev.ta \n",
482
- " inflating: benchmarks/wat2021-devtest/test.pa \n",
483
- " inflating: benchmarks/wat2021-devtest/test.en \n",
484
- " inflating: benchmarks/wat2021-devtest/test.mr \n",
485
- " inflating: benchmarks/wat2021-devtest/test.kn \n",
486
- " inflating: benchmarks/wat2021-devtest/dev.ml \n",
487
- " inflating: benchmarks/wat2021-devtest/test.ta \n",
488
- " inflating: benchmarks/wat2021-devtest/test.gu \n",
489
- " inflating: benchmarks/wat2021-devtest/dev.or \n",
490
- " inflating: benchmarks/wat2021-devtest/test.or \n",
491
- " inflating: benchmarks/wat2021-devtest/test.te \n",
492
- " inflating: benchmarks/wat2021-devtest/dev.mr \n",
493
- " inflating: benchmarks/wat2021-devtest/test.ml \n",
494
- " inflating: benchmarks/wat2021-devtest/dev.pa \n",
495
- " inflating: benchmarks/wat2021-devtest/dev.te \n",
496
- " inflating: benchmarks/wat2021-devtest/dev.hi \n",
497
- " creating: benchmarks/wat2020-devtest/\n",
498
- " creating: benchmarks/wat2020-devtest/en-bn/\n",
499
- " inflating: benchmarks/wat2020-devtest/en-bn/dev.en \n",
500
- " inflating: benchmarks/wat2020-devtest/en-bn/test.bn \n",
501
- " inflating: benchmarks/wat2020-devtest/en-bn/dev.bn \n",
502
- " inflating: benchmarks/wat2020-devtest/en-bn/test.en \n",
503
- " creating: benchmarks/wat2020-devtest/en-ta/\n",
504
- " inflating: benchmarks/wat2020-devtest/en-ta/dev.en \n",
505
- " inflating: benchmarks/wat2020-devtest/en-ta/dev.ta \n",
506
- " inflating: benchmarks/wat2020-devtest/en-ta/test.en \n",
507
- " inflating: benchmarks/wat2020-devtest/en-ta/test.ta \n",
508
- " creating: benchmarks/wat2020-devtest/en-mr/\n",
509
- " inflating: benchmarks/wat2020-devtest/en-mr/dev.en \n",
510
- " inflating: benchmarks/wat2020-devtest/en-mr/test.en \n",
511
- " inflating: benchmarks/wat2020-devtest/en-mr/test.mr \n",
512
- " inflating: benchmarks/wat2020-devtest/en-mr/dev.mr \n",
513
- " creating: benchmarks/wat2020-devtest/en-te/\n",
514
- " inflating: benchmarks/wat2020-devtest/en-te/dev.en \n",
515
- " inflating: benchmarks/wat2020-devtest/en-te/test.en \n",
516
- " inflating: benchmarks/wat2020-devtest/en-te/test.te \n",
517
- " inflating: benchmarks/wat2020-devtest/en-te/dev.te \n",
518
- " creating: benchmarks/wat2020-devtest/en-hi/\n",
519
- " inflating: benchmarks/wat2020-devtest/en-hi/dev.en \n",
520
- " inflating: benchmarks/wat2020-devtest/en-hi/test.hi \n",
521
- " inflating: benchmarks/wat2020-devtest/en-hi/test.en \n",
522
- " inflating: benchmarks/wat2020-devtest/en-hi/dev.hi \n",
523
- " creating: benchmarks/wat2020-devtest/en-gu/\n",
524
- " inflating: benchmarks/wat2020-devtest/en-gu/dev.gu \n",
525
- " inflating: benchmarks/wat2020-devtest/en-gu/dev.en \n",
526
- " inflating: benchmarks/wat2020-devtest/en-gu/test.en \n",
527
- " inflating: benchmarks/wat2020-devtest/en-gu/test.gu \n",
528
- " creating: benchmarks/wat2020-devtest/en-ml/\n",
529
- " inflating: benchmarks/wat2020-devtest/en-ml/dev.en \n",
530
- " inflating: benchmarks/wat2020-devtest/en-ml/test.en \n",
531
- " inflating: benchmarks/wat2020-devtest/en-ml/dev.ml \n",
532
- " inflating: benchmarks/wat2020-devtest/en-ml/test.ml \n",
533
- " creating: benchmarks/ufal-ta/\n",
534
- " creating: benchmarks/ufal-ta/en-ta/\n",
535
- " inflating: benchmarks/ufal-ta/en-ta/dev.en \n",
536
- " inflating: benchmarks/ufal-ta/en-ta/dev.ta \n",
537
- " inflating: benchmarks/ufal-ta/en-ta/test.en \n",
538
- " inflating: benchmarks/ufal-ta/en-ta/test.ta \n",
539
- " creating: benchmarks/wmt-news/\n",
540
- " creating: benchmarks/wmt-news/en-ta/\n",
541
- " inflating: benchmarks/wmt-news/en-ta/dev.en \n",
542
- " inflating: benchmarks/wmt-news/en-ta/dev.ta \n",
543
- " inflating: benchmarks/wmt-news/en-ta/test.en \n",
544
- " inflating: benchmarks/wmt-news/en-ta/test.ta \n",
545
- " creating: benchmarks/wmt-news/en-hi/\n",
546
- " inflating: benchmarks/wmt-news/en-hi/dev.en \n",
547
- " inflating: benchmarks/wmt-news/en-hi/test.hi \n",
548
- " inflating: benchmarks/wmt-news/en-hi/test.en \n",
549
- " inflating: benchmarks/wmt-news/en-hi/dev.hi \n",
550
- " creating: benchmarks/wmt-news/en-gu/\n",
551
- " inflating: benchmarks/wmt-news/en-gu/test.en \n",
552
- " inflating: benchmarks/wmt-news/en-gu/test.gu \n"
553
- ]
554
- }
555
- ],
556
- "source": [
557
- "## for the latest samanantar dataset v0.3 -> please use this link: https://storage.googleapis.com/samanantar-public/V0.3/source_wise_splits.zip\n",
558
- "# This v0.3 dataset has source wise splits to indicate where the data has been collected from\n",
559
- "# For preprocessing simplicity we will use v0.2( which just uses raw text files without source information) in this tutorial\n",
560
- "# \n",
561
- "# \n",
562
- "# lets now download the indictrans data v0.2 dataset\n",
563
- "! wget https://storage.googleapis.com/samanantar-public/V0.2/data/en2indic/samanatar-en-indic-v0.2.zip\n",
564
- "\n",
565
- "\n",
566
- "\n",
567
- "# lets also download the benchmarks for dev and test set\n",
568
- "\n",
569
- "! wget https://storage.googleapis.com/samanantar-public/benchmarks.zip\n",
570
- "\n",
571
- "# training data is organized as en-X folders where each folder contains two text files containing parallel data for en-X lang pair.\n",
572
- "\n",
573
- "# final_data\n",
574
- "# ├── en-as\n",
575
- "# │ ├── train.as\n",
576
- "# │ └── train.en\n",
577
- "# ├── en-bn\n",
578
- "# │ ├── train.bn\n",
579
- "# │ └── train.en\n",
580
- "# ├── en-gu\n",
581
- "# │ ├── train.en\n",
582
- "# │ └── train.gu\n",
583
- "# ├── en-hi\n",
584
- "# │ ├── train.en\n",
585
- "# │ └── train.hi\n",
586
- "# ├── en-kn\n",
587
- "# │ ├── train.en\n",
588
- "# │ └── train.kn\n",
589
- "# ├── en-ml\n",
590
- "# │ ├── train.en\n",
591
- "# │ └── train.ml\n",
592
- "# ├── en-mr\n",
593
- "# │ ├── train.en\n",
594
- "# │ └── train.mr\n",
595
- "# ├── en-or\n",
596
- "# │ ├── train.en\n",
597
- "# │ └── train.or\n",
598
- "# ├── en-pa\n",
599
- "# │ ├── train.en\n",
600
- "# │ └── train.pa\n",
601
- "# ├── en-ta\n",
602
- "# │ ├── train.en\n",
603
- "# │ └── train.ta\n",
604
- "# └── en-te\n",
605
- "# ├── train.en\n",
606
- "# └── train.te\n",
607
- "\n",
608
- "\n",
609
- "! unzip samanatar-en-indic-v0.2.zip\n",
610
- "\n",
611
- "# benchmarks folder consists of all the benchmarks we report in the paper - pmi, ufal-ta, wat2020, wat2021, wmt-news\n",
612
- "\n",
613
- "! unzip benchmarks.zip"
614
- ]
615
- },
616
- {
617
- "cell_type": "code",
618
- "execution_count": null,
619
- "metadata": {
620
- "id": "MR_2GQoa84Jn"
621
- },
622
- "outputs": [],
623
- "source": [
624
- "# create an experiment dir to store train data, devtest data. \n",
625
- "# This folder will also store vocabulary files (created with subword_nmt for bpe), fairseq bin files (for training), model checkpoints.\n",
626
- "\n",
627
- "# for this example we will be training indic to en translation model. We will name our exp_dir as indic-en-exp\n",
628
- "! mkdir indic-en-exp\n",
629
- "# copying all the train folders to exp_dir\n",
630
- "! cp -r final_data/* indic-en-exp\n",
631
- "\n",
632
- "! mkdir -p indic-en-exp/devtest\n",
633
- "\n",
634
- "# copying all benchmarks to devtest folder in exp_dir\n",
635
- "! cp -r benchmarks/* indic-en-exp/devtest\n",
636
- "\n",
637
- "# folder to store combined devtest data (based on the domains you want to test, you can combine multiple benchmarks dev datasets, remove duplicates)\n",
638
- "! mkdir -p indic-en-exp/devtest/all\n",
639
- "\n",
640
- "# in this tutorial, for simplicity, we will just use wat2020 devtest for dev and test set\n",
641
- "! cp -r indic-en-exp/devtest/wat2020-devtest/* indic-en-exp/devtest/all\n",
642
- "\n"
643
- ]
644
- },
645
- {
646
- "cell_type": "code",
647
- "execution_count": null,
648
- "metadata": {
649
- "id": "lorcT8wkFPtQ"
650
- },
651
- "outputs": [],
652
- "source": [
653
- "% cd indicTrans"
654
- ]
655
- },
656
- {
657
- "cell_type": "code",
658
- "execution_count": null,
659
- "metadata": {
660
- "id": "vhvYXUc1FaVn"
661
- },
662
- "outputs": [],
663
- "source": [
664
- "# prepare_data_joint_training.sh takes experiment dir, src_lang, tgt_lang as input \n",
665
- "# This does preprocessing, building vocab, binarization for joint training\n",
666
- "\n",
667
- "# The learning and applying vocabulary will take a while if the dataset is huge. To make it faster, run it on a multicore system\n",
668
- "\n",
669
- "! bash prepare_data_joint_training.sh '../indic-en-exp' 'indic' 'en'"
670
- ]
671
- },
672
- {
673
- "cell_type": "code",
674
- "execution_count": null,
675
- "metadata": {
676
- "id": "p1i3fRQzF2-x"
677
- },
678
- "outputs": [],
679
- "source": [
680
- "# Training the model\n",
681
- "\n",
682
- "# pls refer to fairseq documentaion to know more about each of these options (https://fairseq.readthedocs.io/en/latest/command_line_tools.html)\n",
683
- "\n",
684
- "\n",
685
- "# some notable args:\n",
686
- "# --max-updates -> maximum update steps the model will be trained for\n",
687
- "# --arch=transformer_4x -> we use a custom transformer model and name it transformer_4x (4 times the parameter size of transformer base)\n",
688
- "# --user_dir -> we define the custom transformer arch in model_configs folder and pass it as an argument to user_dir for fairseq to register this architechture\n",
689
- "# --lr -> learning rate. From our limited experiments, we find that lower learning rates like 3e-5 works best for finetuning.\n",
690
- "# --max_tokens -> this is max tokens per batch. You should limit to lower values if you get oom errors.\n",
691
- "# --update-freq -> gradient accumulation steps\n",
692
- "\n",
693
- "\n",
694
- "!( fairseq-train ../indic-en-exp/final_bin \\\n",
695
- "--max-source-positions=210 \\\n",
696
- "--max-target-positions=210 \\\n",
697
- "--max-update=<max_updates> \\\n",
698
- "--save-interval=1 \\\n",
699
- "--arch=transformer_4x \\\n",
700
- "--criterion=label_smoothed_cross_entropy \\\n",
701
- "--source-lang=SRC \\\n",
702
- "--lr-scheduler=inverse_sqrt \\\n",
703
- "--target-lang=TGT \\\n",
704
- "--label-smoothing=0.1 \\\n",
705
- "--optimizer adam \\\n",
706
- "--adam-betas \"(0.9, 0.98)\" \\\n",
707
- "--clip-norm 1.0 \\\n",
708
- "--warmup-init-lr 1e-07 \\\n",
709
- "--lr 0.0005 \\\n",
710
- "--warmup-updates 4000 \\\n",
711
- "--dropout 0.2 \\\n",
712
- "--save-dir ../indic-en-exp/model \\\n",
713
- "--keep-last-epochs 5 \\\n",
714
- "--patience 5 \\\n",
715
- "--skip-invalid-size-inputs-valid-test \\\n",
716
- "--fp16 \\\n",
717
- "--user-dir model_configs \\\n",
718
- "--wandb-project <wandb_project_name> \\\n",
719
- "--update-freq=<grad_accumulation_steps> \\\n",
720
- "--distributed-world-size <num_gpus> \\\n",
721
- "--max-tokens <max_tokens_in_a_batch> )"
722
- ]
723
- }
724
- ],
725
- "metadata": {
726
- "colab": {
727
- "authorship_tag": "ABX9TyO6AA5gXphZ5kJ6h+dgeSqb",
728
- "collapsed_sections": [],
729
- "include_colab_link": true,
730
- "name": "IndicTrans_training.ipynb",
731
- "provenance": []
732
- },
733
- "kernelspec": {
734
- "display_name": "Python 3",
735
- "name": "python3"
736
- },
737
- "language_info": {
738
- "codemirror_mode": {
739
- "name": "ipython",
740
- "version": 3
741
- },
742
- "file_extension": ".py",
743
- "mimetype": "text/x-python",
744
- "name": "python",
745
- "nbconvert_exporter": "python",
746
- "pygments_lexer": "ipython3",
747
- "version": "3.7.7"
748
- }
749
- },
750
- "nbformat": 4,
751
- "nbformat_minor": 0
752
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
apply_bpe_traindevtest_notag.sh DELETED
@@ -1,41 +0,0 @@
1
- #!/bin/bash
2
-
3
- expdir=$1 # EXPDIR
4
-
5
- SUBWORD_NMT_DIR="subword-nmt"
6
-
7
- data_dir="$expdir/data"
8
- mkdir -p $expdir/bpe
9
-
10
- for dset in `echo train dev test`
11
- do
12
- echo $dset
13
- in_dset_dir="$data_dir/$dset"
14
- out_dset_dir="$expdir/bpe/$dset"
15
- # out_dset_dir="$expdir/final/$dset"
16
- echo "Apply joint vocab to SRC corpus"
17
- # for very large datasets, use gnu-parallel to speed up applying bpe
18
- # uncomment the below line if the apply bpe is slow
19
-
20
- # parallel --pipe --keep-order \
21
- python $SUBWORD_NMT_DIR/subword_nmt/apply_bpe.py \
22
- -c $expdir/vocab/bpe_codes.32k.SRC_TGT \
23
- --vocabulary $expdir/vocab/vocab.SRC \
24
- --vocabulary-threshold 5 \
25
- --num-workers "-1" \
26
- < $in_dset_dir.SRC \
27
- > $out_dset_dir.SRC
28
- echo "Apply joint vocab to TGT corpus"
29
-
30
- # for very large datasets, use gnu-parallel to speed up applying bpe
31
- # uncomment the below line if the apply bpe is slow
32
-
33
- # parallel --pipe --keep-order \
34
- python $SUBWORD_NMT_DIR/subword_nmt/apply_bpe.py \
35
- -c $expdir/vocab/bpe_codes.32k.SRC_TGT \
36
- --vocabulary $expdir/vocab/vocab.TGT \
37
- --vocabulary-threshold 5 \
38
- --num-workers "-1" \
39
- < $in_dset_dir.TGT \
40
- > $out_dset_dir.TGT
41
- done
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
apply_single_bpe_traindevtest_notag.sh DELETED
@@ -1,40 +0,0 @@
1
- #!/bin/bash
2
-
3
- expdir=$1 # EXPDIR
4
-
5
- SUBWORD_NMT_DIR="subword-nmt"
6
-
7
- data_dir="$expdir/data"
8
- mkdir -p $expdir/bpe
9
-
10
- for dset in `echo train dev test`
11
- do
12
- echo $dset
13
- in_dset_dir="$data_dir/$dset"
14
- out_dset_dir="$expdir/bpe/$dset"
15
- # out_dset_dir="$expdir/final/$dset"
16
- echo "Apply to SRC corpus"
17
- # for very large datasets, use gnu-parallel to speed up applying bpe
18
- # uncomment the below line if the apply bpe is slow
19
-
20
- # parallel --pipe --keep-order \
21
- python $SUBWORD_NMT_DIR/subword_nmt/apply_bpe.py \
22
- -c $expdir/vocab/bpe_codes.32k.SRC \
23
- --vocabulary $expdir/vocab/vocab.SRC \
24
- --vocabulary-threshold 5 \
25
- --num-workers "-1" \
26
- < $in_dset_dir.SRC \
27
- > $out_dset_dir.SRC
28
- echo "Apply to TGT corpus"
29
- # for very large datasets, use gnu-parallel to speed up applying bpe
30
- # uncomment the below line if the apply bpe is slow
31
-
32
- # parallel --pipe --keep-order \
33
- python $SUBWORD_NMT_DIR/subword_nmt/apply_bpe.py \
34
- -c $expdir/vocab/bpe_codes.32k.TGT \
35
- --vocabulary $expdir/vocab/vocab.TGT \
36
- --vocabulary-threshold 5 \
37
- --num-workers "-1" \
38
- < $in_dset_dir.TGT \
39
- > $out_dset_dir.TGT
40
- done
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
binarize_training_exp.sh DELETED
@@ -1,24 +0,0 @@
1
- #/bin/bash
2
-
3
- exp_dir=$1
4
- src_lang=$2
5
- tgt_lang=$3
6
-
7
- # use cpu_count to get num_workers instead of setting it manually when running in different
8
- # instances
9
- num_workers=`python -c "import multiprocessing; print(multiprocessing.cpu_count())"`
10
-
11
- data_dir=$exp_dir/final
12
- out_data_dir=$exp_dir/final_bin
13
-
14
- rm -rf $out_data_dir
15
-
16
- fairseq-preprocess \
17
- --source-lang $src_lang --target-lang $tgt_lang \
18
- --trainpref $data_dir/train \
19
- --validpref $data_dir/dev \
20
- --testpref $data_dir/test \
21
- --destdir $out_data_dir \
22
- --workers $num_workers \
23
- --thresholdtgt 5 \
24
- --thresholdsrc 5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
compute_bleu.sh DELETED
@@ -1,28 +0,0 @@
1
- pred_fname=$1
2
- ref_fname=$2
3
- src_lang=$3
4
- tgt_lang=$4
5
-
6
- # we compute and report tokenized bleu scores.
7
- # For computing BLEU scores, systems should output detokenized outputs. Your MT system might be doing it out of the box if you are using SentencePiece - nothing to do in that case.
8
- # If you are using BPE then:
9
- # 1. For English, you can use MosesDetokenizer (either the scripts in moses or the sacremoses python package)
10
- # 2. For Indian languages, you can use the IndicNLP library detokenizer (note: please don't skip this step, since detok/tokenizer are not guaranteed to be reversible**.
11
- # ^ both 1. and 2. are scripts/postprocess_translate.py
12
-
13
-
14
- # For computing BLEU, we use sacrebleu:
15
- # For English output: sacrebleu reffile < outputfile. This internally tokenizes using mteval-v13a
16
- # For Indian language output, we need tokenized output and reference since we don't know how well the sacrebleu tokenizer works for Indic input.
17
- # Hence we tokenize both preds and target files with IndicNLP tokenizer and then run: sacrebleu --tokenize none reffile < outputfile
18
- if [ $tgt_lang == 'en' ]; then
19
- # indic to en models
20
- sacrebleu $ref_fname < $pred_fname
21
- else
22
- # indicnlp tokenize predictions and reference files before evaluation
23
- input_size=`python scripts/preprocess_translate.py $ref_fname $ref_fname.tok $tgt_lang`
24
- input_size=`python scripts/preprocess_translate.py $pred_fname $pred_fname.tok $tgt_lang`
25
-
26
- # since we are tokenizing with indicnlp separately, we are setting tokenize to none here
27
- sacrebleu --tokenize none $ref_fname.tok < $pred_fname.tok
28
- fi
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
indicTrans_Finetuning.ipynb DELETED
The diff for this file is too large to render. See raw diff
 
indicTrans_python_interface.ipynb DELETED
@@ -1,462 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "markdown",
5
- "metadata": {
6
- "colab_type": "text",
7
- "id": "view-in-github"
8
- },
9
- "source": [
10
- "<a href=\"https://colab.research.google.com/github/gowtham1997/indicTrans-1/blob/main/indicTrans_python_interface.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
11
- ]
12
- },
13
- {
14
- "cell_type": "code",
15
- "execution_count": 1,
16
- "metadata": {
17
- "colab": {
18
- "base_uri": "https://localhost:8080/"
19
- },
20
- "id": "CjfzxXZLHed_",
21
- "outputId": "69a66b95-41b2-4413-82d1-0caacbddb3f3"
22
- },
23
- "outputs": [
24
- {
25
- "name": "stdout",
26
- "output_type": "stream",
27
- "text": [
28
- "Cloning into 'indicTrans-1'...\n",
29
- "remote: Enumerating objects: 486, done.\u001b[K\n",
30
- "remote: Counting objects: 100% (189/189), done.\u001b[K\n",
31
- "remote: Compressing objects: 100% (67/67), done.\u001b[K\n",
32
- "remote: Total 486 (delta 154), reused 134 (delta 121), pack-reused 297\u001b[K\n",
33
- "Receiving objects: 100% (486/486), 1.48 MiB | 17.61 MiB/s, done.\n",
34
- "Resolving deltas: 100% (281/281), done.\n",
35
- "/content/indicTrans\n",
36
- "Cloning into 'indic_nlp_library'...\n",
37
- "remote: Enumerating objects: 1325, done.\u001b[K\n",
38
- "remote: Counting objects: 100% (147/147), done.\u001b[K\n",
39
- "remote: Compressing objects: 100% (103/103), done.\u001b[K\n",
40
- "remote: Total 1325 (delta 84), reused 89 (delta 41), pack-reused 1178\u001b[K\n",
41
- "Receiving objects: 100% (1325/1325), 9.57 MiB | 13.55 MiB/s, done.\n",
42
- "Resolving deltas: 100% (688/688), done.\n",
43
- "Cloning into 'indic_nlp_resources'...\n",
44
- "remote: Enumerating objects: 133, done.\u001b[K\n",
45
- "remote: Counting objects: 100% (7/7), done.\u001b[K\n",
46
- "remote: Compressing objects: 100% (7/7), done.\u001b[K\n",
47
- "remote: Total 133 (delta 0), reused 2 (delta 0), pack-reused 126\u001b[K\n",
48
- "Receiving objects: 100% (133/133), 149.77 MiB | 33.48 MiB/s, done.\n",
49
- "Resolving deltas: 100% (51/51), done.\n",
50
- "Checking out files: 100% (28/28), done.\n",
51
- "Cloning into 'subword-nmt'...\n",
52
- "remote: Enumerating objects: 580, done.\u001b[K\n",
53
- "remote: Counting objects: 100% (4/4), done.\u001b[K\n",
54
- "remote: Compressing objects: 100% (4/4), done.\u001b[K\n",
55
- "remote: Total 580 (delta 0), reused 1 (delta 0), pack-reused 576\u001b[K\n",
56
- "Receiving objects: 100% (580/580), 237.41 KiB | 18.26 MiB/s, done.\n",
57
- "Resolving deltas: 100% (349/349), done.\n",
58
- "/content\n"
59
- ]
60
- }
61
- ],
62
- "source": [
63
- "# clone the repo for running evaluation\n",
64
- "!git clone https://github.com/AI4Bharat/indicTrans.git\n",
65
- "%cd indicTrans\n",
66
- "# clone requirements repositories\n",
67
- "!git clone https://github.com/anoopkunchukuttan/indic_nlp_library.git\n",
68
- "!git clone https://github.com/anoopkunchukuttan/indic_nlp_resources.git\n",
69
- "!git clone https://github.com/rsennrich/subword-nmt.git\n",
70
- "%cd .."
71
- ]
72
- },
73
- {
74
- "cell_type": "code",
75
- "execution_count": 2,
76
- "metadata": {
77
- "colab": {
78
- "base_uri": "https://localhost:8080/"
79
- },
80
- "id": "IeYW2BJhlJvx",
81
- "outputId": "3357bc85-44d8-43b0-8c64-eef9f18be716"
82
- },
83
- "outputs": [
84
- {
85
- "name": "stdout",
86
- "output_type": "stream",
87
- "text": [
88
- "Collecting sacremoses\n",
89
- "\u001b[?25l Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)\n",
90
- "\r\u001b[K |▍ | 10kB 14.0MB/s eta 0:00:01\r\u001b[K |▊ | 20kB 18.8MB/s eta 0:00:01\r\u001b[K |█ | 30kB 22.5MB/s eta 0:00:01\r\u001b[K |█▌ | 40kB 25.7MB/s eta 0:00:01\r\u001b[K |█▉ | 51kB 27.6MB/s eta 0:00:01\r\u001b[K |██▏ | 61kB 29.2MB/s eta 0:00:01\r\u001b[K |██▋ | 71kB 27.3MB/s eta 0:00:01\r\u001b[K |███ | 81kB 27.7MB/s eta 0:00:01\r\u001b[K |███▎ | 92kB 28.8MB/s eta 0:00:01\r\u001b[K |███▋ | 102kB 29.9MB/s eta 0:00:01\r\u001b[K |████ | 112kB 29.9MB/s eta 0:00:01\r\u001b[K |████▍ | 122kB 29.9MB/s eta 0:00:01\r\u001b[K |████▊ | 133kB 29.9MB/s eta 0:00:01\r\u001b[K |█████▏ | 143kB 29.9MB/s eta 0:00:01\r\u001b[K |█████▌ | 153kB 29.9MB/s eta 0:00:01\r\u001b[K |█████▉ | 163kB 29.9MB/s eta 0:00:01\r\u001b[K |██████▎ | 174kB 29.9MB/s eta 0:00:01\r\u001b[K |██████▋ | 184kB 29.9MB/s eta 0:00:01\r\u001b[K |███████ | 194kB 29.9MB/s eta 0:00:01\r\u001b[K |███████▎ | 204kB 29.9MB/s eta 0:00:01\r\u001b[K |███████▊ | 215kB 29.9MB/s eta 0:00:01\r\u001b[K |████████ | 225kB 29.9MB/s eta 0:00:01\r\u001b[K |████████▍ | 235kB 29.9MB/s eta 0:00:01\r\u001b[K |████████▉ | 245kB 29.9MB/s eta 0:00:01\r\u001b[K |█████████▏ | 256kB 29.9MB/s eta 0:00:01\r\u001b[K |█████████▌ | 266kB 29.9MB/s eta 0:00:01\r\u001b[K |█████████▉ | 276kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████▎ | 286kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████▋ | 296kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████ | 307kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████▍ | 317kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████▊ | 327kB 29.9MB/s eta 0:00:01\r\u001b[K |████████████ | 337kB 29.9MB/s eta 0:00:01\r\u001b[K |████████████▌ | 348kB 29.9MB/s eta 0:00:01\r\u001b[K |████████████▉ | 358kB 29.9MB/s eta 0:00:01\r\u001b[K |█████████████▏ | 368kB 29.9MB/s eta 0:00:01\r\u001b[K |█████████████▌ | 378kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████████ | 389kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████████▎ | 399kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████████▋ | 409kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████████ | 419kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████████▍ | 430kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████████▊ | 440kB 29.9MB/s eta 0:00:01\r\u001b[K |████████████████ | 450kB 29.9MB/s eta 0:00:01\r\u001b[K |████████████████▌ | 460kB 29.9MB/s eta 0:00:01\r\u001b[K |████████████████▉ | 471kB 29.9MB/s eta 0:00:01\r\u001b[K |█████████████████▏ | 481kB 29.9MB/s eta 0:00:01\r\u001b[K |█████████████████▋ | 491kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████████████ | 501kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████████████▎ | 512kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████████████▊ | 522kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████████████ | 532kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████████████▍ | 542kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████████████▊ | 552kB 29.9MB/s eta 0:00:01\r\u001b[K |████████████████████▏ | 563kB 29.9MB/s eta 0:00:01\r\u001b[K |████████████████████▌ | 573kB 29.9MB/s eta 0:00:01\r\u001b[K |████████████████████▉ | 583kB 29.9MB/s eta 0:00:01\r\u001b[K |█████████████████████▎ | 593kB 29.9MB/s eta 0:00:01\r\u001b[K |█████████████████████▋ | 604kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████████████████ | 614kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████████████████▎ | 624kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████████████████▊ | 634kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████████████████ | 645kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████████████████▍ | 655kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████████████████▉ | 665kB 29.9MB/s eta 0:00:01\r\u001b[K |████████████████████████▏ | 675kB 29.9MB/s eta 0:00:01\r\u001b[K |████████████████████████▌ | 686kB 29.9MB/s eta 0:00:01\r\u001b[K |█████████████████████████ | 696kB 29.9MB/s eta 0:00:01\r\u001b[K |█████████████████████████▎ | 706kB 29.9MB/s eta 0:00:01\r\u001b[K |█████████████████████████▋ | 716kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████████████████████ | 727kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████████████████████▍ | 737kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████████████████████▊ | 747kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████████████████████ | 757kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████████████████████▌ | 768kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████████████████████▉ | 778kB 29.9MB/s eta 0:00:01\r\u001b[K |████████████████████████████▏ | 788kB 29.9MB/s eta 0:00:01\r\u001b[K |████████████████████████████▌ | 798kB 29.9MB/s eta 0:00:01\r\u001b[K |█████████████████████████████ | 808kB 29.9MB/s eta 0:00:01\r\u001b[K |█████████████████████████████▎ | 819kB 29.9MB/s eta 0:00:01\r\u001b[K |█████████████████████████████▋ | 829kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████████████████████████ | 839kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████████████████████████▍ | 849kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████████████████████████▊ | 860kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████████████████████████▏| 870kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████████████████████████▌| 880kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████████████████████████▉| 890kB 29.9MB/s eta 0:00:01\r\u001b[K |████████████████████████████████| 901kB 29.9MB/s \n",
91
- "\u001b[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (1.1.5)\n",
92
- "Collecting mock\n",
93
- " Downloading https://files.pythonhosted.org/packages/5c/03/b7e605db4a57c0f6fba744b11ef3ddf4ddebcada35022927a2b5fc623fdf/mock-4.0.3-py3-none-any.whl\n",
94
- "Collecting sacrebleu\n",
95
- "\u001b[?25l Downloading https://files.pythonhosted.org/packages/7e/57/0c7ca4e31a126189dab99c19951910bd081dea5bbd25f24b77107750eae7/sacrebleu-1.5.1-py3-none-any.whl (54kB)\n",
96
- "\u001b[K |████████████████████████████████| 61kB 7.5MB/s \n",
97
- "\u001b[?25hCollecting tensorboardX\n",
98
- "\u001b[?25l Downloading https://files.pythonhosted.org/packages/42/36/2b147652c40c3a858efa0afbf7b8236fae968e88ff530511a4cfa299a506/tensorboardX-2.3-py2.py3-none-any.whl (124kB)\n",
99
- "\u001b[K |████████████████████████████████| 133kB 47.5MB/s \n",
100
- "\u001b[?25hRequirement already satisfied: pyarrow in /usr/local/lib/python3.7/dist-packages (3.0.0)\n",
101
- "Collecting indic-nlp-library\n",
102
- "\u001b[?25l Downloading https://files.pythonhosted.org/packages/84/d4/495bb43b88a2a6d04b09c29fc5115f24872af74cd8317fe84026abd4ddb1/indic_nlp_library-0.81-py3-none-any.whl (40kB)\n",
103
- "\u001b[K |████████████████████████████████| 40kB 5.2MB/s \n",
104
- "\u001b[?25hRequirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from sacremoses) (4.41.1)\n",
105
- "Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from sacremoses) (1.15.0)\n",
106
- "Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from sacremoses) (1.0.1)\n",
107
- "Requirement already satisfied: regex in /usr/local/lib/python3.7/dist-packages (from sacremoses) (2019.12.20)\n",
108
- "Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from sacremoses) (7.1.2)\n",
109
- "Requirement already satisfied: numpy>=1.15.4 in /usr/local/lib/python3.7/dist-packages (from pandas) (1.19.5)\n",
110
- "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas) (2.8.1)\n",
111
- "Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas) (2018.9)\n",
112
- "Collecting portalocker==2.0.0\n",
113
- " Downloading https://files.pythonhosted.org/packages/89/a6/3814b7107e0788040870e8825eebf214d72166adf656ba7d4bf14759a06a/portalocker-2.0.0-py2.py3-none-any.whl\n",
114
- "Requirement already satisfied: protobuf>=3.8.0 in /usr/local/lib/python3.7/dist-packages (from tensorboardX) (3.12.4)\n",
115
- "Collecting sphinx-rtd-theme\n",
116
- "\u001b[?25l Downloading https://files.pythonhosted.org/packages/ac/24/2475e8f83519b54b2148d4a56eb1111f9cec630d088c3ffc214492c12107/sphinx_rtd_theme-0.5.2-py2.py3-none-any.whl (9.1MB)\n",
117
- "\u001b[K |████████████████████████████████| 9.2MB 42.0MB/s \n",
118
- "\u001b[?25hCollecting morfessor\n",
119
- " Downloading https://files.pythonhosted.org/packages/39/e6/7afea30be2ee4d29ce9de0fa53acbb033163615f849515c0b1956ad074ee/Morfessor-2.0.6-py3-none-any.whl\n",
120
- "Collecting sphinx-argparse\n",
121
- " Downloading https://files.pythonhosted.org/packages/06/2b/dfad6a1831c3aeeae25d8d3d417224684befbf45e10c7f2141631616a6ed/sphinx-argparse-0.2.5.tar.gz\n",
122
- "Requirement already satisfied: setuptools in /usr/local/lib/python3.7/dist-packages (from protobuf>=3.8.0->tensorboardX) (57.0.0)\n",
123
- "Requirement already satisfied: sphinx in /usr/local/lib/python3.7/dist-packages (from sphinx-rtd-theme->indic-nlp-library) (1.8.5)\n",
124
- "Collecting docutils<0.17\n",
125
- "\u001b[?25l Downloading https://files.pythonhosted.org/packages/81/44/8a15e45ffa96e6cf82956dd8d7af9e666357e16b0d93b253903475ee947f/docutils-0.16-py2.py3-none-any.whl (548kB)\n",
126
- "\u001b[K |████████████████████████████████| 552kB 31.5MB/s \n",
127
- "\u001b[?25hRequirement already satisfied: sphinxcontrib-websupport in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (1.2.4)\n",
128
- "Requirement already satisfied: snowballstemmer>=1.1 in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (2.1.0)\n",
129
- "Requirement already satisfied: Jinja2>=2.3 in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (2.11.3)\n",
130
- "Requirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (20.9)\n",
131
- "Requirement already satisfied: alabaster<0.8,>=0.7 in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (0.7.12)\n",
132
- "Requirement already satisfied: imagesize in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (1.2.0)\n",
133
- "Requirement already satisfied: Pygments>=2.0 in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (2.6.1)\n",
134
- "Requirement already satisfied: requests>=2.0.0 in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (2.23.0)\n",
135
- "Requirement already satisfied: babel!=2.0,>=1.3 in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (2.9.1)\n",
136
- "Requirement already satisfied: sphinxcontrib-serializinghtml in /usr/local/lib/python3.7/dist-packages (from sphinxcontrib-websupport->sphinx->sphinx-rtd-theme->indic-nlp-library) (1.1.5)\n",
137
- "Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.7/dist-packages (from Jinja2>=2.3->sphinx->sphinx-rtd-theme->indic-nlp-library) (2.0.1)\n",
138
- "Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->sphinx->sphinx-rtd-theme->indic-nlp-library) (2.4.7)\n",
139
- "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests>=2.0.0->sphinx->sphinx-rtd-theme->indic-nlp-library) (2.10)\n",
140
- "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests>=2.0.0->sphinx->sphinx-rtd-theme->indic-nlp-library) (3.0.4)\n",
141
- "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests>=2.0.0->sphinx->sphinx-rtd-theme->indic-nlp-library) (1.24.3)\n",
142
- "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests>=2.0.0->sphinx->sphinx-rtd-theme->indic-nlp-library) (2021.5.30)\n",
143
- "Building wheels for collected packages: sphinx-argparse\n",
144
- " Building wheel for sphinx-argparse (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
145
- " Created wheel for sphinx-argparse: filename=sphinx_argparse-0.2.5-cp37-none-any.whl size=11552 sha256=d8cbdca000085e2e2c122c305bb21aa76a9600012ded8e06c300e03d1c4d1e32\n",
146
- " Stored in directory: /root/.cache/pip/wheels/2a/18/1b/4990a1859da4edc77ab312bc2986c08d2733fb5713d06e44f5\n",
147
- "Successfully built sphinx-argparse\n",
148
- "\u001b[31mERROR: datascience 0.10.6 has requirement folium==0.2.1, but you'll have folium 0.8.3 which is incompatible.\u001b[0m\n",
149
- "Installing collected packages: sacremoses, mock, portalocker, sacrebleu, tensorboardX, docutils, sphinx-rtd-theme, morfessor, sphinx-argparse, indic-nlp-library\n",
150
- " Found existing installation: docutils 0.17.1\n",
151
- " Uninstalling docutils-0.17.1:\n",
152
- " Successfully uninstalled docutils-0.17.1\n",
153
- "Successfully installed docutils-0.16 indic-nlp-library-0.81 mock-4.0.3 morfessor-2.0.6 portalocker-2.0.0 sacrebleu-1.5.1 sacremoses-0.0.45 sphinx-argparse-0.2.5 sphinx-rtd-theme-0.5.2 tensorboardX-2.3\n",
154
- "Collecting mosestokenizer\n",
155
- " Downloading https://files.pythonhosted.org/packages/4b/b3/c0af235b16c4f44a2828ef017f7947d1262b2646e440f85c6a2ff26a8c6f/mosestokenizer-1.1.0.tar.gz\n",
156
- "Collecting subword-nmt\n",
157
- " Downloading https://files.pythonhosted.org/packages/74/60/6600a7bc09e7ab38bc53a48a20d8cae49b837f93f5842a41fe513a694912/subword_nmt-0.3.7-py2.py3-none-any.whl\n",
158
- "Requirement already satisfied: docopt in /usr/local/lib/python3.7/dist-packages (from mosestokenizer) (0.6.2)\n",
159
- "Collecting openfile\n",
160
- " Downloading https://files.pythonhosted.org/packages/93/e6/805db6867faacb488b44ba8e0829ef4de151dd0499f3c5da5f4ad11698a7/openfile-0.0.7-py3-none-any.whl\n",
161
- "Collecting uctools\n",
162
- " Downloading https://files.pythonhosted.org/packages/04/cb/70ed842d9a43460eedaa11f7503b4ab6537b43b63f0d854d59d8e150fac1/uctools-1.3.0.tar.gz\n",
163
- "Collecting toolwrapper\n",
164
- " Downloading https://files.pythonhosted.org/packages/41/7b/34bf8fb69426d8a18bcc61081e9d126f4fcd41c3c832072bef39af1602cd/toolwrapper-2.1.0.tar.gz\n",
165
- "Building wheels for collected packages: mosestokenizer, uctools, toolwrapper\n",
166
- " Building wheel for mosestokenizer (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
167
- " Created wheel for mosestokenizer: filename=mosestokenizer-1.1.0-cp37-none-any.whl size=49120 sha256=4fc04046040e73bd5d13c606ebbfc65ac38c7d073f7fc0b0e4cc1d4215b595f3\n",
168
- " Stored in directory: /root/.cache/pip/wheels/a2/e7/48/48d5e0f9c0cd5def2dfd7cb8543945f906448ed1313de24a29\n",
169
- " Building wheel for uctools (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
170
- " Created wheel for uctools: filename=uctools-1.3.0-cp37-none-any.whl size=6163 sha256=c5a865107c59f98c4da5d18ddc754fa141ab494574187281de1502561c6a004e\n",
171
- " Stored in directory: /root/.cache/pip/wheels/06/b6/8f/935d5bf5bca85d47c6f5ec31641879bba057d336ab36b1e773\n",
172
- " Building wheel for toolwrapper (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
173
- " Created wheel for toolwrapper: filename=toolwrapper-2.1.0-cp37-none-any.whl size=3356 sha256=41a3e12078d5681e8467701735208d880ba588b0f5dbfb3b99c4e04bc643eccc\n",
174
- " Stored in directory: /root/.cache/pip/wheels/84/ea/29/e02f3b855bf19344972092873a1091b329309bbc3d3d0cbaef\n",
175
- "Successfully built mosestokenizer uctools toolwrapper\n",
176
- "Installing collected packages: openfile, uctools, toolwrapper, mosestokenizer, subword-nmt\n",
177
- "Successfully installed mosestokenizer-1.1.0 openfile-0.0.7 subword-nmt-0.3.7 toolwrapper-2.1.0 uctools-1.3.0\n",
178
- "Cloning into 'fairseq'...\n",
179
- "remote: Enumerating objects: 28410, done.\u001b[K\n",
180
- "remote: Counting objects: 100% (229/229), done.\u001b[K\n",
181
- "remote: Compressing objects: 100% (127/127), done.\u001b[K\n",
182
- "remote: Total 28410 (delta 114), reused 187 (delta 99), pack-reused 28181\u001b[K\n",
183
- "Receiving objects: 100% (28410/28410), 11.96 MiB | 24.16 MiB/s, done.\n",
184
- "Resolving deltas: 100% (21310/21310), done.\n",
185
- "/content/fairseq\n",
186
- "Obtaining file:///content/fairseq\n",
187
- " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
188
- " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
189
- " Installing backend dependencies ... \u001b[?25l\u001b[?25hdone\n",
190
- " Preparing wheel metadata ... \u001b[?25l\u001b[?25hdone\n",
191
- "Requirement already satisfied: cffi in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (1.14.5)\n",
192
- "Collecting hydra-core<1.1\n",
193
- "\u001b[?25l Downloading https://files.pythonhosted.org/packages/52/e3/fbd70dd0d3ce4d1d75c22d56c0c9f895cfa7ed6587a9ffb821d6812d6a60/hydra_core-1.0.6-py3-none-any.whl (123kB)\n",
194
- "\u001b[K |████████████████████████████████| 133kB 11.6MB/s \n",
195
- "\u001b[?25hRequirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (4.41.1)\n",
196
- "Collecting omegaconf<2.1\n",
197
- " Downloading https://files.pythonhosted.org/packages/d0/eb/9d63ce09dd8aa85767c65668d5414958ea29648a0eec80a4a7d311ec2684/omegaconf-2.0.6-py3-none-any.whl\n",
198
- "Requirement already satisfied: regex in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (2019.12.20)\n",
199
- "Requirement already satisfied: numpy; python_version >= \"3.7\" in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (1.19.5)\n",
200
- "Requirement already satisfied: torch in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (1.9.0+cu102)\n",
201
- "Requirement already satisfied: sacrebleu>=1.4.12 in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (1.5.1)\n",
202
- "Requirement already satisfied: cython in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (0.29.23)\n",
203
- "Requirement already satisfied: pycparser in /usr/local/lib/python3.7/dist-packages (from cffi->fairseq==1.0.0a0+f887152) (2.20)\n",
204
- "Collecting antlr4-python3-runtime==4.8\n",
205
- "\u001b[?25l Downloading https://files.pythonhosted.org/packages/56/02/789a0bddf9c9b31b14c3e79ec22b9656185a803dc31c15f006f9855ece0d/antlr4-python3-runtime-4.8.tar.gz (112kB)\n",
206
- "\u001b[K |████████████████████████████████| 112kB 33.5MB/s \n",
207
- "\u001b[?25hRequirement already satisfied: importlib-resources; python_version < \"3.9\" in /usr/local/lib/python3.7/dist-packages (from hydra-core<1.1->fairseq==1.0.0a0+f887152) (5.1.4)\n",
208
- "Collecting PyYAML>=5.1.*\n",
209
- "\u001b[?25l Downloading https://files.pythonhosted.org/packages/7a/a5/393c087efdc78091afa2af9f1378762f9821c9c1d7a22c5753fb5ac5f97a/PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636kB)\n",
210
- "\u001b[K |████████████████████████████████| 645kB 30.2MB/s \n",
211
- "\u001b[?25hRequirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from omegaconf<2.1->fairseq==1.0.0a0+f887152) (3.7.4.3)\n",
212
- "Requirement already satisfied: portalocker==2.0.0 in /usr/local/lib/python3.7/dist-packages (from sacrebleu>=1.4.12->fairseq==1.0.0a0+f887152) (2.0.0)\n",
213
- "Requirement already satisfied: zipp>=3.1.0; python_version < \"3.10\" in /usr/local/lib/python3.7/dist-packages (from importlib-resources; python_version < \"3.9\"->hydra-core<1.1->fairseq==1.0.0a0+f887152) (3.4.1)\n",
214
- "Building wheels for collected packages: antlr4-python3-runtime\n",
215
- " Building wheel for antlr4-python3-runtime (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
216
- " Created wheel for antlr4-python3-runtime: filename=antlr4_python3_runtime-4.8-cp37-none-any.whl size=141231 sha256=69960f774a6fdb385fed1a63fb02ae50b57299408cfd6fb33be60d686be878b7\n",
217
- " Stored in directory: /root/.cache/pip/wheels/e3/e2/fa/b78480b448b8579ddf393bebd3f47ee23aa84c89b6a78285c8\n",
218
- "Successfully built antlr4-python3-runtime\n",
219
- "Installing collected packages: antlr4-python3-runtime, PyYAML, omegaconf, hydra-core, fairseq\n",
220
- " Found existing installation: PyYAML 3.13\n",
221
- " Uninstalling PyYAML-3.13:\n",
222
- " Successfully uninstalled PyYAML-3.13\n",
223
- " Running setup.py develop for fairseq\n",
224
- "Successfully installed PyYAML-5.4.1 antlr4-python3-runtime-4.8 fairseq hydra-core-1.0.6 omegaconf-2.0.6\n",
225
- "/content\n"
226
- ]
227
- }
228
- ],
229
- "source": [
230
- "# Install the necessary libraries\n",
231
- "!pip install sacremoses pandas mock sacrebleu tensorboardX pyarrow indic-nlp-library\n",
232
- "! pip install mosestokenizer subword-nmt\n",
233
- "# Install fairseq from source\n",
234
- "!git clone https://github.com/pytorch/fairseq.git\n",
235
- "%cd fairseq\n",
236
- "# !git checkout da9eaba12d82b9bfc1442f0e2c6fc1b895f4d35d\n",
237
- "!pip install --editable ./\n",
238
- "\n",
239
- "%cd .."
240
- ]
241
- },
242
- {
243
- "cell_type": "code",
244
- "execution_count": 1,
245
- "metadata": {
246
- "id": "TktUu9NW_PLq"
247
- },
248
- "outputs": [],
249
- "source": [
250
- "# this step is only required if you are running the code on colab\n",
251
- "# restart the runtime after running prev cell (to update). See this -> https://stackoverflow.com/questions/57838013/modulenotfounderror-after-successful-pip-install-in-google-colaboratory\n",
252
- "\n",
253
- "# this import will not work without restarting runtime\n",
254
- "from fairseq import checkpoint_utils, distributed_utils, options, tasks, utils"
255
- ]
256
- },
257
- {
258
- "cell_type": "code",
259
- "execution_count": 9,
260
- "metadata": {
261
- "colab": {
262
- "base_uri": "https://localhost:8080/"
263
- },
264
- "id": "E_4JxNdRlPQB",
265
- "outputId": "82ab5e2f-d560-4f4e-bf3f-f1ca0a8d31b8"
266
- },
267
- "outputs": [
268
- {
269
- "name": "stdout",
270
- "output_type": "stream",
271
- "text": [
272
- "--2021-06-27 12:43:16-- https://storage.googleapis.com/samanantar-public/V0.2/models/indic-en.zip\n",
273
- "Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.13.240, 172.217.15.80, 142.251.33.208, ...\n",
274
- "Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.13.240|:443... connected.\n",
275
- "HTTP request sent, awaiting response... 200 OK\n",
276
- "Length: 4551079075 (4.2G) [application/zip]\n",
277
- "Saving to: ‘indic-en.zip’\n",
278
- "\n",
279
- "indic-en.zip 100%[===================>] 4.24G 28.8MB/s in 83s \n",
280
- "\n",
281
- "2021-06-27 12:44:39 (52.1 MB/s) - ‘indic-en.zip’ saved [4551079075/4551079075]\n",
282
- "\n",
283
- "Archive: indic-en.zip\n",
284
- " creating: indic-en/\n",
285
- " creating: indic-en/vocab/\n",
286
- " inflating: indic-en/vocab/bpe_codes.32k.SRC \n",
287
- " inflating: indic-en/vocab/vocab.SRC \n",
288
- " inflating: indic-en/vocab/vocab.TGT \n",
289
- " inflating: indic-en/vocab/bpe_codes.32k.TGT \n",
290
- " creating: indic-en/final_bin/\n",
291
- " inflating: indic-en/final_bin/dict.TGT.txt \n",
292
- " inflating: indic-en/final_bin/dict.SRC.txt \n",
293
- " creating: indic-en/model/\n",
294
- " inflating: indic-en/model/checkpoint_best.pt \n",
295
- "/content/indicTrans\n"
296
- ]
297
- }
298
- ],
299
- "source": [
300
- "# download the indictrans model\n",
301
- "\n",
302
- "\n",
303
- "# downloading the indic-en model\n",
304
- "!wget https://storage.googleapis.com/samanantar-public/V0.3/models/indic-en.zip\n",
305
- "!unzip indic-en.zip\n",
306
- "\n",
307
- "# downloading the en-indic model\n",
308
- "# !wget https://storage.googleapis.com/samanantar-public/V0.3/models/en-indic.zip\n",
309
- "# !unzip en-indic.zip\n",
310
- "\n",
311
- "# # downloading the indic-indic model\n",
312
- "# !wget https://storage.googleapis.com/samanantar-public/V0.3/models/m2m.zip\n",
313
- "# !unzip m2m.zip\n",
314
- "\n",
315
- "%cd indicTrans"
316
- ]
317
- },
318
- {
319
- "cell_type": "code",
320
- "execution_count": 10,
321
- "metadata": {
322
- "colab": {
323
- "base_uri": "https://localhost:8080/"
324
- },
325
- "id": "yTnWbHqY01-B",
326
- "outputId": "0d075f51-097b-46ad-aade-407a4437aa62"
327
- },
328
- "outputs": [
329
- {
330
- "name": "stdout",
331
- "output_type": "stream",
332
- "text": [
333
- "Initializing vocab and bpe\n",
334
- "Initializing model for translation\n"
335
- ]
336
- }
337
- ],
338
- "source": [
339
- "from indicTrans.inference.engine import Model\n",
340
- "\n",
341
- "indic2en_model = Model(expdir='../indic-en')"
342
- ]
343
- },
344
- {
345
- "cell_type": "code",
346
- "execution_count": 11,
347
- "metadata": {
348
- "colab": {
349
- "base_uri": "https://localhost:8080/"
350
- },
351
- "id": "QTp2NOgQ__sB",
352
- "outputId": "e015a71e-8206-4e1d-cb3e-11ecb4d44f76"
353
- },
354
- "outputs": [
355
- {
356
- "name": "stderr",
357
- "output_type": "stream",
358
- "text": [
359
- "100%|██████████| 3/3 [00:00<00:00, 1225.21it/s]\n",
360
- "/usr/local/lib/python3.7/dist-packages/torch/_tensor.py:575: UserWarning: floor_divide is deprecated, and will be removed in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values.\n",
361
- "To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at /pytorch/aten/src/ATen/native/BinaryOps.cpp:467.)\n",
362
- " return torch.floor_divide(self, other)\n"
363
- ]
364
- },
365
- {
366
- "data": {
367
- "text/plain": [
368
- "['He seems to know us.',\n",
369
- " 'I couldnt find it anywhere.',\n",
370
- " 'If someone in your neighbourhood develops these symptoms, staying at home can help prevent the spread of the coronavirus infection.']"
371
- ]
372
- },
373
- "execution_count": 11,
374
- "metadata": {
375
- "tags": []
376
- },
377
- "output_type": "execute_result"
378
- }
379
- ],
380
- "source": [
381
- "ta_sents = ['அவனுக்கு நம்மைப் தெரியும் என்று தோன்றுகிறது',\n",
382
- " \"இது எங்கே இருக்கு என்று என்னால் கண்டுபிடிக்க முடியவில்லை.\",\n",
383
- " 'உங்களுக்கு உங்கள் அருகில் இருக்கும் ஒருவருக்கோ இத்தகைய அறிகுறிகள் தென்பட்டால், வீட்டிலேயே இருப்பது, கொரோனா வைரஸ் தொற்று பிறருக்கு வராமல் தடுக்க உதவும்.']\n",
384
- "\n",
385
- "\n",
386
- "indic2en_model.batch_translate(ta_sents, 'ta', 'en')\n",
387
- "\n"
388
- ]
389
- },
390
- {
391
- "cell_type": "code",
392
- "execution_count": 13,
393
- "metadata": {
394
- "colab": {
395
- "base_uri": "https://localhost:8080/",
396
- "height": 68
397
- },
398
- "id": "VFXrCNZGEN7Z",
399
- "outputId": "f72aad17-1cc0-4774-a7ee-5b3a5d954de3"
400
- },
401
- "outputs": [
402
- {
403
- "name": "stderr",
404
- "output_type": "stream",
405
- "text": [
406
- "100%|██████████| 4/4 [00:00<00:00, 1496.76it/s]\n"
407
- ]
408
- },
409
- {
410
- "data": {
411
- "application/vnd.google.colaboratory.intrinsic+json": {
412
- "type": "string"
413
- },
414
- "text/plain": [
415
- "'The pandemic has resulted in worldwide social and economic disruption. The world is facing the worst recession since the global financial crisis. This led to the postponement or cancellation of sporting, religious, political and cultural events. Due to the fear, there was shortage of supply as more people purchased items like masks, sanitizers etc.'"
416
- ]
417
- },
418
- "execution_count": 13,
419
- "metadata": {
420
- "tags": []
421
- },
422
- "output_type": "execute_result"
423
- }
424
- ],
425
- "source": [
426
- "\n",
427
- "ta_paragraph = \"\"\"இத்தொற்றுநோய் உலகளாவிய சமூக மற்றும் பொருளாதார சீர்குலைவை ஏற்படுத்தியுள்ளது.இதனால் பெரும் பொருளாதார மந்தநிலைக்குப் பின்னர் உலகளவில் மிகப்பெரிய மந்தநிலை ஏற்பட்டுள்ளது. இது விளையாட்டு,மத, அரசியல் மற்றும் கலாச்சார நிகழ்வுகளை ஒத்திவைக்க அல்லது ரத்து செய்ய வழிவகுத்தது.\n",
428
- "அச்சம் காரணமாக முகக்கவசம், கிருமிநாசினி உள்ளிட்ட பொருட்களை அதிக நபர்கள் வாங்கியதால் விநியோகப் பற்றாக்குறை ஏற்பட்டது.\"\"\"\n",
429
- "\n",
430
- "indic2en_model.translate_paragraph(ta_paragraph, 'ta', 'en')"
431
- ]
432
- },
433
- {
434
- "cell_type": "code",
435
- "execution_count": null,
436
- "metadata": {
437
- "id": "Hi_D7s_VIjis"
438
- },
439
- "outputs": [],
440
- "source": []
441
- }
442
- ],
443
- "metadata": {
444
- "accelerator": "GPU",
445
- "colab": {
446
- "authorship_tag": "ABX9TyM3t8oQYMhBUuq4/Pyhcr0+",
447
- "collapsed_sections": [],
448
- "include_colab_link": true,
449
- "name": "indicTrans_python_interface.ipynb",
450
- "provenance": []
451
- },
452
- "kernelspec": {
453
- "display_name": "Python 3",
454
- "name": "python3"
455
- },
456
- "language_info": {
457
- "name": "python"
458
- }
459
- },
460
- "nbformat": 4,
461
- "nbformat_minor": 0
462
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
indictrans_fairseq_inference.ipynb DELETED
@@ -1,843 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "markdown",
5
- "metadata": {
6
- "colab_type": "text",
7
- "id": "view-in-github"
8
- },
9
- "source": [
10
- "<a href=\"https://colab.research.google.com/github/gowtham1997/indicTrans-1/blob/main/indictrans_fairseq_inference.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
11
- ]
12
- },
13
- {
14
- "cell_type": "code",
15
- "execution_count": null,
16
- "metadata": {
17
- "colab": {
18
- "base_uri": "https://localhost:8080/"
19
- },
20
- "id": "P0uptOB6U7GW",
21
- "outputId": "988c867e-76ee-4a54-a232-e69abbc5c3db"
22
- },
23
- "outputs": [
24
- {
25
- "name": "stdout",
26
- "output_type": "stream",
27
- "text": [
28
- "/content/testing\n"
29
- ]
30
- }
31
- ],
32
- "source": [
33
- "# create a seperate folder to store everything\n",
34
- "!mkdir testing\n",
35
- "%cd testing"
36
- ]
37
- },
38
- {
39
- "cell_type": "code",
40
- "execution_count": null,
41
- "metadata": {
42
- "colab": {
43
- "base_uri": "https://localhost:8080/"
44
- },
45
- "id": "kQFRiLtSalzt",
46
- "outputId": "03070c7c-8299-46bf-de56-df09c3213a3f"
47
- },
48
- "outputs": [
49
- {
50
- "name": "stdout",
51
- "output_type": "stream",
52
- "text": [
53
- "Cloning into 'indicTrans'...\n",
54
- "remote: Enumerating objects: 398, done.\u001b[K\n",
55
- "remote: Counting objects: 100% (398/398), done.\u001b[K\n",
56
- "remote: Compressing objects: 100% (267/267), done.\u001b[K\n",
57
- "remote: Total 398 (delta 231), reused 251 (delta 126), pack-reused 0\u001b[K\n",
58
- "Receiving objects: 100% (398/398), 1.41 MiB | 6.82 MiB/s, done.\n",
59
- "Resolving deltas: 100% (231/231), done.\n",
60
- "/content/testing/indicTrans\n",
61
- "Cloning into 'indic_nlp_library'...\n",
62
- "remote: Enumerating objects: 1325, done.\u001b[K\n",
63
- "remote: Counting objects: 100% (147/147), done.\u001b[K\n",
64
- "remote: Compressing objects: 100% (103/103), done.\u001b[K\n",
65
- "remote: Total 1325 (delta 84), reused 89 (delta 41), pack-reused 1178\u001b[K\n",
66
- "Receiving objects: 100% (1325/1325), 9.57 MiB | 7.40 MiB/s, done.\n",
67
- "Resolving deltas: 100% (688/688), done.\n",
68
- "Cloning into 'indic_nlp_resources'...\n",
69
- "remote: Enumerating objects: 133, done.\u001b[K\n",
70
- "remote: Counting objects: 100% (7/7), done.\u001b[K\n",
71
- "remote: Compressing objects: 100% (7/7), done.\u001b[K\n",
72
- "remote: Total 133 (delta 0), reused 2 (delta 0), pack-reused 126\u001b[K\n",
73
- "Receiving objects: 100% (133/133), 149.77 MiB | 23.46 MiB/s, done.\n",
74
- "Resolving deltas: 100% (51/51), done.\n",
75
- "Cloning into 'subword-nmt'...\n",
76
- "remote: Enumerating objects: 580, done.\u001b[K\n",
77
- "remote: Counting objects: 100% (4/4), done.\u001b[K\n",
78
- "remote: Compressing objects: 100% (4/4), done.\u001b[K\n",
79
- "remote: Total 580 (delta 0), reused 0 (delta 0), pack-reused 576\u001b[K\n",
80
- "Receiving objects: 100% (580/580), 237.41 KiB | 1.57 MiB/s, done.\n",
81
- "Resolving deltas: 100% (349/349), done.\n",
82
- "/content/testing\n"
83
- ]
84
- }
85
- ],
86
- "source": [
87
- "# clone the repo for running evaluation\n",
88
- "!git clone https://github.com/AI4Bharat/indicTrans.git\n",
89
- "%cd indicTrans\n",
90
- "# clone requirements repositories\n",
91
- "!git clone https://github.com/anoopkunchukuttan/indic_nlp_library.git\n",
92
- "!git clone https://github.com/anoopkunchukuttan/indic_nlp_resources.git\n",
93
- "!git clone https://github.com/rsennrich/subword-nmt.git\n",
94
- "%cd .."
95
- ]
96
- },
97
- {
98
- "cell_type": "code",
99
- "execution_count": null,
100
- "metadata": {
101
- "colab": {
102
- "base_uri": "https://localhost:8080/"
103
- },
104
- "id": "FHUQGCACVvVf",
105
- "outputId": "67c7c3a0-f8bf-46a2-8214-e36556df989b"
106
- },
107
- "outputs": [
108
- {
109
- "name": "stdout",
110
- "output_type": "stream",
111
- "text": [
112
- "Collecting sacremoses\n",
113
- "\u001b[?25l Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)\n",
114
- "\u001b[K |████████████████████████████████| 901kB 3.9MB/s \n",
115
- "\u001b[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (1.1.5)\n",
116
- "Collecting mock\n",
117
- " Downloading https://files.pythonhosted.org/packages/5c/03/b7e605db4a57c0f6fba744b11ef3ddf4ddebcada35022927a2b5fc623fdf/mock-4.0.3-py3-none-any.whl\n",
118
- "Collecting sacrebleu\n",
119
- "\u001b[?25l Downloading https://files.pythonhosted.org/packages/7e/57/0c7ca4e31a126189dab99c19951910bd081dea5bbd25f24b77107750eae7/sacrebleu-1.5.1-py3-none-any.whl (54kB)\n",
120
- "\u001b[K |████████████████████████████████| 61kB 8.3MB/s \n",
121
- "\u001b[?25hCollecting tensorboardX\n",
122
- "\u001b[?25l Downloading https://files.pythonhosted.org/packages/07/84/46421bd3e0e89a92682b1a38b40efc22dafb6d8e3d947e4ceefd4a5fabc7/tensorboardX-2.2-py2.py3-none-any.whl (120kB)\n",
123
- "\u001b[K |████████████████████████████████| 122kB 35.5MB/s \n",
124
- "\u001b[?25hRequirement already satisfied: pyarrow in /usr/local/lib/python3.7/dist-packages (3.0.0)\n",
125
- "Collecting indic-nlp-library\n",
126
- "\u001b[?25l Downloading https://files.pythonhosted.org/packages/84/d4/495bb43b88a2a6d04b09c29fc5115f24872af74cd8317fe84026abd4ddb1/indic_nlp_library-0.81-py3-none-any.whl (40kB)\n",
127
- "\u001b[K |████████████████████████████████| 40kB 5.8MB/s \n",
128
- "\u001b[?25hRequirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from sacremoses) (4.41.1)\n",
129
- "Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from sacremoses) (1.15.0)\n",
130
- "Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from sacremoses) (7.1.2)\n",
131
- "Requirement already satisfied: regex in /usr/local/lib/python3.7/dist-packages (from sacremoses) (2019.12.20)\n",
132
- "Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from sacremoses) (1.0.1)\n",
133
- "Requirement already satisfied: numpy>=1.15.4 in /usr/local/lib/python3.7/dist-packages (from pandas) (1.19.5)\n",
134
- "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas) (2.8.1)\n",
135
- "Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas) (2018.9)\n",
136
- "Collecting portalocker==2.0.0\n",
137
- " Downloading https://files.pythonhosted.org/packages/89/a6/3814b7107e0788040870e8825eebf214d72166adf656ba7d4bf14759a06a/portalocker-2.0.0-py2.py3-none-any.whl\n",
138
- "Requirement already satisfied: protobuf>=3.8.0 in /usr/local/lib/python3.7/dist-packages (from tensorboardX) (3.12.4)\n",
139
- "Collecting morfessor\n",
140
- " Downloading https://files.pythonhosted.org/packages/39/e6/7afea30be2ee4d29ce9de0fa53acbb033163615f849515c0b1956ad074ee/Morfessor-2.0.6-py3-none-any.whl\n",
141
- "Collecting sphinx-rtd-theme\n",
142
- "\u001b[?25l Downloading https://files.pythonhosted.org/packages/ac/24/2475e8f83519b54b2148d4a56eb1111f9cec630d088c3ffc214492c12107/sphinx_rtd_theme-0.5.2-py2.py3-none-any.whl (9.1MB)\n",
143
- "\u001b[K |████████████████████████████████| 9.2MB 28.0MB/s \n",
144
- "\u001b[?25hCollecting sphinx-argparse\n",
145
- " Downloading https://files.pythonhosted.org/packages/06/2b/dfad6a1831c3aeeae25d8d3d417224684befbf45e10c7f2141631616a6ed/sphinx-argparse-0.2.5.tar.gz\n",
146
- "Requirement already satisfied: setuptools in /usr/local/lib/python3.7/dist-packages (from protobuf>=3.8.0->tensorboardX) (57.0.0)\n",
147
- "Collecting docutils<0.17\n",
148
- "\u001b[?25l Downloading https://files.pythonhosted.org/packages/81/44/8a15e45ffa96e6cf82956dd8d7af9e666357e16b0d93b253903475ee947f/docutils-0.16-py2.py3-none-any.whl (548kB)\n",
149
- "\u001b[K |████████████████████████████████| 552kB 30.6MB/s \n",
150
- "\u001b[?25hRequirement already satisfied: sphinx in /usr/local/lib/python3.7/dist-packages (from sphinx-rtd-theme->indic-nlp-library) (1.8.5)\n",
151
- "Requirement already satisfied: Pygments>=2.0 in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (2.6.1)\n",
152
- "Requirement already satisfied: requests>=2.0.0 in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (2.23.0)\n",
153
- "Requirement already satisfied: snowballstemmer>=1.1 in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (2.1.0)\n",
154
- "Requirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (20.9)\n",
155
- "Requirement already satisfied: imagesize in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (1.2.0)\n",
156
- "Requirement already satisfied: alabaster<0.8,>=0.7 in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (0.7.12)\n",
157
- "Requirement already satisfied: sphinxcontrib-websupport in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (1.2.4)\n",
158
- "Requirement already satisfied: babel!=2.0,>=1.3 in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (2.9.1)\n",
159
- "Requirement already satisfied: Jinja2>=2.3 in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (2.11.3)\n",
160
- "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests>=2.0.0->sphinx->sphinx-rtd-theme->indic-nlp-library) (2020.12.5)\n",
161
- "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests>=2.0.0->sphinx->sphinx-rtd-theme->indic-nlp-library) (3.0.4)\n",
162
- "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests>=2.0.0->sphinx->sphinx-rtd-theme->indic-nlp-library) (1.24.3)\n",
163
- "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests>=2.0.0->sphinx->sphinx-rtd-theme->indic-nlp-library) (2.10)\n",
164
- "Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->sphinx->sphinx-rtd-theme->indic-nlp-library) (2.4.7)\n",
165
- "Requirement already satisfied: sphinxcontrib-serializinghtml in /usr/local/lib/python3.7/dist-packages (from sphinxcontrib-websupport->sphinx->sphinx-rtd-theme->indic-nlp-library) (1.1.4)\n",
166
- "Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.7/dist-packages (from Jinja2>=2.3->sphinx->sphinx-rtd-theme->indic-nlp-library) (2.0.1)\n",
167
- "Building wheels for collected packages: sphinx-argparse\n",
168
- " Building wheel for sphinx-argparse (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
169
- " Created wheel for sphinx-argparse: filename=sphinx_argparse-0.2.5-cp37-none-any.whl size=11552 sha256=d8804d903bcf829240052e806acb7c6051e0c240bddf22ef8bd4e4bd2abdfbac\n",
170
- " Stored in directory: /root/.cache/pip/wheels/2a/18/1b/4990a1859da4edc77ab312bc2986c08d2733fb5713d06e44f5\n",
171
- "Successfully built sphinx-argparse\n",
172
- "\u001b[31mERROR: datascience 0.10.6 has requirement folium==0.2.1, but you'll have folium 0.8.3 which is incompatible.\u001b[0m\n",
173
- "Installing collected packages: sacremoses, mock, portalocker, sacrebleu, tensorboardX, morfessor, docutils, sphinx-rtd-theme, sphinx-argparse, indic-nlp-library\n",
174
- " Found existing installation: docutils 0.17.1\n",
175
- " Uninstalling docutils-0.17.1:\n",
176
- " Successfully uninstalled docutils-0.17.1\n",
177
- "Successfully installed docutils-0.16 indic-nlp-library-0.81 mock-4.0.3 morfessor-2.0.6 portalocker-2.0.0 sacrebleu-1.5.1 sacremoses-0.0.45 sphinx-argparse-0.2.5 sphinx-rtd-theme-0.5.2 tensorboardX-2.2\n",
178
- "Cloning into 'fairseq'...\n",
179
- "remote: Enumerating objects: 28243, done.\u001b[K\n",
180
- "remote: Counting objects: 100% (62/62), done.\u001b[K\n",
181
- "remote: Compressing objects: 100% (39/39), done.\u001b[K\n",
182
- "remote: Total 28243 (delta 29), reused 44 (delta 22), pack-reused 28181\u001b[K\n",
183
- "Receiving objects: 100% (28243/28243), 11.83 MiB | 8.53 MiB/s, done.\n",
184
- "Resolving deltas: 100% (21233/21233), done.\n",
185
- "/content/testing/fairseq\n",
186
- "Obtaining file:///content/testing/fairseq\n",
187
- " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
188
- " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
189
- " Installing backend dependencies ... \u001b[?25l\u001b[?25hdone\n",
190
- " Preparing wheel metadata ... \u001b[?25l\u001b[?25hdone\n",
191
- "Requirement already satisfied: sacrebleu>=1.4.12 in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+2fd9d8a) (1.5.1)\n",
192
- "Collecting hydra-core<1.1\n",
193
- "\u001b[?25l Downloading https://files.pythonhosted.org/packages/52/e3/fbd70dd0d3ce4d1d75c22d56c0c9f895cfa7ed6587a9ffb821d6812d6a60/hydra_core-1.0.6-py3-none-any.whl (123kB)\n",
194
- "\u001b[K |████████████████████████████████| 133kB 4.1MB/s \n",
195
- "\u001b[?25hRequirement already satisfied: regex in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+2fd9d8a) (2019.12.20)\n",
196
- "Requirement already satisfied: cython in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+2fd9d8a) (0.29.23)\n",
197
- "Requirement already satisfied: cffi in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+2fd9d8a) (1.14.5)\n",
198
- "Collecting omegaconf<2.1\n",
199
- " Downloading https://files.pythonhosted.org/packages/d0/eb/9d63ce09dd8aa85767c65668d5414958ea29648a0eec80a4a7d311ec2684/omegaconf-2.0.6-py3-none-any.whl\n",
200
- "Requirement already satisfied: torch in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+2fd9d8a) (1.8.1+cu101)\n",
201
- "Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+2fd9d8a) (4.41.1)\n",
202
- "Requirement already satisfied: numpy; python_version >= \"3.7\" in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+2fd9d8a) (1.19.5)\n",
203
- "Requirement already satisfied: portalocker==2.0.0 in /usr/local/lib/python3.7/dist-packages (from sacrebleu>=1.4.12->fairseq==1.0.0a0+2fd9d8a) (2.0.0)\n",
204
- "Requirement already satisfied: importlib-resources; python_version < \"3.9\" in /usr/local/lib/python3.7/dist-packages (from hydra-core<1.1->fairseq==1.0.0a0+2fd9d8a) (5.1.3)\n",
205
- "Collecting antlr4-python3-runtime==4.8\n",
206
- "\u001b[?25l Downloading https://files.pythonhosted.org/packages/56/02/789a0bddf9c9b31b14c3e79ec22b9656185a803dc31c15f006f9855ece0d/antlr4-python3-runtime-4.8.tar.gz (112kB)\n",
207
- "\u001b[K |████████████████████████████████| 112kB 17.0MB/s \n",
208
- "\u001b[?25hRequirement already satisfied: pycparser in /usr/local/lib/python3.7/dist-packages (from cffi->fairseq==1.0.0a0+2fd9d8a) (2.20)\n",
209
- "Collecting PyYAML>=5.1.*\n",
210
- "\u001b[?25l Downloading https://files.pythonhosted.org/packages/7a/a5/393c087efdc78091afa2af9f1378762f9821c9c1d7a22c5753fb5ac5f97a/PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636kB)\n",
211
- "\u001b[K |████████████████████████████████| 645kB 14.1MB/s \n",
212
- "\u001b[?25hRequirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from omegaconf<2.1->fairseq==1.0.0a0+2fd9d8a) (3.7.4.3)\n",
213
- "Requirement already satisfied: zipp>=0.4; python_version < \"3.8\" in /usr/local/lib/python3.7/dist-packages (from importlib-resources; python_version < \"3.9\"->hydra-core<1.1->fairseq==1.0.0a0+2fd9d8a) (3.4.1)\n",
214
- "Building wheels for collected packages: antlr4-python3-runtime\n",
215
- " Building wheel for antlr4-python3-runtime (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
216
- " Created wheel for antlr4-python3-runtime: filename=antlr4_python3_runtime-4.8-cp37-none-any.whl size=141231 sha256=f9207fa94682c5ba5daa722d4103f4c9eb131c8dd86870ae9cf43f7df7a90154\n",
217
- " Stored in directory: /root/.cache/pip/wheels/e3/e2/fa/b78480b448b8579ddf393bebd3f47ee23aa84c89b6a78285c8\n",
218
- "Successfully built antlr4-python3-runtime\n",
219
- "Installing collected packages: PyYAML, omegaconf, antlr4-python3-runtime, hydra-core, fairseq\n",
220
- " Found existing installation: PyYAML 3.13\n",
221
- " Uninstalling PyYAML-3.13:\n",
222
- " Successfully uninstalled PyYAML-3.13\n",
223
- " Running setup.py develop for fairseq\n",
224
- "Successfully installed PyYAML-5.4.1 antlr4-python3-runtime-4.8 fairseq hydra-core-1.0.6 omegaconf-2.0.6\n",
225
- "/content/testing\n"
226
- ]
227
- }
228
- ],
229
- "source": [
230
- "# Install the necessary libraries\n",
231
- "!pip install sacremoses pandas mock sacrebleu tensorboardX pyarrow indic-nlp-library\n",
232
- "# Install fairseq from source\n",
233
- "!git clone https://github.com/pytorch/fairseq.git\n",
234
- "%cd fairseq\n",
235
- "# !git checkout da9eaba12d82b9bfc1442f0e2c6fc1b895f4d35d\n",
236
- "!pip install --editable ./\n",
237
- "%cd .."
238
- ]
239
- },
240
- {
241
- "cell_type": "code",
242
- "execution_count": null,
243
- "metadata": {
244
- "colab": {
245
- "base_uri": "https://localhost:8080/"
246
- },
247
- "id": "kKA8afhBawO5",
248
- "outputId": "d346f462-d5d4-43a0-c29b-90aaab2fb4d2"
249
- },
250
- "outputs": [
251
- {
252
- "name": "stdout",
253
- "output_type": "stream",
254
- "text": [
255
- "--2021-06-09 15:06:00-- https://storage.googleapis.com/samanantar-public/V0.2/models/indic-en.zip\n",
256
- "Resolving storage.googleapis.com (storage.googleapis.com)... 64.233.188.128, 64.233.189.128, 108.177.97.128, ...\n",
257
- "Connecting to storage.googleapis.com (storage.googleapis.com)|64.233.188.128|:443... connected.\n",
258
- "HTTP request sent, awaiting response... 200 OK\n",
259
- "Length: 4551079075 (4.2G) [application/zip]\n",
260
- "Saving to: ‘indic-en.zip’\n",
261
- "\n",
262
- "indic-en.zip 100%[===================>] 4.24G 49.9MB/s in 1m 47s \n",
263
- "\n",
264
- "2021-06-09 15:07:48 (40.5 MB/s) - ‘indic-en.zip’ saved [4551079075/4551079075]\n",
265
- "\n",
266
- "Archive: indic-en.zip\n",
267
- " creating: indic-en/\n",
268
- " creating: indic-en/vocab/\n",
269
- " inflating: indic-en/vocab/bpe_codes.32k.SRC \n",
270
- " inflating: indic-en/vocab/vocab.SRC \n",
271
- " inflating: indic-en/vocab/vocab.TGT \n",
272
- " inflating: indic-en/vocab/bpe_codes.32k.TGT \n",
273
- " creating: indic-en/final_bin/\n",
274
- " inflating: indic-en/final_bin/dict.TGT.txt \n",
275
- " inflating: indic-en/final_bin/dict.SRC.txt \n",
276
- " creating: indic-en/model/\n",
277
- " inflating: indic-en/model/checkpoint_best.pt \n",
278
- "--2021-06-09 15:09:51-- https://storage.googleapis.com/samanantar-public/V0.2/models/en-indic.zip\n",
279
- "Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.204.128, 64.233.188.128, 64.233.189.128, ...\n",
280
- "Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.204.128|:443... connected.\n",
281
- "HTTP request sent, awaiting response... 200 OK\n",
282
- "Length: 4609212103 (4.3G) [application/zip]\n",
283
- "Saving to: ‘en-indic.zip’\n",
284
- "\n",
285
- "en-indic.zip 100%[===================>] 4.29G 33.8MB/s in 1m 51s \n",
286
- "\n",
287
- "2021-06-09 15:11:44 (39.5 MB/s) - ‘en-indic.zip’ saved [4609212103/4609212103]\n",
288
- "\n",
289
- "Archive: en-indic.zip\n",
290
- " creating: en-indic/\n",
291
- " creating: en-indic/vocab/\n",
292
- " inflating: en-indic/vocab/bpe_codes.32k.SRC \n",
293
- " inflating: en-indic/vocab/vocab.SRC \n",
294
- " inflating: en-indic/vocab/vocab.TGT \n",
295
- " inflating: en-indic/vocab/bpe_codes.32k.TGT \n",
296
- " creating: en-indic/final_bin/\n",
297
- " inflating: en-indic/final_bin/dict.TGT.txt \n",
298
- " inflating: en-indic/final_bin/dict.SRC.txt \n",
299
- " creating: en-indic/model/\n",
300
- " inflating: en-indic/model/checkpoint_best.pt \n",
301
- "--2021-06-09 15:14:11-- https://storage.googleapis.com/samanantar-public/models/m2m.zip\n",
302
- "Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.23.128, 74.125.203.128, 74.125.204.128, ...\n",
303
- "Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.23.128|:443... connected.\n",
304
- "HTTP request sent, awaiting response... 200 OK\n",
305
- "Length: 4081990185 (3.8G) [application/zip]\n",
306
- "Saving to: ‘m2m.zip’\n",
307
- "\n",
308
- "m2m.zip 100%[===================>] 3.80G 41.5MB/s in 96s \n",
309
- "\n",
310
- "2021-06-09 15:15:48 (40.4 MB/s) - ‘m2m.zip’ saved [4081990185/4081990185]\n",
311
- "\n",
312
- "Archive: m2m.zip\n",
313
- " creating: m2m/\n",
314
- " creating: m2m/vocab/\n",
315
- " inflating: m2m/vocab/vocab.SRC \n",
316
- " inflating: m2m/vocab/vocab.TGT \n",
317
- " inflating: m2m/vocab/bpe_codes.32k.SRC_TGT \n",
318
- " creating: m2m/final_bin/\n",
319
- " inflating: m2m/final_bin/dict.TGT.txt \n",
320
- " inflating: m2m/final_bin/dict.SRC.txt \n",
321
- " creating: m2m/model/\n",
322
- " inflating: m2m/model/checkpoint_best.pt \n",
323
- "/content/testing/indicTrans\n"
324
- ]
325
- }
326
- ],
327
- "source": [
328
- "# download the indictrans model\n",
329
- "\n",
330
- "\n",
331
- "# downloading the indic-en model\n",
332
- "!wget https://storage.googleapis.com/samanantar-public/V0.3/models/indic-en.zip\n",
333
- "!unzip indic-en.zip\n",
334
- "\n",
335
- "# downloading the en-indic model\n",
336
- "!wget https://storage.googleapis.com/samanantar-public/V0.3/models/en-indic.zip\n",
337
- "!unzip en-indic.zip\n",
338
- "\n",
339
- "# downloading the indic-indic model\n",
340
- "!wget https://storage.googleapis.com/samanantar-public/V0.3/models/m2m.zip\n",
341
- "!unzip m2m.zip\n",
342
- "\n",
343
- "%cd indicTrans/"
344
- ]
345
- },
346
- {
347
- "cell_type": "code",
348
- "execution_count": null,
349
- "metadata": {
350
- "id": "Lg1sQFfyWJli"
351
- },
352
- "outputs": [],
353
- "source": [
354
- "# creating a text file and adding en sentences we can use for testing the model\n",
355
- "!touch en_sentences.txt\n",
356
- "!echo 'This bicycle is too small for you !!' >> en_sentences.txt\n",
357
- "!echo \"I will directly meet you at the airport.\" >> en_sentences.txt\n",
358
- "!echo 'If COVID-19 is spreading in your community, stay safe by taking some simple precautions, such as physical distancing, wearing a mask, keeping rooms well ventilated, avoiding crowds, cleaning your hands, and coughing into a bent elbow or tissue' >> en_sentences.txt"
359
- ]
360
- },
361
- {
362
- "cell_type": "code",
363
- "execution_count": null,
364
- "metadata": {
365
- "colab": {
366
- "base_uri": "https://localhost:8080/"
367
- },
368
- "id": "fLg9BWAGWvLU",
369
- "outputId": "f3ca6f65-9a39-4d80-c25d-88806daf3e7b"
370
- },
371
- "outputs": [
372
- {
373
- "name": "stdout",
374
- "output_type": "stream",
375
- "text": [
376
- "Wed Jun 9 15:18:01 UTC 2021\n",
377
- "Applying normalization and script conversion\n",
378
- "100% 3/3 [00:00<00:00, 71.78it/s]\n",
379
- "Number of sentences in input: 3\n",
380
- "Applying BPE\n",
381
- "Decoding\n",
382
- "Extracting translations, script conversion and detokenization\n",
383
- "Translation completed\n"
384
- ]
385
- }
386
- ],
387
- "source": [
388
- "# joint_translate takes src_file, output_fname, src_lang, tgt_lang, model_folder as inputs\n",
389
- "# src_file -> input text file to be translated\n",
390
- "# output_fname -> name of the output file (will get created) containing the model predictions\n",
391
- "# src_lang -> source lang code of the input text ( in this case we are using en-indic model and hence src_lang would be 'en')\n",
392
- "# tgt_lang -> target lang code of the input text ( tgt lang for en-indic model would be any of the 11 indic langs we trained on:\n",
393
- "# as, bn, hi, gu, kn, ml, mr, or, pa, ta, te)\n",
394
- "# supported languages are:\n",
395
- "# as - assamese, bn - bengali, gu - gujarathi, hi - hindi, kn - kannada, \n",
396
- "# ml - malayalam, mr - marathi, or - oriya, pa - punjabi, ta - tamil, te - telugu\n",
397
- "\n",
398
- "# model_dir -> the directory containing the model and the vocab files\n",
399
- "\n",
400
- "# Note: if the translation is taking a lot of time, please tune the buffer_size and batch_size parameter for fairseq-interactive defined inside this joint_translate script\n",
401
- "\n",
402
- "\n",
403
- "# here we are translating the english sentences to tamil\n",
404
- "!bash joint_translate.sh en_sentences.txt ta_outputs.txt 'en' 'ta' '../en-indic'"
405
- ]
406
- },
407
- {
408
- "cell_type": "code",
409
- "execution_count": null,
410
- "metadata": {
411
- "colab": {
412
- "base_uri": "https://localhost:8080/"
413
- },
414
- "id": "8QzkBCgeGZiH",
415
- "outputId": "c150360c-6d01-4689-8c2e-9bdd0eba1504"
416
- },
417
- "outputs": [
418
- {
419
- "name": "stdout",
420
- "output_type": "stream",
421
- "text": [
422
- "இந்த சைக்கிள் உங்களுக்கு மிகவும் சிறியது!\n",
423
- "விமான நிலையத்தில் உங்களை நேரில் சந்திக்கிறேன்.\n",
424
- "உங்கள் சமூகத்தில் கோவிட்-19 பரவுகிறது என்றால், சில எளிய முன்னெச்சரிக்கை நடவடிக்கைகளான, தனி நபர் இடைவெளி, முகக்கவசம் அணிதல், அறைகளை நன்கு காற்றோட்டமாக வைத்திருத்தல், கூட்டத்தைத் தவிர்த்தல், கைகளைக் கழுவுதல், முழங்கை அல்லது திசுக்களில் இருமல் போன்றவற்றை மேற்கொள்வதன் மூலம் பாதுகாப்பாக இருங்கள்.\n"
425
- ]
426
- }
427
- ],
428
- "source": [
429
- "!cat ta_outputs.txt"
430
- ]
431
- },
432
- {
433
- "cell_type": "code",
434
- "execution_count": null,
435
- "metadata": {
436
- "colab": {
437
- "base_uri": "https://localhost:8080/"
438
- },
439
- "id": "c4v9BmbZao5d",
440
- "outputId": "6efac2a3-5f79-4e72-821b-bc80702a7fa8"
441
- },
442
- "outputs": [
443
- {
444
- "name": "stdout",
445
- "output_type": "stream",
446
- "text": [
447
- "Wed Jun 9 15:21:31 UTC 2021\n",
448
- "Applying normalization and script conversion\n",
449
- "100% 3/3 [00:00<00:00, 88.59it/s]\n",
450
- "Number of sentences in input: 3\n",
451
- "Applying BPE\n",
452
- "Decoding\n",
453
- "Extracting translations, script conversion and detokenization\n",
454
- "Translation completed\n"
455
- ]
456
- }
457
- ],
458
- "source": [
459
- "# Similarly, we can translate the english sentences to hindi\n",
460
- "!bash joint_translate.sh en_sentences.txt hi_outputs.txt 'en' 'hi' '../en-indic'"
461
- ]
462
- },
463
- {
464
- "cell_type": "code",
465
- "execution_count": null,
466
- "metadata": {
467
- "colab": {
468
- "base_uri": "https://localhost:8080/"
469
- },
470
- "id": "pNNzyR_LfqIr",
471
- "outputId": "095b9532-e76a-4451-dec9-4862566a4288"
472
- },
473
- "outputs": [
474
- {
475
- "name": "stdout",
476
- "output_type": "stream",
477
- "text": [
478
- "यह साइकिल तुम्हारे लिए बहुत छोटी है!\n",
479
- "मैं आपसे एयरपोर्ट पर ही मिलने वाला हूं।\n",
480
- "यदि आपके समुदाय में कोविड-19 फैल रहा है, तो कुछ सरल सावधानियां बरतें, जैसे शारीरिक दूरी बनाए रखना, मास्क पहनना, कमरों को अच्छी तरह से हवादार रखना, भीड़ से बचना, अपने हाथों को साफ करना और कोहनी या ऊतक को मोड़कर खांसते हुए सुरक्षित रहें\n"
481
- ]
482
- }
483
- ],
484
- "source": [
485
- "!cat hi_outputs.txt"
486
- ]
487
- },
488
- {
489
- "cell_type": "code",
490
- "execution_count": null,
491
- "metadata": {
492
- "id": "PzjbDLBtaol9"
493
- },
494
- "outputs": [],
495
- "source": [
496
- "# creating a text file and adding hi sentences we can use for testing the model\n",
497
- "!touch hi_sentences.txt\n",
498
- "!echo 'तुम आज सुबह यहाँ क्यों आए?' >> hi_sentences.txt\n",
499
- "!echo \"मेरे परिवार में हर कोई जल्दी उठता है।\" >> hi_sentences.txt\n",
500
- "!echo ' स्वास्थ्य और परिवार कल्याण मंत्रालय द्वारा प्रदान की गई जानकारी और सलाह को सावधानी व सही तरीके से पालन कर वायरस के स्थानीय प्रसार को रोका जा सकता है।' >> hi_sentences.txt\n",
501
- "\n",
502
- "!touch ta_sentences.txt\n",
503
- "!echo 'அவனுக்கு நம்மைப் தெரியும் என்று தோன்றுகிறது' >> ta_sentences.txt\n",
504
- "!echo \"இது எங்கே இருக்கு என்று என்னால் கண்டுபிடிக்க முடியவில்லை.\" >> ta_sentences.txt\n",
505
- "!echo 'உங்களுக்கு உங்கள் அருகில் இருக்கும் ஒருவருக்கோ இத்தகைய அறிகுறிகள் தென்பட்டால், வீட்டிலேயே இருப்பது, கொரோனா வைரஸ் தொற்று பிறருக்கு வராமல் தடுக்க உதவும்.' >> ta_sentences.txt"
506
- ]
507
- },
508
- {
509
- "cell_type": "code",
510
- "execution_count": null,
511
- "metadata": {
512
- "colab": {
513
- "base_uri": "https://localhost:8080/"
514
- },
515
- "id": "5uaOmKb8gmeN",
516
- "outputId": "951bbdf9-61d0-4703-a8df-0c3fcb4e5bb3"
517
- },
518
- "outputs": [
519
- {
520
- "name": "stdout",
521
- "output_type": "stream",
522
- "text": [
523
- "Wed Jun 9 15:24:43 UTC 2021\n",
524
- "Applying normalization and script conversion\n",
525
- "100% 3/3 [00:00<00:00, 74.90it/s]\n",
526
- "Number of sentences in input: 3\n",
527
- "Applying BPE\n",
528
- "Decoding\n",
529
- "Extracting translations, script conversion and detokenization\n",
530
- "Translation completed\n"
531
- ]
532
- }
533
- ],
534
- "source": [
535
- "# here we are translating the english sentences to hindi\n",
536
- "!bash joint_translate.sh hi_sentences.txt en_outputs.txt 'hi' 'en' '../indic-en'"
537
- ]
538
- },
539
- {
540
- "cell_type": "code",
541
- "execution_count": null,
542
- "metadata": {
543
- "colab": {
544
- "base_uri": "https://localhost:8080/"
545
- },
546
- "id": "iLD7WPqmlSnC",
547
- "outputId": "359050fa-6d35-4055-a9c5-13a15322c59e"
548
- },
549
- "outputs": [
550
- {
551
- "name": "stdout",
552
- "output_type": "stream",
553
- "text": [
554
- "Why did you come here this morning?\n",
555
- "Everyone in my family gets up early.\n",
556
- "The local spread of the virus can be curbed by following the information and advice provided by the Ministry of Health and Family Welfare in a careful and correct manner.\n"
557
- ]
558
- }
559
- ],
560
- "source": [
561
- "! cat en_outputs.txt"
562
- ]
563
- },
564
- {
565
- "cell_type": "code",
566
- "execution_count": null,
567
- "metadata": {
568
- "colab": {
569
- "base_uri": "https://localhost:8080/"
570
- },
571
- "id": "O3mJyj-QljWz",
572
- "outputId": "1c0420e5-4b80-41d9-f09e-2fdff79bc7bd"
573
- },
574
- "outputs": [
575
- {
576
- "name": "stdout",
577
- "output_type": "stream",
578
- "text": [
579
- "Wed Jun 9 15:28:05 UTC 2021\n",
580
- "Applying normalization and script conversion\n",
581
- "100% 3/3 [00:00<00:00, 72.92it/s]\n",
582
- "Number of sentences in input: 3\n",
583
- "Applying BPE\n",
584
- "Decoding\n",
585
- "Extracting translations, script conversion and detokenization\n",
586
- "Translation completed\n"
587
- ]
588
- }
589
- ],
590
- "source": [
591
- "# here we are translating the english sentences to tamil\n",
592
- "!bash joint_translate.sh ta_sentences.txt en_outputs.txt 'ta' 'en' '../indic-en'"
593
- ]
594
- },
595
- {
596
- "cell_type": "code",
597
- "execution_count": null,
598
- "metadata": {
599
- "colab": {
600
- "base_uri": "https://localhost:8080/"
601
- },
602
- "id": "GapEJESiloD8",
603
- "outputId": "dc8b2a8c-4f36-4bf9-d517-6826aa65da57"
604
- },
605
- "outputs": [
606
- {
607
- "name": "stdout",
608
- "output_type": "stream",
609
- "text": [
610
- "He seems to know us.\n",
611
- "I couldnt find it anywhere.\n",
612
- "If someone in your neighbourhood develops these symptoms, staying at home can help prevent the spread of the coronavirus infection.\n"
613
- ]
614
- }
615
- ],
616
- "source": [
617
- "! cat en_outputs.txt"
618
- ]
619
- },
620
- {
621
- "cell_type": "code",
622
- "execution_count": null,
623
- "metadata": {
624
- "id": "ckfW2P6abcB3"
625
- },
626
- "outputs": [],
627
- "source": [
628
- "# we just rename the m2m_joint_vocab file here as joint_translate uses bpe_codes.32k.SRC\n",
629
- "mv ../m2m/vocab/bpe_codes.32k.SRC_TGT ../m2m/vocab/bpe_codes.32k.SRC"
630
- ]
631
- },
632
- {
633
- "cell_type": "code",
634
- "execution_count": null,
635
- "metadata": {
636
- "colab": {
637
- "base_uri": "https://localhost:8080/"
638
- },
639
- "id": "H-3vPdCqSWoK",
640
- "outputId": "d5a80c59-cc89-4910-a9ce-7317fac6bf8d"
641
- },
642
- "outputs": [
643
- {
644
- "name": "stdout",
645
- "output_type": "stream",
646
- "text": [
647
- "Wed Jun 9 15:39:26 UTC 2021\n",
648
- "Applying normalization and script conversion\n",
649
- "100% 3/3 [00:00<00:00, 63.53it/s]\n",
650
- "Number of sentences in input: 3\n",
651
- "Applying BPE\n",
652
- "Decoding\n",
653
- "Extracting translations, script conversion and detokenization\n",
654
- "Translation completed\n"
655
- ]
656
- }
657
- ],
658
- "source": [
659
- "# here we are using the indic2indic model for translating the hindi sentences to tamil\n",
660
- "!bash joint_translate.sh hi_sentences.txt ta_outputs.txt 'hi' 'ta' '../m2m'"
661
- ]
662
- },
663
- {
664
- "cell_type": "code",
665
- "execution_count": null,
666
- "metadata": {
667
- "colab": {
668
- "base_uri": "https://localhost:8080/"
669
- },
670
- "id": "22yPo78Zb_oR",
671
- "outputId": "4df17e93-9029-4020-8deb-0dbaf8bb0b27"
672
- },
673
- "outputs": [
674
- {
675
- "name": "stdout",
676
- "output_type": "stream",
677
- "text": [
678
- "तुम आज सुबह यहाँ क्यों आए?\n",
679
- "मेरे परिवार में हर कोई जल्दी उठता है।\n",
680
- " स्वास्थ्य और परिवार कल्याण मंत्रालय द्वारा प्रदान की गई जानकारी और सलाह को सावधानी व सही तरीके से पालन कर वायरस के स्थानीय प्रसार को रोका जा सकता है।\n"
681
- ]
682
- }
683
- ],
684
- "source": [
685
- " ! cat hi_sentences.txt # the hindi inputs"
686
- ]
687
- },
688
- {
689
- "cell_type": "code",
690
- "execution_count": null,
691
- "metadata": {
692
- "colab": {
693
- "base_uri": "https://localhost:8080/"
694
- },
695
- "id": "onnfzTDESg2I",
696
- "outputId": "1bc600d4-d3ff-40fa-d258-7d1c876bd49c"
697
- },
698
- "outputs": [
699
- {
700
- "name": "stdout",
701
- "output_type": "stream",
702
- "text": [
703
- "ஏன் இன்று காலையில் வந்தீர்கள்?\n",
704
- "எனது குடும்பத்தில் உள்ள ஒவ்வொருவரும் விரைவில் எழுவார்கள்.\n",
705
- "மத்திய சுகாதாரம் மற்றும் குடும்ப நல அமைச்சகத்தின் அறிவுறுத்தல்கள் மற்றும் தகவல்களைப் பின்பற்றுவதன் மூலம், உள்ளூர் அளவில் வைரஸ் பரவுவதைத் தடுக்க முடியும்.\n"
706
- ]
707
- }
708
- ],
709
- "source": [
710
- "! cat ta_outputs.txt # the tamil outputs"
711
- ]
712
- },
713
- {
714
- "cell_type": "code",
715
- "execution_count": null,
716
- "metadata": {
717
- "colab": {
718
- "base_uri": "https://localhost:8080/"
719
- },
720
- "id": "5klOcwi8SjGS",
721
- "outputId": "bc4e47fa-ee1d-4da2-85ea-f7900cae7b48"
722
- },
723
- "outputs": [
724
- {
725
- "name": "stdout",
726
- "output_type": "stream",
727
- "text": [
728
- "Wed Jun 9 15:45:53 UTC 2021\n",
729
- "Applying normalization and script conversion\n",
730
- "100% 3/3 [00:00<00:00, 82.25it/s]\n",
731
- "Number of sentences in input: 3\n",
732
- "Applying BPE\n",
733
- "Decoding\n",
734
- "Extracting translations, script conversion and detokenization\n",
735
- "Translation completed\n"
736
- ]
737
- }
738
- ],
739
- "source": [
740
- "# here we are using the indic2indic model for translating the hindi sentences to tamil (same as above with reversing the direction)\n",
741
- "!bash joint_translate.sh ta_sentences.txt hi_outputs.txt 'ta' 'hi' '../m2m'"
742
- ]
743
- },
744
- {
745
- "cell_type": "code",
746
- "execution_count": null,
747
- "metadata": {
748
- "colab": {
749
- "base_uri": "https://localhost:8080/"
750
- },
751
- "id": "4ifZhGkKc6oo",
752
- "outputId": "a0112e2b-a54b-48ad-e3ae-a3d84c6d097e"
753
- },
754
- "outputs": [
755
- {
756
- "name": "stdout",
757
- "output_type": "stream",
758
- "text": [
759
- "அவனுக்கு நம்மைப் தெரியும் என்று தோன்றுகிறது\n",
760
- "இது எங்கே இருக்கு என்று என்னால் கண்டுபிடிக்க முடியவில்லை.\n",
761
- "உங்களுக்கு உங்கள் அருகில் இருக்கும் ஒருவருக்கோ இத்தகைய அறிகுறிகள் தென்பட்டால், வீட்டிலேயே இருப்பது, கொரோனா வைரஸ் தொற்று பிறருக்கு வராமல் தடுக்க உதவும்.\n"
762
- ]
763
- }
764
- ],
765
- "source": [
766
- "! cat ta_sentences.txt # the tamil inputs"
767
- ]
768
- },
769
- {
770
- "cell_type": "code",
771
- "execution_count": null,
772
- "metadata": {
773
- "colab": {
774
- "base_uri": "https://localhost:8080/"
775
- },
776
- "id": "v0x0YrWYSwwK",
777
- "outputId": "4c37d699-5b8e-4ae7-9724-953d7e165035"
778
- },
779
- "outputs": [
780
- {
781
- "name": "stdout",
782
- "output_type": "stream",
783
- "text": [
784
- "ऐसा लगता है कि वह हमें जानता है।\n",
785
- "मुझे पता नहीं था कि यह कहां है।\n",
786
- "अगर आपके आस-पास के किसी व्यक्ति में ऐसे लक्षण दिखाई देते हैं, तो घर पर रहने से कोरोना वायरस को फैलने से रोकने में मदद मिलेगी।\n"
787
- ]
788
- }
789
- ],
790
- "source": [
791
- "! cat hi_outputs.txt # the hi outputs"
792
- ]
793
- },
794
- {
795
- "cell_type": "code",
796
- "execution_count": null,
797
- "metadata": {
798
- "id": "-xcnDOc4gNKC"
799
- },
800
- "outputs": [],
801
- "source": [
802
- "# to compute bleu scores for the predicitions with a reference file, use the following command\n",
803
- "\n",
804
- "# bash compute_bleu.sh pred_fname ref_fname src_lang tgt_lang\n",
805
- "# arguments:\n",
806
- "# pred_fname: file that contains model predictions\n",
807
- "# ref_fname: file that contains references\n",
808
- "# src_lang and tgt_lang : the source and target language"
809
- ]
810
- },
811
- {
812
- "cell_type": "code",
813
- "execution_count": null,
814
- "metadata": {
815
- "id": "9YK2BdwvrUgI"
816
- },
817
- "outputs": [],
818
- "source": []
819
- }
820
- ],
821
- "metadata": {
822
- "accelerator": "GPU",
823
- "colab": {
824
- "collapsed_sections": [],
825
- "include_colab_link": true,
826
- "name": "indictrans_fairseq_inference.ipynb",
827
- "provenance": []
828
- },
829
- "interpreter": {
830
- "hash": "3c7d4130300118f0c7487d576c6841c0dbbdeec039e1e658ac9b107412a09af0"
831
- },
832
- "kernelspec": {
833
- "display_name": "Python 3.7.7 64-bit",
834
- "name": "python3"
835
- },
836
- "language_info": {
837
- "name": "python",
838
- "version": ""
839
- }
840
- },
841
- "nbformat": 4,
842
- "nbformat_minor": 0
843
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
joint_translate.sh DELETED
@@ -1,69 +0,0 @@
1
- #!/bin/bash
2
- echo `date`
3
- infname=$1
4
- outfname=$2
5
- src_lang=$3
6
- tgt_lang=$4
7
- exp_dir=$5
8
- ref_fname=$6
9
-
10
- SRC_PREFIX='SRC'
11
- TGT_PREFIX='TGT'
12
-
13
- #`dirname $0`/env.sh
14
- SUBWORD_NMT_DIR='subword-nmt'
15
- model_dir=$exp_dir/model
16
- data_bin_dir=$exp_dir/final_bin
17
-
18
- ### normalization and script conversion
19
-
20
- echo "Applying normalization and script conversion"
21
- input_size=`python scripts/preprocess_translate.py $infname $outfname.norm $src_lang true`
22
- echo "Number of sentences in input: $input_size"
23
-
24
- ### apply BPE to input file
25
-
26
- echo "Applying BPE"
27
- python $SUBWORD_NMT_DIR/subword_nmt/apply_bpe.py \
28
- -c $exp_dir/vocab/bpe_codes.32k.${SRC_PREFIX} \
29
- --vocabulary $exp_dir/vocab/vocab.$SRC_PREFIX \
30
- --vocabulary-threshold 5 \
31
- < $outfname.norm \
32
- > $outfname._bpe
33
-
34
- # not needed for joint training
35
- # echo "Adding language tags"
36
- python scripts/add_tags_translate.py $outfname._bpe $outfname.bpe $src_lang $tgt_lang
37
-
38
- ### run decoder
39
-
40
- echo "Decoding"
41
-
42
- src_input_bpe_fname=$outfname.bpe
43
- tgt_output_fname=$outfname
44
- fairseq-interactive $data_bin_dir \
45
- -s $SRC_PREFIX -t $TGT_PREFIX \
46
- --distributed-world-size 1 \
47
- --path $model_dir/checkpoint_best.pt \
48
- --batch-size 64 --buffer-size 2500 --beam 5 --remove-bpe \
49
- --skip-invalid-size-inputs-valid-test \
50
- --user-dir model_configs \
51
- --input $src_input_bpe_fname > $tgt_output_fname.log 2>&1
52
-
53
-
54
- echo "Extracting translations, script conversion and detokenization"
55
- # this part reverses the transliteration from devnagiri script to target lang and then detokenizes it.
56
- python scripts/postprocess_translate.py $tgt_output_fname.log $tgt_output_fname $input_size $tgt_lang true
57
-
58
- # This block is now moved to compute_bleu.sh for release with more documentation.
59
- # if [ $src_lang == 'en' ]; then
60
- # # indicnlp tokenize the output files before evaluation
61
- # input_size=`python scripts/preprocess_translate.py $ref_fname $ref_fname.tok $tgt_lang`
62
- # input_size=`python scripts/preprocess_translate.py $tgt_output_fname $tgt_output_fname.tok $tgt_lang`
63
- # sacrebleu --tokenize none $ref_fname.tok < $tgt_output_fname.tok
64
- # else
65
- # # indic to en models
66
- # sacrebleu $ref_fname < $tgt_output_fname
67
- # fi
68
- # echo `date`
69
- echo "Translation completed"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
learn_bpe.sh DELETED
@@ -1,44 +0,0 @@
1
- #!/bin/bash
2
-
3
- expdir=$1 # EXPDIR
4
- num_operations=${2:-32000}
5
-
6
- #`dirname $0`/env.sh
7
- SUBWORD_NMT_DIR="subword-nmt"
8
- data_dir="$expdir/data"
9
- train_file=$data_dir/train
10
- # num_operations=32000
11
-
12
- echo Input file: $train_file
13
-
14
- mkdir -p $expdir/vocab
15
-
16
- echo "learning joint BPE"
17
- cat $train_file.SRC $train_file.TGT > $train_file.ALL
18
- python $SUBWORD_NMT_DIR/subword_nmt/learn_bpe.py \
19
- --input $train_file.ALL \
20
- -s $num_operations \
21
- -o $expdir/vocab/bpe_codes.32k.SRC_TGT \
22
- --num-workers -1
23
-
24
- echo "computing SRC vocab"
25
- python $SUBWORD_NMT_DIR/subword_nmt/apply_bpe.py \
26
- -c $expdir/vocab/bpe_codes.32k.SRC_TGT \
27
- --num-workers -1 \
28
- -i $train_file.SRC | \
29
- python $SUBWORD_NMT_DIR/subword_nmt/get_vocab.py \
30
- > $expdir/vocab/vocab.tmp.SRC
31
- python scripts/clean_vocab.py $expdir/vocab/vocab.tmp.SRC $expdir/vocab/vocab.SRC
32
- #rm $expdir/vocab/vocab.tmp.SRC
33
-
34
- echo "computing TGT vocab"
35
- python $SUBWORD_NMT_DIR/subword_nmt/apply_bpe.py \
36
- -c $expdir/vocab/bpe_codes.32k.SRC_TGT \
37
- --num-workers -1 \
38
- -i $train_file.TGT | \
39
- python $SUBWORD_NMT_DIR/subword_nmt/get_vocab.py \
40
- > $expdir/vocab/vocab.tmp.TGT
41
- python scripts/clean_vocab.py $expdir/vocab/vocab.tmp.TGT $expdir/vocab/vocab.TGT
42
- #rm $expdir/vocab/vocab.tmp.TGT
43
-
44
- rm $train_file.ALL
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
learn_single_bpe.sh DELETED
@@ -1,49 +0,0 @@
1
- #!/bin/bash
2
-
3
- expdir=$1 # EXPDIR
4
- num_operations=${2:-32000}
5
-
6
- #`dirname $0`/env.sh
7
- SUBWORD_NMT_DIR="subword-nmt"
8
- data_dir="$expdir/data"
9
- train_file=$data_dir/train
10
- # num_operations=32000
11
-
12
- echo Input file: $train_file
13
-
14
- mkdir -p $expdir/vocab
15
-
16
- echo "learning source BPE"
17
-
18
- python $SUBWORD_NMT_DIR/subword_nmt/learn_bpe.py \
19
- --input $train_file.SRC \
20
- -s $num_operations \
21
- -o $expdir/vocab/bpe_codes.32k.SRC\
22
- --num-workers -1
23
-
24
- echo "learning target BPE"
25
- python $SUBWORD_NMT_DIR/subword_nmt/learn_bpe.py \
26
- --input $train_file.TGT \
27
- -s $num_operations \
28
- -o $expdir/vocab/bpe_codes.32k.TGT\
29
- --num-workers -1
30
-
31
- echo "computing SRC vocab"
32
- python $SUBWORD_NMT_DIR/subword_nmt/apply_bpe.py \
33
- -c $expdir/vocab/bpe_codes.32k.SRC \
34
- --num-workers -1 \
35
- -i $train_file.SRC | \
36
- python $SUBWORD_NMT_DIR/subword_nmt/get_vocab.py \
37
- > $expdir/vocab/vocab.tmp.SRC
38
- python scripts/clean_vocab.py $expdir/vocab/vocab.tmp.SRC $expdir/vocab/vocab.SRC
39
- rm $expdir/vocab/vocab.tmp.SRC
40
-
41
- echo "computing TGT vocab"
42
- python $SUBWORD_NMT_DIR/subword_nmt/apply_bpe.py \
43
- -c $expdir/vocab/bpe_codes.32k.TGT \
44
- --num-workers -1 \
45
- -i $train_file.TGT | \
46
- python $SUBWORD_NMT_DIR/subword_nmt/get_vocab.py \
47
- > $expdir/vocab/vocab.tmp.TGT
48
- python scripts/clean_vocab.py $expdir/vocab/vocab.tmp.TGT $expdir/vocab/vocab.TGT
49
- rm $expdir/vocab/vocab.tmp.TGT
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
prepare_data.sh DELETED
@@ -1,71 +0,0 @@
1
- exp_dir=$1
2
- src_lang=$2
3
- tgt_lang=$3
4
- train_data_dir=${4:-"$exp_dir/$src_lang-$tgt_lang"}
5
- devtest_data_dir=${5:-"$exp_dir/devtest/all/$src_lang-$tgt_lang"}
6
-
7
- echo "Running experiment ${exp_dir} on ${src_lang} to ${tgt_lang}"
8
-
9
- train_processed_dir=$exp_dir/data
10
- devtest_processed_dir=$exp_dir/data
11
-
12
- out_data_dir=$exp_dir/final_bin
13
-
14
- mkdir -p $train_processed_dir
15
- mkdir -p $devtest_processed_dir
16
- mkdir -p $out_data_dir
17
-
18
- # train preprocessing
19
- train_infname_src=$train_data_dir/train.$src_lang
20
- train_infname_tgt=$train_data_dir/train.$tgt_lang
21
- train_outfname_src=$train_processed_dir/train.SRC
22
- train_outfname_tgt=$train_processed_dir/train.TGT
23
- echo "Applying normalization and script conversion for train"
24
- input_size=`python scripts/preprocess_translate.py $train_infname_src $train_outfname_src $src_lang`
25
- input_size=`python scripts/preprocess_translate.py $train_infname_tgt $train_outfname_tgt $tgt_lang`
26
- echo "Number of sentences in train: $input_size"
27
-
28
- # dev preprocessing
29
- dev_infname_src=$devtest_data_dir/dev.$src_lang
30
- dev_infname_tgt=$devtest_data_dir/dev.$tgt_lang
31
- dev_outfname_src=$devtest_processed_dir/dev.SRC
32
- dev_outfname_tgt=$devtest_processed_dir/dev.TGT
33
- echo "Applying normalization and script conversion for dev"
34
- input_size=`python scripts/preprocess_translate.py $dev_infname_src $dev_outfname_src $src_lang`
35
- input_size=`python scripts/preprocess_translate.py $dev_infname_tgt $dev_outfname_tgt $tgt_lang`
36
- echo "Number of sentences in dev: $input_size"
37
-
38
- # test preprocessing
39
- test_infname_src=$devtest_data_dir/test.$src_lang
40
- test_infname_tgt=$devtest_data_dir/test.$tgt_lang
41
- test_outfname_src=$devtest_processed_dir/test.SRC
42
- test_outfname_tgt=$devtest_processed_dir/test.TGT
43
- echo "Applying normalization and script conversion for test"
44
- input_size=`python scripts/preprocess_translate.py $test_infname_src $test_outfname_src $src_lang`
45
- input_size=`python scripts/preprocess_translate.py $test_infname_tgt $test_outfname_tgt $tgt_lang`
46
- echo "Number of sentences in test: $input_size"
47
-
48
- echo "Learning bpe. This will take a very long time depending on the size of the dataset"
49
- echo `date`
50
- # learn bpe for preprocessed_train files
51
- bash learn_bpe.sh $exp_dir
52
- echo `date`
53
-
54
- echo "Applying bpe"
55
- bash apply_bpe_traindevtest_notag.sh $exp_dir
56
-
57
- mkdir -p $exp_dir/final
58
-
59
- # this is only required for joint training
60
- # echo "Adding language tags"
61
- # python scripts/add_tags_translate.py $outfname._bpe $outfname.bpe $src_lang $tgt_lang
62
-
63
- # this is imporatnt step if you are training with tpu and using num_batch_buckets
64
- # the currnet implementation does not remove outliers before bucketing and hence
65
- # removing these large sentences ourselves helps with getting better buckets
66
- python scripts/remove_large_sentences.py $exp_dir/bpe/train.SRC $exp_dir/bpe/train.TGT $exp_dir/final/train.SRC $exp_dir/final/train.TGT
67
- python scripts/remove_large_sentences.py $exp_dir/bpe/dev.SRC $exp_dir/bpe/dev.TGT $exp_dir/final/dev.SRC $exp_dir/final/dev.TGT
68
- python scripts/remove_large_sentences.py $exp_dir/bpe/test.SRC $exp_dir/bpe/test.TGT $exp_dir/final/test.SRC $exp_dir/final/test.TGT
69
-
70
- echo "Binarizing data"
71
- bash binarize_training_exp.sh $exp_dir SRC TGT
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
prepare_data_joint_training.sh DELETED
@@ -1,110 +0,0 @@
1
- exp_dir=$1
2
- src_lang=$2
3
- tgt_lang=$3
4
- train_data_dir=${4:-"$exp_dir"}
5
- devtest_data_dir=${5:-"$exp_dir/devtest/all"}
6
-
7
- echo "Running experiment ${exp_dir} on ${src_lang} to ${tgt_lang}"
8
-
9
-
10
- train_processed_dir=$exp_dir/data
11
- devtest_processed_dir=$exp_dir/data
12
-
13
- out_data_dir=$exp_dir/final_bin
14
-
15
- mkdir -p $train_processed_dir
16
- mkdir -p $devtest_processed_dir
17
- mkdir -p $out_data_dir
18
- langs=(as bn hi gu kn ml mr or pa ta te)
19
-
20
- for lang in ${langs[@]};do
21
- if [ $src_lang == en ]; then
22
- tgt_lang=$lang
23
- else
24
- src_lang=$lang
25
- fi
26
-
27
- train_norm_dir=$exp_dir/norm/$src_lang-$tgt_lang
28
- devtest_norm_dir=$exp_dir/norm/$src_lang-$tgt_lang
29
- mkdir -p $train_norm_dir
30
- mkdir -p $devtest_norm_dir
31
-
32
- # train preprocessing
33
- train_infname_src=$train_data_dir/en-${lang}/train.$src_lang
34
- train_infname_tgt=$train_data_dir/en-${lang}/train.$tgt_lang
35
- train_outfname_src=$train_norm_dir/train.$src_lang
36
- train_outfname_tgt=$train_norm_dir/train.$tgt_lang
37
- echo "Applying normalization and script conversion for train"
38
- # this is for preprocessing text and in for indic langs, we convert all scripts to devnagiri
39
- input_size=`python scripts/preprocess_translate.py $train_infname_src $train_outfname_src $src_lang true`
40
- input_size=`python scripts/preprocess_translate.py $train_infname_tgt $train_outfname_tgt $tgt_lang true`
41
- echo "Number of sentences in train: $input_size"
42
-
43
- # dev preprocessing
44
- dev_infname_src=$devtest_data_dir/en-${lang}/dev.$src_lang
45
- dev_infname_tgt=$devtest_data_dir/en-${lang}/dev.$tgt_lang
46
- dev_outfname_src=$devtest_norm_dir/dev.$src_lang
47
- dev_outfname_tgt=$devtest_norm_dir/dev.$tgt_lang
48
- echo "Applying normalization and script conversion for dev"
49
- input_size=`python scripts/preprocess_translate.py $dev_infname_src $dev_outfname_src $src_lang true`
50
- input_size=`python scripts/preprocess_translate.py $dev_infname_tgt $dev_outfname_tgt $tgt_lang true`
51
- echo "Number of sentences in dev: $input_size"
52
-
53
- # test preprocessing
54
- test_infname_src=$devtest_data_dir/en-${lang}/test.$src_lang
55
- test_infname_tgt=$devtest_data_dir/en-${lang}/test.$tgt_lang
56
- test_outfname_src=$devtest_norm_dir/test.$src_lang
57
- test_outfname_tgt=$devtest_norm_dir/test.$tgt_lang
58
- echo "Applying normalization and script conversion for test"
59
- input_size=`python scripts/preprocess_translate.py $test_infname_src $test_outfname_src $src_lang true`
60
- input_size=`python scripts/preprocess_translate.py $test_infname_tgt $test_outfname_tgt $tgt_lang true`
61
- echo "Number of sentences in test: $input_size"
62
- done
63
- # this concatenates lang pair data and creates text files to keep track of number of lines in each lang pair.
64
- # this is imp as for joint training, we will merge all the lang pairs and the indivitual lang lines info
65
- # would be required for adding specific lang tags later.
66
-
67
- # the outputs of these scripts will be text file like this:
68
- # <lang1> <lang2> <number of lines>
69
- # lang1-lang2 n1
70
- # lang1-lang3 n2
71
-
72
- python scripts/concat_joint_data.py $exp_dir/norm $exp_dir/data $src_lang $tgt_lang 'train'
73
- python scripts/concat_joint_data.py $exp_dir/norm $exp_dir/data $src_lang $tgt_lang 'dev'
74
- python scripts/concat_joint_data.py $exp_dir/norm $exp_dir/data $src_lang $tgt_lang 'test'
75
-
76
- # echo "Learning bpe. This will take a very long time depending on the size of the dataset"
77
- echo `date`
78
- # # learn bpe for preprocessed_train files
79
- # for creating joint_vocab use this
80
- # bash learn_bpe.sh $exp_dir
81
-
82
- # for sep vocab use this
83
- bash learn_single_bpe.sh $exp_dir
84
- echo `date`
85
-
86
- # echo "Applying bpe"
87
- # apply the learnt bpe to the data
88
- bash apply_bpe_traindevtest_notag.sh $exp_dir
89
-
90
- mkdir -p $exp_dir/final
91
-
92
- # # this is only required for joint training
93
- # we apply language tags to the bpe segmented data
94
- #
95
- # if we are translating lang1 to lang2 then <lang1 line> will become __src__ <lang1> __tgt__ <lang2> <lang1 line>
96
- echo "Adding language tags"
97
- python scripts/add_joint_tags_translate.py $exp_dir 'train'
98
- python scripts/add_joint_tags_translate.py $exp_dir 'dev'
99
- python scripts/add_joint_tags_translate.py $exp_dir 'test'
100
-
101
- # # this is important step if you are training with tpu and using num_batch_buckets
102
- # # the currnet implementation does not remove outliers before bucketing and hence
103
- # # removing these large sentences ourselves helps with getting better buckets
104
- # python scripts/remove_large_sentences.py $exp_dir/bpe/train.SRC $exp_dir/bpe/train.TGT $exp_dir/final/train.SRC $exp_dir/final/train.TGT
105
- # python scripts/remove_large_sentences.py $exp_dir/bpe/dev.SRC $exp_dir/bpe/dev.TGT $exp_dir/final/dev.SRC $exp_dir/final/dev.TGT
106
- # python scripts/remove_large_sentences.py $exp_dir/bpe/test.SRC $exp_dir/bpe/test.TGT $exp_dir/final/test.SRC $exp_dir/final/test.TGT
107
-
108
- # echo "Binarizing data"
109
- # Binarize the training data for using with fairseq train
110
- bash binarize_training_exp.sh $exp_dir SRC TGT