harveen commited on
Commit
e50fe35
1 Parent(s): 9e65735

Adding code

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. indicTrans/.gitignore +143 -0
  2. indicTrans/IndicTrans_training.ipynb +752 -0
  3. indicTrans/LICENSE +21 -0
  4. indicTrans/README.md +296 -0
  5. indicTrans/api.py +86 -0
  6. indicTrans/apply_bpe_traindevtest_notag.sh +41 -0
  7. indicTrans/apply_single_bpe_traindevtest_notag.sh +40 -0
  8. indicTrans/binarize_training_exp.sh +24 -0
  9. indicTrans/compute_bleu.sh +28 -0
  10. indicTrans/indicTrans_Finetuning.ipynb +0 -0
  11. indicTrans/indicTrans_python_interface.ipynb +462 -0
  12. indicTrans/indic_nlp_library/LICENSE +9 -0
  13. indicTrans/indic_nlp_library/README.md +142 -0
  14. indicTrans/indic_nlp_library/contrib/README.md +7 -0
  15. indicTrans/indic_nlp_library/contrib/correct_moses_tokenizer.py +29 -0
  16. indicTrans/indic_nlp_library/contrib/hindi_to_kannada_transliterator.py +62 -0
  17. indicTrans/indic_nlp_library/contrib/indic_scraper_project_sample.ipynb +569 -0
  18. indicTrans/indic_nlp_library/docs/Makefile +153 -0
  19. indicTrans/indic_nlp_library/docs/cmd.rst +8 -0
  20. indicTrans/indic_nlp_library/docs/code.rst +5 -0
  21. indicTrans/indic_nlp_library/docs/conf.py +242 -0
  22. indicTrans/indic_nlp_library/docs/index.rst +22 -0
  23. indicTrans/indic_nlp_library/docs/indicnlp.MD +122 -0
  24. indicTrans/indic_nlp_library/docs/indicnlp.cli.rst +11 -0
  25. indicTrans/indic_nlp_library/docs/indicnlp.morph.rst +11 -0
  26. indicTrans/indic_nlp_library/docs/indicnlp.normalize.rst +15 -0
  27. indicTrans/indic_nlp_library/docs/indicnlp.pdf +0 -0
  28. indicTrans/indic_nlp_library/docs/indicnlp.rst +47 -0
  29. indicTrans/indic_nlp_library/docs/indicnlp.script.rst +26 -0
  30. indicTrans/indic_nlp_library/docs/indicnlp.syllable.rst +11 -0
  31. indicTrans/indic_nlp_library/docs/indicnlp.tokenize.rst +26 -0
  32. indicTrans/indic_nlp_library/docs/indicnlp.transliterate.rst +34 -0
  33. indicTrans/indic_nlp_library/docs/make.bat +35 -0
  34. indicTrans/indic_nlp_library/docs/modules.rst +7 -0
  35. indicTrans/indic_nlp_library/indicnlp/__init__.py +10 -0
  36. indicTrans/indic_nlp_library/indicnlp/cli/__init__.py +0 -0
  37. indicTrans/indic_nlp_library/indicnlp/cli/cliparser.py +266 -0
  38. indicTrans/indic_nlp_library/indicnlp/common.py +58 -0
  39. indicTrans/indic_nlp_library/indicnlp/langinfo.py +488 -0
  40. indicTrans/indic_nlp_library/indicnlp/loader.py +35 -0
  41. indicTrans/indic_nlp_library/indicnlp/morph/__init__.py +0 -0
  42. indicTrans/indic_nlp_library/indicnlp/morph/unsupervised_morph.py +142 -0
  43. indicTrans/indic_nlp_library/indicnlp/normalize/__init__.py +0 -0
  44. indicTrans/indic_nlp_library/indicnlp/normalize/indic_normalize.py +984 -0
  45. indicTrans/indic_nlp_library/indicnlp/script/__init__.py +0 -0
  46. indicTrans/indic_nlp_library/indicnlp/script/english_script.py +154 -0
  47. indicTrans/indic_nlp_library/indicnlp/script/indic_scripts.py +301 -0
  48. indicTrans/indic_nlp_library/indicnlp/script/phonetic_sim.py +59 -0
  49. indicTrans/indic_nlp_library/indicnlp/syllable/__init__.py +0 -0
  50. indicTrans/indic_nlp_library/indicnlp/syllable/syllabifier.py +302 -0
indicTrans/.gitignore ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #ignore libs folder we use
2
+ indic_nlp_library
3
+ indic_nlp_resources
4
+ subword-nmt
5
+
6
+ # Byte-compiled / optimized / DLL files
7
+ __pycache__/
8
+ *.py[cod]
9
+ *$py.class
10
+
11
+ # C extensions
12
+ *.so
13
+
14
+ # Distribution / packaging
15
+ .Python
16
+ build/
17
+ develop-eggs/
18
+ dist/
19
+ downloads/
20
+ eggs/
21
+ .eggs/
22
+ lib/
23
+ lib64/
24
+ parts/
25
+ sdist/
26
+ var/
27
+ wheels/
28
+ share/python-wheels/
29
+ *.egg-info/
30
+ .installed.cfg
31
+ *.egg
32
+ MANIFEST
33
+
34
+ # PyInstaller
35
+ # Usually these files are written by a python script from a template
36
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
37
+ *.manifest
38
+ *.spec
39
+
40
+ # Installer logs
41
+ pip-log.txt
42
+ pip-delete-this-directory.txt
43
+
44
+ # Unit test / coverage reports
45
+ htmlcov/
46
+ .tox/
47
+ .nox/
48
+ .coverage
49
+ .coverage.*
50
+ .cache
51
+ nosetests.xml
52
+ coverage.xml
53
+ *.cover
54
+ *.py,cover
55
+ .hypothesis/
56
+ .pytest_cache/
57
+ cover/
58
+
59
+ # Translations
60
+ *.mo
61
+ *.pot
62
+
63
+ # Django stuff:
64
+ *.log
65
+ local_settings.py
66
+ db.sqlite3
67
+ db.sqlite3-journal
68
+
69
+ # Flask stuff:
70
+ instance/
71
+ .webassets-cache
72
+
73
+ # Scrapy stuff:
74
+ .scrapy
75
+
76
+ # Sphinx documentation
77
+ docs/_build/
78
+
79
+ # PyBuilder
80
+ .pybuilder/
81
+ target/
82
+
83
+ # Jupyter Notebook
84
+ .ipynb_checkpoints
85
+
86
+ # IPython
87
+ profile_default/
88
+ ipython_config.py
89
+
90
+ # pyenv
91
+ # For a library or package, you might want to ignore these files since the code is
92
+ # intended to run in multiple environments; otherwise, check them in:
93
+ # .python-version
94
+
95
+ # pipenv
96
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
97
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
98
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
99
+ # install all needed dependencies.
100
+ #Pipfile.lock
101
+
102
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
103
+ __pypackages__/
104
+
105
+ # Celery stuff
106
+ celerybeat-schedule
107
+ celerybeat.pid
108
+
109
+ # SageMath parsed files
110
+ *.sage.py
111
+
112
+ # Environments
113
+ .env
114
+ .venv
115
+ env/
116
+ venv/
117
+ ENV/
118
+ env.bak/
119
+ venv.bak/
120
+
121
+ # Spyder project settings
122
+ .spyderproject
123
+ .spyproject
124
+
125
+ # Rope project settings
126
+ .ropeproject
127
+
128
+ # mkdocs documentation
129
+ /site
130
+
131
+ # mypy
132
+ .mypy_cache/
133
+ .dmypy.json
134
+ dmypy.json
135
+
136
+ # Pyre type checker
137
+ .pyre/
138
+
139
+ # pytype static type analyzer
140
+ .pytype/
141
+
142
+ # Cython debug symbols
143
+ cython_debug/
indicTrans/IndicTrans_training.ipynb ADDED
@@ -0,0 +1,752 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {
6
+ "colab_type": "text",
7
+ "id": "view-in-github"
8
+ },
9
+ "source": [
10
+ "<a href=\"https://colab.research.google.com/github/gowtham1997/indicTrans-1/blob/main/IndicTrans_training.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": 1,
16
+ "metadata": {
17
+ "colab": {
18
+ "base_uri": "https://localhost:8080/"
19
+ },
20
+ "id": "FdyHSnoj7Iun",
21
+ "outputId": "d0624c60-68c4-470f-9ade-c517e3296044"
22
+ },
23
+ "outputs": [
24
+ {
25
+ "name": "stdout",
26
+ "output_type": "stream",
27
+ "text": [
28
+ "/content/training\n"
29
+ ]
30
+ }
31
+ ],
32
+ "source": [
33
+ "# create a seperate folder to store everything\n",
34
+ "!mkdir training\n",
35
+ "%cd training"
36
+ ]
37
+ },
38
+ {
39
+ "cell_type": "code",
40
+ "execution_count": 2,
41
+ "metadata": {
42
+ "colab": {
43
+ "base_uri": "https://localhost:8080/"
44
+ },
45
+ "id": "y55OfxBz8QeP",
46
+ "outputId": "6d0ab016-0f96-4671-ddee-f06b50506dcd"
47
+ },
48
+ "outputs": [
49
+ {
50
+ "name": "stdout",
51
+ "output_type": "stream",
52
+ "text": [
53
+ "Cloning into 'indicTrans'...\n",
54
+ "remote: Enumerating objects: 432, done.\u001b[K\n",
55
+ "remote: Counting objects: 100% (139/139), done.\u001b[K\n",
56
+ "remote: Compressing objects: 100% (34/34), done.\u001b[K\n",
57
+ "remote: Total 432 (delta 122), reused 105 (delta 105), pack-reused 293\u001b[K\n",
58
+ "Receiving objects: 100% (432/432), 1.43 MiB | 14.11 MiB/s, done.\n",
59
+ "Resolving deltas: 100% (248/248), done.\n",
60
+ "/content/training/indicTrans\n",
61
+ "Cloning into 'indic_nlp_library'...\n",
62
+ "remote: Enumerating objects: 1325, done.\u001b[K\n",
63
+ "remote: Counting objects: 100% (147/147), done.\u001b[K\n",
64
+ "remote: Compressing objects: 100% (103/103), done.\u001b[K\n",
65
+ "remote: Total 1325 (delta 84), reused 89 (delta 41), pack-reused 1178\u001b[K\n",
66
+ "Receiving objects: 100% (1325/1325), 9.57 MiB | 10.51 MiB/s, done.\n",
67
+ "Resolving deltas: 100% (688/688), done.\n",
68
+ "Cloning into 'indic_nlp_resources'...\n",
69
+ "remote: Enumerating objects: 133, done.\u001b[K\n",
70
+ "remote: Counting objects: 100% (7/7), done.\u001b[K\n",
71
+ "remote: Compressing objects: 100% (7/7), done.\u001b[K\n",
72
+ "remote: Total 133 (delta 0), reused 2 (delta 0), pack-reused 126\u001b[K\n",
73
+ "Receiving objects: 100% (133/133), 149.77 MiB | 34.05 MiB/s, done.\n",
74
+ "Resolving deltas: 100% (51/51), done.\n",
75
+ "Checking out files: 100% (28/28), done.\n",
76
+ "Cloning into 'subword-nmt'...\n",
77
+ "remote: Enumerating objects: 580, done.\u001b[K\n",
78
+ "remote: Counting objects: 100% (4/4), done.\u001b[K\n",
79
+ "remote: Compressing objects: 100% (4/4), done.\u001b[K\n",
80
+ "remote: Total 580 (delta 0), reused 1 (delta 0), pack-reused 576\u001b[K\n",
81
+ "Receiving objects: 100% (580/580), 237.41 KiB | 5.28 MiB/s, done.\n",
82
+ "Resolving deltas: 100% (349/349), done.\n",
83
+ "/content/training\n"
84
+ ]
85
+ }
86
+ ],
87
+ "source": [
88
+ "# clone the repo for running finetuning\n",
89
+ "!git clone https://github.com/AI4Bharat/indicTrans.git\n",
90
+ "%cd indicTrans\n",
91
+ "# clone requirements repositories\n",
92
+ "!git clone https://github.com/anoopkunchukuttan/indic_nlp_library.git\n",
93
+ "!git clone https://github.com/anoopkunchukuttan/indic_nlp_resources.git\n",
94
+ "!git clone https://github.com/rsennrich/subword-nmt.git\n",
95
+ "%cd .."
96
+ ]
97
+ },
98
+ {
99
+ "cell_type": "code",
100
+ "execution_count": 3,
101
+ "metadata": {
102
+ "colab": {
103
+ "base_uri": "https://localhost:8080/"
104
+ },
105
+ "id": "ziWWl-1a8SMw",
106
+ "outputId": "d7908a62-9573-4693-e7cb-44aeeebaaa15"
107
+ },
108
+ "outputs": [
109
+ {
110
+ "name": "stdout",
111
+ "output_type": "stream",
112
+ "text": [
113
+ "Reading package lists... Done\n",
114
+ "Building dependency tree \n",
115
+ "Reading state information... Done\n",
116
+ "The following NEW packages will be installed:\n",
117
+ " tree\n",
118
+ "0 upgraded, 1 newly installed, 0 to remove and 39 not upgraded.\n",
119
+ "Need to get 40.7 kB of archives.\n",
120
+ "After this operation, 105 kB of additional disk space will be used.\n",
121
+ "Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 tree amd64 1.7.0-5 [40.7 kB]\n",
122
+ "Fetched 40.7 kB in 0s (133 kB/s)\n",
123
+ "debconf: unable to initialize frontend: Dialog\n",
124
+ "debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 76, <> line 1.)\n",
125
+ "debconf: falling back to frontend: Readline\n",
126
+ "debconf: unable to initialize frontend: Readline\n",
127
+ "debconf: (This frontend requires a controlling tty.)\n",
128
+ "debconf: falling back to frontend: Teletype\n",
129
+ "dpkg-preconfigure: unable to re-open stdin: \n",
130
+ "Selecting previously unselected package tree.\n",
131
+ "(Reading database ... 160772 files and directories currently installed.)\n",
132
+ "Preparing to unpack .../tree_1.7.0-5_amd64.deb ...\n",
133
+ "Unpacking tree (1.7.0-5) ...\n",
134
+ "Setting up tree (1.7.0-5) ...\n",
135
+ "Processing triggers for man-db (2.8.3-2ubuntu0.1) ...\n",
136
+ "Collecting sacremoses\n",
137
+ "\u001b[?25l Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)\n",
138
+ "\u001b[K |████████████████████████████████| 901kB 4.0MB/s \n",
139
+ "\u001b[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (1.1.5)\n",
140
+ "Collecting mock\n",
141
+ " Downloading https://files.pythonhosted.org/packages/5c/03/b7e605db4a57c0f6fba744b11ef3ddf4ddebcada35022927a2b5fc623fdf/mock-4.0.3-py3-none-any.whl\n",
142
+ "Collecting sacrebleu\n",
143
+ "\u001b[?25l Downloading https://files.pythonhosted.org/packages/7e/57/0c7ca4e31a126189dab99c19951910bd081dea5bbd25f24b77107750eae7/sacrebleu-1.5.1-py3-none-any.whl (54kB)\n",
144
+ "\u001b[K |████████████████████████████████| 61kB 7.4MB/s \n",
145
+ "\u001b[?25hCollecting tensorboardX\n",
146
+ "\u001b[?25l Downloading https://files.pythonhosted.org/packages/42/36/2b147652c40c3a858efa0afbf7b8236fae968e88ff530511a4cfa299a506/tensorboardX-2.3-py2.py3-none-any.whl (124kB)\n",
147
+ "\u001b[K |████████████████████████████████| 133kB 24.0MB/s \n",
148
+ "\u001b[?25hRequirement already satisfied: pyarrow in /usr/local/lib/python3.7/dist-packages (3.0.0)\n",
149
+ "Collecting indic-nlp-library\n",
150
+ "\u001b[?25l Downloading https://files.pythonhosted.org/packages/84/d4/495bb43b88a2a6d04b09c29fc5115f24872af74cd8317fe84026abd4ddb1/indic_nlp_library-0.81-py3-none-any.whl (40kB)\n",
151
+ "\u001b[K |████████████████████████████████| 40kB 5.4MB/s \n",
152
+ "\u001b[?25hRequirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from sacremoses) (1.15.0)\n",
153
+ "Requirement already satisfied: regex in /usr/local/lib/python3.7/dist-packages (from sacremoses) (2019.12.20)\n",
154
+ "Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from sacremoses) (7.1.2)\n",
155
+ "Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from sacremoses) (4.41.1)\n",
156
+ "Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from sacremoses) (1.0.1)\n",
157
+ "Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas) (2018.9)\n",
158
+ "Requirement already satisfied: numpy>=1.15.4 in /usr/local/lib/python3.7/dist-packages (from pandas) (1.19.5)\n",
159
+ "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas) (2.8.1)\n",
160
+ "Collecting portalocker==2.0.0\n",
161
+ " Downloading https://files.pythonhosted.org/packages/89/a6/3814b7107e0788040870e8825eebf214d72166adf656ba7d4bf14759a06a/portalocker-2.0.0-py2.py3-none-any.whl\n",
162
+ "Requirement already satisfied: protobuf>=3.8.0 in /usr/local/lib/python3.7/dist-packages (from tensorboardX) (3.12.4)\n",
163
+ "Collecting morfessor\n",
164
+ " Downloading https://files.pythonhosted.org/packages/39/e6/7afea30be2ee4d29ce9de0fa53acbb033163615f849515c0b1956ad074ee/Morfessor-2.0.6-py3-none-any.whl\n",
165
+ "Collecting sphinx-argparse\n",
166
+ " Downloading https://files.pythonhosted.org/packages/06/2b/dfad6a1831c3aeeae25d8d3d417224684befbf45e10c7f2141631616a6ed/sphinx-argparse-0.2.5.tar.gz\n",
167
+ "Collecting sphinx-rtd-theme\n",
168
+ "\u001b[?25l Downloading https://files.pythonhosted.org/packages/ac/24/2475e8f83519b54b2148d4a56eb1111f9cec630d088c3ffc214492c12107/sphinx_rtd_theme-0.5.2-py2.py3-none-any.whl (9.1MB)\n",
169
+ "\u001b[K |████████████████████████████████| 9.2MB 21.7MB/s \n",
170
+ "\u001b[?25hRequirement already satisfied: setuptools in /usr/local/lib/python3.7/dist-packages (from protobuf>=3.8.0->tensorboardX) (57.0.0)\n",
171
+ "Requirement already satisfied: sphinx>=1.2.0 in /usr/local/lib/python3.7/dist-packages (from sphinx-argparse->indic-nlp-library) (1.8.5)\n",
172
+ "Collecting docutils<0.17\n",
173
+ "\u001b[?25l Downloading https://files.pythonhosted.org/packages/81/44/8a15e45ffa96e6cf82956dd8d7af9e666357e16b0d93b253903475ee947f/docutils-0.16-py2.py3-none-any.whl (548kB)\n",
174
+ "\u001b[K |████████████████████████████████| 552kB 38.5MB/s \n",
175
+ "\u001b[?25hRequirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (20.9)\n",
176
+ "Requirement already satisfied: imagesize in /usr/local/lib/python3.7/dist-packages (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (1.2.0)\n",
177
+ "Requirement already satisfied: requests>=2.0.0 in /usr/local/lib/python3.7/dist-packages (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (2.23.0)\n",
178
+ "Requirement already satisfied: sphinxcontrib-websupport in /usr/local/lib/python3.7/dist-packages (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (1.2.4)\n",
179
+ "Requirement already satisfied: Pygments>=2.0 in /usr/local/lib/python3.7/dist-packages (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (2.6.1)\n",
180
+ "Requirement already satisfied: snowballstemmer>=1.1 in /usr/local/lib/python3.7/dist-packages (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (2.1.0)\n",
181
+ "Requirement already satisfied: babel!=2.0,>=1.3 in /usr/local/lib/python3.7/dist-packages (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (2.9.1)\n",
182
+ "Requirement already satisfied: alabaster<0.8,>=0.7 in /usr/local/lib/python3.7/dist-packages (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (0.7.12)\n",
183
+ "Requirement already satisfied: Jinja2>=2.3 in /usr/local/lib/python3.7/dist-packages (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (2.11.3)\n",
184
+ "Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (2.4.7)\n",
185
+ "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests>=2.0.0->sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (1.24.3)\n",
186
+ "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests>=2.0.0->sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (2.10)\n",
187
+ "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests>=2.0.0->sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (3.0.4)\n",
188
+ "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests>=2.0.0->sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (2021.5.30)\n",
189
+ "Requirement already satisfied: sphinxcontrib-serializinghtml in /usr/local/lib/python3.7/dist-packages (from sphinxcontrib-websupport->sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (1.1.5)\n",
190
+ "Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.7/dist-packages (from Jinja2>=2.3->sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (2.0.1)\n",
191
+ "Building wheels for collected packages: sphinx-argparse\n",
192
+ " Building wheel for sphinx-argparse (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
193
+ " Created wheel for sphinx-argparse: filename=sphinx_argparse-0.2.5-cp37-none-any.whl size=11552 sha256=0f3830a0bf7a6cfa99000091da945e9dd814b2f1e1f9ca5d773f99aaa0d3a4a5\n",
194
+ " Stored in directory: /root/.cache/pip/wheels/2a/18/1b/4990a1859da4edc77ab312bc2986c08d2733fb5713d06e44f5\n",
195
+ "Successfully built sphinx-argparse\n",
196
+ "\u001b[31mERROR: datascience 0.10.6 has requirement folium==0.2.1, but you'll have folium 0.8.3 which is incompatible.\u001b[0m\n",
197
+ "Installing collected packages: sacremoses, mock, portalocker, sacrebleu, tensorboardX, morfessor, sphinx-argparse, docutils, sphinx-rtd-theme, indic-nlp-library\n",
198
+ " Found existing installation: docutils 0.17.1\n",
199
+ " Uninstalling docutils-0.17.1:\n",
200
+ " Successfully uninstalled docutils-0.17.1\n",
201
+ "Successfully installed docutils-0.16 indic-nlp-library-0.81 mock-4.0.3 morfessor-2.0.6 portalocker-2.0.0 sacrebleu-1.5.1 sacremoses-0.0.45 sphinx-argparse-0.2.5 sphinx-rtd-theme-0.5.2 tensorboardX-2.3\n",
202
+ "Cloning into 'fairseq'...\n",
203
+ "remote: Enumerating objects: 28410, done.\u001b[K\n",
204
+ "remote: Counting objects: 100% (229/229), done.\u001b[K\n",
205
+ "remote: Compressing objects: 100% (127/127), done.\u001b[K\n",
206
+ "remote: Total 28410 (delta 114), reused 187 (delta 99), pack-reused 28181\u001b[K\n",
207
+ "Receiving objects: 100% (28410/28410), 11.96 MiB | 24.45 MiB/s, done.\n",
208
+ "Resolving deltas: 100% (21310/21310), done.\n",
209
+ "/content/training/fairseq\n",
210
+ "Obtaining file:///content/training/fairseq\n",
211
+ " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
212
+ " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
213
+ " Installing backend dependencies ... \u001b[?25l\u001b[?25hdone\n",
214
+ " Preparing wheel metadata ... \u001b[?25l\u001b[?25hdone\n",
215
+ "Requirement already satisfied: regex in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (2019.12.20)\n",
216
+ "Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (4.41.1)\n",
217
+ "Collecting omegaconf<2.1\n",
218
+ " Downloading https://files.pythonhosted.org/packages/d0/eb/9d63ce09dd8aa85767c65668d5414958ea29648a0eec80a4a7d311ec2684/omegaconf-2.0.6-py3-none-any.whl\n",
219
+ "Requirement already satisfied: numpy; python_version >= \"3.7\" in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (1.19.5)\n",
220
+ "Requirement already satisfied: sacrebleu>=1.4.12 in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (1.5.1)\n",
221
+ "Requirement already satisfied: cython in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (0.29.23)\n",
222
+ "Collecting hydra-core<1.1\n",
223
+ "\u001b[?25l Downloading https://files.pythonhosted.org/packages/52/e3/fbd70dd0d3ce4d1d75c22d56c0c9f895cfa7ed6587a9ffb821d6812d6a60/hydra_core-1.0.6-py3-none-any.whl (123kB)\n",
224
+ "\u001b[K |████████████████████████████████| 133kB 4.7MB/s \n",
225
+ "\u001b[?25hRequirement already satisfied: torch in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (1.9.0+cu102)\n",
226
+ "Requirement already satisfied: cffi in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (1.14.5)\n",
227
+ "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from omegaconf<2.1->fairseq==1.0.0a0+f887152) (3.7.4.3)\n",
228
+ "Collecting PyYAML>=5.1.*\n",
229
+ "\u001b[?25l Downloading https://files.pythonhosted.org/packages/7a/a5/393c087efdc78091afa2af9f1378762f9821c9c1d7a22c5753fb5ac5f97a/PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636kB)\n",
230
+ "\u001b[K |████████████████████████████████| 645kB 32.4MB/s \n",
231
+ "\u001b[?25hRequirement already satisfied: portalocker==2.0.0 in /usr/local/lib/python3.7/dist-packages (from sacrebleu>=1.4.12->fairseq==1.0.0a0+f887152) (2.0.0)\n",
232
+ "Requirement already satisfied: importlib-resources; python_version < \"3.9\" in /usr/local/lib/python3.7/dist-packages (from hydra-core<1.1->fairseq==1.0.0a0+f887152) (5.1.4)\n",
233
+ "Collecting antlr4-python3-runtime==4.8\n",
234
+ "\u001b[?25l Downloading https://files.pythonhosted.org/packages/56/02/789a0bddf9c9b31b14c3e79ec22b9656185a803dc31c15f006f9855ece0d/antlr4-python3-runtime-4.8.tar.gz (112kB)\n",
235
+ "\u001b[K |████████████████████████████████| 112kB 53.0MB/s \n",
236
+ "\u001b[?25hRequirement already satisfied: pycparser in /usr/local/lib/python3.7/dist-packages (from cffi->fairseq==1.0.0a0+f887152) (2.20)\n",
237
+ "Requirement already satisfied: zipp>=3.1.0; python_version < \"3.10\" in /usr/local/lib/python3.7/dist-packages (from importlib-resources; python_version < \"3.9\"->hydra-core<1.1->fairseq==1.0.0a0+f887152) (3.4.1)\n",
238
+ "Building wheels for collected packages: antlr4-python3-runtime\n",
239
+ " Building wheel for antlr4-python3-runtime (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
240
+ " Created wheel for antlr4-python3-runtime: filename=antlr4_python3_runtime-4.8-cp37-none-any.whl size=141231 sha256=52f59bfe6322a04598da6960d2d5675a581273a45e4391e04cf1240c97346019\n",
241
+ " Stored in directory: /root/.cache/pip/wheels/e3/e2/fa/b78480b448b8579ddf393bebd3f47ee23aa84c89b6a78285c8\n",
242
+ "Successfully built antlr4-python3-runtime\n",
243
+ "Installing collected packages: PyYAML, omegaconf, antlr4-python3-runtime, hydra-core, fairseq\n",
244
+ " Found existing installation: PyYAML 3.13\n",
245
+ " Uninstalling PyYAML-3.13:\n",
246
+ " Successfully uninstalled PyYAML-3.13\n",
247
+ " Running setup.py develop for fairseq\n",
248
+ "Successfully installed PyYAML-5.4.1 antlr4-python3-runtime-4.8 fairseq hydra-core-1.0.6 omegaconf-2.0.6\n",
249
+ "/content/training\n"
250
+ ]
251
+ }
252
+ ],
253
+ "source": [
254
+ "! sudo apt install tree\n",
255
+ "\n",
256
+ "# Install the necessary libraries\n",
257
+ "!pip install sacremoses pandas mock sacrebleu tensorboardX pyarrow indic-nlp-library\n",
258
+ "# Install fairseq from source\n",
259
+ "!git clone https://github.com/pytorch/fairseq.git\n",
260
+ "%cd fairseq\n",
261
+ "# !git checkout da9eaba12d82b9bfc1442f0e2c6fc1b895f4d35d\n",
262
+ "!pip install --editable ./\n",
263
+ "%cd .."
264
+ ]
265
+ },
266
+ {
267
+ "cell_type": "code",
268
+ "execution_count": 1,
269
+ "metadata": {
270
+ "colab": {
271
+ "base_uri": "https://localhost:8080/"
272
+ },
273
+ "id": "tmfGYkd58UiO",
274
+ "outputId": "3b83bcf6-bbbf-4e49-c2bb-7d0fb999297d"
275
+ },
276
+ "outputs": [
277
+ {
278
+ "name": "stdout",
279
+ "output_type": "stream",
280
+ "text": [
281
+ "^C\n"
282
+ ]
283
+ },
284
+ {
285
+ "name": "stderr",
286
+ "output_type": "stream",
287
+ "text": [
288
+ "--2021-12-18 21:31:57-- https://storage.googleapis.com/samanantar-public/benchmarks.zip\n",
289
+ "Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.160.144, 216.58.196.176, 142.250.71.16, ...\n",
290
+ "Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.160.144|:443... connected.\n",
291
+ "HTTP request sent, awaiting response... 200 OK\n",
292
+ "Length: 7301872 (7.0M) [application/zip]\n",
293
+ "Saving to: 'benchmarks.zip'\n",
294
+ "\n",
295
+ " 0K .......... .......... .......... .......... .......... 0% 774K 9s\n",
296
+ " 50K .......... .......... .......... .......... .......... 1% 2.10M 6s\n",
297
+ " 100K .......... .......... .......... .......... .......... 2% 2.46M 5s\n",
298
+ " 150K .......... .......... .......... .......... .......... 2% 2.68M 4s\n",
299
+ " 200K .......... .......... .......... .......... .......... 3% 1.44M 4s\n",
300
+ " 250K .......... .......... .......... .......... .......... 4% 2.48M 4s\n",
301
+ " 300K .......... .......... .......... .......... .......... 4% 3.41M 4s\n",
302
+ " 350K .......... .......... .......... .......... .......... 5% 2.22M 4s\n",
303
+ " 400K .......... .......... .......... .......... .......... 6% 1.20M 4s\n",
304
+ " 450K .......... .......... .......... .......... .......... 7% 2.65M 4s\n",
305
+ " 500K .......... .......... .......... .......... .......... 7% 2.97M 3s\n",
306
+ " 550K .......... .......... .......... .......... .......... 8% 887K 4s\n",
307
+ " 600K .......... .......... .......... .......... .......... 9% 2.90M 4s\n",
308
+ " 650K .......... .......... .......... .......... .......... 9% 2.76M 4s\n",
309
+ " 700K .......... .......... .......... .......... .......... 10% 980K 4s\n",
310
+ " 750K .......... .......... .......... .......... .......... 11% 2.55M 4s\n",
311
+ " 800K .......... .......... .......... .......... .......... 11% 2.86M 3s\n",
312
+ " 850K .......... .......... .......... .......... .......... 12% 3.04M 3s\n",
313
+ " 900K .......... .......... .......... .......... .......... 13% 1.01M 3s\n",
314
+ " 950K .......... .......... .......... .......... .......... 14% 3.35M 3s\n",
315
+ " 1000K .......... .......... .......... .......... .......... 14% 5.04M 3s\n",
316
+ " 1050K .......... .......... .......... .......... .......... 15% 14.5M 3s\n",
317
+ " 1100K .......... .......... .......... .......... .......... 16% 1.01M 3s\n",
318
+ " 1150K .......... .......... .......... .......... .......... 16% 4.48M 3s\n",
319
+ " 1200K .......... .......... .......... .......... .......... 17% 4.34M 3s\n",
320
+ " 1250K .......... .......... .......... .......... .......... 18% 2.90M 3s\n",
321
+ " 1300K .......... .......... .......... .......... .......... 18% 1.14M 3s\n",
322
+ " 1350K .......... .......... .......... .......... .......... 19% 3.00M 3s\n",
323
+ " 1400K .......... .......... .......... .......... .......... 20% 5.09M 3s\n",
324
+ " 1450K .......... .......... .......... .......... .......... 21% 1.91M 3s\n",
325
+ " 1500K .......... .......... .......... .......... .......... 21% 7.70M 3s\n",
326
+ " 1550K .......... .......... .......... .......... .......... 22% 1.27M 3s\n",
327
+ " 1600K .......... .......... .......... .......... .......... 23% 3.06M 3s\n",
328
+ " 1650K .......... .......... .......... .......... .......... 23% 4.11M 3s\n",
329
+ " 1700K .......... .......... .......... .......... .......... 24% 3.34M 3s\n",
330
+ " 1750K .......... .......... .......... .......... .......... 25% 4.13M 2s\n",
331
+ " 1800K .......... .......... .......... .......... .......... 25% 7.95M 2s\n",
332
+ " 1850K .......... .......... .......... .......... .......... 26% 3.69M 2s\n",
333
+ " 1900K .......... .......... .......... .......... .......... 27% 4.00M 2s\n",
334
+ " 1950K .......... .......... .......... .......... .......... 28% 3.50M 2s\n",
335
+ " 2000K .......... .......... .......... .......... .......... 28% 4.04M 2s\n",
336
+ " 2050K .......... .......... .......... .......... .......... 29% 3.31M 2s\n",
337
+ " 2100K .......... .......... .......... .......... .......... 30% 2.49M 2s\n",
338
+ " 2150K .......... .......... .......... .......... .......... 30% 4.19M 2s\n",
339
+ " 2200K .......... .......... .......... .......... .......... 31% 5.18M 2s\n",
340
+ " 2250K .......... .......... .......... .......... .......... 32% 9.49M 2s\n",
341
+ " 2300K .......... .......... .......... .......... .......... 32% 8.67M 2s\n",
342
+ " 2350K .......... .......... .......... .......... .......... 33% 4.88M 2s\n",
343
+ " 2400K .......... .......... .......... .......... .......... 34% 4.56M 2s\n",
344
+ " 2450K .......... .......... .......... .......... .......... 35% 4.94M 2s\n",
345
+ " 2500K .......... .......... .......... .......... .......... 35% 4.38M 2s\n",
346
+ " 2550K .......... .......... .......... .......... .......... 36% 3.78M 2s\n",
347
+ " 2600K .......... .......... .......... .......... .......... 37% 4.95M 2s\n",
348
+ " 2650K .......... .......... .......... .......... .......... 37% 5.50M 2s\n",
349
+ " 2700K .......... .......... .......... .......... .......... 38% 5.23M 2s\n",
350
+ " 2750K .......... .......... .......... .......... .......... 39% 3.77M 2s\n",
351
+ " 2800K .......... .......... .......... .......... .......... 39% 10.7M 2s\n",
352
+ " 2850K .......... .......... .......... .......... .......... 40% 7.16M 2s\n",
353
+ " 2900K .......... .......... .......... .......... .......... 41% 5.36M 2s\n",
354
+ " 2950K .......... .......... .......... .......... .......... 42% 6.80M 1s\n",
355
+ " 3000K .......... .......... .......... .......... .......... 42% 6.57M 1s\n",
356
+ " 3050K .......... .......... .......... .......... .......... 43% 7.21M 1s\n",
357
+ " 3100K .......... .......... .......... .......... .......... 44% 6.66M 1s\n",
358
+ " 3150K .......... .......... .......... .......... .......... 44% 6.42M 1s\n",
359
+ " 3200K .......... .......... .......... .......... .......... 45% 8.02M 1s\n",
360
+ " 3250K .......... .......... .......... .......... .......... 46% 5.96M 1s\n",
361
+ " 3300K .......... .......... .......... .......... .......... 46% 5.13M 1s\n",
362
+ " 3350K .......... .......... .......... .......... .......... 47% 5.19M 1s\n",
363
+ " 3400K .......... .......... .......... .......... .......... 48% 7.64M 1s\n",
364
+ " 3450K .......... .......... .......... .......... .......... 49% 6.11M 1s\n",
365
+ " 3500K .......... .......... .......... .......... .......... 49% 4.01M 1s\n",
366
+ " 3550K .......... .......... .......... .......... .......... 50% 4.52M 1s\n",
367
+ " 3600K .......... .......... .......... .......... .......... 51% 6.72M 1s\n",
368
+ " 3650K .......... .......... .......... .......... .......... 51% 5.45M 1s\n",
369
+ " 3700K .......... .......... .......... .......... .......... 52% 4.37M 1s\n",
370
+ " 3750K .......... .......... .......... .......... .......... 53% 5.39M 1s\n",
371
+ " 3800K .......... .......... .......... .......... .......... 53% 7.40M 1s\n",
372
+ " 3850K .......... .......... .......... .......... .......... 54% 6.70M 1s\n",
373
+ " 3900K .......... .......... .......... .......... .......... 55% 5.14M 1s\n",
374
+ " 3950K .......... .......... .......... .......... .......... 56% 5.02M 1s\n",
375
+ " 4000K .......... .......... .......... .......... .......... 56% 6.70M 1s\n",
376
+ " 4050K .......... .......... .......... .......... .......... 57% 6.76M 1s\n",
377
+ " 4100K .......... .......... .......... .......... .......... 58% 2.52M 1s\n",
378
+ " 4150K .......... .......... .......... .......... .......... 58% 887K 1s\n",
379
+ " 4200K .......... .......... .......... .......... .......... 59% 9.25M 1s\n",
380
+ " 4250K .......... .......... .......... .......... .......... 60% 1.27M 1s\n",
381
+ " 4300K .......... .......... .......... .......... .......... 61% 5.72M 1s\n",
382
+ " 4350K .......... .......... .......... .......... .......... 61% 4.48M 1s\n",
383
+ " 4400K .......... .......... .......... .......... .......... 62% 5.20M 1s\n",
384
+ " 4450K .......... .......... .......... .......... .......... 63% 6.21M 1s\n",
385
+ " 4500K .......... .......... .......... .......... .......... 63% 7.94M 1s\n",
386
+ " 4550K .......... .......... .......... .......... .......... 64% 4.76M 1s\n",
387
+ " 4600K .......... .......... .......... .......... .......... 65% 4.74M 1s\n",
388
+ " 4650K .......... .......... .......... .......... .......... 65% 6.94M 1s\n",
389
+ " 4700K .......... .......... .......... .......... .......... 66% 5.62M 1s\n",
390
+ " 4750K .......... .......... .......... .......... .......... 67% 4.44M 1s\n",
391
+ " 4800K .......... .......... .......... .......... .......... 68% 6.02M 1s\n",
392
+ " 4850K .......... .......... .......... .......... .......... 68% 6.61M 1s\n",
393
+ " 4900K .......... .......... .......... .......... .......... 69% 3.04M 1s\n",
394
+ " 4950K .......... .......... .......... .......... .......... 70% 5.34M 1s\n",
395
+ " 5000K .......... .......... .......... .......... .......... 70% 3.03M 1s\n",
396
+ " 5050K .......... .......... .......... .......... .......... 71% 19.8M 1s\n",
397
+ " 5100K .......... .......... .......... .......... .......... 72% 6.17M 1s\n",
398
+ " 5150K .......... .......... .......... .......... .......... 72% 5.58M 1s\n",
399
+ " 5200K .......... .......... .......... .......... .......... 73% 7.38M 1s\n",
400
+ " 5250K .......... .......... .......... .......... .......... 74% 7.11M 1s\n",
401
+ " 5300K .......... .......... .......... .......... .......... 75% 6.24M 1s\n",
402
+ " 5350K .......... .......... .......... .......... .......... 75% 4.62M 1s\n",
403
+ " 5400K .......... .......... .......... .......... .......... 76% 7.64M 0s\n",
404
+ " 5450K .......... .......... .......... .......... .......... 77% 6.06M 0s\n",
405
+ " 5500K .......... .......... .......... .......... .......... 77% 5.56M 0s\n",
406
+ " 5550K .......... .......... .......... .......... .......... 78% 2.96M 0s\n",
407
+ " 5600K .......... .......... .......... .......... .......... 79% 6.17M 0s\n",
408
+ " 5650K .......... .......... .......... .......... .......... 79% 9.58M 0s\n",
409
+ " 5700K .......... .......... .......... .......... .......... 80% 2.58M 0s\n",
410
+ " 5750K .......... .......... .......... .......... .......... 81% 4.23M 0s\n",
411
+ " 5800K .......... .......... .......... .......... .......... 82% 5.70M 0s\n",
412
+ " 5850K .......... .......... .......... .......... .......... 82% 4.72M 0s\n",
413
+ " 5900K .......... .......... .......... .......... .......... 83% 6.52M 0s\n",
414
+ " 5950K .......... .......... .......... .......... .......... 84% 5.86M 0s\n",
415
+ " 6000K .......... .......... .......... .......... .......... 84% 5.22M 0s\n",
416
+ " 6050K .......... .......... .......... .......... .......... 85% 5.50M 0s\n",
417
+ " 6100K .......... .......... .......... .......... .......... 86% 6.29M 0s\n",
418
+ " 6150K .......... .......... .......... .......... .......... 86% 6.93M 0s\n",
419
+ " 6200K .......... .......... .......... .......... .......... 87% 5.50M 0s\n",
420
+ " 6250K .......... .......... .......... .......... .......... 88% 5.82M 0s\n",
421
+ " 6300K .......... .......... .......... .......... .......... 89% 6.76M 0s\n",
422
+ " 6350K .......... .......... .......... .......... .......... 89% 3.73M 0s\n",
423
+ " 6400K .......... .......... .......... .......... .......... 90% 5.98M 0s\n",
424
+ " 6450K .......... .......... .......... .......... .......... 91% 5.78M 0s\n",
425
+ " 6500K .......... .......... .......... .......... .......... 91% 5.60M 0s\n",
426
+ " 6550K .......... .......... .......... .......... .......... 92% 4.84M 0s\n",
427
+ " 6600K .......... .......... .......... .......... .......... 93% 7.25M 0s\n",
428
+ " 6650K .......... .......... .......... .......... .......... 93% 2.60M 0s\n",
429
+ " 6700K .......... .......... .......... .......... .......... 94% 6.02M 0s\n",
430
+ " 6750K .......... .......... .......... .......... .......... 95% 6.57M 0s\n",
431
+ " 6800K .......... .......... .......... .......... .......... 96% 8.30M 0s\n",
432
+ " 6850K .......... .......... .......... .......... .......... 96% 14.4M 0s\n",
433
+ " 6900K .......... .......... .......... .......... .......... 97% 4.58M 0s\n",
434
+ " 6950K .......... .......... .......... .......... .......... 98% 3.31M 0s\n",
435
+ " 7000K .......... .......... .......... .......... .......... 98% 6.88M 0s\n",
436
+ " 7050K .......... .......... .......... .......... .......... 99% 4.40M 0s\n",
437
+ " 7100K .......... .......... .......... 100% 15.1M=1.9s\n",
438
+ "\n",
439
+ "2021-12-18 21:32:01 (3.64 MB/s) - 'benchmarks.zip' saved [7301872/7301872]\n",
440
+ "\n"
441
+ ]
442
+ },
443
+ {
444
+ "name": "stdout",
445
+ "output_type": "stream",
446
+ "text": [
447
+ "Archive: samanatar-en-indic-v0.2.zip\n"
448
+ ]
449
+ },
450
+ {
451
+ "name": "stderr",
452
+ "output_type": "stream",
453
+ "text": [
454
+ " End-of-central-directory signature not found. Either this file is not\n",
455
+ " a zipfile, or it constitutes one disk of a multi-part archive. In the\n",
456
+ " latter case the central directory and zipfile comment will be found on\n",
457
+ " the last disk(s) of this archive.\n",
458
+ "unzip: cannot find zipfile directory in one of samanatar-en-indic-v0.2.zip or\n",
459
+ " samanatar-en-indic-v0.2.zip.zip, and cannot find samanatar-en-indic-v0.2.zip.ZIP, period.\n"
460
+ ]
461
+ },
462
+ {
463
+ "name": "stdout",
464
+ "output_type": "stream",
465
+ "text": [
466
+ "Archive: benchmarks.zip\n",
467
+ " creating: benchmarks/\n",
468
+ " creating: benchmarks/pmi/\n",
469
+ " creating: benchmarks/pmi/en-as/\n",
470
+ " inflating: benchmarks/pmi/en-as/dev.as \n",
471
+ " inflating: benchmarks/pmi/en-as/dev.en \n",
472
+ " inflating: benchmarks/pmi/en-as/test.as \n",
473
+ " inflating: benchmarks/pmi/en-as/test.en \n",
474
+ " creating: benchmarks/wat2021-devtest/\n",
475
+ " inflating: benchmarks/wat2021-devtest/dev.gu \n",
476
+ " inflating: benchmarks/wat2021-devtest/dev.en \n",
477
+ " inflating: benchmarks/wat2021-devtest/test.bn \n",
478
+ " inflating: benchmarks/wat2021-devtest/dev.bn \n",
479
+ " inflating: benchmarks/wat2021-devtest/test.hi \n",
480
+ " inflating: benchmarks/wat2021-devtest/dev.kn \n",
481
+ " inflating: benchmarks/wat2021-devtest/dev.ta \n",
482
+ " inflating: benchmarks/wat2021-devtest/test.pa \n",
483
+ " inflating: benchmarks/wat2021-devtest/test.en \n",
484
+ " inflating: benchmarks/wat2021-devtest/test.mr \n",
485
+ " inflating: benchmarks/wat2021-devtest/test.kn \n",
486
+ " inflating: benchmarks/wat2021-devtest/dev.ml \n",
487
+ " inflating: benchmarks/wat2021-devtest/test.ta \n",
488
+ " inflating: benchmarks/wat2021-devtest/test.gu \n",
489
+ " inflating: benchmarks/wat2021-devtest/dev.or \n",
490
+ " inflating: benchmarks/wat2021-devtest/test.or \n",
491
+ " inflating: benchmarks/wat2021-devtest/test.te \n",
492
+ " inflating: benchmarks/wat2021-devtest/dev.mr \n",
493
+ " inflating: benchmarks/wat2021-devtest/test.ml \n",
494
+ " inflating: benchmarks/wat2021-devtest/dev.pa \n",
495
+ " inflating: benchmarks/wat2021-devtest/dev.te \n",
496
+ " inflating: benchmarks/wat2021-devtest/dev.hi \n",
497
+ " creating: benchmarks/wat2020-devtest/\n",
498
+ " creating: benchmarks/wat2020-devtest/en-bn/\n",
499
+ " inflating: benchmarks/wat2020-devtest/en-bn/dev.en \n",
500
+ " inflating: benchmarks/wat2020-devtest/en-bn/test.bn \n",
501
+ " inflating: benchmarks/wat2020-devtest/en-bn/dev.bn \n",
502
+ " inflating: benchmarks/wat2020-devtest/en-bn/test.en \n",
503
+ " creating: benchmarks/wat2020-devtest/en-ta/\n",
504
+ " inflating: benchmarks/wat2020-devtest/en-ta/dev.en \n",
505
+ " inflating: benchmarks/wat2020-devtest/en-ta/dev.ta \n",
506
+ " inflating: benchmarks/wat2020-devtest/en-ta/test.en \n",
507
+ " inflating: benchmarks/wat2020-devtest/en-ta/test.ta \n",
508
+ " creating: benchmarks/wat2020-devtest/en-mr/\n",
509
+ " inflating: benchmarks/wat2020-devtest/en-mr/dev.en \n",
510
+ " inflating: benchmarks/wat2020-devtest/en-mr/test.en \n",
511
+ " inflating: benchmarks/wat2020-devtest/en-mr/test.mr \n",
512
+ " inflating: benchmarks/wat2020-devtest/en-mr/dev.mr \n",
513
+ " creating: benchmarks/wat2020-devtest/en-te/\n",
514
+ " inflating: benchmarks/wat2020-devtest/en-te/dev.en \n",
515
+ " inflating: benchmarks/wat2020-devtest/en-te/test.en \n",
516
+ " inflating: benchmarks/wat2020-devtest/en-te/test.te \n",
517
+ " inflating: benchmarks/wat2020-devtest/en-te/dev.te \n",
518
+ " creating: benchmarks/wat2020-devtest/en-hi/\n",
519
+ " inflating: benchmarks/wat2020-devtest/en-hi/dev.en \n",
520
+ " inflating: benchmarks/wat2020-devtest/en-hi/test.hi \n",
521
+ " inflating: benchmarks/wat2020-devtest/en-hi/test.en \n",
522
+ " inflating: benchmarks/wat2020-devtest/en-hi/dev.hi \n",
523
+ " creating: benchmarks/wat2020-devtest/en-gu/\n",
524
+ " inflating: benchmarks/wat2020-devtest/en-gu/dev.gu \n",
525
+ " inflating: benchmarks/wat2020-devtest/en-gu/dev.en \n",
526
+ " inflating: benchmarks/wat2020-devtest/en-gu/test.en \n",
527
+ " inflating: benchmarks/wat2020-devtest/en-gu/test.gu \n",
528
+ " creating: benchmarks/wat2020-devtest/en-ml/\n",
529
+ " inflating: benchmarks/wat2020-devtest/en-ml/dev.en \n",
530
+ " inflating: benchmarks/wat2020-devtest/en-ml/test.en \n",
531
+ " inflating: benchmarks/wat2020-devtest/en-ml/dev.ml \n",
532
+ " inflating: benchmarks/wat2020-devtest/en-ml/test.ml \n",
533
+ " creating: benchmarks/ufal-ta/\n",
534
+ " creating: benchmarks/ufal-ta/en-ta/\n",
535
+ " inflating: benchmarks/ufal-ta/en-ta/dev.en \n",
536
+ " inflating: benchmarks/ufal-ta/en-ta/dev.ta \n",
537
+ " inflating: benchmarks/ufal-ta/en-ta/test.en \n",
538
+ " inflating: benchmarks/ufal-ta/en-ta/test.ta \n",
539
+ " creating: benchmarks/wmt-news/\n",
540
+ " creating: benchmarks/wmt-news/en-ta/\n",
541
+ " inflating: benchmarks/wmt-news/en-ta/dev.en \n",
542
+ " inflating: benchmarks/wmt-news/en-ta/dev.ta \n",
543
+ " inflating: benchmarks/wmt-news/en-ta/test.en \n",
544
+ " inflating: benchmarks/wmt-news/en-ta/test.ta \n",
545
+ " creating: benchmarks/wmt-news/en-hi/\n",
546
+ " inflating: benchmarks/wmt-news/en-hi/dev.en \n",
547
+ " inflating: benchmarks/wmt-news/en-hi/test.hi \n",
548
+ " inflating: benchmarks/wmt-news/en-hi/test.en \n",
549
+ " inflating: benchmarks/wmt-news/en-hi/dev.hi \n",
550
+ " creating: benchmarks/wmt-news/en-gu/\n",
551
+ " inflating: benchmarks/wmt-news/en-gu/test.en \n",
552
+ " inflating: benchmarks/wmt-news/en-gu/test.gu \n"
553
+ ]
554
+ }
555
+ ],
556
+ "source": [
557
+ "## for the latest samanantar dataset v0.3 -> please use this link: https://storage.googleapis.com/samanantar-public/V0.3/source_wise_splits.zip\n",
558
+ "# This v0.3 dataset has source wise splits to indicate where the data has been collected from\n",
559
+ "# For preprocessing simplicity we will use v0.2( which just uses raw text files without source information) in this tutorial\n",
560
+ "# \n",
561
+ "# \n",
562
+ "# lets now download the indictrans data v0.2 dataset\n",
563
+ "! wget https://storage.googleapis.com/samanantar-public/V0.2/data/en2indic/samanatar-en-indic-v0.2.zip\n",
564
+ "\n",
565
+ "\n",
566
+ "\n",
567
+ "# lets also download the benchmarks for dev and test set\n",
568
+ "\n",
569
+ "! wget https://storage.googleapis.com/samanantar-public/benchmarks.zip\n",
570
+ "\n",
571
+ "# training data is organized as en-X folders where each folder contains two text files containing parallel data for en-X lang pair.\n",
572
+ "\n",
573
+ "# final_data\n",
574
+ "# ├── en-as\n",
575
+ "# │ ├── train.as\n",
576
+ "# │ └── train.en\n",
577
+ "# ├── en-bn\n",
578
+ "# │ ├── train.bn\n",
579
+ "# │ └── train.en\n",
580
+ "# ├── en-gu\n",
581
+ "# │ ├── train.en\n",
582
+ "# │ └── train.gu\n",
583
+ "# ├── en-hi\n",
584
+ "# │ ├── train.en\n",
585
+ "# │ └── train.hi\n",
586
+ "# ├── en-kn\n",
587
+ "# │ ├── train.en\n",
588
+ "# │ └── train.kn\n",
589
+ "# ├── en-ml\n",
590
+ "# │ ├── train.en\n",
591
+ "# │ └── train.ml\n",
592
+ "# ├── en-mr\n",
593
+ "# │ ├── train.en\n",
594
+ "# │ └── train.mr\n",
595
+ "# ├── en-or\n",
596
+ "# │ ├── train.en\n",
597
+ "# │ └── train.or\n",
598
+ "# ├── en-pa\n",
599
+ "# │ ├── train.en\n",
600
+ "# │ └── train.pa\n",
601
+ "# ├── en-ta\n",
602
+ "# │ ├── train.en\n",
603
+ "# │ └── train.ta\n",
604
+ "# └── en-te\n",
605
+ "# ├── train.en\n",
606
+ "# └── train.te\n",
607
+ "\n",
608
+ "\n",
609
+ "! unzip samanatar-en-indic-v0.2.zip\n",
610
+ "\n",
611
+ "# benchmarks folder consists of all the benchmarks we report in the paper - pmi, ufal-ta, wat2020, wat2021, wmt-news\n",
612
+ "\n",
613
+ "! unzip benchmarks.zip"
614
+ ]
615
+ },
616
+ {
617
+ "cell_type": "code",
618
+ "execution_count": null,
619
+ "metadata": {
620
+ "id": "MR_2GQoa84Jn"
621
+ },
622
+ "outputs": [],
623
+ "source": [
624
+ "# create an experiment dir to store train data, devtest data. \n",
625
+ "# This folder will also store vocabulary files (created with subword_nmt for bpe), fairseq bin files (for training), model checkpoints.\n",
626
+ "\n",
627
+ "# for this example we will be training indic to en translation model. We will name our exp_dir as indic-en-exp\n",
628
+ "! mkdir indic-en-exp\n",
629
+ "# copying all the train folders to exp_dir\n",
630
+ "! cp -r final_data/* indic-en-exp\n",
631
+ "\n",
632
+ "! mkdir -p indic-en-exp/devtest\n",
633
+ "\n",
634
+ "# copying all benchmarks to devtest folder in exp_dir\n",
635
+ "! cp -r benchmarks/* indic-en-exp/devtest\n",
636
+ "\n",
637
+ "# folder to store combined devtest data (based on the domains you want to test, you can combine multiple benchmarks dev datasets, remove duplicates)\n",
638
+ "! mkdir -p indic-en-exp/devtest/all\n",
639
+ "\n",
640
+ "# in this tutorial, for simplicity, we will just use wat2020 devtest for dev and test set\n",
641
+ "! cp -r indic-en-exp/devtest/wat2020-devtest/* indic-en-exp/devtest/all\n",
642
+ "\n"
643
+ ]
644
+ },
645
+ {
646
+ "cell_type": "code",
647
+ "execution_count": null,
648
+ "metadata": {
649
+ "id": "lorcT8wkFPtQ"
650
+ },
651
+ "outputs": [],
652
+ "source": [
653
+ "% cd indicTrans"
654
+ ]
655
+ },
656
+ {
657
+ "cell_type": "code",
658
+ "execution_count": null,
659
+ "metadata": {
660
+ "id": "vhvYXUc1FaVn"
661
+ },
662
+ "outputs": [],
663
+ "source": [
664
+ "# prepare_data_joint_training.sh takes experiment dir, src_lang, tgt_lang as input \n",
665
+ "# This does preprocessing, building vocab, binarization for joint training\n",
666
+ "\n",
667
+ "# The learning and applying vocabulary will take a while if the dataset is huge. To make it faster, run it on a multicore system\n",
668
+ "\n",
669
+ "! bash prepare_data_joint_training.sh '../indic-en-exp' 'indic' 'en'"
670
+ ]
671
+ },
672
+ {
673
+ "cell_type": "code",
674
+ "execution_count": null,
675
+ "metadata": {
676
+ "id": "p1i3fRQzF2-x"
677
+ },
678
+ "outputs": [],
679
+ "source": [
680
+ "# Training the model\n",
681
+ "\n",
682
+ "# pls refer to fairseq documentaion to know more about each of these options (https://fairseq.readthedocs.io/en/latest/command_line_tools.html)\n",
683
+ "\n",
684
+ "\n",
685
+ "# some notable args:\n",
686
+ "# --max-updates -> maximum update steps the model will be trained for\n",
687
+ "# --arch=transformer_4x -> we use a custom transformer model and name it transformer_4x (4 times the parameter size of transformer base)\n",
688
+ "# --user_dir -> we define the custom transformer arch in model_configs folder and pass it as an argument to user_dir for fairseq to register this architechture\n",
689
+ "# --lr -> learning rate. From our limited experiments, we find that lower learning rates like 3e-5 works best for finetuning.\n",
690
+ "# --max_tokens -> this is max tokens per batch. You should limit to lower values if you get oom errors.\n",
691
+ "# --update-freq -> gradient accumulation steps\n",
692
+ "\n",
693
+ "\n",
694
+ "!( fairseq-train ../indic-en-exp/final_bin \\\n",
695
+ "--max-source-positions=210 \\\n",
696
+ "--max-target-positions=210 \\\n",
697
+ "--max-update=<max_updates> \\\n",
698
+ "--save-interval=1 \\\n",
699
+ "--arch=transformer_4x \\\n",
700
+ "--criterion=label_smoothed_cross_entropy \\\n",
701
+ "--source-lang=SRC \\\n",
702
+ "--lr-scheduler=inverse_sqrt \\\n",
703
+ "--target-lang=TGT \\\n",
704
+ "--label-smoothing=0.1 \\\n",
705
+ "--optimizer adam \\\n",
706
+ "--adam-betas \"(0.9, 0.98)\" \\\n",
707
+ "--clip-norm 1.0 \\\n",
708
+ "--warmup-init-lr 1e-07 \\\n",
709
+ "--lr 0.0005 \\\n",
710
+ "--warmup-updates 4000 \\\n",
711
+ "--dropout 0.2 \\\n",
712
+ "--save-dir ../indic-en-exp/model \\\n",
713
+ "--keep-last-epochs 5 \\\n",
714
+ "--patience 5 \\\n",
715
+ "--skip-invalid-size-inputs-valid-test \\\n",
716
+ "--fp16 \\\n",
717
+ "--user-dir model_configs \\\n",
718
+ "--wandb-project <wandb_project_name> \\\n",
719
+ "--update-freq=<grad_accumulation_steps> \\\n",
720
+ "--distributed-world-size <num_gpus> \\\n",
721
+ "--max-tokens <max_tokens_in_a_batch> )"
722
+ ]
723
+ }
724
+ ],
725
+ "metadata": {
726
+ "colab": {
727
+ "authorship_tag": "ABX9TyO6AA5gXphZ5kJ6h+dgeSqb",
728
+ "collapsed_sections": [],
729
+ "include_colab_link": true,
730
+ "name": "IndicTrans_training.ipynb",
731
+ "provenance": []
732
+ },
733
+ "kernelspec": {
734
+ "display_name": "Python 3",
735
+ "name": "python3"
736
+ },
737
+ "language_info": {
738
+ "codemirror_mode": {
739
+ "name": "ipython",
740
+ "version": 3
741
+ },
742
+ "file_extension": ".py",
743
+ "mimetype": "text/x-python",
744
+ "name": "python",
745
+ "nbconvert_exporter": "python",
746
+ "pygments_lexer": "ipython3",
747
+ "version": "3.7.7"
748
+ }
749
+ },
750
+ "nbformat": 4,
751
+ "nbformat_minor": 0
752
+ }
indicTrans/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2021 Gowtham.R
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
indicTrans/README.md ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div align="center">
2
+ <h1><b><i>IndicTrans</i></b></h1>
3
+ <a href="http://indicnlp.ai4bharat.org/samanantar">Website</a> |
4
+ <a href="https://arxiv.org/abs/2104.05596">Paper</a> |
5
+ <a href="https://youtu.be/QwYPOd1eBtQ?t=383">Video</a><br><br>
6
+ </div>
7
+
8
+ **IndicTrans** is a Transformer-4x ( ~434M ) multilingual NMT model trained on [Samanantar](https://indicnlp.ai4bharat.org/samanantar) dataset which is the largest publicly available parallel corpora collection for Indic languages at the time of writing ( 14 April 2021 ). It is a single script model i.e we convert all the Indic data to the Devanagari script which allows for ***better lexical sharing between languages for transfer learning, prevents fragmentation of the subword vocabulary between Indic languages and allows using a smaller subword vocabulary***. We currently release two models - Indic to English and English to Indic and support the following 11 indic languages:
9
+
10
+ | <!-- --> | <!-- --> | <!-- --> | <!-- --> |
11
+ | ------------- | -------------- | ------------ | ----------- |
12
+ | Assamese (as) | Hindi (hi) | Marathi (mr) | Tamil (ta) |
13
+ | Bengali (bn) | Kannada (kn) | Oriya (or) | Telugu (te) |
14
+ | Gujarati (gu) | Malayalam (ml) | Punjabi (pa) |
15
+
16
+
17
+
18
+
19
+ - [Updates](#updates)
20
+ - [Download IndicTrans models:](#download-indictrans-models)
21
+ - [Using the model for translating any input](#using-the-model-for-translating-any-input)
22
+ - [Finetuning the model on your input dataset](#finetuning-the-model-on-your-input-dataset)
23
+ - [Mining Indic to Indic pairs from english centric corpus](#mining-indic-to-indic-pairs-from-english-centric-corpus)
24
+ - [Installation](#installation)
25
+ - [How to train the indictrans model on your training data?](#how-to-train-the-indictrans-model-on-your-training-data)
26
+ - [Network & Training Details](#network--training-details)
27
+ - [Folder Structure](#folder-structure)
28
+ - [Citing](#citing)
29
+ - [License](#license)
30
+ - [Contributors](#contributors)
31
+ - [Contact](#contact)
32
+
33
+
34
+ ## Updates
35
+ <details><summary>Click to expand </summary>
36
+ 18 December 2021
37
+
38
+ ```
39
+ Tutorials updated with latest model links
40
+ ```
41
+
42
+
43
+ 26 November 2021
44
+ ```
45
+ - v0.3 models are now available for download
46
+ ```
47
+
48
+ 27 June 2021
49
+ ```
50
+ - Updated links for indic to indic model
51
+ - Add more comments to training scripts
52
+ - Add link to [Samanantar Video](https://youtu.be/QwYPOd1eBtQ?t=383)
53
+ - Add folder structure in readme
54
+ - Add python wrapper for model inference
55
+ ```
56
+
57
+ 09 June 2021
58
+ ```
59
+ - Updated links for models
60
+ - Added Indic to Indic model
61
+ ```
62
+
63
+ 09 May 2021
64
+ ```
65
+ - Added fix for finetuning on datasets where some lang pairs are not present. Previously the script assumed the finetuning dataset will have data for all 11 indic lang pairs
66
+ - Added colab notebook for finetuning instructions
67
+ ```
68
+ </details>
69
+
70
+ ## Download IndicTrans models:
71
+
72
+ Indic to English: [v0.3](https://storage.googleapis.com/samanantar-public/V0.3/models/indic-en.zip)
73
+
74
+ English to Indic: [v0.3](https://storage.googleapis.com/samanantar-public/V0.3/models/en-indic.zip)
75
+
76
+ Indic to Indic: [v0.3](https://storage.googleapis.com/samanantar-public/V0.3/models/m2m.zip)
77
+
78
+
79
+
80
+ ## Using the model for translating any input
81
+
82
+ The model is trained on single sentences and hence, users need to split parapgraphs to sentences before running the translation when using our command line interface (The python interface has `translate_paragraph` method to handle multi sentence translations).
83
+
84
+ Note: IndicTrans is trained with a max sequence length of **200** tokens (subwords). If your sentence is too long (> 200 tokens), the sentence will be truncated to 200 tokens before translation.
85
+
86
+ Here is an example snippet to split paragraphs into sentences for English and Indic languages supported by our model:
87
+ ```python
88
+ # install these libraries
89
+ # pip install mosestokenizer
90
+ # pip install indic-nlp-library
91
+
92
+ from mosestokenizer import *
93
+ from indicnlp.tokenize import sentence_tokenize
94
+
95
+ INDIC = ["as", "bn", "gu", "hi", "kn", "ml", "mr", "or", "pa", "ta", "te"]
96
+
97
+ def split_sentences(paragraph, language):
98
+ if language == "en":
99
+ with MosesSentenceSplitter(language) as splitter:
100
+ return splitter([paragraph])
101
+ elif language in INDIC:
102
+ return sentence_tokenize.sentence_split(paragraph, lang=language)
103
+
104
+ split_sentences("""COVID-19 is caused by infection with the severe acute respiratory
105
+ syndrome coronavirus 2 (SARS-CoV-2) virus strain. The disease is mainly transmitted via the respiratory
106
+ route when people inhale droplets and particles that infected people release as they breathe, talk, cough, sneeze, or sing. """, language='en')
107
+
108
+ >> ['COVID-19 is caused by infection with the severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2) virus strain.',
109
+ 'The disease is mainly transmitted via the respiratory route when people inhale droplets and particles that infected people release as they breathe, talk, cough, sneeze, or sing.']
110
+
111
+ split_sentences("""இத்தொற்றுநோய் உலகளாவிய சமூக மற்���ும் பொருளாதார சீர்குலைவை ஏற்படுத்தியுள்ளது.இதனால் பெரும் பொருளாதார மந்தநிலைக்குப் பின்னர் உலகளவில் மிகப்பெரிய மந்தநிலை ஏற்பட்டுள்ளது. இது விளையாட்டு,மத, அரசியல் மற்றும் கலாச்சார நிகழ்வுகளை ஒத்திவைக்க அல்லது ரத்து செய்ய வழிவகுத்தது.
112
+ அச்சம் காரணமாக முகக்கவசம், கிருமிநாசினி உள்ளிட்ட பொருட்களை அதிக நபர்கள் வாங்கியதால் விநியோகப் பற்றாக்குறை ஏற்பட்டது.""",
113
+ language='ta')
114
+
115
+ >> ['இத்தொற்றுநோய் உலகளாவிய சமூக மற்றும் பொருளாதார சீர்குலைவை ஏற்படுத்தியுள்ளது.',
116
+ 'இதனால் பெரும் பொருளாதார மந்தநிலைக்குப் பின்னர் உலகளவில் மிகப்பெரிய மந்தநிலை ஏற்பட்டுள்ளது.',
117
+ 'இது விளையாட்டு,மத, அரசியல் மற்றும் கலாச்சார நிகழ்வுகளை ஒத்திவைக்க அல்லது ரத்து செய்ய வழிவகுத்தது.',
118
+ 'அச்சம் காரணமாக முகக்கவசம், கிருமிநாசினி உள்ளிட்ட பொருட்களை அதிக நபர்கள் வாங்கியதால் விநியோகப் பற்றாக்குறை ஏற்பட்டது.']
119
+
120
+
121
+ ```
122
+
123
+ Follow the colab notebook to setup the environment, download the trained _IndicTrans_ models and translating your own text.
124
+
125
+ Command line interface --> [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AI4Bharat/indicTrans/blob/main/indictrans_fairseq_inference.ipynb)
126
+
127
+
128
+ Python interface --> [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AI4Bharat/indicTrans/blob/main/indicTrans_python_interface.ipynb)
129
+
130
+ The python interface is useful in case you want to reuse the model for multiple translations and do not want to reinitialize the model each time
131
+
132
+
133
+ ## Finetuning the model on your input dataset
134
+
135
+ [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AI4Bharat/indicTrans/blob/main/indicTrans_Finetuning.ipynb)
136
+
137
+ The colab notebook can be used to setup the environment, download the trained _IndicTrans_ models and prepare your custom dataset for funetuning the indictrans model. There is also a section on mining indic to indic data from english centric corpus for finetuning indic to indic model.
138
+
139
+ **Note**: Since this is a big model (400M params), you might not be able to train with reasonable batch sizes in the free google Colab account. We are planning to release smaller models (after pruning / distallation) soon.
140
+
141
+ ## Mining Indic to Indic pairs from english centric corpus
142
+
143
+ The `extract_non_english_pairs` in `scripts/extract_non_english_pairs.py` can be used to mine indic to indic pairs from english centric corpus.
144
+
145
+ As described in the [paper](https://arxiv.org/pdf/2104.05596.pdf) (section 2.5) , we use a very strict deduplication criterion to avoid the creation of very similar parallel sentences. For example, if an en sentence is aligned to *M* hi sentences and *N* ta sentences, then we would get *MN* hi-ta pairs. However, these pairs would be very similar and not contribute much to the training process. Hence, we retain only 1 randomly chosen pair out of these *MN* pairs.
146
+
147
+ ```bash
148
+ extract_non_english_pairs(indir, outdir, LANGS):
149
+ """
150
+ Extracts non-english pair parallel corpora
151
+ indir: contains english centric data in the following form:
152
+ - directory named en-xx for language xx
153
+ - each directory contains a train.en and train.xx
154
+ outdir: output directory to store mined data for each pair.
155
+ One directory is created for each pair.
156
+ LANGS: list of languages in the corpus (other than English).
157
+ The language codes must correspond to the ones used in the
158
+ files and directories in indir. Prefarably, sort the languages
159
+ in this list in alphabetic order. outdir will contain data for xx-yy,
160
+ but not for yy-xx, so it will be convenient to have this list in sorted order.
161
+ """
162
+ ```
163
+
164
+ ## Installation
165
+ <details><summary>Click to expand </summary>
166
+
167
+ ```bash
168
+ cd indicTrans
169
+ git clone https://github.com/anoopkunchukuttan/indic_nlp_library.git
170
+ git clone https://github.com/anoopkunchukuttan/indic_nlp_resources.git
171
+ git clone https://github.com/rsennrich/subword-nmt.git
172
+ # install required libraries
173
+ pip install sacremoses pandas mock sacrebleu tensorboardX pyarrow indic-nlp-library
174
+
175
+ # Install fairseq from source
176
+ git clone https://github.com/pytorch/fairseq.git
177
+ cd fairseq
178
+ pip install --editable ./
179
+
180
+ ```
181
+ </details>
182
+
183
+ ## How to train the indictrans model on your training data?
184
+ [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AI4Bharat/indicTrans/blob/main/IndicTrans_training.ipynb)
185
+
186
+
187
+ Follow the colab notebook to setup the environment, download the dataset and train the indicTrans model
188
+
189
+ ## Network & Training Details
190
+
191
+ - Architechture: IndicTrans uses 6 encoder and decoder layers, input embeddings of size 1536 with 16 attention heads and
192
+ feedforward dimension of 4096 with total number of parameters of 434M
193
+ - Loss: Cross entropy loss
194
+ - Optimizer: Adam
195
+ - Label Smoothing: 0.1
196
+ - Gradient clipping: 1.0
197
+ - Learning rate: 5e-4
198
+ - Warmup_steps: 4000
199
+
200
+ Please refer to section 4, 5 of our [paper](https://arxiv.org/ftp/arxiv/papers/2104/2104.05596.pdf) for more details on training/experimental setup.
201
+
202
+ ## Folder Structure
203
+ ```
204
+
205
+ IndicTrans
206
+ │ .gitignore
207
+ │ apply_bpe_traindevtest_notag.sh # apply bpe for joint vocab (Train, dev and test)
208
+ │ apply_single_bpe_traindevtest_notag.sh # apply bpe for seperate vocab (Train, dev and test)
209
+ │ binarize_training_exp.sh # binarize the training data after preprocessing for fairseq-training
210
+ │ compute_bleu.sh # Compute blue scores with postprocessing after translating with `joint_translate.sh`
211
+ │ indictrans_fairseq_inference.ipynb # colab example to show how to use model for inference
212
+ │ indicTrans_Finetuning.ipynb # colab example to show how to use model for finetuning on custom domain data
213
+ │ joint_translate.sh # used for inference (see colab inference notebook for more details on usage)
214
+ │ learn_bpe.sh # learning joint bpe on preprocessed text
215
+ │ learn_single_bpe.sh # learning seperate bpe on preprocessed text
216
+ │ LICENSE
217
+ │ prepare_data.sh # prepare data given an experiment dir (this does preprocessing,
218
+ │ # building vocab, binarization ) for bilingual training
219
+ │ prepare_data_joint_training.sh # prepare data given an experiment dir (this does preprocessing,
220
+ │ # building vocab, binarization ) for joint training
221
+ │ README.md
222
+
223
+ ├───legacy # old unused scripts
224
+ ├───model_configs # custom model configrations are stored here
225
+ │ custom_transformer.py # contains custom 4x transformer models
226
+ │ __init__.py
227
+ ├───inference
228
+ │ custom_interactive.py # for python wrapper around fairseq-interactive
229
+ │ engine.py # python interface for model inference
230
+ └───scripts # stores python scripts that are used by other bash scripts
231
+ │ add_joint_tags_translate.py # add lang tags to the processed training data for bilingual training
232
+ │ add_tags_translate.py # add lang tags to the processed training data for joint training
233
+ │ clean_vocab.py # clean vocabulary after building with subword_nmt
234
+ │ concat_joint_data.py # concatenates lang pair data and creates text files to keep track
235
+ │ # of number of lines in each lang pair.
236
+ │ extract_non_english_pairs.py # Mining Indic to Indic pairs from english centric corpus
237
+ │ postprocess_translate.py # Postprocesses translations
238
+ │ preprocess_translate.py # Preprocess translations and for script conversion (from indic to devnagiri)
239
+ │ remove_large_sentences.py # to remove large sentences from training data
240
+ └───remove_train_devtest_overlaps.py # Finds and removes overlaped data of train with dev and test sets
241
+ ```
242
+
243
+
244
+ ## Citing
245
+
246
+ If you are using any of the resources, please cite the following article:
247
+ ```
248
+ @misc{ramesh2021samanantar,
249
+ title={Samanantar: The Largest Publicly Available Parallel Corpora Collection for 11 Indic Languages},
250
+ author={Gowtham Ramesh and Sumanth Doddapaneni and Aravinth Bheemaraj and Mayank Jobanputra and Raghavan AK and Ajitesh Sharma and Sujit Sahoo and Harshita Diddee and Mahalakshmi J and Divyanshu Kakwani and Navneet Kumar and Aswin Pradeep and Kumar Deepak and Vivek Raghavan and Anoop Kunchukuttan and Pratyush Kumar and Mitesh Shantadevi Khapra},
251
+ year={2021},
252
+ eprint={2104.05596},
253
+ archivePrefix={arXiv},
254
+ primaryClass={cs.CL}
255
+ }
256
+ ```
257
+
258
+ We would like to hear from you if:
259
+
260
+ - You are using our resources. Please let us know how you are putting these resources to use.
261
+ - You have any feedback on these resources.
262
+
263
+
264
+
265
+ ### License
266
+
267
+ The IndicTrans code (and models) are released under the MIT License.
268
+
269
+
270
+ ### Contributors
271
+
272
+ - Gowtham Ramesh, <sub>([RBCDSAI](https://rbcdsai.iitm.ac.in), [IITM](https://www.iitm.ac.in))</sub>
273
+ - Sumanth Doddapaneni, <sub>([RBCDSAI](https://rbcdsai.iitm.ac.in), [IITM](https://www.iitm.ac.in))</sub>
274
+ - Aravinth Bheemaraj, <sub>([Tarento](https://www.linkedin.com/company/tarento-group/), [EkStep](https://ekstep.in))</sub>
275
+ - Mayank Jobanputra, <sub>([IITM](https://www.iitm.ac.in))</sub>
276
+ - Raghavan AK, <sub>([AI4Bharat](https://ai4bharat.org))</sub>
277
+ - Ajitesh Sharma, <sub>([Tarento](https://www.linkedin.com/company/tarento-group/), [EkStep](https://ekstep.in))</sub>
278
+ - Sujit Sahoo, <sub>([Tarento](https://www.linkedin.com/company/tarento-group/), [EkStep](https://ekstep.in))</sub>
279
+ - Harshita Diddee, <sub>([AI4Bharat](https://ai4bharat.org))</sub>
280
+ - Mahalakshmi J, <sub>([AI4Bharat](https://ai4bharat.org))</sub>
281
+ - Divyanshu Kakwani, <sub>([IITM](https://www.iitm.ac.in), [AI4Bharat](https://ai4bharat.org))</sub>
282
+ - Navneet Kumar, <sub>([Tarento](https://www.linkedin.com/company/tarento-group/), [EkStep](https://ekstep.in))</sub>
283
+ - Aswin Pradeep, <sub>([Tarento](https://www.linkedin.com/company/tarento-group/), [EkStep](https://ekstep.in))</sub>
284
+ - Kumar Deepak, <sub>([Tarento](https://www.linkedin.com/company/tarento-group/), [EkStep](https://ekstep.in))</sub>
285
+ - Vivek Raghavan, <sub>([EkStep](https://ekstep.in))</sub>
286
+ - Anoop Kunchukuttan, <sub>([Microsoft](https://www.microsoft.com/en-in/), [AI4Bharat](https://ai4bharat.org))</sub>
287
+ - Pratyush Kumar, <sub>([RBCDSAI](https://rbcdsai.iitm.ac.in), [AI4Bharat](https://ai4bharat.org), [IITM](https://www.iitm.ac.in))</sub>
288
+ - Mitesh Shantadevi Khapra, <sub>([RBCDSAI](https://rbcdsai.iitm.ac.in), [AI4Bharat](https://ai4bharat.org), [IITM](https://www.iitm.ac.in))</sub>
289
+
290
+
291
+
292
+ ### Contact
293
+
294
+ - Anoop Kunchukuttan ([anoop.kunchukuttan@gmail.com](mailto:anoop.kunchukuttan@gmail.com))
295
+ - Mitesh Khapra ([miteshk@cse.iitm.ac.in](mailto:miteshk@cse.iitm.ac.in))
296
+ - Pratyush Kumar ([pratyush@cse.iitm.ac.in](mailto:pratyush@cse.iitm.ac.in))
indicTrans/api.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+
3
+ from fairseq import checkpoint_utils, distributed_utils, options, tasks, utils
4
+ from inference.engine import Model
5
+ from flask import Flask, request
6
+ from flask import jsonify
7
+ from flask_cors import CORS, cross_origin
8
+ import webvtt
9
+ from io import StringIO
10
+
11
+
12
+ app = Flask(__name__)
13
+ cors = CORS(app)
14
+ app.config['CORS_HEADERS'] = 'Content-Type'
15
+
16
+ indic2en_model = Model(expdir='../models/v3/indic-en')
17
+ en2indic_model = Model(expdir='../models/v3/en-indic')
18
+ m2m_model = Model(expdir='../models/m2m')
19
+
20
+ language_dict = {
21
+ 'Assamese': 'as',
22
+ 'Hindi' : 'hi',
23
+ 'Marathi' : 'mr',
24
+ 'Tamil' : 'ta',
25
+ 'Bengali' : 'bn',
26
+ 'Kannada' : 'kn',
27
+ 'Oriya' : 'or',
28
+ 'Telugu' : 'te',
29
+ 'Gujarati' : 'gu',
30
+ 'Malayalam' : 'ml',
31
+ 'Punjabi' : 'pa',
32
+ }
33
+
34
+ def get_inference_params():
35
+ model_type = request.form['model_type']
36
+ source_language = request.form['source_language']
37
+ target_language = request.form['target_language']
38
+
39
+ if model_type == 'indic-en':
40
+ model = indic2en_model
41
+ source_lang = language_dict[source_language]
42
+ assert target_language == 'English'
43
+ target_lang = 'en'
44
+ elif model_type == 'en-indic':
45
+ model = en2indic_model
46
+ assert source_language == 'English'
47
+ source_lang = 'en'
48
+ target_lang = language_dict[target_language]
49
+ elif model_type == 'm2m':
50
+ model = m2m_model
51
+ source_lang = language_dict[source_language]
52
+ target_lang = language_dict[target_language]
53
+
54
+ return model, source_lang, target_lang
55
+
56
+ @app.route('/', methods=['GET'])
57
+ def main():
58
+ return "IndicTrans API"
59
+
60
+ @app.route("/translate", methods=['POST'])
61
+ @cross_origin()
62
+ def infer_indic_en():
63
+ model, source_lang, target_lang = get_inference_params()
64
+ source_text = request.form['text']
65
+
66
+ start_time = time.time()
67
+ target_text = model.translate_paragraph(source_text, source_lang, target_lang)
68
+ end_time = time.time()
69
+ return {'text':target_text, 'duration':round(end_time-start_time, 2)}
70
+
71
+ @app.route("/translate_vtt", methods=['POST'])
72
+ @cross_origin()
73
+ def infer_vtt_indic_en():
74
+ model, source_lang, target_lang = get_inference_params()
75
+ source_text = request.form['text']
76
+ captions = webvtt.read_buffer(StringIO(source_text))
77
+ source_sentences = [caption.text.replace('\r', '').replace('\n', ' ') for caption in captions]
78
+
79
+ start_time = time.time()
80
+ target_sentences = model.batch_translate(source_sentences, source_lang, target_lang)
81
+ end_time = time.time()
82
+
83
+ for i in range(len(target_sentences)):
84
+ captions[i].text = target_sentences[i]
85
+
86
+ return {'text': captions.content, 'duration':round(end_time-start_time, 2)}
indicTrans/apply_bpe_traindevtest_notag.sh ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ expdir=$1 # EXPDIR
4
+
5
+ SUBWORD_NMT_DIR="subword-nmt"
6
+
7
+ data_dir="$expdir/data"
8
+ mkdir -p $expdir/bpe
9
+
10
+ for dset in `echo train dev test`
11
+ do
12
+ echo $dset
13
+ in_dset_dir="$data_dir/$dset"
14
+ out_dset_dir="$expdir/bpe/$dset"
15
+ # out_dset_dir="$expdir/final/$dset"
16
+ echo "Apply joint vocab to SRC corpus"
17
+ # for very large datasets, use gnu-parallel to speed up applying bpe
18
+ # uncomment the below line if the apply bpe is slow
19
+
20
+ # parallel --pipe --keep-order \
21
+ python $SUBWORD_NMT_DIR/subword_nmt/apply_bpe.py \
22
+ -c $expdir/vocab/bpe_codes.32k.SRC_TGT \
23
+ --vocabulary $expdir/vocab/vocab.SRC \
24
+ --vocabulary-threshold 5 \
25
+ --num-workers "-1" \
26
+ < $in_dset_dir.SRC \
27
+ > $out_dset_dir.SRC
28
+ echo "Apply joint vocab to TGT corpus"
29
+
30
+ # for very large datasets, use gnu-parallel to speed up applying bpe
31
+ # uncomment the below line if the apply bpe is slow
32
+
33
+ # parallel --pipe --keep-order \
34
+ python $SUBWORD_NMT_DIR/subword_nmt/apply_bpe.py \
35
+ -c $expdir/vocab/bpe_codes.32k.SRC_TGT \
36
+ --vocabulary $expdir/vocab/vocab.TGT \
37
+ --vocabulary-threshold 5 \
38
+ --num-workers "-1" \
39
+ < $in_dset_dir.TGT \
40
+ > $out_dset_dir.TGT
41
+ done
indicTrans/apply_single_bpe_traindevtest_notag.sh ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ expdir=$1 # EXPDIR
4
+
5
+ SUBWORD_NMT_DIR="subword-nmt"
6
+
7
+ data_dir="$expdir/data"
8
+ mkdir -p $expdir/bpe
9
+
10
+ for dset in `echo train dev test`
11
+ do
12
+ echo $dset
13
+ in_dset_dir="$data_dir/$dset"
14
+ out_dset_dir="$expdir/bpe/$dset"
15
+ # out_dset_dir="$expdir/final/$dset"
16
+ echo "Apply to SRC corpus"
17
+ # for very large datasets, use gnu-parallel to speed up applying bpe
18
+ # uncomment the below line if the apply bpe is slow
19
+
20
+ # parallel --pipe --keep-order \
21
+ python $SUBWORD_NMT_DIR/subword_nmt/apply_bpe.py \
22
+ -c $expdir/vocab/bpe_codes.32k.SRC \
23
+ --vocabulary $expdir/vocab/vocab.SRC \
24
+ --vocabulary-threshold 5 \
25
+ --num-workers "-1" \
26
+ < $in_dset_dir.SRC \
27
+ > $out_dset_dir.SRC
28
+ echo "Apply to TGT corpus"
29
+ # for very large datasets, use gnu-parallel to speed up applying bpe
30
+ # uncomment the below line if the apply bpe is slow
31
+
32
+ # parallel --pipe --keep-order \
33
+ python $SUBWORD_NMT_DIR/subword_nmt/apply_bpe.py \
34
+ -c $expdir/vocab/bpe_codes.32k.TGT \
35
+ --vocabulary $expdir/vocab/vocab.TGT \
36
+ --vocabulary-threshold 5 \
37
+ --num-workers "-1" \
38
+ < $in_dset_dir.TGT \
39
+ > $out_dset_dir.TGT
40
+ done
indicTrans/binarize_training_exp.sh ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #/bin/bash
2
+
3
+ exp_dir=$1
4
+ src_lang=$2
5
+ tgt_lang=$3
6
+
7
+ # use cpu_count to get num_workers instead of setting it manually when running in different
8
+ # instances
9
+ num_workers=`python -c "import multiprocessing; print(multiprocessing.cpu_count())"`
10
+
11
+ data_dir=$exp_dir/final
12
+ out_data_dir=$exp_dir/final_bin
13
+
14
+ rm -rf $out_data_dir
15
+
16
+ fairseq-preprocess \
17
+ --source-lang $src_lang --target-lang $tgt_lang \
18
+ --trainpref $data_dir/train \
19
+ --validpref $data_dir/dev \
20
+ --testpref $data_dir/test \
21
+ --destdir $out_data_dir \
22
+ --workers $num_workers \
23
+ --thresholdtgt 5 \
24
+ --thresholdsrc 5
indicTrans/compute_bleu.sh ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pred_fname=$1
2
+ ref_fname=$2
3
+ src_lang=$3
4
+ tgt_lang=$4
5
+
6
+ # we compute and report tokenized bleu scores.
7
+ # For computing BLEU scores, systems should output detokenized outputs. Your MT system might be doing it out of the box if you are using SentencePiece - nothing to do in that case.
8
+ # If you are using BPE then:
9
+ # 1. For English, you can use MosesDetokenizer (either the scripts in moses or the sacremoses python package)
10
+ # 2. For Indian languages, you can use the IndicNLP library detokenizer (note: please don't skip this step, since detok/tokenizer are not guaranteed to be reversible**.
11
+ # ^ both 1. and 2. are scripts/postprocess_translate.py
12
+
13
+
14
+ # For computing BLEU, we use sacrebleu:
15
+ # For English output: sacrebleu reffile < outputfile. This internally tokenizes using mteval-v13a
16
+ # For Indian language output, we need tokenized output and reference since we don't know how well the sacrebleu tokenizer works for Indic input.
17
+ # Hence we tokenize both preds and target files with IndicNLP tokenizer and then run: sacrebleu --tokenize none reffile < outputfile
18
+ if [ $tgt_lang == 'en' ]; then
19
+ # indic to en models
20
+ sacrebleu $ref_fname < $pred_fname
21
+ else
22
+ # indicnlp tokenize predictions and reference files before evaluation
23
+ input_size=`python scripts/preprocess_translate.py $ref_fname $ref_fname.tok $tgt_lang`
24
+ input_size=`python scripts/preprocess_translate.py $pred_fname $pred_fname.tok $tgt_lang`
25
+
26
+ # since we are tokenizing with indicnlp separately, we are setting tokenize to none here
27
+ sacrebleu --tokenize none $ref_fname.tok < $pred_fname.tok
28
+ fi
indicTrans/indicTrans_Finetuning.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
indicTrans/indicTrans_python_interface.ipynb ADDED
@@ -0,0 +1,462 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {
6
+ "colab_type": "text",
7
+ "id": "view-in-github"
8
+ },
9
+ "source": [
10
+ "<a href=\"https://colab.research.google.com/github/gowtham1997/indicTrans-1/blob/main/indicTrans_python_interface.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": 1,
16
+ "metadata": {
17
+ "colab": {
18
+ "base_uri": "https://localhost:8080/"
19
+ },
20
+ "id": "CjfzxXZLHed_",
21
+ "outputId": "69a66b95-41b2-4413-82d1-0caacbddb3f3"
22
+ },
23
+ "outputs": [
24
+ {
25
+ "name": "stdout",
26
+ "output_type": "stream",
27
+ "text": [
28
+ "Cloning into 'indicTrans-1'...\n",
29
+ "remote: Enumerating objects: 486, done.\u001b[K\n",
30
+ "remote: Counting objects: 100% (189/189), done.\u001b[K\n",
31
+ "remote: Compressing objects: 100% (67/67), done.\u001b[K\n",
32
+ "remote: Total 486 (delta 154), reused 134 (delta 121), pack-reused 297\u001b[K\n",
33
+ "Receiving objects: 100% (486/486), 1.48 MiB | 17.61 MiB/s, done.\n",
34
+ "Resolving deltas: 100% (281/281), done.\n",
35
+ "/content/indicTrans\n",
36
+ "Cloning into 'indic_nlp_library'...\n",
37
+ "remote: Enumerating objects: 1325, done.\u001b[K\n",
38
+ "remote: Counting objects: 100% (147/147), done.\u001b[K\n",
39
+ "remote: Compressing objects: 100% (103/103), done.\u001b[K\n",
40
+ "remote: Total 1325 (delta 84), reused 89 (delta 41), pack-reused 1178\u001b[K\n",
41
+ "Receiving objects: 100% (1325/1325), 9.57 MiB | 13.55 MiB/s, done.\n",
42
+ "Resolving deltas: 100% (688/688), done.\n",
43
+ "Cloning into 'indic_nlp_resources'...\n",
44
+ "remote: Enumerating objects: 133, done.\u001b[K\n",
45
+ "remote: Counting objects: 100% (7/7), done.\u001b[K\n",
46
+ "remote: Compressing objects: 100% (7/7), done.\u001b[K\n",
47
+ "remote: Total 133 (delta 0), reused 2 (delta 0), pack-reused 126\u001b[K\n",
48
+ "Receiving objects: 100% (133/133), 149.77 MiB | 33.48 MiB/s, done.\n",
49
+ "Resolving deltas: 100% (51/51), done.\n",
50
+ "Checking out files: 100% (28/28), done.\n",
51
+ "Cloning into 'subword-nmt'...\n",
52
+ "remote: Enumerating objects: 580, done.\u001b[K\n",
53
+ "remote: Counting objects: 100% (4/4), done.\u001b[K\n",
54
+ "remote: Compressing objects: 100% (4/4), done.\u001b[K\n",
55
+ "remote: Total 580 (delta 0), reused 1 (delta 0), pack-reused 576\u001b[K\n",
56
+ "Receiving objects: 100% (580/580), 237.41 KiB | 18.26 MiB/s, done.\n",
57
+ "Resolving deltas: 100% (349/349), done.\n",
58
+ "/content\n"
59
+ ]
60
+ }
61
+ ],
62
+ "source": [
63
+ "# clone the repo for running evaluation\n",
64
+ "!git clone https://github.com/AI4Bharat/indicTrans.git\n",
65
+ "%cd indicTrans\n",
66
+ "# clone requirements repositories\n",
67
+ "!git clone https://github.com/anoopkunchukuttan/indic_nlp_library.git\n",
68
+ "!git clone https://github.com/anoopkunchukuttan/indic_nlp_resources.git\n",
69
+ "!git clone https://github.com/rsennrich/subword-nmt.git\n",
70
+ "%cd .."
71
+ ]
72
+ },
73
+ {
74
+ "cell_type": "code",
75
+ "execution_count": 2,
76
+ "metadata": {
77
+ "colab": {
78
+ "base_uri": "https://localhost:8080/"
79
+ },
80
+ "id": "IeYW2BJhlJvx",
81
+ "outputId": "3357bc85-44d8-43b0-8c64-eef9f18be716"
82
+ },
83
+ "outputs": [
84
+ {
85
+ "name": "stdout",
86
+ "output_type": "stream",
87
+ "text": [
88
+ "Collecting sacremoses\n",
89
+ "\u001b[?25l Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)\n",
90
+ "\r\u001b[K |▍ | 10kB 14.0MB/s eta 0:00:01\r\u001b[K |▊ | 20kB 18.8MB/s eta 0:00:01\r\u001b[K |█ | 30kB 22.5MB/s eta 0:00:01\r\u001b[K |█▌ | 40kB 25.7MB/s eta 0:00:01\r\u001b[K |█▉ | 51kB 27.6MB/s eta 0:00:01\r\u001b[K |██▏ | 61kB 29.2MB/s eta 0:00:01\r\u001b[K |██▋ | 71kB 27.3MB/s eta 0:00:01\r\u001b[K |███ | 81kB 27.7MB/s eta 0:00:01\r\u001b[K |███▎ | 92kB 28.8MB/s eta 0:00:01\r\u001b[K |███▋ | 102kB 29.9MB/s eta 0:00:01\r\u001b[K |████ | 112kB 29.9MB/s eta 0:00:01\r\u001b[K |████▍ | 122kB 29.9MB/s eta 0:00:01\r\u001b[K |████▊ | 133kB 29.9MB/s eta 0:00:01\r\u001b[K |█████▏ | 143kB 29.9MB/s eta 0:00:01\r\u001b[K |█████▌ | 153kB 29.9MB/s eta 0:00:01\r\u001b[K |█████▉ | 163kB 29.9MB/s eta 0:00:01\r\u001b[K |██████▎ | 174kB 29.9MB/s eta 0:00:01\r\u001b[K |██████▋ | 184kB 29.9MB/s eta 0:00:01\r\u001b[K |███████ | 194kB 29.9MB/s eta 0:00:01\r\u001b[K |███████▎ | 204kB 29.9MB/s eta 0:00:01\r\u001b[K |███████▊ | 215kB 29.9MB/s eta 0:00:01\r\u001b[K |████████ | 225kB 29.9MB/s eta 0:00:01\r\u001b[K |████████▍ | 235kB 29.9MB/s eta 0:00:01\r\u001b[K |████████▉ | 245kB 29.9MB/s eta 0:00:01\r\u001b[K |█████████▏ | 256kB 29.9MB/s eta 0:00:01\r\u001b[K |█████████▌ | 266kB 29.9MB/s eta 0:00:01\r\u001b[K |█████████▉ | 276kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████▎ | 286kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████▋ | 296kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████ | 307kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████▍ | 317kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████▊ | 327kB 29.9MB/s eta 0:00:01\r\u001b[K |████████████ | 337kB 29.9MB/s eta 0:00:01\r\u001b[K |████████████▌ | 348kB 29.9MB/s eta 0:00:01\r\u001b[K |████████████▉ | 358kB 29.9MB/s eta 0:00:01\r\u001b[K |█████████████▏ | 368kB 29.9MB/s eta 0:00:01\r\u001b[K |█████████████▌ | 378kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████████ | 389kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████████▎ | 399kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████████▋ | 409kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████████ | 419kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████████▍ | 430kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████████▊ | 440kB 29.9MB/s eta 0:00:01\r\u001b[K |████████████████ | 450kB 29.9MB/s eta 0:00:01\r\u001b[K |████████████████▌ | 460kB 29.9MB/s eta 0:00:01\r\u001b[K |████████████████▉ | 471kB 29.9MB/s eta 0:00:01\r\u001b[K |█████████████████▏ | 481kB 29.9MB/s eta 0:00:01\r\u001b[K |█████████████████▋ | 491kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████████████ | 501kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████████████▎ | 512kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████████████▊ | 522kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████████████ | 532kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████████████▍ | 542kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████████████▊ | 552kB 29.9MB/s eta 0:00:01\r\u001b[K |████████████████████▏ | 563kB 29.9MB/s eta 0:00:01\r\u001b[K |████████████████████▌ | 573kB 29.9MB/s eta 0:00:01\r\u001b[K |████████████████████▉ | 583kB 29.9MB/s eta 0:00:01\r\u001b[K |█████████████████████▎ | 593kB 29.9MB/s eta 0:00:01\r\u001b[K |█████████████████████▋ | 604kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████████████████ | 614kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████████████████▎ | 624kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████████████████▊ | 634kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████████████████ | 645kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████████████████▍ | 655kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████████████████▉ | 665kB 29.9MB/s eta 0:00:01\r\u001b[K |████████████████████████▏ | 675kB 29.9MB/s eta 0:00:01\r\u001b[K |████████████████████████▌ | 686kB 29.9MB/s eta 0:00:01\r\u001b[K |█████████████████████████ | 696kB 29.9MB/s eta 0:00:01\r\u001b[K |█████████████████████████▎ | 706kB 29.9MB/s eta 0:00:01\r\u001b[K |█████████████████████████▋ | 716kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████████████████████ | 727kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████████████████████▍ | 737kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████████████████████▊ | 747kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████████████████████ | 757kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████████████████████▌ | 768kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████████████████████▉ | 778kB 29.9MB/s eta 0:00:01\r\u001b[K |████████████████████████████▏ | 788kB 29.9MB/s eta 0:00:01\r\u001b[K |████████████████████████████▌ | 798kB 29.9MB/s eta 0:00:01\r\u001b[K |█████████████████████████████ | 808kB 29.9MB/s eta 0:00:01\r\u001b[K |█████████████████████████████▎ | 819kB 29.9MB/s eta 0:00:01\r\u001b[K |█████████████████████████████▋ | 829kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████████████████████████ | 839kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████████████████████████▍ | 849kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████████████████████████▊ | 860kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████████████████████████▏| 870kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████████████████████████▌| 880kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████████████████████████▉| 890kB 29.9MB/s eta 0:00:01\r\u001b[K |████████████████████████████████| 901kB 29.9MB/s \n",
91
+ "\u001b[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (1.1.5)\n",
92
+ "Collecting mock\n",
93
+ " Downloading https://files.pythonhosted.org/packages/5c/03/b7e605db4a57c0f6fba744b11ef3ddf4ddebcada35022927a2b5fc623fdf/mock-4.0.3-py3-none-any.whl\n",
94
+ "Collecting sacrebleu\n",
95
+ "\u001b[?25l Downloading https://files.pythonhosted.org/packages/7e/57/0c7ca4e31a126189dab99c19951910bd081dea5bbd25f24b77107750eae7/sacrebleu-1.5.1-py3-none-any.whl (54kB)\n",
96
+ "\u001b[K |████████████████████████████████| 61kB 7.5MB/s \n",
97
+ "\u001b[?25hCollecting tensorboardX\n",
98
+ "\u001b[?25l Downloading https://files.pythonhosted.org/packages/42/36/2b147652c40c3a858efa0afbf7b8236fae968e88ff530511a4cfa299a506/tensorboardX-2.3-py2.py3-none-any.whl (124kB)\n",
99
+ "\u001b[K |████████████████████████████████| 133kB 47.5MB/s \n",
100
+ "\u001b[?25hRequirement already satisfied: pyarrow in /usr/local/lib/python3.7/dist-packages (3.0.0)\n",
101
+ "Collecting indic-nlp-library\n",
102
+ "\u001b[?25l Downloading https://files.pythonhosted.org/packages/84/d4/495bb43b88a2a6d04b09c29fc5115f24872af74cd8317fe84026abd4ddb1/indic_nlp_library-0.81-py3-none-any.whl (40kB)\n",
103
+ "\u001b[K |████████████████████████████████| 40kB 5.2MB/s \n",
104
+ "\u001b[?25hRequirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from sacremoses) (4.41.1)\n",
105
+ "Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from sacremoses) (1.15.0)\n",
106
+ "Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from sacremoses) (1.0.1)\n",
107
+ "Requirement already satisfied: regex in /usr/local/lib/python3.7/dist-packages (from sacremoses) (2019.12.20)\n",
108
+ "Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from sacremoses) (7.1.2)\n",
109
+ "Requirement already satisfied: numpy>=1.15.4 in /usr/local/lib/python3.7/dist-packages (from pandas) (1.19.5)\n",
110
+ "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas) (2.8.1)\n",
111
+ "Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas) (2018.9)\n",
112
+ "Collecting portalocker==2.0.0\n",
113
+ " Downloading https://files.pythonhosted.org/packages/89/a6/3814b7107e0788040870e8825eebf214d72166adf656ba7d4bf14759a06a/portalocker-2.0.0-py2.py3-none-any.whl\n",
114
+ "Requirement already satisfied: protobuf>=3.8.0 in /usr/local/lib/python3.7/dist-packages (from tensorboardX) (3.12.4)\n",
115
+ "Collecting sphinx-rtd-theme\n",
116
+ "\u001b[?25l Downloading https://files.pythonhosted.org/packages/ac/24/2475e8f83519b54b2148d4a56eb1111f9cec630d088c3ffc214492c12107/sphinx_rtd_theme-0.5.2-py2.py3-none-any.whl (9.1MB)\n",
117
+ "\u001b[K |████████████████████████████████| 9.2MB 42.0MB/s \n",
118
+ "\u001b[?25hCollecting morfessor\n",
119
+ " Downloading https://files.pythonhosted.org/packages/39/e6/7afea30be2ee4d29ce9de0fa53acbb033163615f849515c0b1956ad074ee/Morfessor-2.0.6-py3-none-any.whl\n",
120
+ "Collecting sphinx-argparse\n",
121
+ " Downloading https://files.pythonhosted.org/packages/06/2b/dfad6a1831c3aeeae25d8d3d417224684befbf45e10c7f2141631616a6ed/sphinx-argparse-0.2.5.tar.gz\n",
122
+ "Requirement already satisfied: setuptools in /usr/local/lib/python3.7/dist-packages (from protobuf>=3.8.0->tensorboardX) (57.0.0)\n",
123
+ "Requirement already satisfied: sphinx in /usr/local/lib/python3.7/dist-packages (from sphinx-rtd-theme->indic-nlp-library) (1.8.5)\n",
124
+ "Collecting docutils<0.17\n",
125
+ "\u001b[?25l Downloading https://files.pythonhosted.org/packages/81/44/8a15e45ffa96e6cf82956dd8d7af9e666357e16b0d93b253903475ee947f/docutils-0.16-py2.py3-none-any.whl (548kB)\n",
126
+ "\u001b[K |████████████████████████████████| 552kB 31.5MB/s \n",
127
+ "\u001b[?25hRequirement already satisfied: sphinxcontrib-websupport in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (1.2.4)\n",
128
+ "Requirement already satisfied: snowballstemmer>=1.1 in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (2.1.0)\n",
129
+ "Requirement already satisfied: Jinja2>=2.3 in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (2.11.3)\n",
130
+ "Requirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (20.9)\n",
131
+ "Requirement already satisfied: alabaster<0.8,>=0.7 in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (0.7.12)\n",
132
+ "Requirement already satisfied: imagesize in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (1.2.0)\n",
133
+ "Requirement already satisfied: Pygments>=2.0 in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (2.6.1)\n",
134
+ "Requirement already satisfied: requests>=2.0.0 in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (2.23.0)\n",
135
+ "Requirement already satisfied: babel!=2.0,>=1.3 in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (2.9.1)\n",
136
+ "Requirement already satisfied: sphinxcontrib-serializinghtml in /usr/local/lib/python3.7/dist-packages (from sphinxcontrib-websupport->sphinx->sphinx-rtd-theme->indic-nlp-library) (1.1.5)\n",
137
+ "Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.7/dist-packages (from Jinja2>=2.3->sphinx->sphinx-rtd-theme->indic-nlp-library) (2.0.1)\n",
138
+ "Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->sphinx->sphinx-rtd-theme->indic-nlp-library) (2.4.7)\n",
139
+ "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests>=2.0.0->sphinx->sphinx-rtd-theme->indic-nlp-library) (2.10)\n",
140
+ "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests>=2.0.0->sphinx->sphinx-rtd-theme->indic-nlp-library) (3.0.4)\n",
141
+ "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests>=2.0.0->sphinx->sphinx-rtd-theme->indic-nlp-library) (1.24.3)\n",
142
+ "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests>=2.0.0->sphinx->sphinx-rtd-theme->indic-nlp-library) (2021.5.30)\n",
143
+ "Building wheels for collected packages: sphinx-argparse\n",
144
+ " Building wheel for sphinx-argparse (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
145
+ " Created wheel for sphinx-argparse: filename=sphinx_argparse-0.2.5-cp37-none-any.whl size=11552 sha256=d8cbdca000085e2e2c122c305bb21aa76a9600012ded8e06c300e03d1c4d1e32\n",
146
+ " Stored in directory: /root/.cache/pip/wheels/2a/18/1b/4990a1859da4edc77ab312bc2986c08d2733fb5713d06e44f5\n",
147
+ "Successfully built sphinx-argparse\n",
148
+ "\u001b[31mERROR: datascience 0.10.6 has requirement folium==0.2.1, but you'll have folium 0.8.3 which is incompatible.\u001b[0m\n",
149
+ "Installing collected packages: sacremoses, mock, portalocker, sacrebleu, tensorboardX, docutils, sphinx-rtd-theme, morfessor, sphinx-argparse, indic-nlp-library\n",
150
+ " Found existing installation: docutils 0.17.1\n",
151
+ " Uninstalling docutils-0.17.1:\n",
152
+ " Successfully uninstalled docutils-0.17.1\n",
153
+ "Successfully installed docutils-0.16 indic-nlp-library-0.81 mock-4.0.3 morfessor-2.0.6 portalocker-2.0.0 sacrebleu-1.5.1 sacremoses-0.0.45 sphinx-argparse-0.2.5 sphinx-rtd-theme-0.5.2 tensorboardX-2.3\n",
154
+ "Collecting mosestokenizer\n",
155
+ " Downloading https://files.pythonhosted.org/packages/4b/b3/c0af235b16c4f44a2828ef017f7947d1262b2646e440f85c6a2ff26a8c6f/mosestokenizer-1.1.0.tar.gz\n",
156
+ "Collecting subword-nmt\n",
157
+ " Downloading https://files.pythonhosted.org/packages/74/60/6600a7bc09e7ab38bc53a48a20d8cae49b837f93f5842a41fe513a694912/subword_nmt-0.3.7-py2.py3-none-any.whl\n",
158
+ "Requirement already satisfied: docopt in /usr/local/lib/python3.7/dist-packages (from mosestokenizer) (0.6.2)\n",
159
+ "Collecting openfile\n",
160
+ " Downloading https://files.pythonhosted.org/packages/93/e6/805db6867faacb488b44ba8e0829ef4de151dd0499f3c5da5f4ad11698a7/openfile-0.0.7-py3-none-any.whl\n",
161
+ "Collecting uctools\n",
162
+ " Downloading https://files.pythonhosted.org/packages/04/cb/70ed842d9a43460eedaa11f7503b4ab6537b43b63f0d854d59d8e150fac1/uctools-1.3.0.tar.gz\n",
163
+ "Collecting toolwrapper\n",
164
+ " Downloading https://files.pythonhosted.org/packages/41/7b/34bf8fb69426d8a18bcc61081e9d126f4fcd41c3c832072bef39af1602cd/toolwrapper-2.1.0.tar.gz\n",
165
+ "Building wheels for collected packages: mosestokenizer, uctools, toolwrapper\n",
166
+ " Building wheel for mosestokenizer (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
167
+ " Created wheel for mosestokenizer: filename=mosestokenizer-1.1.0-cp37-none-any.whl size=49120 sha256=4fc04046040e73bd5d13c606ebbfc65ac38c7d073f7fc0b0e4cc1d4215b595f3\n",
168
+ " Stored in directory: /root/.cache/pip/wheels/a2/e7/48/48d5e0f9c0cd5def2dfd7cb8543945f906448ed1313de24a29\n",
169
+ " Building wheel for uctools (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
170
+ " Created wheel for uctools: filename=uctools-1.3.0-cp37-none-any.whl size=6163 sha256=c5a865107c59f98c4da5d18ddc754fa141ab494574187281de1502561c6a004e\n",
171
+ " Stored in directory: /root/.cache/pip/wheels/06/b6/8f/935d5bf5bca85d47c6f5ec31641879bba057d336ab36b1e773\n",
172
+ " Building wheel for toolwrapper (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
173
+ " Created wheel for toolwrapper: filename=toolwrapper-2.1.0-cp37-none-any.whl size=3356 sha256=41a3e12078d5681e8467701735208d880ba588b0f5dbfb3b99c4e04bc643eccc\n",
174
+ " Stored in directory: /root/.cache/pip/wheels/84/ea/29/e02f3b855bf19344972092873a1091b329309bbc3d3d0cbaef\n",
175
+ "Successfully built mosestokenizer uctools toolwrapper\n",
176
+ "Installing collected packages: openfile, uctools, toolwrapper, mosestokenizer, subword-nmt\n",
177
+ "Successfully installed mosestokenizer-1.1.0 openfile-0.0.7 subword-nmt-0.3.7 toolwrapper-2.1.0 uctools-1.3.0\n",
178
+ "Cloning into 'fairseq'...\n",
179
+ "remote: Enumerating objects: 28410, done.\u001b[K\n",
180
+ "remote: Counting objects: 100% (229/229), done.\u001b[K\n",
181
+ "remote: Compressing objects: 100% (127/127), done.\u001b[K\n",
182
+ "remote: Total 28410 (delta 114), reused 187 (delta 99), pack-reused 28181\u001b[K\n",
183
+ "Receiving objects: 100% (28410/28410), 11.96 MiB | 24.16 MiB/s, done.\n",
184
+ "Resolving deltas: 100% (21310/21310), done.\n",
185
+ "/content/fairseq\n",
186
+ "Obtaining file:///content/fairseq\n",
187
+ " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
188
+ " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
189
+ " Installing backend dependencies ... \u001b[?25l\u001b[?25hdone\n",
190
+ " Preparing wheel metadata ... \u001b[?25l\u001b[?25hdone\n",
191
+ "Requirement already satisfied: cffi in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (1.14.5)\n",
192
+ "Collecting hydra-core<1.1\n",
193
+ "\u001b[?25l Downloading https://files.pythonhosted.org/packages/52/e3/fbd70dd0d3ce4d1d75c22d56c0c9f895cfa7ed6587a9ffb821d6812d6a60/hydra_core-1.0.6-py3-none-any.whl (123kB)\n",
194
+ "\u001b[K |████████████████████████████████| 133kB 11.6MB/s \n",
195
+ "\u001b[?25hRequirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (4.41.1)\n",
196
+ "Collecting omegaconf<2.1\n",
197
+ " Downloading https://files.pythonhosted.org/packages/d0/eb/9d63ce09dd8aa85767c65668d5414958ea29648a0eec80a4a7d311ec2684/omegaconf-2.0.6-py3-none-any.whl\n",
198
+ "Requirement already satisfied: regex in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (2019.12.20)\n",
199
+ "Requirement already satisfied: numpy; python_version >= \"3.7\" in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (1.19.5)\n",
200
+ "Requirement already satisfied: torch in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (1.9.0+cu102)\n",
201
+ "Requirement already satisfied: sacrebleu>=1.4.12 in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (1.5.1)\n",
202
+ "Requirement already satisfied: cython in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (0.29.23)\n",
203
+ "Requirement already satisfied: pycparser in /usr/local/lib/python3.7/dist-packages (from cffi->fairseq==1.0.0a0+f887152) (2.20)\n",
204
+ "Collecting antlr4-python3-runtime==4.8\n",
205
+ "\u001b[?25l Downloading https://files.pythonhosted.org/packages/56/02/789a0bddf9c9b31b14c3e79ec22b9656185a803dc31c15f006f9855ece0d/antlr4-python3-runtime-4.8.tar.gz (112kB)\n",
206
+ "\u001b[K |████████████████████████████████| 112kB 33.5MB/s \n",
207
+ "\u001b[?25hRequirement already satisfied: importlib-resources; python_version < \"3.9\" in /usr/local/lib/python3.7/dist-packages (from hydra-core<1.1->fairseq==1.0.0a0+f887152) (5.1.4)\n",
208
+ "Collecting PyYAML>=5.1.*\n",
209
+ "\u001b[?25l Downloading https://files.pythonhosted.org/packages/7a/a5/393c087efdc78091afa2af9f1378762f9821c9c1d7a22c5753fb5ac5f97a/PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636kB)\n",
210
+ "\u001b[K |████████████████████████████████| 645kB 30.2MB/s \n",
211
+ "\u001b[?25hRequirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from omegaconf<2.1->fairseq==1.0.0a0+f887152) (3.7.4.3)\n",
212
+ "Requirement already satisfied: portalocker==2.0.0 in /usr/local/lib/python3.7/dist-packages (from sacrebleu>=1.4.12->fairseq==1.0.0a0+f887152) (2.0.0)\n",
213
+ "Requirement already satisfied: zipp>=3.1.0; python_version < \"3.10\" in /usr/local/lib/python3.7/dist-packages (from importlib-resources; python_version < \"3.9\"->hydra-core<1.1->fairseq==1.0.0a0+f887152) (3.4.1)\n",
214
+ "Building wheels for collected packages: antlr4-python3-runtime\n",
215
+ " Building wheel for antlr4-python3-runtime (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
216
+ " Created wheel for antlr4-python3-runtime: filename=antlr4_python3_runtime-4.8-cp37-none-any.whl size=141231 sha256=69960f774a6fdb385fed1a63fb02ae50b57299408cfd6fb33be60d686be878b7\n",
217
+ " Stored in directory: /root/.cache/pip/wheels/e3/e2/fa/b78480b448b8579ddf393bebd3f47ee23aa84c89b6a78285c8\n",
218
+ "Successfully built antlr4-python3-runtime\n",
219
+ "Installing collected packages: antlr4-python3-runtime, PyYAML, omegaconf, hydra-core, fairseq\n",
220
+ " Found existing installation: PyYAML 3.13\n",
221
+ " Uninstalling PyYAML-3.13:\n",
222
+ " Successfully uninstalled PyYAML-3.13\n",
223
+ " Running setup.py develop for fairseq\n",
224
+ "Successfully installed PyYAML-5.4.1 antlr4-python3-runtime-4.8 fairseq hydra-core-1.0.6 omegaconf-2.0.6\n",
225
+ "/content\n"
226
+ ]
227
+ }
228
+ ],
229
+ "source": [
230
+ "# Install the necessary libraries\n",
231
+ "!pip install sacremoses pandas mock sacrebleu tensorboardX pyarrow indic-nlp-library\n",
232
+ "! pip install mosestokenizer subword-nmt\n",
233
+ "# Install fairseq from source\n",
234
+ "!git clone https://github.com/pytorch/fairseq.git\n",
235
+ "%cd fairseq\n",
236
+ "# !git checkout da9eaba12d82b9bfc1442f0e2c6fc1b895f4d35d\n",
237
+ "!pip install --editable ./\n",
238
+ "\n",
239
+ "%cd .."
240
+ ]
241
+ },
242
+ {
243
+ "cell_type": "code",
244
+ "execution_count": 1,
245
+ "metadata": {
246
+ "id": "TktUu9NW_PLq"
247
+ },
248
+ "outputs": [],
249
+ "source": [
250
+ "# this step is only required if you are running the code on colab\n",
251
+ "# restart the runtime after running prev cell (to update). See this -> https://stackoverflow.com/questions/57838013/modulenotfounderror-after-successful-pip-install-in-google-colaboratory\n",
252
+ "\n",
253
+ "# this import will not work without restarting runtime\n",
254
+ "from fairseq import checkpoint_utils, distributed_utils, options, tasks, utils"
255
+ ]
256
+ },
257
+ {
258
+ "cell_type": "code",
259
+ "execution_count": 9,
260
+ "metadata": {
261
+ "colab": {
262
+ "base_uri": "https://localhost:8080/"
263
+ },
264
+ "id": "E_4JxNdRlPQB",
265
+ "outputId": "82ab5e2f-d560-4f4e-bf3f-f1ca0a8d31b8"
266
+ },
267
+ "outputs": [
268
+ {
269
+ "name": "stdout",
270
+ "output_type": "stream",
271
+ "text": [
272
+ "--2021-06-27 12:43:16-- https://storage.googleapis.com/samanantar-public/V0.2/models/indic-en.zip\n",
273
+ "Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.13.240, 172.217.15.80, 142.251.33.208, ...\n",
274
+ "Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.13.240|:443... connected.\n",
275
+ "HTTP request sent, awaiting response... 200 OK\n",
276
+ "Length: 4551079075 (4.2G) [application/zip]\n",
277
+ "Saving to: ‘indic-en.zip’\n",
278
+ "\n",
279
+ "indic-en.zip 100%[===================>] 4.24G 28.8MB/s in 83s \n",
280
+ "\n",
281
+ "2021-06-27 12:44:39 (52.1 MB/s) - ‘indic-en.zip’ saved [4551079075/4551079075]\n",
282
+ "\n",
283
+ "Archive: indic-en.zip\n",
284
+ " creating: indic-en/\n",
285
+ " creating: indic-en/vocab/\n",
286
+ " inflating: indic-en/vocab/bpe_codes.32k.SRC \n",
287
+ " inflating: indic-en/vocab/vocab.SRC \n",
288
+ " inflating: indic-en/vocab/vocab.TGT \n",
289
+ " inflating: indic-en/vocab/bpe_codes.32k.TGT \n",
290
+ " creating: indic-en/final_bin/\n",
291
+ " inflating: indic-en/final_bin/dict.TGT.txt \n",
292
+ " inflating: indic-en/final_bin/dict.SRC.txt \n",
293
+ " creating: indic-en/model/\n",
294
+ " inflating: indic-en/model/checkpoint_best.pt \n",
295
+ "/content/indicTrans\n"
296
+ ]
297
+ }
298
+ ],
299
+ "source": [
300
+ "# download the indictrans model\n",
301
+ "\n",
302
+ "\n",
303
+ "# downloading the indic-en model\n",
304
+ "!wget https://storage.googleapis.com/samanantar-public/V0.3/models/indic-en.zip\n",
305
+ "!unzip indic-en.zip\n",
306
+ "\n",
307
+ "# downloading the en-indic model\n",
308
+ "# !wget https://storage.googleapis.com/samanantar-public/V0.3/models/en-indic.zip\n",
309
+ "# !unzip en-indic.zip\n",
310
+ "\n",
311
+ "# # downloading the indic-indic model\n",
312
+ "# !wget https://storage.googleapis.com/samanantar-public/V0.3/models/m2m.zip\n",
313
+ "# !unzip m2m.zip\n",
314
+ "\n",
315
+ "%cd indicTrans"
316
+ ]
317
+ },
318
+ {
319
+ "cell_type": "code",
320
+ "execution_count": 10,
321
+ "metadata": {
322
+ "colab": {
323
+ "base_uri": "https://localhost:8080/"
324
+ },
325
+ "id": "yTnWbHqY01-B",
326
+ "outputId": "0d075f51-097b-46ad-aade-407a4437aa62"
327
+ },
328
+ "outputs": [
329
+ {
330
+ "name": "stdout",
331
+ "output_type": "stream",
332
+ "text": [
333
+ "Initializing vocab and bpe\n",
334
+ "Initializing model for translation\n"
335
+ ]
336
+ }
337
+ ],
338
+ "source": [
339
+ "from indicTrans.inference.engine import Model\n",
340
+ "\n",
341
+ "indic2en_model = Model(expdir='../indic-en')"
342
+ ]
343
+ },
344
+ {
345
+ "cell_type": "code",
346
+ "execution_count": 11,
347
+ "metadata": {
348
+ "colab": {
349
+ "base_uri": "https://localhost:8080/"
350
+ },
351
+ "id": "QTp2NOgQ__sB",
352
+ "outputId": "e015a71e-8206-4e1d-cb3e-11ecb4d44f76"
353
+ },
354
+ "outputs": [
355
+ {
356
+ "name": "stderr",
357
+ "output_type": "stream",
358
+ "text": [
359
+ "100%|██████████| 3/3 [00:00<00:00, 1225.21it/s]\n",
360
+ "/usr/local/lib/python3.7/dist-packages/torch/_tensor.py:575: UserWarning: floor_divide is deprecated, and will be removed in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values.\n",
361
+ "To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at /pytorch/aten/src/ATen/native/BinaryOps.cpp:467.)\n",
362
+ " return torch.floor_divide(self, other)\n"
363
+ ]
364
+ },
365
+ {
366
+ "data": {
367
+ "text/plain": [
368
+ "['He seems to know us.',\n",
369
+ " 'I couldnt find it anywhere.',\n",
370
+ " 'If someone in your neighbourhood develops these symptoms, staying at home can help prevent the spread of the coronavirus infection.']"
371
+ ]
372
+ },
373
+ "execution_count": 11,
374
+ "metadata": {
375
+ "tags": []
376
+ },
377
+ "output_type": "execute_result"
378
+ }
379
+ ],
380
+ "source": [
381
+ "ta_sents = ['அவனுக்கு நம்மைப் தெரியும் என்று தோன்றுகிறது',\n",
382
+ " \"இது எங்கே இருக்கு என்று என்னால் கண்டுபிடிக்க முடியவில்லை.\",\n",
383
+ " 'உங்களுக்கு உங்கள் அருகில் இருக்கும் ஒருவருக்கோ இத்தகைய அறிகுறிகள் தென்பட்டால், வீட்டிலேயே இருப்பது, கொரோனா வைரஸ் தொற்று பிறருக்கு வராமல் தடுக்க உதவும்.']\n",
384
+ "\n",
385
+ "\n",
386
+ "indic2en_model.batch_translate(ta_sents, 'ta', 'en')\n",
387
+ "\n"
388
+ ]
389
+ },
390
+ {
391
+ "cell_type": "code",
392
+ "execution_count": 13,
393
+ "metadata": {
394
+ "colab": {
395
+ "base_uri": "https://localhost:8080/",
396
+ "height": 68
397
+ },
398
+ "id": "VFXrCNZGEN7Z",
399
+ "outputId": "f72aad17-1cc0-4774-a7ee-5b3a5d954de3"
400
+ },
401
+ "outputs": [
402
+ {
403
+ "name": "stderr",
404
+ "output_type": "stream",
405
+ "text": [
406
+ "100%|██████████| 4/4 [00:00<00:00, 1496.76it/s]\n"
407
+ ]
408
+ },
409
+ {
410
+ "data": {
411
+ "application/vnd.google.colaboratory.intrinsic+json": {
412
+ "type": "string"
413
+ },
414
+ "text/plain": [
415
+ "'The pandemic has resulted in worldwide social and economic disruption. The world is facing the worst recession since the global financial crisis. This led to the postponement or cancellation of sporting, religious, political and cultural events. Due to the fear, there was shortage of supply as more people purchased items like masks, sanitizers etc.'"
416
+ ]
417
+ },
418
+ "execution_count": 13,
419
+ "metadata": {
420
+ "tags": []
421
+ },
422
+ "output_type": "execute_result"
423
+ }
424
+ ],
425
+ "source": [
426
+ "\n",
427
+ "ta_paragraph = \"\"\"இத்தொற்றுநோய் உலகளாவிய சமூக மற்றும் பொருளாதார சீர்குலைவை ஏற்படுத்தியுள்ளது.இதனால் பெரும் பொருளாதார மந்தநிலைக்குப் பின்னர் உலகளவில் மிகப்பெரிய மந்தநிலை ஏற்பட்டுள்ளது. இது விளையாட்டு,மத, அரசியல் மற்றும் கலாச்சார நிகழ்வுகளை ஒத்திவைக்க அல்லது ரத்து செய்ய வழிவகுத்தது.\n",
428
+ "அச்சம் காரணமாக முகக்கவசம், கிருமிநாசினி உள்ளிட்ட பொருட்களை அதிக நபர்கள் வாங்கியதால் விநியோகப் பற்றாக்குறை ஏற்பட்டது.\"\"\"\n",
429
+ "\n",
430
+ "indic2en_model.translate_paragraph(ta_paragraph, 'ta', 'en')"
431
+ ]
432
+ },
433
+ {
434
+ "cell_type": "code",
435
+ "execution_count": null,
436
+ "metadata": {
437
+ "id": "Hi_D7s_VIjis"
438
+ },
439
+ "outputs": [],
440
+ "source": []
441
+ }
442
+ ],
443
+ "metadata": {
444
+ "accelerator": "GPU",
445
+ "colab": {
446
+ "authorship_tag": "ABX9TyM3t8oQYMhBUuq4/Pyhcr0+",
447
+ "collapsed_sections": [],
448
+ "include_colab_link": true,
449
+ "name": "indicTrans_python_interface.ipynb",
450
+ "provenance": []
451
+ },
452
+ "kernelspec": {
453
+ "display_name": "Python 3",
454
+ "name": "python3"
455
+ },
456
+ "language_info": {
457
+ "name": "python"
458
+ }
459
+ },
460
+ "nbformat": 4,
461
+ "nbformat_minor": 0
462
+ }
indicTrans/indic_nlp_library/LICENSE ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2013-present Anoop Kunchukuttan
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
6
+
7
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
8
+
9
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
indicTrans/indic_nlp_library/README.md ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Indic NLP Library
2
+
3
+ The goal of the Indic NLP Library is to build Python based libraries for common text processing and Natural Language Processing in Indian languages. Indian languages share a lot of similarity in terms of script, phonology, language syntax, etc. and this library is an attempt to provide a general solution to very commonly required toolsets for Indian language text.
4
+
5
+ The library provides the following functionalities:
6
+
7
+ - Text Normalization
8
+ - Script Information
9
+ - Word Tokenization and Detokenization
10
+ - Sentence Splitting
11
+ - Word Segmentation
12
+ - Syllabification
13
+ - Script Conversion
14
+ - Romanization
15
+ - Indicization
16
+ - Transliteration
17
+ - Translation
18
+
19
+ The data resources required by the Indic NLP Library are hosted in a different repository. These resources are required for some modules. You can download from the [Indic NLP Resources](https://github.com/anoopkunchukuttan/indic_nlp_resources) project.
20
+
21
+ **If you are interested in Indian language NLP resources, you should check the [Indic NLP Catalog](https://github.com/indicnlpweb/indicnlp_catalog) for pointers.**
22
+
23
+ ## Pre-requisites
24
+
25
+ - Python 3.x
26
+ - (For Python 2.x version check the tag `PYTHON_2.7_FINAL_JAN_2019`. Not actively supporting Python 2.x anymore, but will try to maintain as much compatibility as possible)
27
+ - [Indic NLP Resources](https://github.com/anoopkunchukuttan/indic_nlp_resources)
28
+ - [Urduhack](https://github.com/urduhack/urduhack): Needed only if Urdu normalization is required. It has other dependencies like Tensorflow.
29
+ - Other dependencies are listed in setup.py
30
+
31
+
32
+ ## Configuration
33
+
34
+ - Installation from pip:
35
+
36
+ `pip install indic-nlp-library`
37
+
38
+ - If you want to use the project from the github repo, add the project to the Python Path:
39
+
40
+ - Clone this repository
41
+ - Install dependencies: `pip install -r requirements.txt`
42
+ - Run: `export PYTHONPATH=$PYTHONPATH:<project base directory>`
43
+
44
+ - In either case, export the path to the _Indic NLP Resources_ directory
45
+
46
+ Run: `export INDIC_RESOURCES_PATH=<path to Indic NLP resources>`
47
+
48
+ ## Usage
49
+
50
+ You can use the Python API to access all the features of the library. Many of the most common operations are also accessible via a unified commandline API.
51
+
52
+ ### Getting Started
53
+
54
+ Check [this IPython Notebook](http://nbviewer.ipython.org/url/anoopkunchukuttan.github.io/indic_nlp_library/doc/indic_nlp_examples.ipynb) for examples to use the Python API.
55
+ - You can find the Python 2.x Notebook [here](http://nbviewer.ipython.org/url/anoopkunchukuttan.github.io/indic_nlp_library/doc/indic_nlp_examples_2_7.ipynb)
56
+
57
+ ### Documentation
58
+
59
+ You can find detailed documentation [HERE](https://indic-nlp-library.readthedocs.io/en/latest)
60
+
61
+ This documents the Python API as well as the commandline reference.
62
+
63
+ ## Citing
64
+
65
+ If you use this library, please include the following citation:
66
+
67
+ ```
68
+ @misc{kunchukuttan2020indicnlp,
69
+ author = "Anoop Kunchukuttan",
70
+ title = "{The IndicNLP Library}",
71
+ year = "2020",
72
+ howpublished={\url{https://github.com/anoopkunchukuttan/indic_nlp_library/blob/master/docs/indicnlp.pdf}}
73
+ }
74
+ ```
75
+ You can find the document [HERE](docs/indicnlp.pdf)
76
+
77
+ ## Website
78
+
79
+ `http://anoopkunchukuttan.github.io/indic_nlp_library`
80
+
81
+ ## Author
82
+ Anoop Kunchukuttan ([anoop.kunchukuttan@gmail.com](anoop.kunchukuttan@gmail.com))
83
+
84
+ ## Companies, Organizations, Projects using IndicNLP Library
85
+
86
+ - [AI4Bharat-IndicNLPSuite](https://indicnlp.ai4bharat.org)
87
+ - [The Classical Language Toolkit](http://cltk.org)
88
+ - [Microsoft NLP Recipes](https://github.com/microsoft/nlp-recipes)
89
+ - [Facebook M2M-100](https://github.com/pytorch/fairseq/tree/master/examples/m2m_100)
90
+
91
+ ## Revision Log
92
+
93
+
94
+ 0.81 : 26 May 2021
95
+
96
+ - Bug fix in version number extraction
97
+
98
+ 0.80 : 24 May 2021
99
+
100
+ - Improved sentence splitting
101
+ - Bug fixes
102
+ - Support for Urdu Normalizer
103
+
104
+ 0.71 : 03 Sep 2020
105
+
106
+ - Improved documentation
107
+ - Bug fixes
108
+
109
+ 0.7 : 02 Apr 2020:
110
+
111
+ - Unified commandline
112
+ - Improved documentation
113
+ - Added setup.py
114
+
115
+ 0.6 : 16 Dec 2019:
116
+
117
+ - New romanizer and indicizer
118
+ - Script Unifiers
119
+ - Improved script normalizers
120
+ - Added contrib directory for sample uses
121
+ - changed to MIT license
122
+
123
+ 0.5 : 03 Jun 2019:
124
+
125
+ - Improved word tokenizer to handle dates and numbers.
126
+ - Added sentence splitter that can handle common prefixes/honorofics and uses some heuristics.
127
+ - Added detokenizer
128
+ - Added acronym transliterator that can convert English acronyms to Brahmi-derived scripts
129
+
130
+ 0.4 : 28 Jan 2019: Ported to Python 3, and lots of feature additions since last release; primarily around script information, script similarity and syllabification.
131
+
132
+ 0.3 : 21 Oct 2014: Supports morph-analysis between Indian languages
133
+
134
+ 0.2 : 13 Jun 2014: Supports transliteration between Indian languages and tokenization of Indian languages
135
+
136
+ 0.1 : 12 Mar 2014: Initial version. Supports text normalization.
137
+
138
+ ## LICENSE
139
+
140
+ Indic NLP Library is released under the MIT license
141
+
142
+
indicTrans/indic_nlp_library/contrib/README.md ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # Contrib
2
+
3
+ Contains additional utilities and applications using Indic NLP library core
4
+
5
+ - `indic_scraper_project_sample.ipynb`: A simple pipeline for building monolingual corpora for Indian languages from crawled web content, Wikipedia, etc. An extensible framework which allows incorporation of website specific extractors, whereas generic NLP tasks like tokenization, sentence splitting, normalization, etc. are handled by the framework.
6
+ - `correct_moses_tokenizer.py`: This script corrects the incorrect tokenization done by Moses tokenizer. The Moses tokenizer splits on nukta and halant characters.
7
+ - `hindi_to_kannada_transliterator.py`: This script transliterates Hindi to Kannada. It removes/remaps characters only found in Hindi. It also adds halanta to words ending with consonant - as is the convention in Kannada.
indicTrans/indic_nlp_library/contrib/correct_moses_tokenizer.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from indicnlp import langinfo
3
+ from indicnlp import loader
4
+
5
+ if __name__ == '__main__':
6
+ """
7
+ This script corrects the incorrect tokenization done by Moses tokenizer.
8
+ The Moses tokenizer splits on nukta and halant characters
9
+ Usage: python correct_moses_tokenizer.py <infname> <outfname> <langcode>
10
+ """
11
+
12
+ loader.load()
13
+
14
+ infname=sys.argv[1]
15
+ outfname=sys.argv[2]
16
+ lang=sys.argv[3]
17
+
18
+ halant_char=langinfo.offset_to_char(langinfo.HALANTA_OFFSET,lang)
19
+ nukta_char=langinfo.offset_to_char(langinfo.NUKTA_OFFSET,lang)
20
+
21
+ with open(infname,'r',encoding='utf-8') as infile, \
22
+ open(outfname,'w',encoding='utf-8') as outfile:
23
+ for line in infile:
24
+ outfile.write(
25
+ line.replace(
26
+ ' {} '.format(halant_char), halant_char).replace(
27
+ ' {} '.format(nukta_char), nukta_char).replace(
28
+ ' {}{}'.format(nukta_char,halant_char),'{}{}'.format(nukta_char,halant_char))
29
+ )
indicTrans/indic_nlp_library/contrib/hindi_to_kannada_transliterator.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from indicnlp import common
3
+ common.set_resources_path(INDIC_NLP_RESOURCES)
4
+
5
+ from indicnlp import loader
6
+ from indicnlp.normalize import indic_normalize
7
+ from indicnlp.transliterate import unicode_transliterate
8
+
9
+ if __name__ == '__main__':
10
+ """
11
+ This script transliterates Hindi to Kannada. It removes/remaps
12
+ characters only found in Hindi. It also adds halanta to words ending
13
+ with consonant - as is the convention in Kannada
14
+ """
15
+
16
+ infname=sys.argv[1] # one sentence/word per line. Sentences should be space-tokenized
17
+ outfname=sys.agv[2]
18
+ loader.load()
19
+
20
+ normalizer_factory=indic_normalize.IndicNormalizerFactory()
21
+ normalizer=normalizer_factory.get_normalizer('hi')
22
+
23
+ with open(infname,'r',encoding='utf-8') as infile, \
24
+ open(outfname,'w',encoding='utf-8') as outfile:
25
+ for line in infile:
26
+ line=line.strip()
27
+ line=normalizer.normalize(line)
28
+
29
+ ## replace chandrabindus with anusvara
30
+ line=line.replace('\u0900','\u0902')
31
+ line=line.replace('\u0901','\u0902')
32
+
33
+ ### replace chandra e and o diacritics with e and o respectively
34
+ #line=line.replace('\u0945','\u0947')
35
+ #line=line.replace('\u0949','\u094b')
36
+
37
+ ### replace chandra e and o diacritics with a diacritic
38
+ ## this seems to be general usage
39
+ line=line.replace('\u0945','\u093e')
40
+ line=line.replace('\u0949','\u093e')
41
+
42
+ ## remove nukta
43
+ line=line.replace('\u093c','')
44
+
45
+ ## add halant if word ends with consonant
46
+ #if isc.is_consonant(isc.get_phonetic_feature_vector(line[-1],'hi')):
47
+ # line=line+'\u094d'
48
+ words=line.split(' ')
49
+ outwords=[]
50
+ for word in line.split(' '):
51
+ if isc.is_consonant(isc.get_phonetic_feature_vector(word[-1],'hi')):
52
+ word=word+'\u094d'
53
+ outwords.append(word)
54
+ line=' '.join(outwords)
55
+
56
+
57
+ ## script conversion
58
+ line=unicode_transliterate.UnicodeIndicTransliterator.transliterate(line,'hi','kn')
59
+
60
+ outfile.write(line+'\n')
61
+
62
+
indicTrans/indic_nlp_library/contrib/indic_scraper_project_sample.ipynb ADDED
@@ -0,0 +1,569 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# Pre-requisites\n",
8
+ "\n",
9
+ "- Python 3.5+\n",
10
+ "- Python packages: \n",
11
+ " - `pip install bs4 pandas mmh3`\n",
12
+ "- [Indic NLP Library](https://github.com/anoopkunchukuttan/indic_nlp_library)\n",
13
+ "- [Indic NLP Resources](https://github.com/anoopkunchukuttan/indic_nlp_resources)"
14
+ ]
15
+ },
16
+ {
17
+ "cell_type": "markdown",
18
+ "metadata": {},
19
+ "source": [
20
+ "# Initialize the Indic NLP Library\n",
21
+ "\n",
22
+ "Run the cell below to initialize the Indic NLP Library"
23
+ ]
24
+ },
25
+ {
26
+ "cell_type": "code",
27
+ "execution_count": null,
28
+ "metadata": {},
29
+ "outputs": [],
30
+ "source": [
31
+ "# The path to the local git repo for Indic NLP Library\n",
32
+ "INDIC_NLP_LIB_HOME=\"/disk1/src/indic_nlp_library\"\n",
33
+ "\n",
34
+ "# The path to the local git repo for Indic NLP Resources\n",
35
+ "INDIC_NLP_RESOURCES=\"/disk1/src/indic_nlp_resources\"\n",
36
+ "\n",
37
+ "import sys\n",
38
+ "sys.path.append('{}/src'.format(INDIC_NLP_LIB_HOME))\n",
39
+ "\n",
40
+ "from indicnlp import common\n",
41
+ "common.set_resources_path(INDIC_NLP_RESOURCES)\n",
42
+ "\n",
43
+ "from indicnlp import loader\n",
44
+ "loader.load()"
45
+ ]
46
+ },
47
+ {
48
+ "cell_type": "code",
49
+ "execution_count": null,
50
+ "metadata": {},
51
+ "outputs": [],
52
+ "source": [
53
+ "from bs4 import BeautifulSoup\n",
54
+ "import os\n",
55
+ "import string\n",
56
+ "import indicnlp\n",
57
+ "from indicnlp.tokenize import indic_tokenize\n",
58
+ "from indicnlp.normalize import indic_normalize\n",
59
+ "from indicnlp.transliterate import unicode_transliterate\n",
60
+ "from indicnlp.tokenize import sentence_tokenize\n",
61
+ "import re\n",
62
+ "import collections\n",
63
+ "import random\n",
64
+ "import mmh3"
65
+ ]
66
+ },
67
+ {
68
+ "cell_type": "markdown",
69
+ "metadata": {},
70
+ "source": [
71
+ "# Common Functions"
72
+ ]
73
+ },
74
+ {
75
+ "cell_type": "code",
76
+ "execution_count": null,
77
+ "metadata": {},
78
+ "outputs": [],
79
+ "source": [
80
+ "def preprocess_sent(text,lang,normalizer):\n",
81
+ " \"\"\"\n",
82
+ " Pre-process text (normalization and tokenization)\n",
83
+ " \n",
84
+ " text: text string to preprocess\n",
85
+ " lang: language code (2-letter ISO code)\n",
86
+ " normalizer: normalizer object for language\n",
87
+ " \n",
88
+ " returns the processed text string\n",
89
+ " \"\"\"\n",
90
+ " return ' '.join(indic_tokenize.trivial_tokenize(normalizer.normalize(text.replace('\\n',' ')),lang)) \n",
91
+ "\n",
92
+ "def sent_split(text,lang):\n",
93
+ " \"\"\"\n",
94
+ " Sentence splitter\n",
95
+ " \n",
96
+ " text: text to sentence split \n",
97
+ " lang: language\n",
98
+ " \n",
99
+ " returns list of sentences \n",
100
+ " \"\"\"\n",
101
+ " return sentence_tokenize.sentence_split(text,lang)\n",
102
+ "\n",
103
+ "def extract_all_content(indir,lang,\n",
104
+ " article_extract_fn,\n",
105
+ " preprocess_fn=preprocess_sent,\n",
106
+ " narticles=-1,\n",
107
+ " start_artid=0):\n",
108
+ " \"\"\"\n",
109
+ " This method reads all files from the input directory, extracts text content from each file,\n",
110
+ " and pre-processes the text. This method is a generator. \n",
111
+ " For each sentence, the method yields a tuple of the format: \n",
112
+ " \n",
113
+ " (artid, fname, paraid, sentid, processed_text)\n",
114
+ " \n",
115
+ " indir: path to input directoryo containing files to be parsed \n",
116
+ " \n",
117
+ " lang: language to the files in the input directory\n",
118
+ " \n",
119
+ " article_extract_fn: the function to extract text content from each file. \n",
120
+ " Signature of the function: get_article_contents(fname,lang,encoding) \n",
121
+ " `fname` is name of the file, `lang` is langcode, \n",
122
+ " `encoding` is text-encoding (default=utf-8). \n",
123
+ " The function yields a tuple (paraid, sentid, extracted_text) \n",
124
+ " for each sentence.\n",
125
+ " \n",
126
+ " preprocess_fn: pre-processing function to apply to the extracted text. \n",
127
+ " The function takes a string as input and returns processed string as output.\n",
128
+ " \n",
129
+ " narticles: extract and process the first `narticles` from input directory. \n",
130
+ " if narticles=-1 (default), all files are extracted\n",
131
+ " \n",
132
+ " start_artid: the start of the article id to assign to extracted articles (default=0)\n",
133
+ " \n",
134
+ " \"\"\"\n",
135
+ "\n",
136
+ " fnames = os.listdir(indir)\n",
137
+ " if narticles>0:\n",
138
+ " fnames=fnames[:narticles]\n",
139
+ " nsent=0\n",
140
+ "\n",
141
+ " normalizer_factory=indic_normalize.IndicNormalizerFactory()\n",
142
+ " normalizer=normalizer_factory.get_normalizer(lang)\n",
143
+ " \n",
144
+ " print('Number of articles: {}'.format(len(fnames)))\n",
145
+ " for artid, fname in enumerate(fnames,start_artid):\n",
146
+ "# print(fname)\n",
147
+ " if artid%100 == 0:\n",
148
+ " print('({}|{})'.format(artid,nsent),end=' ... ')\n",
149
+ " \n",
150
+ " try:\n",
151
+ " fpath=os.sep.join([indir,fname])\n",
152
+ " for paraid, sentid, sent in article_extract_fn(fpath,lang):\n",
153
+ " nsent+=1\n",
154
+ " yield( ( artid, fname, paraid, sentid, preprocess_fn(sent,lang,normalizer) ) )\n",
155
+ " except:\n",
156
+ " print('Cannot parse {}'.format(fname))\n",
157
+ " \n",
158
+ "def write_corpus(corpus_iterator,content_fname,article_mapping_fname,delimiter=' ||| ', encoding='utf-8'):\n",
159
+ " \"\"\"\n",
160
+ " Writes the extracted corpus to a file. The extracted data is organized in terms of articles, paragraphs \n",
161
+ " and sentences. The following is the format of the output file: \n",
162
+ " - one line per sentence\n",
163
+ " - format of line: article_id, para_id, sent_id, sentence\n",
164
+ " In addition to the content file mention, a metadata file which maps the article id to the filename is also written. \n",
165
+ " \n",
166
+ " corpus_iterator: iterator over the corpus, yielding tuple (artid, fname, paraid, sentid, processed_text). \n",
167
+ " The function `extract_all_content` yields a generator in this format. \n",
168
+ " content_fname: output content file to write the extracted data to in the format mentioned above\n",
169
+ " article_mapping_fname: output metadata file to write article id to filename mapping.\n",
170
+ " delimiter=' ||| ': delimiter for the content file. The default delimiter is the same \n",
171
+ " as used in the Moses phrase table\n",
172
+ " encoding: text encoding default - 'utf-8'\n",
173
+ " \n",
174
+ " \"\"\"\n",
175
+ " \n",
176
+ " artid_name_mapping={}\n",
177
+ " with open(content_fname,'w',encoding=encoding) as contentfile:\n",
178
+ " for artid, fname, paraid, sentid, text in corpus_iterator:\n",
179
+ " contentfile.write(delimiter.join([str(artid), str(paraid), str(sentid), text]) + '\\n')\n",
180
+ " artid_name_mapping[artid]=fname\n",
181
+ "\n",
182
+ " with open(article_mapping_fname,'w',encoding=encoding) as artmappingfile:\n",
183
+ " for artid, name in sorted(artid_name_mapping.items(),key=lambda x: x[0]):\n",
184
+ " artmappingfile.write('{} {} {}\\n'.format(artid,delimiter,name))\n",
185
+ "\n",
186
+ "def convert_txt_to_csv_format(infname, outfname, encoding='utf-8'):\n",
187
+ " \"\"\"\n",
188
+ " convert txt file to csv format. This method is used when the text file is directly available.\n",
189
+ " The input file has one sentence per line. Assumed to be preprocessed (tokenized, normalized)\n",
190
+ " \n",
191
+ " \"\"\"\n",
192
+ " with open(infname,'r',encoding=encoding) as infile, \\\n",
193
+ " open(outfname,'w',encoding=encoding) as outfile: \n",
194
+ " for i, line in enumerate(infile):\n",
195
+ " outfile.write('0 ||| 0 ||| {} ||| {}\\n'.format(i,line.strip()))\n",
196
+ " \n",
197
+ "def preprocess_convert_txt_to_csv_format(infname, outfname, lang, encoding='utf-8'):\n",
198
+ " \"\"\"\n",
199
+ " Convert raw text file to csv format\n",
200
+ " \"\"\"\n",
201
+ " \n",
202
+ " normalizer_factory=indic_normalize.IndicNormalizerFactory()\n",
203
+ " normalizer=normalizer_factory.get_normalizer(lang)\n",
204
+ " \n",
205
+ " with open(infname,'r',encoding=encoding) as infile, \\\n",
206
+ " open(outfname,'w',encoding=encoding) as outfile: \n",
207
+ " i=0\n",
208
+ " for line in infile:\n",
209
+ " sents = sent_split(line.strip(),lang)\n",
210
+ " for sent in sents:\n",
211
+ " outfile.write('0 ||| 0 ||| {} ||| {}\\n'.format(i,\n",
212
+ " preprocess_sent(sent.strip(), lang, normalizer)) )\n",
213
+ " i=i+1\n",
214
+ "\n",
215
+ "def print_txt(infnames, outfname, encoding='utf-8'):\n",
216
+ " \"\"\"\n",
217
+ " Extract only the text from the content csv file. The output file has one sentence per file.\n",
218
+ " \"\"\"\n",
219
+ " with open(outfname,'w',encoding=encoding) as outfile: \n",
220
+ " for infname in filter(lambda x: os.path.isfile(x),infnames):\n",
221
+ " with open(infname,'r',encoding=encoding) as infile:\n",
222
+ " for i, line in enumerate(infile):\n",
223
+ " fields=line.strip().split('|||')\n",
224
+ " if len(fields) >=4:\n",
225
+ " outfile.write('{}\\n'.format(fields[3].strip()))\n",
226
+ " \n",
227
+ "# def dedup_and_print_txt(infnames, outfname, encoding='utf-8'):\n",
228
+ " \n",
229
+ "# total=0\n",
230
+ "# unique=0\n",
231
+ "# hash_codes=set()\n",
232
+ " \n",
233
+ "# with open(outfname,'w',encoding=encoding) as outfile: \n",
234
+ "# for infname in filter(lambda x: os.path.isfile(x),infnames):\n",
235
+ "# with open(infname,'r',encoding=encoding) as infile:\n",
236
+ "# for i, line in enumerate(infile):\n",
237
+ "# fields=line.strip().split('|||')\n",
238
+ "# if len(fields) >=4:\n",
239
+ "# sent=fields[3].strip()\n",
240
+ "# total+=1\n",
241
+ "# hs=hash(sent)\n",
242
+ "# if hs not in hash_codes:\n",
243
+ "# outfile.write('{}\\n'.format(sent))\n",
244
+ "# hash_codes.add(hs)\n",
245
+ "# unique+=1\n",
246
+ " \n",
247
+ "# print('Total: {}'.format(total))\n",
248
+ "# print('Unique: {}'.format(unique))\n",
249
+ "\n",
250
+ "def dedup_shuffle_and_print_txt(infnames, outfname, max_buf_size=100000,encoding='utf-8'):\n",
251
+ " \"\"\"\n",
252
+ " The method creates a sentence level corpora from multiple content csv files.\n",
253
+ " All sentences are extracted, they are de-duplicated using murmurhash and shuffled\n",
254
+ " before writing the entire corpus to the output file. The output file has one sentence per line.\n",
255
+ "\n",
256
+ " \"\"\"\n",
257
+ " \n",
258
+ " total=0\n",
259
+ " unique=0\n",
260
+ " hash_codes=set()\n",
261
+ " sent_buffer=[]\n",
262
+ " \n",
263
+ " with open(outfname,'w',encoding=encoding) as outfile: \n",
264
+ " for infname in filter(lambda x: os.path.isfile(x),infnames):\n",
265
+ " print('Processing: {}'.format(infname))\n",
266
+ " with open(infname,'r',encoding=encoding) as infile:\n",
267
+ " for i, line in enumerate(infile):\n",
268
+ " fields=line.strip().split('|||')\n",
269
+ " if len(fields) >=4:\n",
270
+ " sent=fields[3].strip()\n",
271
+ " total+=1\n",
272
+ "# hs=hash(sent)\n",
273
+ " hs=mmh3.hash128(sent)\n",
274
+ " if hs not in hash_codes:\n",
275
+ "# outfile.write('{}\\n'.format(sent))\n",
276
+ " sent_buffer.append(sent)\n",
277
+ " hash_codes.add(hs)\n",
278
+ " unique+=1\n",
279
+ " if len(sent_buffer)>=max_buf_size:\n",
280
+ " random.shuffle(sent_buffer)\n",
281
+ " for sent in sent_buffer: \n",
282
+ " outfile.write('{}\\n'.format(sent))\n",
283
+ " sent_buffer.clear()\n",
284
+ " \n",
285
+ " if len(sent_buffer)>0:\n",
286
+ " random.shuffle(sent_buffer)\n",
287
+ " for sent in sent_buffer: \n",
288
+ " outfile.write('{}\\n'.format(sent))\n",
289
+ " sent_buffer.clear() \n",
290
+ " \n",
291
+ " print('Total: {}'.format(total))\n",
292
+ " print('Unique: {}'.format(unique))\n",
293
+ "\n",
294
+ "def extract_wikiextractor_file(infname, outfname, lang, \n",
295
+ " encoding='utf-8', delimiter=' ||| ', preprocess_fn=preprocess_sent):\n",
296
+ " \"\"\"\n",
297
+ " Extract text content into a content csv file from wikipedia article page. \n",
298
+ " The wikipedia article page is the output from `wikiextractor` [https://github.com/attardi/wikiextractor] \n",
299
+ " \n",
300
+ " \"\"\"\n",
301
+ " normalizer_factory=indic_normalize.IndicNormalizerFactory()\n",
302
+ " normalizer=normalizer_factory.get_normalizer(lang)\n",
303
+ " \n",
304
+ " with open(infname,'r',encoding=encoding) as infile, \\\n",
305
+ " open(outfname,'w',encoding=encoding) as outfile: \n",
306
+ " artid=-1\n",
307
+ " paraid=0\n",
308
+ " for line in infile:\n",
309
+ " if line.find('<doc')==0:\n",
310
+ " artid+=1\n",
311
+ " paraid=0\n",
312
+ " continue\n",
313
+ " if line.find('</doc')==0:\n",
314
+ " continue\n",
315
+ " if len(line.strip())>0:\n",
316
+ " for sentid, sent in enumerate(sent_split(line.strip(),lang)):\n",
317
+ " sent=sent.strip()\n",
318
+ " if sent!='':\n",
319
+ " sent = preprocess_fn(sent,lang,normalizer)\n",
320
+ " outfile.write(delimiter.join([str(artid), str(paraid), str(sentid), sent]) + '\\n')\n",
321
+ " paraid+=1\n",
322
+ "\n",
323
+ " \n",
324
+ "def extract_leipzig_corpus(infname,outfname,lang,encoding='utf-8'):\n",
325
+ " \"\"\"\n",
326
+ " Extractor for files form the Leipzig corpus\n",
327
+ " [http://wortschatz.uni-leipzig.de/en/download/]\n",
328
+ " \n",
329
+ " \"\"\"\n",
330
+ " normalizer_factory=indic_normalize.IndicNormalizerFactory()\n",
331
+ " normalizer=normalizer_factory.get_normalizer(lang) \n",
332
+ "\n",
333
+ " with open(infname,'r',encoding=encoding) as infile, \\\n",
334
+ " open(outfname,'w',encoding=encoding) as outfile: \n",
335
+ " for i, line in enumerate(infile):\n",
336
+ " outfile.write('0 ||| 0 ||| {} ||| {}\\n'.format(i,preprocess_sent(line,lang,normalizer))) \n",
337
+ " \n",
338
+ "def dataset_stats(fname):\n",
339
+ " \"\"\"\n",
340
+ " Extracts dataset statistics from the final extracted file. This input file contains\n",
341
+ " one sentence per line. The sentences are tokenized.\n",
342
+ " \"\"\"\n",
343
+ "\n",
344
+ " all_puncs=set(string.punctuation+'\\u0964\\u0965')\n",
345
+ " \n",
346
+ " sent_count=0\n",
347
+ " token_cnt=0\n",
348
+ " true_token_cnt=0\n",
349
+ " tokens=set()\n",
350
+ " \n",
351
+ " with open(fname,'r',encoding='utf-8') as infile:\n",
352
+ " for line in infile:\n",
353
+ " sent_count+=1\n",
354
+ " a=line.strip().split(' ')\n",
355
+ " token_cnt+=len(a)\n",
356
+ " b=list(filter(lambda x: x not in all_puncs,a))\n",
357
+ " true_token_cnt+=len(b)\n",
358
+ " tokens.update(b)\n",
359
+ " \n",
360
+ " print('== Stats ==')\n",
361
+ " print('Sent count: {}'.format(sent_count))\n",
362
+ " print('Token count: {}'.format(token_cnt))\n",
363
+ " print('True Token count: {}'.format(true_token_cnt))\n",
364
+ " print('Unique Token count: {}'.format(len(tokens)))\n"
365
+ ]
366
+ },
367
+ {
368
+ "cell_type": "markdown",
369
+ "metadata": {},
370
+ "source": [
371
+ "# Marathi"
372
+ ]
373
+ },
374
+ {
375
+ "cell_type": "markdown",
376
+ "metadata": {},
377
+ "source": [
378
+ "## Wikipedia"
379
+ ]
380
+ },
381
+ {
382
+ "cell_type": "markdown",
383
+ "metadata": {},
384
+ "source": [
385
+ "### Wikipedia extraction commands using wikiextractor\n",
386
+ "\n",
387
+ "```\n",
388
+ "### This uses WikiExtractor (https://github.com/attardi/wikiextractor)\n",
389
+ "\n",
390
+ "x=/disk1/crawl_project/ta/wikipedia\n",
391
+ "mkdir $x\n",
392
+ "cd $x\n",
393
+ "wget https://dumps.wikimedia.org/tawiki/20190501/tawiki-20190501-pages-articles-multistream.xml.bz2\n",
394
+ "cd /disk1/src/wikiextractor\n",
395
+ "python3 WikiExtractor.py -cb 250k -o $x/extracted $x/tawiki-20190501-pages-articles-multistream.xml.bz2\n",
396
+ "cd -\n",
397
+ "find extracted -name '*bz2' -exec bunzip2 -c {} \\; > text.xml\n",
398
+ "rm text.xml\n",
399
+ "rm tawiki-20190501-pages-articles-multistream.xml.bz2\n",
400
+ "rm -rf extracted\n",
401
+ "```"
402
+ ]
403
+ },
404
+ {
405
+ "cell_type": "markdown",
406
+ "metadata": {},
407
+ "source": [
408
+ "mrwiki-20190401-pages-articles-multistream.xml.bz2\n",
409
+ "\n",
410
+ "INFO: Finished 1-process extraction of 53715 articles in 123.6s (434.7 art/s)\n",
411
+ "\n",
412
+ "INFO: total of page: 102025, total of articl page: 53715; total of used articl page: 53715"
413
+ ]
414
+ },
415
+ {
416
+ "cell_type": "markdown",
417
+ "metadata": {},
418
+ "source": [
419
+ "### Post-processing output generated by wikiextractor"
420
+ ]
421
+ },
422
+ {
423
+ "cell_type": "code",
424
+ "execution_count": null,
425
+ "metadata": {},
426
+ "outputs": [],
427
+ "source": [
428
+ "## tex.xml is extracted as shown in commanfs above\n",
429
+ "extract_wikiextractor_file('text.xml',\n",
430
+ " 'content_fname1.csv',\n",
431
+ " 'mr')"
432
+ ]
433
+ },
434
+ {
435
+ "cell_type": "markdown",
436
+ "metadata": {},
437
+ "source": [
438
+ "## Loksatta"
439
+ ]
440
+ },
441
+ {
442
+ "cell_type": "markdown",
443
+ "metadata": {},
444
+ "source": [
445
+ "**Extractor function for Marathi Loksatta page**"
446
+ ]
447
+ },
448
+ {
449
+ "cell_type": "code",
450
+ "execution_count": null,
451
+ "metadata": {},
452
+ "outputs": [],
453
+ "source": [
454
+ "def get_article_contents_mr_loksatta(fname,lang,encoding='utf-8'):\n",
455
+ " with open(fname,'r',encoding=encoding) as infile: \n",
456
+ " soup = BeautifulSoup(infile)\n",
457
+ " for elem in soup.find_all('div'):\n",
458
+ " if 'itemprop' in elem.attrs and 'articleBody' in elem['itemprop']:\n",
459
+ " filtered_paras=list(filter(lambda x: x.name=='p' and len(x.attrs)==0,elem.children))\n",
460
+ " paraid=0\n",
461
+ " for blockid, block in enumerate(filtered_paras):\n",
462
+ "# print('Para: {}'.format(blockid))\n",
463
+ "# print(list(block.strings))\n",
464
+ " text=' '.join(block.strings)\n",
465
+ " if blockid==0 and text.find(':')>=0 and text.find(':')<20:\n",
466
+ " text=':'.join(text.split(':')[1:])\n",
467
+ " for para_text in text.split('\\n'): \n",
468
+ " for sentid, sent in enumerate(sent_split(para_text,lang)):\n",
469
+ " sent=sent.strip()\n",
470
+ " if sent!='':\n",
471
+ " # print('{}: {}'.format(sentid, sent))\n",
472
+ " yield((paraid,sentid,sent))\n",
473
+ " # yield((paraid,sentid,preprocess_sent(sent,'ml',normalizer)))\n",
474
+ " # print() \n",
475
+ " paraid+=1"
476
+ ]
477
+ },
478
+ {
479
+ "cell_type": "markdown",
480
+ "metadata": {},
481
+ "source": [
482
+ "**Extracting data from crawled HTML files**"
483
+ ]
484
+ },
485
+ {
486
+ "cell_type": "code",
487
+ "execution_count": null,
488
+ "metadata": {},
489
+ "outputs": [],
490
+ "source": [
491
+ "lang='mr'\n",
492
+ "posts_dir='directory_containing_crawled_html_pages'\n",
493
+ "content_fname='content_fname2.csv'\n",
494
+ "article_mapping_fname='article_mapping_fname'\n",
495
+ "get_article_contents=get_article_contents_mr_loksatta\n",
496
+ "narticles=-1"
497
+ ]
498
+ },
499
+ {
500
+ "cell_type": "code",
501
+ "execution_count": null,
502
+ "metadata": {},
503
+ "outputs": [],
504
+ "source": [
505
+ "write_corpus(\n",
506
+ " extract_all_content(posts_dir, lang, article_extract_fn=get_article_contents,narticles=narticles),\n",
507
+ " content_fname,\n",
508
+ " article_mapping_fname\n",
509
+ " )"
510
+ ]
511
+ },
512
+ {
513
+ "cell_type": "markdown",
514
+ "metadata": {},
515
+ "source": [
516
+ "## Aggregating all crawled data"
517
+ ]
518
+ },
519
+ {
520
+ "cell_type": "code",
521
+ "execution_count": null,
522
+ "metadata": {},
523
+ "outputs": [],
524
+ "source": [
525
+ "### aggregating, de-duplicating and shuffling all the data \n",
526
+ "dedup_shuffle_and_print_txt([ 'content_fname1.csv', 'content_fname2.csv' ], 'output_fname.txt' )\n",
527
+ "### extract dataset statistics\n",
528
+ "dataset_stats('output_fname.txt')"
529
+ ]
530
+ }
531
+ ],
532
+ "metadata": {
533
+ "kernelspec": {
534
+ "display_name": "Python 3",
535
+ "language": "python",
536
+ "name": "python3"
537
+ },
538
+ "language_info": {
539
+ "codemirror_mode": {
540
+ "name": "ipython",
541
+ "version": 3
542
+ },
543
+ "file_extension": ".py",
544
+ "mimetype": "text/x-python",
545
+ "name": "python",
546
+ "nbconvert_exporter": "python",
547
+ "pygments_lexer": "ipython3",
548
+ "version": "3.6.7"
549
+ },
550
+ "toc": {
551
+ "base_numbering": 1,
552
+ "nav_menu": {
553
+ "height": "703px",
554
+ "width": "326px"
555
+ },
556
+ "number_sections": true,
557
+ "sideBar": true,
558
+ "skip_h1_title": false,
559
+ "title_cell": "Table of Contents",
560
+ "title_sidebar": "Contents",
561
+ "toc_cell": false,
562
+ "toc_position": {},
563
+ "toc_section_display": true,
564
+ "toc_window_display": false
565
+ }
566
+ },
567
+ "nbformat": 4,
568
+ "nbformat_minor": 2
569
+ }
indicTrans/indic_nlp_library/docs/Makefile ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Makefile for Sphinx documentation
2
+ #
3
+
4
+ # You can set these variables from the command line.
5
+ SPHINXOPTS =
6
+ SPHINXBUILD = sphinx-build
7
+ PAPER =
8
+ BUILDDIR = _build
9
+
10
+ # Internal variables.
11
+ PAPEROPT_a4 = -D latex_paper_size=a4
12
+ PAPEROPT_letter = -D latex_paper_size=letter
13
+ ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
14
+ # the i18n builder cannot share the environment and doctrees with the others
15
+ I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
16
+
17
+ .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
18
+
19
+ help:
20
+ @echo "Please use \`make <target>' where <target> is one of"
21
+ @echo " html to make standalone HTML files"
22
+ @echo " dirhtml to make HTML files named index.html in directories"
23
+ @echo " singlehtml to make a single large HTML file"
24
+ @echo " pickle to make pickle files"
25
+ @echo " json to make JSON files"
26
+ @echo " htmlhelp to make HTML files and a HTML help project"
27
+ @echo " qthelp to make HTML files and a qthelp project"
28
+ @echo " devhelp to make HTML files and a Devhelp project"
29
+ @echo " epub to make an epub"
30
+ @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
31
+ @echo " latexpdf to make LaTeX files and run them through pdflatex"
32
+ @echo " text to make text files"
33
+ @echo " man to make manual pages"
34
+ @echo " texinfo to make Texinfo files"
35
+ @echo " info to make Texinfo files and run them through makeinfo"
36
+ @echo " gettext to make PO message catalogs"
37
+ @echo " changes to make an overview of all changed/added/deprecated items"
38
+ @echo " linkcheck to check all external links for integrity"
39
+ @echo " doctest to run all doctests embedded in the documentation (if enabled)"
40
+
41
+ clean:
42
+ -rm -rf $(BUILDDIR)/*
43
+
44
+ html:
45
+ $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
46
+ @echo
47
+ @echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
48
+
49
+ dirhtml:
50
+ $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
51
+ @echo
52
+ @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
53
+
54
+ singlehtml:
55
+ $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
56
+ @echo
57
+ @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
58
+
59
+ pickle:
60
+ $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
61
+ @echo
62
+ @echo "Build finished; now you can process the pickle files."
63
+
64
+ json:
65
+ $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
66
+ @echo
67
+ @echo "Build finished; now you can process the JSON files."
68
+
69
+ htmlhelp:
70
+ $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
71
+ @echo
72
+ @echo "Build finished; now you can run HTML Help Workshop with the" \
73
+ ".hhp project file in $(BUILDDIR)/htmlhelp."
74
+
75
+ qthelp:
76
+ $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
77
+ @echo
78
+ @echo "Build finished; now you can run "qcollectiongenerator" with the" \
79
+ ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
80
+ @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/IndicNLPLibrary.qhcp"
81
+ @echo "To view the help file:"
82
+ @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/IndicNLPLibrary.qhc"
83
+
84
+ devhelp:
85
+ $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
86
+ @echo
87
+ @echo "Build finished."
88
+ @echo "To view the help file:"
89
+ @echo "# mkdir -p $$HOME/.local/share/devhelp/IndicNLPLibrary"
90
+ @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/IndicNLPLibrary"
91
+ @echo "# devhelp"
92
+
93
+ epub:
94
+ $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
95
+ @echo
96
+ @echo "Build finished. The epub file is in $(BUILDDIR)/epub."
97
+
98
+ latex:
99
+ $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
100
+ @echo
101
+ @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
102
+ @echo "Run \`make' in that directory to run these through (pdf)latex" \
103
+ "(use \`make latexpdf' here to do that automatically)."
104
+
105
+ latexpdf:
106
+ $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
107
+ @echo "Running LaTeX files through pdflatex..."
108
+ $(MAKE) -C $(BUILDDIR)/latex all-pdf
109
+ @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
110
+
111
+ text:
112
+ $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
113
+ @echo
114
+ @echo "Build finished. The text files are in $(BUILDDIR)/text."
115
+
116
+ man:
117
+ $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
118
+ @echo
119
+ @echo "Build finished. The manual pages are in $(BUILDDIR)/man."
120
+
121
+ texinfo:
122
+ $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
123
+ @echo
124
+ @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
125
+ @echo "Run \`make' in that directory to run these through makeinfo" \
126
+ "(use \`make info' here to do that automatically)."
127
+
128
+ info:
129
+ $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
130
+ @echo "Running Texinfo files through makeinfo..."
131
+ make -C $(BUILDDIR)/texinfo info
132
+ @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
133
+
134
+ gettext:
135
+ $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
136
+ @echo
137
+ @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
138
+
139
+ changes:
140
+ $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
141
+ @echo
142
+ @echo "The overview file is in $(BUILDDIR)/changes."
143
+
144
+ linkcheck:
145
+ $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
146
+ @echo
147
+ @echo "Link check complete; look for any errors in the above output " \
148
+ "or in $(BUILDDIR)/linkcheck/output.txt."
149
+
150
+ doctest:
151
+ $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
152
+ @echo "Testing of doctests in the sources finished, look at the " \
153
+ "results in $(BUILDDIR)/doctest/output.txt."
indicTrans/indic_nlp_library/docs/cmd.rst ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ Commandline
2
+ ===========
3
+
4
+ .. argparse::
5
+ :module: indicnlp.cli.cliparser
6
+ :func: get_parser
7
+ :prog: cliparser.py
8
+
indicTrans/indic_nlp_library/docs/code.rst ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Auto Generated Documentation
2
+ ============================
3
+
4
+ .. automodule:: indicnlp.langinfo indicnlp.common
5
+ :members:
indicTrans/indic_nlp_library/docs/conf.py ADDED
@@ -0,0 +1,242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ #
3
+ # Indic NLP Library documentation build configuration file, created by
4
+ # sphinx-quickstart on Tue Nov 3 01:50:37 2015.
5
+ #
6
+ # This file is execfile()d with the current directory set to its containing dir.
7
+ #
8
+ # Note that not all possible configuration values are present in this
9
+ # autogenerated file.
10
+ #
11
+ # All configuration values have a default; values that are commented out
12
+ # serve to show the default.
13
+
14
+ import sys, os
15
+
16
+ # If extensions (or modules to document with autodoc) are in another directory,
17
+ # add these directories to sys.path here. If the directory is relative to the
18
+ # documentation root, use os.path.abspath to make it absolute, like shown here.
19
+ sys.path.insert(0, os.path.abspath('..'))
20
+
21
+ # -- General configuration -----------------------------------------------------
22
+
23
+ # If your documentation needs a minimal Sphinx version, state it here.
24
+ #needs_sphinx = '1.0'
25
+
26
+ # Add any Sphinx extension module names here, as strings. They can be extensions
27
+ # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
28
+ extensions = ['sphinx.ext.autodoc', 'sphinx.ext.mathjax', 'sphinx.ext.viewcode', 'sphinx.ext.napoleon', 'sphinxarg.ext']
29
+
30
+ # Add any paths that contain templates here, relative to this directory.
31
+ templates_path = ['_templates']
32
+
33
+ # The suffix of source filenames.
34
+ source_suffix = '.rst'
35
+
36
+ # The encoding of source files.
37
+ #source_encoding = 'utf-8-sig'
38
+
39
+ # The master toctree document.
40
+ master_doc = 'index'
41
+
42
+ # General information about the project.
43
+ project = 'Indic NLP Library'
44
+ copyright = '2015, Anoop Kunchukuttan'
45
+
46
+ # The version info for the project you're documenting, acts as replacement for
47
+ # |version| and |release|, also used in various other places throughout the
48
+ # built documents.
49
+ #
50
+ # The short X.Y version.
51
+ version = '0.2'
52
+ # The full version, including alpha/beta/rc tags.
53
+ release = '0.2'
54
+
55
+ # The language for content autogenerated by Sphinx. Refer to documentation
56
+ # for a list of supported languages.
57
+ #language = None
58
+
59
+ # There are two options for replacing |today|: either, you set today to some
60
+ # non-false value, then it is used:
61
+ #today = ''
62
+ # Else, today_fmt is used as the format for a strftime call.
63
+ #today_fmt = '%B %d, %Y'
64
+
65
+ # List of patterns, relative to source directory, that match files and
66
+ # directories to ignore when looking for source files.
67
+ exclude_patterns = ['_build']
68
+
69
+ # The reST default role (used for this markup: `text`) to use for all documents.
70
+ #default_role = None
71
+
72
+ # If true, '()' will be appended to :func: etc. cross-reference text.
73
+ #add_function_parentheses = True
74
+
75
+ # If true, the current module name will be prepended to all description
76
+ # unit titles (such as .. function::).
77
+ #add_module_names = True
78
+
79
+ # If true, sectionauthor and moduleauthor directives will be shown in the
80
+ # output. They are ignored by default.
81
+ #show_authors = False
82
+
83
+ # The name of the Pygments (syntax highlighting) style to use.
84
+ pygments_style = 'sphinx'
85
+
86
+ # A list of ignored prefixes for module index sorting.
87
+ #modindex_common_prefix = []
88
+
89
+
90
+ # -- Options for HTML output ---------------------------------------------------
91
+
92
+ # The theme to use for HTML and HTML Help pages. See the documentation for
93
+ # a list of builtin themes.
94
+ html_theme = 'sphinx_rtd_theme'
95
+
96
+ # Theme options are theme-specific and customize the look and feel of a theme
97
+ # further. For a list of options available for each theme, see the
98
+ # documentation.
99
+ #html_theme_options = {}
100
+
101
+ # Add any paths that contain custom themes here, relative to this directory.
102
+ #html_theme_path = []
103
+
104
+ # The name for this set of Sphinx documents. If None, it defaults to
105
+ # "<project> v<release> documentation".
106
+ #html_title = None
107
+
108
+ # A shorter title for the navigation bar. Default is the same as html_title.
109
+ #html_short_title = None
110
+
111
+ # The name of an image file (relative to this directory) to place at the top
112
+ # of the sidebar.
113
+ #html_logo = None
114
+
115
+ # The name of an image file (within the static path) to use as favicon of the
116
+ # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
117
+ # pixels large.
118
+ #html_favicon = None
119
+
120
+ # Add any paths that contain custom static files (such as style sheets) here,
121
+ # relative to this directory. They are copied after the builtin static files,
122
+ # so a file named "default.css" will overwrite the builtin "default.css".
123
+ html_static_path = ['_static']
124
+
125
+ # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
126
+ # using the given strftime format.
127
+ #html_last_updated_fmt = '%b %d, %Y'
128
+
129
+ # If true, SmartyPants will be used to convert quotes and dashes to
130
+ # typographically correct entities.
131
+ #html_use_smartypants = True
132
+
133
+ # Custom sidebar templates, maps document names to template names.
134
+ #html_sidebars = {}
135
+
136
+ # Additional templates that should be rendered to pages, maps page names to
137
+ # template names.
138
+ #html_additional_pages = {}
139
+
140
+ # If false, no module index is generated.
141
+ #html_domain_indices = True
142
+
143
+ # If false, no index is generated.
144
+ #html_use_index = True
145
+
146
+ # If true, the index is split into individual pages for each letter.
147
+ #html_split_index = False
148
+
149
+ # If true, links to the reST sources are added to the pages.
150
+ #html_show_sourcelink = True
151
+
152
+ # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
153
+ #html_show_sphinx = True
154
+
155
+ # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
156
+ #html_show_copyright = True
157
+
158
+ # If true, an OpenSearch description file will be output, and all pages will
159
+ # contain a <link> tag referring to it. The value of this option must be the
160
+ # base URL from which the finished HTML is served.
161
+ #html_use_opensearch = ''
162
+
163
+ # This is the file name suffix for HTML files (e.g. ".xhtml").
164
+ #html_file_suffix = None
165
+
166
+ # Output file base name for HTML help builder.
167
+ htmlhelp_basename = 'IndicNLPLibrarydoc'
168
+
169
+
170
+ # -- Options for LaTeX output --------------------------------------------------
171
+
172
+ latex_elements = {
173
+ # The paper size ('letterpaper' or 'a4paper').
174
+ #'papersize': 'letterpaper',
175
+
176
+ # The font size ('10pt', '11pt' or '12pt').
177
+ #'pointsize': '10pt',
178
+
179
+ # Additional stuff for the LaTeX preamble.
180
+ #'preamble': '',
181
+ }
182
+
183
+ # Grouping the document tree into LaTeX files. List of tuples
184
+ # (source start file, target name, title, author, documentclass [howto/manual]).
185
+ latex_documents = [
186
+ ('index', 'IndicNLPLibrary.tex', 'Indic NLP Library Documentation',
187
+ 'Anoop Kunchukuttan', 'manual'),
188
+ ]
189
+
190
+ # The name of an image file (relative to this directory) to place at the top of
191
+ # the title page.
192
+ #latex_logo = None
193
+
194
+ # For "manual" documents, if this is true, then toplevel headings are parts,
195
+ # not chapters.
196
+ #latex_use_parts = False
197
+
198
+ # If true, show page references after internal links.
199
+ #latex_show_pagerefs = False
200
+
201
+ # If true, show URL addresses after external links.
202
+ #latex_show_urls = False
203
+
204
+ # Documents to append as an appendix to all manuals.
205
+ #latex_appendices = []
206
+
207
+ # If false, no module index is generated.
208
+ #latex_domain_indices = True
209
+
210
+
211
+ # -- Options for manual page output --------------------------------------------
212
+
213
+ # One entry per manual page. List of tuples
214
+ # (source start file, name, description, authors, manual section).
215
+ man_pages = [
216
+ ('index', 'indicnlplibrary', 'Indic NLP Library Documentation',
217
+ ['Anoop Kunchukuttan'], 1)
218
+ ]
219
+
220
+ # If true, show URL addresses after external links.
221
+ #man_show_urls = False
222
+
223
+
224
+ # -- Options for Texinfo output ------------------------------------------------
225
+
226
+ # Grouping the document tree into Texinfo files. List of tuples
227
+ # (source start file, target name, title, author,
228
+ # dir menu entry, description, category)
229
+ texinfo_documents = [
230
+ ('index', 'IndicNLPLibrary', 'Indic NLP Library Documentation',
231
+ 'Anoop Kunchukuttan', 'IndicNLPLibrary', 'NLP library for Indian languages',
232
+ 'NLP'),
233
+ ]
234
+
235
+ # Documents to append as an appendix to all manuals.
236
+ #texinfo_appendices = []
237
+
238
+ # If false, no module index is generated.
239
+ #texinfo_domain_indices = True
240
+
241
+ # How to display URL addresses: 'footnote', 'no', or 'inline'.
242
+ #texinfo_show_urls = 'footnote'
indicTrans/indic_nlp_library/docs/index.rst ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .. Indic NLP Library documentation master file, created by
2
+ sphinx-quickstart on Tue Nov 3 01:50:37 2015.
3
+ You can adapt this file completely to your liking, but it should at least
4
+ contain the root `toctree` directive.
5
+
6
+ :github_url: https://github.com/anoopkunchukuttan/indic_nlp_library
7
+
8
+ .. toctree::
9
+ :maxdepth: 2
10
+ :caption: Packages
11
+
12
+ indicnlp
13
+
14
+ .. toctree::
15
+ :maxdepth: 2
16
+ :caption: Commandline
17
+
18
+ cmd
19
+
20
+
21
+
22
+
indicTrans/indic_nlp_library/docs/indicnlp.MD ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Indic NLP Library
2
+ ## A unified approach to NLP for Indian languages
3
+
4
+ ### Anoop Kunchukuttan (`anoop.kunchukuttan@gmail.com`)
5
+
6
+ The goal of the Indic NLP Library is to build Python based libraries for common text processing and Natural Language Processing in Indian languages. Indian languages share a lot of similarity in terms of script, phonology, language syntax, etc. and this library is an attempt to provide a general solution to very commonly required toolsets for Indian language text.
7
+
8
+ The library provides the following functionalities:
9
+
10
+ - Text Normalization
11
+ - Script Information
12
+ - Word Tokenization and Detokenization
13
+ - Sentence Splitting
14
+ - Word Segmentation
15
+ - Syllabification
16
+ - Script Conversion
17
+ - Romanization
18
+ - Indicization
19
+ - Transliteration
20
+ - Translation
21
+
22
+ The data resources required by the Indic NLP Library are hosted in a different repository. These resources are required for some modules. You can download from the [Indic NLP Resources](https://github.com/anoopkunchukuttan/indic_nlp_resources) project.
23
+
24
+ **If you are interested in Indian language NLP resources, you should check the [Indic NLP Catalog](https://github.com/anoopkunchukuttan/indic_nlp_library) for pointers.**
25
+
26
+ ## Pre-requisites
27
+
28
+ - Python 3.x
29
+ - (For Python 2.x version check the tag `PYTHON_2.7_FINAL_JAN_2019`. Not actively supporting Python 2.x anymore, but will try to maintain as much compatibility as possible)
30
+ - [Indic NLP Resources](https://github.com/anoopkunchukuttan/indic_nlp_resources)
31
+ - Other dependencies are listed in setup.py
32
+
33
+
34
+ ## Configuration
35
+
36
+ - Installation from pip:
37
+
38
+ `pip install indic-nlp-library`
39
+
40
+ - If you want to use the project from the github repo, add the project to the Python Path:
41
+
42
+ - Clone this repository
43
+ - Install dependencies: `pip install -r requirements.txt`
44
+ - Run: `export PYTHONPATH=$PYTHONPATH:<project base directory>`
45
+
46
+ - In either case, export the path to the _Indic NLP Resources_ directory
47
+
48
+ Run: `export INDIC_RESOURCES_PATH=<path to Indic NLP resources>`
49
+
50
+ ## Usage
51
+
52
+ You can use the Python API to access all the features of the library. Many of the most common operations are also accessible via a unified commandline API.
53
+
54
+ ### Getting Started
55
+
56
+ Check [this IPython Notebook](http://nbviewer.ipython.org/url/anoopkunchukuttan.github.io/indic_nlp_library/doc/indic_nlp_examples.ipynb) for examples to use the Python API.
57
+ - You can find the Python 2.x Notebook [here](http://nbviewer.ipython.org/url/anoopkunchukuttan.github.io/indic_nlp_library/doc/indic_nlp_examples_2_7.ipynb)
58
+
59
+ ### Documentation
60
+
61
+ You can find detailed documentation [HERE](https://indic-nlp-library.readthedocs.io/en/latest)
62
+
63
+ This documents the Python API as well as the commandline reference.
64
+
65
+ ## Citing
66
+
67
+ If you use this library, please include the following citation:
68
+
69
+ ```
70
+ @unpublished{kunchukuttan2020indicnlp,
71
+ author = "Anoop Kunchukuttan",
72
+ title = "The IndicNLP Library",
73
+ year = "2020",
74
+ }
75
+ ```
76
+ You can find the document [HERE](docs/indicnlp.pdf)
77
+
78
+ ## Website
79
+
80
+ `http://anoopkunchukuttan.github.io/indic_nlp_library`
81
+
82
+ ## Author
83
+ Anoop Kunchukuttan ([anoop.kunchukuttan@gmail.com](anoop.kunchukuttan@gmail.com))
84
+
85
+ ## Version: 0.7
86
+
87
+ ## Revision Log
88
+
89
+ 0.7 : 02 Apr 2020:
90
+
91
+ - Unified commandline
92
+ - Improved documentation
93
+ - Added setup.py
94
+
95
+ 0.6 : 16 Dec 2019:
96
+
97
+ - New romanizer and indicizer
98
+ - Script Unifiers
99
+ - Improved script normalizers
100
+ - Added contrib directory for sample uses
101
+ - changed to MIT license
102
+
103
+ 0.5 : 03 Jun 2019:
104
+
105
+ - Improved word tokenizer to handle dates and numbers.
106
+ - Added sentence splitter that can handle common prefixes/honorofics and uses some heuristics.
107
+ - Added detokenizer
108
+ - Added acronym transliterator that can convert English acronyms to Brahmi-derived scripts
109
+
110
+ 0.4 : 28 Jan 2019: Ported to Python 3, and lots of feature additions since last release; primarily around script information, script similarity and syllabification.
111
+
112
+ 0.3 : 21 Oct 2014: Supports morph-analysis between Indian languages
113
+
114
+ 0.2 : 13 Jun 2014: Supports transliteration between Indian languages and tokenization of Indian languages
115
+
116
+ 0.1 : 12 Mar 2014: Initial version. Supports text normalization.
117
+
118
+ ## LICENSE
119
+
120
+ Indic NLP Library is released under the MIT license
121
+
122
+
indicTrans/indic_nlp_library/docs/indicnlp.cli.rst ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ cli Package
2
+ =============
3
+
4
+ :mod:`cliparser` Module
5
+ --------------------------------
6
+
7
+ .. automodule:: indicnlp.cli.cliparser
8
+ :members:
9
+ :undoc-members:
10
+ :show-inheritance:
11
+
indicTrans/indic_nlp_library/docs/indicnlp.morph.rst ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ morph Package
2
+ =============
3
+
4
+ :mod:`unsupervised_morph` Module
5
+ --------------------------------
6
+
7
+ .. automodule:: indicnlp.morph.unsupervised_morph
8
+ :members:
9
+ :undoc-members:
10
+ :show-inheritance:
11
+
indicTrans/indic_nlp_library/docs/indicnlp.normalize.rst ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ normalize Package
2
+ =================
3
+
4
+ :mod:`indic_normalize` Module
5
+ -----------------------------
6
+
7
+ .. automodule:: indicnlp.normalize.indic_normalize
8
+ :members:
9
+ :undoc-members:
10
+ :show-inheritance:
11
+
12
+ .. autoclass:: indicnlp.normalize.indic_normalize.
13
+ :members:
14
+ :undoc-members:
15
+ :show-inheritance:
indicTrans/indic_nlp_library/docs/indicnlp.pdf ADDED
Binary file (38.1 kB). View file
 
indicTrans/indic_nlp_library/docs/indicnlp.rst ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ indicnlp Package
2
+ ================
3
+
4
+ :mod:`common` Module
5
+ --------------------
6
+
7
+ .. automodule:: indicnlp.common
8
+ :members:
9
+ :undoc-members:
10
+ :show-inheritance:
11
+
12
+ :mod:`langinfo` Module
13
+ ----------------------
14
+
15
+ .. automodule:: indicnlp.langinfo
16
+ :members:
17
+ :undoc-members:
18
+ :show-inheritance:
19
+
20
+ :mod:`loader` Module
21
+ --------------------
22
+
23
+ .. automodule:: indicnlp.loader
24
+ :members:
25
+ :undoc-members:
26
+ :show-inheritance:
27
+
28
+ Subpackages
29
+ -----------
30
+
31
+ .. toctree::
32
+
33
+ indicnlp.cli
34
+ indicnlp.morph
35
+ indicnlp.normalize
36
+ indicnlp.script
37
+ indicnlp.syllable
38
+ indicnlp.tokenize
39
+ indicnlp.transliterate
40
+
41
+ Indices and tables
42
+ ==================
43
+
44
+ * :ref:`genindex`
45
+ * :ref:`modindex`
46
+ * :ref:`search`
47
+
indicTrans/indic_nlp_library/docs/indicnlp.script.rst ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ script Package
2
+ ==============
3
+
4
+ :mod:`indic_scripts` Module
5
+ ---------------------------
6
+
7
+ .. automodule:: indicnlp.script.indic_scripts
8
+ :members:
9
+ :undoc-members:
10
+ :show-inheritance:
11
+
12
+ :mod:`english_script` Module
13
+ ---------------------------
14
+
15
+ .. automodule:: indicnlp.script.english_script
16
+ :members:
17
+ :undoc-members:
18
+ :show-inheritance:
19
+
20
+ :mod:`phonetic_sim` Module
21
+ ---------------------------
22
+
23
+ .. automodule:: indicnlp.script.phonetic_sim
24
+ :members:
25
+ :undoc-members:
26
+ :show-inheritance:
indicTrans/indic_nlp_library/docs/indicnlp.syllable.rst ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ syllable Package
2
+ ==============
3
+
4
+ :mod:`syllabifier` Module
5
+ ---------------------------
6
+
7
+ .. automodule:: indicnlp.syllable.syllabifier
8
+ :members:
9
+ :undoc-members:
10
+ :show-inheritance:
11
+
indicTrans/indic_nlp_library/docs/indicnlp.tokenize.rst ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ tokenize Package
2
+ ================
3
+
4
+ :mod:`indic_tokenize` Module
5
+ ----------------------------
6
+
7
+ .. automodule:: indicnlp.tokenize.indic_tokenize
8
+ :members:
9
+ :undoc-members:
10
+ :show-inheritance:
11
+
12
+ :mod:`indic_detokenize` Module
13
+ ------------------------------
14
+
15
+ .. automodule:: indicnlp.tokenize.indic_detokenize
16
+ :members:
17
+ :undoc-members:
18
+ :show-inheritance:
19
+
20
+ :mod:`sentence_tokenize` Module
21
+ ----------------------------
22
+
23
+ .. automodule:: indicnlp.tokenize.sentence_tokenize
24
+ :members:
25
+ :undoc-members:
26
+ :show-inheritance:
indicTrans/indic_nlp_library/docs/indicnlp.transliterate.rst ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ transliterate Package
2
+ =====================
3
+
4
+ :mod:`sinhala_transliterator` Module
5
+ ------------------------------------
6
+
7
+ .. automodule:: indicnlp.transliterate.sinhala_transliterator
8
+ :members:
9
+ :undoc-members:
10
+ :show-inheritance:
11
+
12
+ :mod:`unicode_transliterate` Module
13
+ -----------------------------------
14
+
15
+ .. automodule:: indicnlp.transliterate.unicode_transliterate
16
+ :members:
17
+ :undoc-members:
18
+ :show-inheritance:
19
+
20
+ :mod:`acronym_transliterator` Module
21
+ -----------------------------------
22
+
23
+ .. automodule:: indicnlp.transliterate.acronym_transliterator
24
+ :members:
25
+ :undoc-members:
26
+ :show-inheritance:
27
+
28
+ :mod:`script_unifier` Module
29
+ -----------------------------------
30
+
31
+ .. automodule:: indicnlp.transliterate.script_unifier
32
+ :members:
33
+ :undoc-members:
34
+ :show-inheritance:
indicTrans/indic_nlp_library/docs/make.bat ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ @ECHO OFF
2
+
3
+ pushd %~dp0
4
+
5
+ REM Command file for Sphinx documentation
6
+
7
+ if "%SPHINXBUILD%" == "" (
8
+ set SPHINXBUILD=sphinx-build
9
+ )
10
+ set SOURCEDIR=.
11
+ set BUILDDIR=_build
12
+
13
+ if "%1" == "" goto help
14
+
15
+ %SPHINXBUILD% >NUL 2>NUL
16
+ if errorlevel 9009 (
17
+ echo.
18
+ echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19
+ echo.installed, then set the SPHINXBUILD environment variable to point
20
+ echo.to the full path of the 'sphinx-build' executable. Alternatively you
21
+ echo.may add the Sphinx directory to PATH.
22
+ echo.
23
+ echo.If you don't have Sphinx installed, grab it from
24
+ echo.http://sphinx-doc.org/
25
+ exit /b 1
26
+ )
27
+
28
+ %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29
+ goto end
30
+
31
+ :help
32
+ %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33
+
34
+ :end
35
+ popd
indicTrans/indic_nlp_library/docs/modules.rst ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ indicnlp
2
+ ===
3
+
4
+ .. toctree::
5
+ :maxdepth: 4
6
+
7
+ indicnlp
indicTrans/indic_nlp_library/indicnlp/__init__.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+
4
+ try:
5
+ from .version import __version__ # noqa
6
+ except ImportError:
7
+ version_txt = os.path.join(os.path.dirname(__file__), "version.txt")
8
+ with open(version_txt) as f:
9
+ __version__ = f.read().strip()
10
+
indicTrans/indic_nlp_library/indicnlp/cli/__init__.py ADDED
File without changes
indicTrans/indic_nlp_library/indicnlp/cli/cliparser.py ADDED
@@ -0,0 +1,266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import sys
3
+
4
+ from indicnlp import loader
5
+ from indicnlp.tokenize import indic_tokenize
6
+ from indicnlp.tokenize import indic_detokenize
7
+ from indicnlp.normalize import indic_normalize
8
+ from indicnlp.morph import unsupervised_morph
9
+ from indicnlp.tokenize import sentence_tokenize
10
+ from indicnlp.syllable import syllabifier
11
+ from indicnlp.transliterate import unicode_transliterate
12
+ from indicnlp.transliterate import script_unifier
13
+
14
+ DEFAULT_ENCODING='utf-8'
15
+
16
+ def run_detokenize(args):
17
+ for line in args.infile:
18
+ args.outfile.write(indic_detokenize.trivial_detokenize(line,args.lang))
19
+
20
+ def run_tokenize(args):
21
+ for line in args.infile:
22
+ args.outfile.write(' '.join(
23
+ indic_tokenize.trivial_tokenize(line,args.lang)))
24
+
25
+ def run_sentence_split(args):
26
+ text=' '.join([ l.replace('\n','').replace('\r','') for l in args.infile])
27
+ outlines=sentence_tokenize.sentence_split(text,args.lang)
28
+ for line in outlines:
29
+ args.outfile.write(line+'\n')
30
+
31
+ def run_normalize(args):
32
+
33
+ # TODO: add more options to cli
34
+ remove_nuktas=False
35
+ normalize_nasals='do_nothing'
36
+
37
+ # create normalizer
38
+ factory=indic_normalize.IndicNormalizerFactory()
39
+ normalizer=factory.get_normalizer(args.lang,
40
+ remove_nuktas=remove_nuktas,
41
+ nasals_mode=normalize_nasals)
42
+
43
+ # DO normalization
44
+ for line in args.infile:
45
+ normalized_line=normalizer.normalize(line)
46
+ args.outfile.write(normalized_line)
47
+
48
+ def run_morph(args):
49
+
50
+ add_marker=False
51
+ analyzer=unsupervised_morph.UnsupervisedMorphAnalyzer(args.lang,add_marker)
52
+ for line in args.infile:
53
+ morph_tokens=analyzer.morph_analyze_document(line.strip().split(' '))
54
+ args.outfile.write(' '.join(morph_tokens) + '\n')
55
+
56
+ def run_syllabify(args):
57
+ for line in args.infile:
58
+ new_line = ' '.join(
59
+ [ ' '.join(syllabifier.orthographic_syllabify(w,args.lang))
60
+ for w in line.strip().split(' ') ]
61
+ )
62
+ args.outfile.write(new_line+'\n')
63
+
64
+ def run_wc(args):
65
+ # if args.l==False and args.w==False and args.c==False:
66
+ # args.l, args.w, args.c= True, True, True
67
+
68
+ nl=0
69
+ nw=0
70
+ nc=0
71
+
72
+ for line in args.infile:
73
+ nl+=1
74
+ nw+=len(line.strip(' ').split(' '))
75
+ nc+=len(line)
76
+
77
+ print('{} {} {}'.format(nl,nw,nc))
78
+
79
+ def run_indic2roman(args):
80
+ for line in args.infile:
81
+ transliterated_line=unicode_transliterate.ItransTransliterator.to_itrans(
82
+ line,args.lang)
83
+ args.outfile.write(transliterated_line)
84
+
85
+ def run_roman2indic(args):
86
+ for line in args.infile:
87
+ transliterated_line=unicode_transliterate.ItransTransliterator.from_itrans(
88
+ line,args.lang)
89
+ args.outfile.write(transliterated_line)
90
+
91
+ def run_script_unify(args):
92
+
93
+ unifier=None
94
+
95
+ if args.mode=='aggressive':
96
+ unifier=script_unifier.AggressiveScriptUnifier(nasals_mode='to_anusvaara_relaxed', common_lang=args.common_lang)
97
+
98
+ elif args.mode=='basic':
99
+ unifier=script_unifier.BasicScriptUnifier(nasals_mode='do_nothing',
100
+ common_lang=args.common_lang)
101
+
102
+ elif args.mode=='naive':
103
+ unifier=script_unifier.NaiveScriptUnifier(common_lang=args.common_lang)
104
+
105
+ assert(unifier is not None)
106
+
107
+ for line in args.infile:
108
+ transliterated_line=unifier.transform(line,args.lang)
109
+ args.outfile.write(transliterated_line)
110
+
111
+ def run_script_convert(args):
112
+ for line in args.infile:
113
+ transliterated_line=unicode_transliterate.UnicodeIndicTransliterator.transliterate(
114
+ line,args.srclang,args.tgtlang)
115
+ args.outfile.write(transliterated_line)
116
+
117
+ def add_common_monolingual_args(task_parser):
118
+ task_parser.add_argument('infile',
119
+ type=argparse.FileType('r',encoding=DEFAULT_ENCODING),
120
+ nargs='?',
121
+ default=sys.stdin,
122
+ help='Input File path',
123
+ )
124
+ task_parser.add_argument('outfile',
125
+ type=argparse.FileType('w',encoding=DEFAULT_ENCODING),
126
+ nargs='?',
127
+ default=sys.stdout,
128
+ help='Output File path',
129
+ )
130
+ task_parser.add_argument('-l', '--lang',
131
+ help='Language',
132
+ )
133
+
134
+ def add_common_bilingual_args(task_parser):
135
+ task_parser.add_argument('infile',
136
+ type=argparse.FileType('r',encoding=DEFAULT_ENCODING),
137
+ nargs='?',
138
+ default=sys.stdin,
139
+ help='Input File path',
140
+ )
141
+ task_parser.add_argument('outfile',
142
+ type=argparse.FileType('w',encoding=DEFAULT_ENCODING),
143
+ nargs='?',
144
+ default=sys.stdout,
145
+ help='Output File path',
146
+ )
147
+ task_parser.add_argument('-s', '--srclang',
148
+ help='Source Language',
149
+ )
150
+
151
+ task_parser.add_argument('-t', '--tgtlang',
152
+ help='Target Language',
153
+ )
154
+
155
+ def add_tokenize_parser(subparsers):
156
+ task_parser=subparsers.add_parser('tokenize',
157
+ help='tokenizer help')
158
+ add_common_monolingual_args(task_parser)
159
+ task_parser.set_defaults(func=run_tokenize)
160
+
161
+ def add_detokenize_parser(subparsers):
162
+ task_parser=subparsers.add_parser('detokenize',
163
+ help='de-tokenizer help')
164
+ add_common_monolingual_args(task_parser)
165
+ task_parser.set_defaults(func=run_detokenize)
166
+
167
+ def add_sentence_split_parser(subparsers):
168
+ task_parser=subparsers.add_parser('sentence_split', help='sentence split help')
169
+ add_common_monolingual_args(task_parser)
170
+ task_parser.set_defaults(func=run_sentence_split)
171
+
172
+ def add_normalize_parser(subparsers):
173
+ task_parser=subparsers.add_parser('normalize', help='normalizer help')
174
+ add_common_monolingual_args(task_parser)
175
+ task_parser.set_defaults(func=run_normalize)
176
+
177
+ def add_morph_parser(subparsers):
178
+ task_parser=subparsers.add_parser('morph', help='morph help')
179
+ add_common_monolingual_args(task_parser)
180
+ task_parser.set_defaults(func=run_morph)
181
+
182
+ def add_syllabify_parser(subparsers):
183
+ task_parser=subparsers.add_parser('syllabify', help='syllabify help')
184
+ add_common_monolingual_args(task_parser)
185
+ task_parser.set_defaults(func=run_syllabify)
186
+
187
+ def add_wc_parser(subparsers):
188
+ task_parser=subparsers.add_parser('wc', help='wc help')
189
+
190
+ task_parser.add_argument('infile',
191
+ type=argparse.FileType('r',encoding=DEFAULT_ENCODING),
192
+ nargs='?',
193
+ default=sys.stdin,
194
+ help='Input File path',
195
+ )
196
+ # task_parser.add_argument('-l', action='store_true')
197
+ # task_parser.add_argument('-w', action='store_true')
198
+ # task_parser.add_argument('-c', action='store_true')
199
+ # task_parser.set_defaults(l=False)
200
+ # task_parser.set_defaults(w=False)
201
+ # task_parser.set_defaults(c=False)
202
+
203
+ task_parser.set_defaults(func=run_wc)
204
+
205
+ def add_indic2roman_parser(subparsers):
206
+ task_parser=subparsers.add_parser('indic2roman', help='indic2roman help')
207
+ add_common_monolingual_args(task_parser)
208
+ task_parser.set_defaults(func=run_indic2roman)
209
+
210
+ def add_roman2indic_parser(subparsers):
211
+ task_parser=subparsers.add_parser('roman2indic', help='roman2indic help')
212
+ add_common_monolingual_args(task_parser)
213
+ task_parser.set_defaults(func=run_indic2roman)
214
+
215
+ def add_script_unify_parser(subparsers):
216
+ task_parser=subparsers.add_parser('script_unify', help='script_unify help')
217
+ add_common_monolingual_args(task_parser)
218
+ task_parser.add_argument('-m','--mode',
219
+ default='basic',
220
+ choices=['naive', 'basic', 'aggressive'] ,
221
+ help='Script unification mode',
222
+ )
223
+ task_parser.add_argument('-c','--common_lang',
224
+ default='hi',
225
+ help='Common language in which all languages are represented',
226
+ )
227
+
228
+ task_parser.set_defaults(func=run_script_unify)
229
+
230
+ def add_script_convert_parser(subparsers):
231
+ task_parser=subparsers.add_parser('script_convert', help='script convert help')
232
+ add_common_bilingual_args(task_parser)
233
+ task_parser.set_defaults(func=run_script_convert)
234
+
235
+ def get_parser():
236
+ parser = argparse.ArgumentParser(prog='indicnlp')
237
+ subparsers = parser.add_subparsers(help='Invoke each operation with one of the subcommands', dest='subcommand')
238
+
239
+ add_tokenize_parser(subparsers)
240
+ add_detokenize_parser(subparsers)
241
+ add_sentence_split_parser(subparsers)
242
+ add_normalize_parser(subparsers)
243
+
244
+ add_morph_parser(subparsers)
245
+ add_syllabify_parser(subparsers)
246
+
247
+ add_wc_parser(subparsers)
248
+
249
+ add_indic2roman_parser(subparsers)
250
+ add_roman2indic_parser(subparsers)
251
+ add_script_unify_parser(subparsers)
252
+
253
+ add_script_convert_parser(subparsers)
254
+
255
+ return parser
256
+
257
+ def main():
258
+ parser=get_parser()
259
+ args=parser.parse_args()
260
+ # print(args)
261
+ args.func(args)
262
+
263
+ if __name__ == '__main__':
264
+ loader.load()
265
+ main()
266
+
indicTrans/indic_nlp_library/indicnlp/common.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) 2013-present, Anoop Kunchukuttan
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the MIT license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+ #
8
+
9
+ import os
10
+
11
+ """
12
+ Path to the Indic NLP Resources directory
13
+ """
14
+ INDIC_RESOURCES_PATH=''
15
+
16
+ def init():
17
+ """
18
+ Initialize the module. The following actions are performed:
19
+
20
+ - Checks of INDIC_RESOURCES_PATH variable is set. If not, checks if it can beb initialized from
21
+ INDIC_RESOURCES_PATH environment variable. If that fails, an exception is raised
22
+ """
23
+ global INDIC_RESOURCES_PATH
24
+ try:
25
+ if INDIC_RESOURCES_PATH=='':
26
+ INDIC_RESOURCES_PATH=os.environ['INDIC_RESOURCES_PATH']
27
+ except Exception as e:
28
+ raise IndicNlpException('INDIC_RESOURCES_PATH not set')
29
+
30
+ if INDIC_RESOURCES_PATH=='':
31
+ raise IndicNlpException('INDIC_RESOURCES_PATH not set')
32
+
33
+
34
+
35
+ def get_resources_path():
36
+ """
37
+ Get the path to the Indic NLP Resources directory
38
+ """
39
+ return INDIC_RESOURCES_PATH
40
+
41
+ def set_resources_path(resources_path):
42
+ """
43
+ Set the path to the Indic NLP Resources directory
44
+ """
45
+ global INDIC_RESOURCES_PATH
46
+ INDIC_RESOURCES_PATH=resources_path
47
+
48
+ class IndicNlpException(Exception):
49
+ """
50
+ Exceptions thrown by Indic NLP Library components are instances of this class.
51
+ 'msg' attribute contains exception details.
52
+ """
53
+ def __init__(self, msg):
54
+ self.msg = msg
55
+
56
+ def __str__(self):
57
+ return repr(self.msg)
58
+
indicTrans/indic_nlp_library/indicnlp/langinfo.py ADDED
@@ -0,0 +1,488 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) 2013-present, Anoop Kunchukuttan
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the MIT license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+ #
8
+
9
+ ## language codes
10
+ LC_TA='ta'
11
+
12
+ SCRIPT_RANGES={
13
+ 'pa':[0x0a00,0x0a7f] ,
14
+ 'gu':[0x0a80,0x0aff] ,
15
+ 'or':[0x0b00,0x0b7f] ,
16
+ 'ta':[0x0b80,0x0bff] ,
17
+ 'te':[0x0c00,0x0c7f] ,
18
+ 'kn':[0x0c80,0x0cff] ,
19
+ 'ml':[0x0d00,0x0d7f] ,
20
+ 'si':[0x0d80,0x0dff] ,
21
+ 'hi':[0x0900,0x097f] ,
22
+ 'mr':[0x0900,0x097f] ,
23
+ 'kK':[0x0900,0x097f] ,
24
+ 'sa':[0x0900,0x097f] ,
25
+ 'ne':[0x0900,0x097f] ,
26
+ 'sd':[0x0900,0x097f] ,
27
+ 'bn':[0x0980,0x09ff] ,
28
+ 'as':[0x0980,0x09ff] ,
29
+ }
30
+
31
+ DRAVIDIAN_LANGUAGES=['ta', 'te', 'kn', 'ml',]
32
+ IE_LANGUAGES=['hi', 'mr', 'kK', 'sa', 'ne', 'sd', 'bn', 'as', 'pa', 'gu', 'or', 'si', ]
33
+ DANDA_DELIM_LANGUAGES=['as','bn','hi','ne','or','pa','sa','sd']
34
+
35
+ URDU_RANGES=[
36
+ [0x0600,0x06ff],
37
+ [0x0750,0x077f],
38
+ [0xfb50,0xfdff],
39
+ [0xfe70,0xfeff],
40
+ ]
41
+
42
+ COORDINATED_RANGE_START_INCLUSIVE=0
43
+ COORDINATED_RANGE_END_INCLUSIVE=0x6f
44
+
45
+ NUMERIC_OFFSET_START=0x66
46
+ NUMERIC_OFFSET_END=0x6f
47
+
48
+ HALANTA_OFFSET=0x4d
49
+ AUM_OFFSET=0x50
50
+ NUKTA_OFFSET=0x3c
51
+
52
+ RUPEE_SIGN=0x20b9
53
+
54
+ DANDA=0x0964
55
+ DOUBLE_DANDA=0x0965
56
+
57
+ #TODO: add missing fricatives and approximants
58
+ VELAR_RANGE=[0x15,0x19]
59
+ PALATAL_RANGE=[0x1a,0x1e]
60
+ RETROFLEX_RANGE=[0x1f,0x23]
61
+ DENTAL_RANGE=[0x24,0x29]
62
+ LABIAL_RANGE=[0x2a,0x2e]
63
+
64
+ # verify
65
+ VOICED_LIST=[0x17,0x18,0x1c,0x1d,0x21,0x22,0x26,0x27,0x2c,0x2d]
66
+ UNVOICED_LIST=[0x15,0x16,0x1a,0x1b,0x1f,0x20,0x24,0x25,0x2a,0x2b] #TODO: add sibilants/sonorants
67
+ ASPIRATED_LIST=[0x16,0x18,0x1b,0x1d,0x20,0x22,0x25,0x27,0x2b,0x2d]
68
+ UNASPIRATED_LIST=[0x15,0x17,0x1a,0x1c,0x1f,0x21,0x24,0x26,0x2a,0x2c]
69
+ NASAL_LIST=[0x19,0x1e,0x23,0x28,0x29,0x2d]
70
+ FRICATIVE_LIST=[0x36,0x37,0x38]
71
+ APPROXIMANT_LIST=[0x2f,0x30,0x31,0x32,0x33,0x34,0x35]
72
+
73
+ #TODO: ha has to be properly categorized
74
+
75
+ def is_danda_delim(lang):
76
+ """
77
+ Returns True if danda/double danda is a possible delimiter for the language
78
+ """
79
+ return lang in DANDA_DELIM_LANGUAGES
80
+
81
+ def get_offset(c,lang):
82
+ """
83
+ Applicable to Brahmi derived Indic scripts
84
+ """
85
+ return ord(c)-SCRIPT_RANGES[lang][0]
86
+
87
+ def offset_to_char(c,lang):
88
+ """
89
+ Applicable to Brahmi derived Indic scripts
90
+ """
91
+ return chr(c+SCRIPT_RANGES[lang][0])
92
+
93
+ def in_coordinated_range(c_offset):
94
+ """
95
+ Applicable to Brahmi derived Indic scripts
96
+ """
97
+ return (c_offset>=COORDINATED_RANGE_START_INCLUSIVE and c_offset<=COORDINATED_RANGE_END_INCLUSIVE)
98
+
99
+ def is_indiclang_char(c,lang):
100
+ """
101
+ Applicable to Brahmi derived Indic scripts
102
+ """
103
+ o=get_offset(c,lang)
104
+ return (o>=0 and o<=0x7f) or ord(c)==DANDA or ord(c)==DOUBLE_DANDA
105
+
106
+ # def is_vowel(c,lang):
107
+ # """
108
+ # Is the character a vowel
109
+ # """
110
+ # o=get_offset(c,lang)
111
+ # return (o>=0x04 and o<=0x14)
112
+
113
+ # def is_vowel_sign(c,lang):
114
+ # """
115
+ # Is the character a vowel sign (maatraa)
116
+ # """
117
+ # o=get_offset(c,lang)
118
+ # return (o>=0x3e and o<=0x4c)
119
+
120
+ # def is_halanta(c,lang):
121
+ # """
122
+ # Is the character the halanta character
123
+ # """
124
+ # o=get_offset(c,lang)
125
+ # return (o==HALANTA_OFFSET)
126
+
127
+ # def is_nukta(c,lang):
128
+ # """
129
+ # Is the character the halanta character
130
+ # """
131
+ # o=get_offset(c,lang)
132
+ # return (o==NUKTA_OFFSET)
133
+
134
+ # def is_aum(c,lang):
135
+ # """
136
+ # Is the character a vowel sign (maatraa)
137
+ # """
138
+ # o=get_offset(c,lang)
139
+ # return (o==AUM_OFFSET)
140
+
141
+ # def is_consonant(c,lang):
142
+ # """
143
+ # Is the character a consonant
144
+ # """
145
+ # o=get_offset(c,lang)
146
+ # return (o>=0x15 and o<=0x39)
147
+
148
+ # def is_velar(c,lang):
149
+ # """
150
+ # Is the character a velar
151
+ # """
152
+ # o=get_offset(c,lang)
153
+ # return (o>=VELAR_RANGE[0] and o<=VELAR_RANGE[1])
154
+
155
+ # def is_palatal(c,lang):
156
+ # """
157
+ # Is the character a palatal
158
+ # """
159
+ # o=get_offset(c,lang)
160
+ # return (o>=PALATAL_RANGE[0] and o<=PALATAL_RANGE[1])
161
+
162
+ # def is_retroflex(c,lang):
163
+ # """
164
+ # Is the character a retroflex
165
+ # """
166
+ # o=get_offset(c,lang)
167
+ # return (o>=RETROFLEX_RANGE[0] and o<=RETROFLEX_RANGE[1])
168
+
169
+ # def is_dental(c,lang):
170
+ # """
171
+ # Is the character a dental
172
+ # """
173
+ # o=get_offset(c,lang)
174
+ # return (o>=DENTAL_RANGE[0] and o<=DENTAL_RANGE[1])
175
+
176
+ # def is_labial(c,lang):
177
+ # """
178
+ # Is the character a labial
179
+ # """
180
+ # o=get_offset(c,lang)
181
+ # return (o>=LABIAL_RANGE[0] and o<=LABIAL_RANGE[1])
182
+
183
+ # def is_voiced(c,lang):
184
+ # """
185
+ # Is the character a voiced consonant
186
+ # """
187
+ # o=get_offset(c,lang)
188
+ # return o in VOICED_LIST
189
+
190
+ # def is_unvoiced(c,lang):
191
+ # """
192
+ # Is the character a unvoiced consonant
193
+ # """
194
+ # o=get_offset(c,lang)
195
+ # return o in UNVOICED_LIST
196
+
197
+ # def is_aspirated(c,lang):
198
+ # """
199
+ # Is the character a aspirated consonant
200
+ # """
201
+ # o=get_offset(c,lang)
202
+ # return o in ASPIRATED_LIST
203
+
204
+ # def is_unaspirated(c,lang):
205
+ # """
206
+ # Is the character a unaspirated consonant
207
+ # """
208
+ # o=get_offset(c,lang)
209
+ # return o in UNASPIRATED_LIST
210
+
211
+ # def is_nasal(c,lang):
212
+ # """
213
+ # Is the character a nasal consonant
214
+ # """
215
+ # o=get_offset(c,lang)
216
+ # return o in NASAL_LIST
217
+
218
+ # def is_fricative(c,lang):
219
+ # """
220
+ # Is the character a fricative consonant
221
+ # """
222
+ # o=get_offset(c,lang)
223
+ # return o in FRICATIVE_LIST
224
+
225
+ # def is_approximant(c,lang):
226
+ # """
227
+ # Is the character an approximant consonant
228
+ # """
229
+ # o=get_offset(c,lang)
230
+ # return o in APPROXIMANT_LIST
231
+
232
+ # def is_number(c,lang):
233
+ # """
234
+ # Is the character a number
235
+ # """
236
+ # o=get_offset(c,lang)
237
+ # return (o>=0x66 and o<=0x6f)
238
+
239
+
240
+ def is_vowel(c,lang):
241
+ """
242
+ Is the character a vowel
243
+ """
244
+ o=get_offset(c,lang)
245
+ return (o>=0x04 and o<=0x14)
246
+
247
+ def is_vowel_sign(c,lang):
248
+ """
249
+ Is the character a vowel sign (maatraa)
250
+ """
251
+ o=get_offset(c,lang)
252
+ return (o>=0x3e and o<=0x4c)
253
+
254
+ def is_halanta(c,lang):
255
+ """
256
+ Is the character the halanta character
257
+ """
258
+ o=get_offset(c,lang)
259
+ return (o==HALANTA_OFFSET)
260
+
261
+ def is_nukta(c,lang):
262
+ """
263
+ Is the character the halanta character
264
+ """
265
+ o=get_offset(c,lang)
266
+ return (o==NUKTA_OFFSET)
267
+
268
+ def is_aum(c,lang):
269
+ """
270
+ Is the character a vowel sign (maatraa)
271
+ """
272
+ o=get_offset(c,lang)
273
+ return (o==AUM_OFFSET)
274
+
275
+ def is_consonant(c,lang):
276
+ """
277
+ Is the character a consonant
278
+ """
279
+ o=get_offset(c,lang)
280
+ return (o>=0x15 and o<=0x39)
281
+
282
+ def is_velar(c,lang):
283
+ """
284
+ Is the character a velar
285
+ """
286
+ o=get_offset(c,lang)
287
+ return (o>=VELAR_RANGE[0] and o<=VELAR_RANGE[1])
288
+
289
+ def is_palatal(c,lang):
290
+ """
291
+ Is the character a palatal
292
+ """
293
+ o=get_offset(c,lang)
294
+ return (o>=PALATAL_RANGE[0] and o<=PALATAL_RANGE[1])
295
+
296
+ def is_retroflex(c,lang):
297
+ """
298
+ Is the character a retroflex
299
+ """
300
+ o=get_offset(c,lang)
301
+ return (o>=RETROFLEX_RANGE[0] and o<=RETROFLEX_RANGE[1])
302
+
303
+ def is_dental(c,lang):
304
+ """
305
+ Is the character a dental
306
+ """
307
+ o=get_offset(c,lang)
308
+ return (o>=DENTAL_RANGE[0] and o<=DENTAL_RANGE[1])
309
+
310
+ def is_labial(c,lang):
311
+ """
312
+ Is the character a labial
313
+ """
314
+ o=get_offset(c,lang)
315
+ return (o>=LABIAL_RANGE[0] and o<=LABIAL_RANGE[1])
316
+
317
+ def is_voiced(c,lang):
318
+ """
319
+ Is the character a voiced consonant
320
+ """
321
+ o=get_offset(c,lang)
322
+ return o in VOICED_LIST
323
+
324
+ def is_unvoiced(c,lang):
325
+ """
326
+ Is the character a unvoiced consonant
327
+ """
328
+ o=get_offset(c,lang)
329
+ return o in UNVOICED_LIST
330
+
331
+ def is_aspirated(c,lang):
332
+ """
333
+ Is the character a aspirated consonant
334
+ """
335
+ o=get_offset(c,lang)
336
+ return o in ASPIRATED_LIST
337
+
338
+ def is_unaspirated(c,lang):
339
+ """
340
+ Is the character a unaspirated consonant
341
+ """
342
+ o=get_offset(c,lang)
343
+ return o in UNASPIRATED_LIST
344
+
345
+ def is_nasal(c,lang):
346
+ """
347
+ Is the character a nasal consonant
348
+ """
349
+ o=get_offset(c,lang)
350
+ return o in NASAL_LIST
351
+
352
+ def is_fricative(c,lang):
353
+ """
354
+ Is the character a fricative consonant
355
+ """
356
+ o=get_offset(c,lang)
357
+ return o in FRICATIVE_LIST
358
+
359
+ def is_approximant(c,lang):
360
+ """
361
+ Is the character an approximant consonant
362
+ """
363
+ o=get_offset(c,lang)
364
+ return o in APPROXIMANT_LIST
365
+
366
+ def is_number(c,lang):
367
+ """
368
+ Is the character a number
369
+ """
370
+ o=get_offset(c,lang)
371
+ return (o>=0x66 and o<=0x6f)
372
+
373
+
374
+ ##################################################
375
+
376
+ def is_vowel_offset(c_offset):
377
+ """
378
+ Is the offset a vowel
379
+ """
380
+ return (c_offset>=0x04 and c_offset<=0x14)
381
+
382
+ def is_vowel_sign_offset(c_offset):
383
+ """
384
+ Is the offset a vowel sign (maatraa)
385
+ """
386
+ return (c_offset>=0x3e and c_offset<=0x4c)
387
+
388
+ def is_halanta_offset(c_offset):
389
+ """
390
+ Is the offset the halanta offset
391
+ """
392
+ return (c_offset==HALANTA_OFFSET)
393
+
394
+ def is_nukta_offset(c_offset):
395
+ """
396
+ Is the offset the halanta offset
397
+ """
398
+ return (c_offset==NUKTA_OFFSET)
399
+
400
+ def is_aum_offset(c_offset):
401
+ """
402
+ Is the offset a vowel sign (maatraa)
403
+ """
404
+ return (c_offset==AUM_OFFSET)
405
+
406
+ def is_consonant_offset(c_offset):
407
+ """
408
+ Is the offset a consonant
409
+ """
410
+ return (c_offset>=0x15 and c_offset<=0x39)
411
+
412
+ def is_velar_offset(c_offset):
413
+ """
414
+ Is the offset a velar
415
+ """
416
+ return (c_offset>=VELAR_RANGE[0] and c_offset<=VELAR_RANGE[1])
417
+
418
+ def is_palatal_offset(c_offset):
419
+ """
420
+ Is the offset a palatal
421
+ """
422
+ return (c_offset>=PALATAL_RANGE[0] and c_offset<=PALATAL_RANGE[1])
423
+
424
+ def is_retroflex_offset(c_offset):
425
+ """
426
+ Is the offset a retroflex
427
+ """
428
+ return (c_offset>=RETROFLEX_RANGE[0] and c_offset<=RETROFLEX_RANGE[1])
429
+
430
+ def is_dental_offset(c_offset):
431
+ """
432
+ Is the offset a dental
433
+ """
434
+ return (c_offset>=DENTAL_RANGE[0] and c_offset<=DENTAL_RANGE[1])
435
+
436
+ def is_labial_offset(c_offset):
437
+ """
438
+ Is the offset a labial
439
+ """
440
+ return (c_offset>=LABIAL_RANGE[0] and c_offset<=LABIAL_RANGE[1])
441
+
442
+ def is_voiced_offset(c_offset):
443
+ """
444
+ Is the offset a voiced consonant
445
+ """
446
+ return c_offset in VOICED_LIST
447
+
448
+ def is_unvoiced_offset(c_offset):
449
+ """
450
+ Is the offset a unvoiced consonant
451
+ """
452
+ return c_offset in UNVOICED_LIST
453
+
454
+ def is_aspirated_offset(c_offset):
455
+ """
456
+ Is the offset a aspirated consonant
457
+ """
458
+ return c_offset in ASPIRATED_LIST
459
+
460
+ def is_unaspirated_offset(c_offset):
461
+ """
462
+ Is the offset a unaspirated consonant
463
+ """
464
+ return c_offset in UNASPIRATED_LIST
465
+
466
+ def is_nasal_offset(c_offset):
467
+ """
468
+ Is the offset a nasal consonant
469
+ """
470
+ return c_offset in NASAL_LIST
471
+
472
+ def is_fricative_offset(c_offset):
473
+ """
474
+ Is the offset a fricative consonant
475
+ """
476
+ return c_offset in FRICATIVE_LIST
477
+
478
+ def is_approximant_offset(c_offset):
479
+ """
480
+ Is the offset an approximant consonant
481
+ """
482
+ return c_offset in APPROXIMANT_LIST
483
+
484
+ def is_number_offset(c_offset):
485
+ """
486
+ Is the offset a number
487
+ """
488
+ return (c_offset>=0x66 and c_offset<=0x6f)
indicTrans/indic_nlp_library/indicnlp/loader.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) 2013-present, Anoop Kunchukuttan
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the MIT license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+ #
8
+
9
+ from indicnlp import common
10
+ from indicnlp.script import indic_scripts
11
+ from indicnlp.script import english_script
12
+ from indicnlp.transliterate import unicode_transliterate
13
+
14
+ def load():
15
+ """
16
+ Initializes the Indic NLP library. Clients should call this method before using the library.
17
+
18
+ Any module requiring initialization should have a init() method, to which a call must be made from this method
19
+ """
20
+
21
+ ### Order of intialization may matter
22
+
23
+ # Common has to be loaded first to get access to resources
24
+ common.init()
25
+
26
+ ## Initialization of Indic scripts module
27
+ indic_scripts.init()
28
+
29
+ ## Initialization of English scripts module
30
+ english_script.init()
31
+
32
+ ## Initialization of unicode_transliterate module
33
+ unicode_transliterate.init()
34
+
35
+
indicTrans/indic_nlp_library/indicnlp/morph/__init__.py ADDED
File without changes
indicTrans/indic_nlp_library/indicnlp/morph/unsupervised_morph.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) 2013-present, Anoop Kunchukuttan
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the MIT license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+ #
8
+
9
+ import codecs, sys, itertools,re,os
10
+ import morfessor
11
+
12
+ from functools import lru_cache
13
+
14
+ from indicnlp import langinfo
15
+ from indicnlp import common
16
+ from indicnlp.tokenize import indic_tokenize
17
+
18
+ # Unsupervised Morphological Analyser for Indian languages.
19
+ #
20
+ # @author Anoop Kunchukuttan
21
+ #
22
+
23
+ class MorphAnalyzerI(object):
24
+ """
25
+ Interface for Morph Analyzer
26
+ """
27
+
28
+ def morph_analyze(word):
29
+ pass
30
+
31
+ def morph_analyze_document(tokens):
32
+ pass
33
+
34
+ class UnsupervisedMorphAnalyzer(MorphAnalyzerI):
35
+ """
36
+ Unsupervised Morphological analyser built using Morfessor 2.0
37
+ """
38
+
39
+ def __init__(self,lang,add_marker=False):
40
+ self.lang=lang
41
+ self.add_marker=add_marker
42
+
43
+ io = morfessor.MorfessorIO()
44
+ self._morfessor_model=io.read_any_model(os.path.join(common.INDIC_RESOURCES_PATH,'morph','morfessor','{}.model'.format(lang)))
45
+
46
+ self._script_range_pat=r'^[{}-{}]+$'.format(chr(langinfo.SCRIPT_RANGES[lang][0]),chr(langinfo.SCRIPT_RANGES[lang][1]))
47
+ self._script_check_re=re.compile(self._script_range_pat)
48
+
49
+ def _contains_number(self,text):
50
+ if self.lang in langinfo.SCRIPT_RANGES:
51
+ for c in text:
52
+ offset=ord(c)-langinfo.SCRIPT_RANGES[self.lang][0]
53
+ if offset >=langinfo.NUMERIC_OFFSET_START and offset <= langinfo.NUMERIC_OFFSET_END:
54
+ return True
55
+ return False
56
+
57
+ def _morphanalysis_needed(self,word):
58
+ return self._script_check_re.match(word) and not self._contains_number(word)
59
+
60
+ @lru_cache(maxsize=16384)
61
+ def morph_analyze(self,word):
62
+ """
63
+ Morphanalyzes a single word and returns a list of component morphemes
64
+
65
+ @param word: string input word
66
+ """
67
+ m_list=[]
68
+ if self._morphanalysis_needed(word):
69
+ val=self._morfessor_model.viterbi_segment(word)
70
+ m_list=val[0]
71
+ if self.add_marker:
72
+ m_list= [ '{}_S_'.format(m) if i>0 else '{}_R_'.format(m) for i,m in enumerate(m_list)]
73
+ else:
74
+ if self.add_marker:
75
+ word='{}_E_'.format(word)
76
+ m_list=[word]
77
+ return m_list
78
+
79
+ ### Older implementation
80
+ #val=self._morfessor_model.viterbi_segment(word)
81
+ #m_list=val[0]
82
+ #if self.add_marker:
83
+ # m_list= [ u'{}_S_'.format(m) if i>0 else u'{}_R_'.format(m) for i,m in enumerate(m_list)]
84
+ #return m_list
85
+
86
+
87
+ def morph_analyze_document(self,tokens):
88
+ """
89
+ Morphanalyzes a document, represented as a list of tokens
90
+ Each word is morphanalyzed and result is a list of morphemes constituting the document
91
+
92
+ @param tokens: string sequence of words
93
+
94
+ @return list of segments in the document after morph analysis
95
+ """
96
+
97
+ out_tokens=[]
98
+ for token in tokens:
99
+ morphs=self.morph_analyze(token)
100
+ out_tokens.extend(morphs)
101
+ return out_tokens
102
+
103
+ #### Older implementation
104
+ #out_tokens=[]
105
+ #for token in tokens:
106
+ # if self._morphanalysis_needed(token):
107
+ # morphs=self.morph_analyze(token)
108
+ # out_tokens.extend(morphs)
109
+ # else:
110
+ # if self.add_marker:
111
+ # token=u'{}_E_'.format(token)
112
+ # out_tokens.append(token)
113
+ #return out_tokens
114
+
115
+
116
+ if __name__ == '__main__':
117
+
118
+ if len(sys.argv)<4:
119
+ print("Usage: python unsupervised_morph.py <infile> <outfile> <language> <indic_resources_path> [<add_marker>]")
120
+ sys.exit(1)
121
+
122
+ language=sys.argv[3]
123
+ common.INDIC_RESOURCES_PATH=sys.argv[4]
124
+
125
+ add_marker=False
126
+
127
+ if len(sys.argv)==6:
128
+ add_marker= True if sys.argv[5] == 'True' else False
129
+
130
+ print('Loading morph analyser for ' + language)
131
+ analyzer=UnsupervisedMorphAnalyzer(language,add_marker)
132
+ print('Loaded morph analyser for ' + language)
133
+
134
+ with codecs.open(sys.argv[1],'r','utf-8') as ifile:
135
+ with codecs.open(sys.argv[2],'w','utf-8') as ofile:
136
+ for line in ifile.readlines():
137
+ line=line.strip()
138
+ tokens=indic_tokenize.trivial_tokenize(line)
139
+ morph_tokens=analyzer.morph_analyze_document(tokens)
140
+ ofile.write(' '.join(morph_tokens))
141
+ ofile.write('\n')
142
+
indicTrans/indic_nlp_library/indicnlp/normalize/__init__.py ADDED
File without changes
indicTrans/indic_nlp_library/indicnlp/normalize/indic_normalize.py ADDED
@@ -0,0 +1,984 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ #
4
+ # Copyright (c) 2013-present, Anoop Kunchukuttan
5
+ # All rights reserved.
6
+ #
7
+ # This source code is licensed under the MIT license found in the
8
+ # LICENSE file in the root directory of this source tree.
9
+ #
10
+
11
+ #Program for normalization of text written in Unicode. This is mainly geared towards Indic scripts
12
+ #
13
+ # @author Anoop Kunchukuttan
14
+ #
15
+
16
+ import sys, codecs, string, itertools, re
17
+ from indicnlp import langinfo
18
+
19
+
20
+ class NormalizerI(object):
21
+ """
22
+ The normalizer classes do the following:
23
+
24
+ * Some characters have multiple Unicode codepoints. The normalizer chooses a single standard representation
25
+ * Some control characters are deleted
26
+ * While typing using the Latin keyboard, certain typical mistakes occur which are corrected by the module
27
+
28
+ Base class for normalizer. Performs some common normalization, which includes:
29
+
30
+ * Byte order mark, word joiner, etc. removal
31
+ * ZERO_WIDTH_NON_JOINER and ZERO_WIDTH_JOINER removal
32
+ * ZERO_WIDTH_SPACE and NO_BREAK_SPACE replaced by spaces
33
+
34
+ Script specific normalizers should derive from this class and override the normalize() method.
35
+ They can call the super class 'normalize() method to avail of the common normalization
36
+
37
+ """
38
+
39
+ BYTE_ORDER_MARK='\uFEFF'
40
+ BYTE_ORDER_MARK_2='\uFFFE'
41
+ WORD_JOINER='\u2060'
42
+ SOFT_HYPHEN='\u00AD'
43
+
44
+ ZERO_WIDTH_SPACE='\u200B'
45
+ NO_BREAK_SPACE='\u00A0'
46
+
47
+ ZERO_WIDTH_NON_JOINER='\u200C'
48
+ ZERO_WIDTH_JOINER='\u200D'
49
+
50
+ def _normalize_punctuations(self, text):
51
+ """
52
+ Normalize punctuations.
53
+ Applied many of the punctuation normalizations that are part of MosesNormalizer
54
+ from sacremoses
55
+ """
56
+ text=text.replace(NormalizerI.BYTE_ORDER_MARK,'')
57
+ text=text.replace('„', r'"')
58
+ text=text.replace('“', r'"')
59
+ text=text.replace('”', r'"')
60
+ text=text.replace('–', r'-')
61
+ text=text.replace('—', r' - ')
62
+ text=text.replace('´', r"'")
63
+ text=text.replace('‘', r"'")
64
+ text=text.replace('‚', r"'")
65
+ text=text.replace('’', r"'")
66
+ text=text.replace("''", r'"')
67
+ text=text.replace('´´', r'"')
68
+ text=text.replace('…', r'...')
69
+
70
+ return text
71
+
72
+ def normalize(self,text):
73
+ pass
74
+
75
+
76
+ class BaseNormalizer(NormalizerI):
77
+
78
+ def __init__(self,lang,
79
+ remove_nuktas=False,
80
+ nasals_mode='do_nothing',
81
+ do_normalize_chandras=False,
82
+ do_normalize_vowel_ending=False):
83
+
84
+ self.lang=lang
85
+ self.remove_nuktas=remove_nuktas
86
+ self.nasals_mode=nasals_mode
87
+ self.do_normalize_chandras=do_normalize_chandras
88
+ self.do_normalize_vowel_ending=do_normalize_vowel_ending
89
+
90
+ self._init_normalize_chandras()
91
+ self._init_normalize_nasals()
92
+ self._init_normalize_vowel_ending()
93
+ #self._init_visarga_correction()
94
+
95
+ def _init_normalize_vowel_ending(self):
96
+
97
+ if self.lang in langinfo.IE_LANGUAGES:
98
+ self.fn_vowel_ending=self._normalize_word_vowel_ending_ie
99
+ elif self.lang in langinfo.DRAVIDIAN_LANGUAGES:
100
+ self.fn_vowel_ending=self._normalize_word_vowel_ending_dravidian
101
+ else:
102
+ self.fn_vowel_ending=lambda x: x
103
+
104
+ def _init_normalize_chandras(self):
105
+
106
+ substitution_offsets =\
107
+ [
108
+ [0x0d , 0x0f], # chandra e, independent
109
+ [0x11 , 0x13], # chandra o, independent
110
+ [0x45 , 0x47], # chandra e , 0xde],pendent
111
+ [0x49 , 0x4b], # chandra o , 0xde],pendent
112
+ # [0x72 , 0x0f], # mr: chandra e, independent
113
+
114
+ [0x00 , 0x02], # chandrabindu
115
+ [0x01 , 0x02], # chandrabindu
116
+ ]
117
+
118
+ self.chandra_substitutions = [
119
+ (langinfo.offset_to_char(x[0],self.lang), langinfo.offset_to_char(x[1],self.lang))
120
+ for x in substitution_offsets ]
121
+
122
+ def _normalize_chandras(self,text):
123
+ for match, repl in self.chandra_substitutions:
124
+ text=text.replace(match,repl)
125
+ return text
126
+
127
+ def _init_to_anusvaara_strict(self):
128
+ """
129
+ `r1_nasal=re.compile(r'\\u0919\\u094D([\\u0915-\\u0918])')`
130
+ """
131
+
132
+ pat_signatures=\
133
+ [
134
+ [0x19,0x15,0x18],
135
+ [0x1e,0x1a,0x1d],
136
+ [0x23,0x1f,0x22],
137
+ [0x28,0x24,0x27],
138
+ [0x29,0x24,0x27],
139
+ [0x2e,0x2a,0x2d],
140
+ ]
141
+
142
+ halant_offset=0x4d
143
+ anusvaara_offset=0x02
144
+
145
+ pats=[]
146
+
147
+ for pat_signature in pat_signatures:
148
+ pat=re.compile(r'{nasal}{halant}([{start_r}-{end_r}])'.format(
149
+ nasal=langinfo.offset_to_char(pat_signature[0],self.lang),
150
+ halant=langinfo.offset_to_char(halant_offset,self.lang),
151
+ start_r=langinfo.offset_to_char(pat_signature[1],self.lang),
152
+ end_r=langinfo.offset_to_char(pat_signature[2],self.lang),
153
+ ))
154
+ pats.append(pat)
155
+
156
+ repl_string='{anusvaara}\\1'.format(anusvaara=langinfo.offset_to_char(anusvaara_offset,self.lang))
157
+
158
+ self.pats_repls=(pats,repl_string)
159
+
160
+ def _to_anusvaara_strict(self,text):
161
+
162
+ pats, repl_string = self.pats_repls
163
+ for pat in pats:
164
+ text=pat.sub(repl_string,text)
165
+
166
+ return text
167
+
168
+ def _init_to_anusvaara_relaxed(self):
169
+ """
170
+ `r1_nasal=re.compile(r'\\u0919\\u094D([\\u0915-\\u0918])')`
171
+ """
172
+
173
+ nasals_list=[0x19,0x1e,0x23,0x28,0x29,0x2e]
174
+ nasals_list_str=','.join([langinfo.offset_to_char(x,self.lang) for x in nasals_list])
175
+
176
+ halant_offset=0x4d
177
+ anusvaara_offset=0x02
178
+
179
+ pat=re.compile(r'[{nasals_list_str}]{halant}'.format(
180
+ nasals_list_str=nasals_list_str,
181
+ halant=langinfo.offset_to_char(halant_offset,self.lang),
182
+ ))
183
+
184
+ repl_string='{anusvaara}'.format(anusvaara=langinfo.offset_to_char(anusvaara_offset,self.lang))
185
+
186
+ self.pats_repls = (pat,repl_string)
187
+
188
+ def _to_anusvaara_relaxed(self,text):
189
+ pat, repl_string = self.pats_repls
190
+ return pat.sub(repl_string,text)
191
+
192
+
193
+ def _init_to_nasal_consonants(self):
194
+ """
195
+ `r1_nasal=re.compile(r'\\u0919\\u094D([\\u0915-\\u0918])')`
196
+ """
197
+
198
+ pat_signatures=\
199
+ [
200
+ [0x19,0x15,0x18],
201
+ [0x1e,0x1a,0x1d],
202
+ [0x23,0x1f,0x22],
203
+ [0x28,0x24,0x27],
204
+ [0x29,0x24,0x27],
205
+ [0x2e,0x2a,0x2d],
206
+ ]
207
+
208
+ halant_offset=0x4d
209
+ anusvaara_offset=0x02
210
+
211
+ pats=[]
212
+ repl_strings=[]
213
+
214
+ for pat_signature in pat_signatures:
215
+ pat=re.compile(r'{anusvaara}([{start_r}-{end_r}])'.format(
216
+ anusvaara=langinfo.offset_to_char(anusvaara_offset,self.lang),
217
+ start_r=langinfo.offset_to_char(pat_signature[1],self.lang),
218
+ end_r=langinfo.offset_to_char(pat_signature[2],self.lang),
219
+ ))
220
+ pats.append(pat)
221
+ repl_string='{nasal}{halant}\\1'.format(
222
+ nasal=langinfo.offset_to_char(pat_signature[0],self.lang),
223
+ halant=langinfo.offset_to_char(halant_offset,self.lang),
224
+ )
225
+ repl_strings.append(repl_string)
226
+
227
+ self.pats_repls=list(zip(pats,repl_strings))
228
+
229
+ def _to_nasal_consonants(self,text):
230
+
231
+ for pat, repl in self.pats_repls:
232
+ text=pat.sub(repl,text)
233
+
234
+ return text
235
+
236
+ def _init_normalize_nasals(self):
237
+
238
+ if self.nasals_mode == 'to_anusvaara_strict':
239
+ self._init_to_anusvaara_strict()
240
+ elif self.nasals_mode == 'to_anusvaara_relaxed':
241
+ self._init_to_anusvaara_relaxed()
242
+ elif self.nasals_mode == 'to_nasal_consonants':
243
+ self._init_to_nasal_consonants()
244
+
245
+ def _normalize_nasals(self,text):
246
+ if self.nasals_mode == 'to_anusvaara_strict':
247
+ return self._to_anusvaara_strict(text)
248
+ elif self.nasals_mode == 'to_anusvaara_relaxed':
249
+ return self._to_anusvaara_relaxed(text)
250
+ elif self.nasals_mode == 'to_nasal_consonants':
251
+ return self._to_nasal_consonants(text)
252
+ else:
253
+ return text
254
+
255
+
256
+ def _normalize_word_vowel_ending_dravidian(self,word):
257
+ """
258
+ for Dravidian
259
+ - consonant ending: add 'a' ki maatra
260
+ - halant ending: no change
261
+ - 'a' ki maatra: no change
262
+ """
263
+ if len(word)>0 and langinfo.is_consonant(word[-1],self.lang):
264
+ return word+langinfo.offset_to_char(0x3e,self.lang)
265
+ else:
266
+ return word
267
+
268
+ def _normalize_word_vowel_ending_ie(self,word):
269
+ """
270
+ for IE
271
+ - consonant ending: add halant
272
+ - halant ending: no change
273
+ - 'a' ki maatra: no change
274
+ """
275
+ if len(word)>0 and langinfo.is_consonant(word[-1],self.lang):
276
+ return word+langinfo.offset_to_char(langinfo.HALANTA_OFFSET,self.lang)
277
+ else:
278
+ return word
279
+
280
+ def _normalize_vowel_ending(self,text):
281
+ return ' '.join([ self.fn_vowel_ending(w) for w in text.split(' ') ])
282
+
283
+ def normalize(self,text):
284
+ """
285
+ Method to be implemented for normalization for each script
286
+ """
287
+ text=text.replace(NormalizerI.BYTE_ORDER_MARK,'')
288
+ text=text.replace(NormalizerI.BYTE_ORDER_MARK_2,'')
289
+ text=text.replace(NormalizerI.WORD_JOINER,'')
290
+ text=text.replace(NormalizerI.SOFT_HYPHEN,'')
291
+
292
+ text=text.replace(NormalizerI.ZERO_WIDTH_SPACE,' ') # ??
293
+ text=text.replace(NormalizerI.NO_BREAK_SPACE,' ')
294
+
295
+ text=text.replace(NormalizerI.ZERO_WIDTH_NON_JOINER, '')
296
+ text=text.replace(NormalizerI.ZERO_WIDTH_JOINER,'')
297
+
298
+ text=self._normalize_punctuations(text)
299
+
300
+ if self.do_normalize_chandras:
301
+ text=self._normalize_chandras(text)
302
+ text=self._normalize_nasals(text)
303
+ if self.do_normalize_vowel_ending:
304
+ text=self._normalize_vowel_ending(text)
305
+
306
+ return text
307
+
308
+
309
+ def get_char_stats(self,text):
310
+ print(len(re.findall(NormalizerI.BYTE_ORDER_MARK,text)))
311
+ print(len(re.findall(NormalizerI.BYTE_ORDER_MARK_2,text)))
312
+ print(len(re.findall(NormalizerI.WORD_JOINER,text)))
313
+ print(len(re.findall(NormalizerI.SOFT_HYPHEN,text)))
314
+
315
+ print(len(re.findall(NormalizerI.ZERO_WIDTH_SPACE,text) ))
316
+ print(len(re.findall(NormalizerI.NO_BREAK_SPACE,text)))
317
+
318
+ print(len(re.findall(NormalizerI.ZERO_WIDTH_NON_JOINER,text)))
319
+ print(len(re.findall(NormalizerI.ZERO_WIDTH_JOINER,text)))
320
+
321
+ #for mobj in re.finditer(NormalizerI.ZERO_WIDTH_NON_JOINER,text):
322
+ # print text[mobj.start()-10:mobj.end()+10].replace('\n', ' ').replace(NormalizerI.ZERO_WIDTH_NON_JOINER,'').encode('utf-8')
323
+ #print hex(ord(text[mobj.end():mobj.end()+1]))
324
+
325
+ def correct_visarga(self,text,visarga_char,char_range):
326
+ text=re.sub(r'([\u0900-\u097f]):','\\1\u0903',text)
327
+
328
+
329
+
330
+ class DevanagariNormalizer(BaseNormalizer):
331
+ """
332
+ Normalizer for the Devanagari script. In addition to basic normalization by the super class,
333
+
334
+ * Replaces the composite characters containing nuktas by their decomposed form
335
+ * replace pipe character '|' by poorna virama character
336
+ * replace colon ':' by visarga if the colon follows a charcter in this script
337
+
338
+ """
339
+
340
+ NUKTA='\u093C'
341
+
342
+ def __init__(self,lang='hi',remove_nuktas=False,nasals_mode='do_nothing',
343
+ do_normalize_chandras=False,do_normalize_vowel_ending=False):
344
+ super(DevanagariNormalizer,self).__init__(lang,remove_nuktas,nasals_mode,do_normalize_chandras,do_normalize_vowel_ending)
345
+
346
+ def normalize(self,text):
347
+
348
+ # common normalization for Indic scripts
349
+ text=super(DevanagariNormalizer,self).normalize(text)
350
+
351
+ # chandra a replacement for Marathi
352
+ text=text.replace('\u0972','\u090f')
353
+
354
+ # decomposing Nukta based composite characters
355
+ text=text.replace('\u0929','\u0928'+DevanagariNormalizer.NUKTA)
356
+ text=text.replace('\u0931','\u0930'+DevanagariNormalizer.NUKTA)
357
+ text=text.replace('\u0934','\u0933'+DevanagariNormalizer.NUKTA)
358
+ text=text.replace('\u0958','\u0915'+DevanagariNormalizer.NUKTA)
359
+ text=text.replace('\u0959','\u0916'+DevanagariNormalizer.NUKTA)
360
+ text=text.replace('\u095A','\u0917'+DevanagariNormalizer.NUKTA)
361
+ text=text.replace('\u095B','\u091C'+DevanagariNormalizer.NUKTA)
362
+ text=text.replace('\u095C','\u0921'+DevanagariNormalizer.NUKTA)
363
+ text=text.replace('\u095D','\u0922'+DevanagariNormalizer.NUKTA)
364
+ text=text.replace('\u095E','\u092B'+DevanagariNormalizer.NUKTA)
365
+ text=text.replace('\u095F','\u092F'+DevanagariNormalizer.NUKTA)
366
+
367
+ if self.remove_nuktas:
368
+ text=text.replace(DevanagariNormalizer.NUKTA,'')
369
+
370
+ # replace pipe character for poorna virama
371
+ text=text.replace('\u007c','\u0964')
372
+
373
+ # correct visarga
374
+ text=re.sub(r'([\u0900-\u097f]):','\\1\u0903',text)
375
+
376
+ return text
377
+
378
+ def get_char_stats(self,text):
379
+ super(DevanagariNormalizer,self).get_char_stats(text)
380
+
381
+ print((len(re.findall('\u0929',text))))
382
+ print((len(re.findall('\u0931',text))))
383
+ print((len(re.findall('\u0934',text))))
384
+ print((len(re.findall('\u0958',text))))
385
+ print((len(re.findall('\u0959',text))))
386
+ print((len(re.findall('\u095A',text))))
387
+ print((len(re.findall('\u095B',text))))
388
+ print((len(re.findall('\u095C',text))))
389
+ print((len(re.findall('\u095D',text))))
390
+ print((len(re.findall('\u095E',text))))
391
+ print((len(re.findall('\u095F',text))))
392
+
393
+ #print(len(re.findall(u'\u0928'+DevanagariNormalizer.NUKTA,text)))
394
+ #print(len(re.findall(u'\u0930'+DevanagariNormalizer.NUKTA,text)))
395
+ #print(len(re.findall(u'\u0933'+DevanagariNormalizer.NUKTA,text)))
396
+ #print(len(re.findall(u'\u0915'+DevanagariNormalizer.NUKTA,text)))
397
+ #print(len(re.findall(u'\u0916'+DevanagariNormalizer.NUKTA,text)))
398
+ #print(len(re.findall(u'\u0917'+DevanagariNormalizer.NUKTA,text)))
399
+ #print(len(re.findall(u'\u091C'+DevanagariNormalizer.NUKTA,text)))
400
+ #print(len(re.findall(u'\u0921'+DevanagariNormalizer.NUKTA,text)))
401
+ #print(len(re.findall(u'\u0922'+DevanagariNormalizer.NUKTA,text)))
402
+ #print(len(re.findall(u'\u092B'+DevanagariNormalizer.NUKTA,text)))
403
+ #print(len(re.findall(u'\u092F'+DevanagariNormalizer.NUKTA,text)))
404
+
405
+ class GurmukhiNormalizer(BaseNormalizer):
406
+ """
407
+ Normalizer for the Gurmukhi script. In addition to basic normalization by the super class,
408
+
409
+ * Replaces the composite characters containing nuktas by their decomposed form
410
+ * Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama
411
+ * replace pipe character '|' by poorna virama character
412
+ * replace colon ':' by visarga if the colon follows a charcter in this script
413
+ """
414
+
415
+ NUKTA='\u0A3C'
416
+
417
+ VOWEL_NORM_MAPS={
418
+ ## http://www.unicode.org/versions/Unicode12.1.0/ch12.pdf
419
+ ## Table 12-16
420
+ '\u0a05\u0a3e': '\u0a06',
421
+ '\u0a72\u0a3f': '\u0a07',
422
+ '\u0a72\u0a40': '\u0a08',
423
+ '\u0a73\u0a41': '\u0a09',
424
+ '\u0a73\u0a42': '\u0a0a',
425
+ '\u0a72\u0a47': '\u0a0f',
426
+ '\u0a05\u0a48': '\u0a10',
427
+ '\u0a73\u0a4b': '\u0a13',
428
+ '\u0a05\u0a4c': '\u0a14',
429
+ }
430
+
431
+ def __init__(self,lang='pa',remove_nuktas=False,nasals_mode='do_nothing',do_normalize_chandras=False,
432
+ do_normalize_vowel_ending=False,
433
+ do_canonicalize_addak=False,
434
+ do_canonicalize_tippi=False,
435
+ do_replace_vowel_bases=False):
436
+ super(GurmukhiNormalizer,self).__init__(lang,remove_nuktas,nasals_mode,do_normalize_chandras,do_normalize_vowel_ending)
437
+ self.do_canonicalize_addak=do_canonicalize_addak
438
+ self.do_canonicalize_tippi=do_canonicalize_tippi
439
+ self.do_replace_vowel_bases=do_replace_vowel_bases
440
+
441
+
442
+ def _normalize_vowels(self,text):
443
+ """
444
+
445
+ """
446
+
447
+ ## standard vowel replacements as per suggestions in
448
+ ## http://www.unicode.org/versions/Unicode12.1.0/ch12.pdf
449
+ ## Table 12-16
450
+
451
+ for k,v in GurmukhiNormalizer.VOWEL_NORM_MAPS.items():
452
+ text=text.replace(k,v)
453
+
454
+ ## the above mappings should account for majority of the variantions,
455
+ ## Rest are handled via this generic rule which looks at the diacritic
456
+ ## following the 2 special characters
457
+ ## TBD: don't see evidence for this in Wikipedia corpus
458
+
459
+ ## If these special characters occur without any diacritic, replace them with closet
460
+ ## equivalent vowels
461
+ if self.do_replace_vowel_bases:
462
+ text=text.replace('\u0a72','\u0a07')
463
+ text=text.replace('\u0a73','\u0a09')
464
+
465
+ return text
466
+
467
+
468
+ def normalize(self,text):
469
+
470
+ # Addak
471
+ if self.do_canonicalize_addak:
472
+ ## replace addak+consonant with consonat+halant+consonant
473
+ text=re.sub(r'\u0a71(.)','\\1\u0a4d\\1',text)
474
+
475
+ # Tippi
476
+ if self.do_canonicalize_tippi:
477
+ text=text.replace('\u0a70','\u0a02')
478
+
479
+ # Vowels: Gurumuki has multiple ways of representing independent vowels due
480
+ # to the characters 'iri' and 'ura'.
481
+ text=self._normalize_vowels(text)
482
+
483
+ # common normalization for Indic scripts
484
+ text=super(GurmukhiNormalizer,self).normalize(text)
485
+
486
+ # decomposing Nukta based composite characters
487
+ text=text.replace('\u0a33','\u0a32'+GurmukhiNormalizer.NUKTA)
488
+ text=text.replace('\u0a36','\u0a38'+GurmukhiNormalizer.NUKTA)
489
+ text=text.replace('\u0a59','\u0a16'+GurmukhiNormalizer.NUKTA)
490
+ text=text.replace('\u0a5a','\u0a17'+GurmukhiNormalizer.NUKTA)
491
+ text=text.replace('\u0a5b','\u0a1c'+GurmukhiNormalizer.NUKTA)
492
+ text=text.replace('\u0a5e','\u0a2b'+GurmukhiNormalizer.NUKTA)
493
+
494
+ if self.remove_nuktas:
495
+ text=text.replace(GurmukhiNormalizer.NUKTA,'')
496
+
497
+ # replace the poorna virama codes specific to script
498
+ # with generic Indic script codes
499
+ text=text.replace('\u0a64','\u0964')
500
+ text=text.replace('\u0a65','\u0965')
501
+
502
+ ## replace pipe character for poorna virama
503
+ text=text.replace('\u007c','\u0964')
504
+
505
+ # correct visarge
506
+ text=re.sub(r'([\u0a00-\u0a7f]):','\\1\u0a03',text)
507
+
508
+ return text
509
+
510
+
511
+ class GujaratiNormalizer(BaseNormalizer):
512
+ """
513
+ Normalizer for the Gujarati script. In addition to basic normalization by the super class,
514
+
515
+ * Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama
516
+ * replace colon ':' by visarga if the colon follows a charcter in this script
517
+ """
518
+
519
+ NUKTA='\u0ABC'
520
+
521
+ def __init__(self,lang='gu',remove_nuktas=False,nasals_mode='do_nothing',do_normalize_chandras=False,
522
+ do_normalize_vowel_ending=False):
523
+ super(GujaratiNormalizer,self).__init__(lang,remove_nuktas,nasals_mode,do_normalize_chandras,do_normalize_vowel_ending)
524
+
525
+ def normalize(self,text):
526
+
527
+ # common normalization for Indic scripts
528
+ text=super(GujaratiNormalizer,self).normalize(text)
529
+
530
+ # decomposing Nukta based composite characters
531
+ if self.remove_nuktas:
532
+ text=text.replace(GujaratiNormalizer.NUKTA,'')
533
+
534
+
535
+ # replace the poorna virama codes specific to script
536
+ # with generic Indic script codes
537
+ text=text.replace('\u0ae4','\u0964')
538
+ text=text.replace('\u0ae5','\u0965')
539
+
540
+ # correct visarge
541
+ text=re.sub(r'([\u0a80-\u0aff]):','\\1\u0a83',text)
542
+
543
+ return text
544
+
545
+
546
+ class OriyaNormalizer(BaseNormalizer):
547
+ """
548
+ Normalizer for the Oriya script. In addition to basic normalization by the super class,
549
+
550
+ * Replaces the composite characters containing nuktas by their decomposed form
551
+ * Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama
552
+ * Canonicalize two part dependent vowels
553
+ * Replace 'va' with 'ba'
554
+ * replace pipe character '|' by poorna virama character
555
+ * replace colon ':' by visarga if the colon follows a charcter in this script
556
+ """
557
+
558
+ NUKTA='\u0B3C'
559
+
560
+ VOWEL_NORM_MAPS={
561
+ ## See Table 12-22 in http://www.unicode.org/versions/Unicode12.1.0/ch12.pdf
562
+ '\u0b05\u0b3e': '\u0b06',
563
+ '\u0b0f\u0b57': '\u0b10',
564
+ '\u0b13\u0b57': '\u0b14',
565
+ }
566
+
567
+
568
+ def __init__(self,lang='or',remove_nuktas=False,nasals_mode='do_nothing',do_normalize_chandras=False,
569
+ do_normalize_vowel_ending=False,
570
+ do_remap_wa=False):
571
+ super(OriyaNormalizer,self).__init__(lang,remove_nuktas,nasals_mode,do_normalize_chandras,do_normalize_vowel_ending)
572
+ self.do_remap_wa=do_remap_wa
573
+
574
+ def normalize(self,text):
575
+
576
+ # common normalization for Indic scripts
577
+ text=super(OriyaNormalizer,self).normalize(text)
578
+
579
+ ## standard vowel replacements as per suggestions in Unicode documents
580
+ for k,v in OriyaNormalizer.VOWEL_NORM_MAPS.items():
581
+ text=text.replace(k,v)
582
+
583
+ # decomposing Nukta based composite characters
584
+ text=text.replace('\u0b5c','\u0b21'+OriyaNormalizer.NUKTA)
585
+ text=text.replace('\u0b5d','\u0b22'+OriyaNormalizer.NUKTA)
586
+
587
+ if self.remove_nuktas:
588
+ text=text.replace(OriyaNormalizer.NUKTA,'')
589
+
590
+ # replace the poorna virama codes specific to script
591
+ # with generic Indic script codes
592
+ text=text.replace('\u0b64','\u0964')
593
+ text=text.replace('\u0b65','\u0965')
594
+
595
+ # replace pipe character for poorna virama
596
+ text=text.replace('\u0b7c','\u0964')
597
+
598
+ # replace wa with ba
599
+ if self.do_remap_wa:
600
+ text=text.replace('\u0b71','\u0b2c')
601
+
602
+ # replace va with ba
603
+ # NOTE: documentation (chapter on Indic scripts) and codepoint chart seem contradictory
604
+ # (this applied to wa to ba rule also above)
605
+ text=text.replace('\u0b35','\u0b2c')
606
+
607
+ # AI dependent vowel sign
608
+ text=text.replace('\u0b47\u0b56','\u0b58')
609
+
610
+ # two part dependent vowels
611
+ text=text.replace('\u0b47\u0b3e','\u0b4b')
612
+ text=text.replace('\u0b47\u0b57','\u0b4c')
613
+
614
+
615
+ # additional consonant - not clear how to handle this
616
+ # ignore
617
+
618
+ # correct visarge
619
+ text=re.sub(r'([\u0b00-\u0b7f]):','\\1\u0b03',text)
620
+
621
+ return text
622
+
623
+
624
+ class BengaliNormalizer(BaseNormalizer):
625
+ """
626
+ Normalizer for the Bengali script. In addition to basic normalization by the super class,
627
+
628
+ * Replaces the composite characters containing nuktas by their decomposed form
629
+ * Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama
630
+ * Canonicalize two part dependent vowels
631
+ * replace pipe character '|' by poorna virama character
632
+ * replace colon ':' by visarga if the colon follows a charcter in this script
633
+
634
+ """
635
+
636
+ NUKTA='\u09BC'
637
+
638
+ def __init__(self,lang='bn',remove_nuktas=False,nasals_mode='do_nothing',do_normalize_chandras=False,
639
+ do_normalize_vowel_ending=False,
640
+ do_remap_assamese_chars=False):
641
+ super(BengaliNormalizer,self).__init__(lang,remove_nuktas,nasals_mode,do_normalize_chandras,do_normalize_vowel_ending)
642
+ self.do_remap_assamese_chars=do_remap_assamese_chars
643
+
644
+ def normalize(self,text):
645
+
646
+ # common normalization for Indic scripts
647
+ text=super(BengaliNormalizer,self).normalize(text)
648
+
649
+ # decomposing Nukta based composite characters
650
+ text=text.replace('\u09dc','\u09a1'+BengaliNormalizer.NUKTA)
651
+ text=text.replace('\u09dd','\u09a2'+BengaliNormalizer.NUKTA)
652
+ text=text.replace('\u09df','\u09af'+BengaliNormalizer.NUKTA)
653
+
654
+ if self.remove_nuktas:
655
+ text=text.replace(BengaliNormalizer.NUKTA,'')
656
+
657
+ if self.do_remap_assamese_chars and self.lang=='as':
658
+ text=text.replace('\u09f0','\u09b0') # 'ra' character
659
+ text=text.replace('\u09f1','\u09ac') # 'va' character
660
+
661
+ # replace the poorna virama codes specific to script
662
+ # with generic Indic script codes
663
+ text=text.replace('\u09e4','\u0964')
664
+ text=text.replace('\u09e5','\u0965')
665
+
666
+ # replace pipe character for poorna virama
667
+ text=text.replace('\u007c','\u0964')
668
+ # replace bengali currency numerator four for poorna virama (it looks similar and is used as a substitute)
669
+ text=text.replace('\u09f7','\u0964')
670
+
671
+ # two part dependent vowels
672
+ text=text.replace('\u09c7\u09be','\u09cb')
673
+ text=text.replace('\u09c7\u09d7','\u09cc')
674
+
675
+ # correct visarge
676
+ text=re.sub(r'([\u0980-\u09ff]):','\\1\u0983',text)
677
+
678
+ return text
679
+
680
+
681
+ class TamilNormalizer(BaseNormalizer):
682
+ """
683
+ Normalizer for the Tamil script. In addition to basic normalization by the super class,
684
+
685
+ * Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama
686
+ * canonicalize two-part dependent vowel signs
687
+ * replace colon ':' by visarga if the colon follows a charcter in this script
688
+ """
689
+
690
+ def __init__(self,lang='ta',remove_nuktas=False,nasals_mode='do_nothing',
691
+ do_normalize_chandras=False,do_normalize_vowel_ending=False):
692
+ super(TamilNormalizer,self).__init__(lang,remove_nuktas,nasals_mode,do_normalize_chandras,do_normalize_vowel_ending)
693
+
694
+ def normalize(self,text):
695
+
696
+ # common normalization for Indic scripts
697
+ text=super(TamilNormalizer,self).normalize(text)
698
+
699
+ # replace the poorna virama codes specific to script
700
+ # with generic Indic script codes
701
+ text=text.replace('\u0be4','\u0964')
702
+ text=text.replace('\u0be5','\u0965')
703
+
704
+ # two part dependent vowels
705
+ text=text.replace('\u0b92\u0bd7','\u0b94')
706
+ text=text.replace('\u0bc6\u0bbe','\u0bca')
707
+ text=text.replace('\u0bc7\u0bbe','\u0bcb')
708
+ text=text.replace('\u0bc6\u0bd7','\u0bcc')
709
+
710
+ # correct visarge
711
+ text=re.sub(r'([\u0b80-\u0bff]):','\\1\u0b83',text)
712
+
713
+ return text
714
+
715
+
716
+ class TeluguNormalizer(BaseNormalizer):
717
+ """
718
+ Normalizer for the Teluguscript. In addition to basic normalization by the super class,
719
+
720
+ * Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama
721
+ * canonicalize two-part dependent vowel signs
722
+ * replace colon ':' by visarga if the colon follows a charcter in this script
723
+ """
724
+
725
+ def __init__(self,lang='te',remove_nuktas=False,nasals_mode='do_nothing',
726
+ do_normalize_chandras=False,do_normalize_vowel_ending=False):
727
+ super(TeluguNormalizer,self).__init__(lang,remove_nuktas,nasals_mode,do_normalize_chandras,do_normalize_vowel_ending)
728
+
729
+ def normalize(self,text):
730
+
731
+ # common normalization for Indic scripts
732
+ text=super(TeluguNormalizer,self).normalize(text)
733
+
734
+ # replace the poorna virama codes specific to script
735
+ # with generic Indic script codes
736
+ text=text.replace('\u0c64','\u0964')
737
+ text=text.replace('\u0c65','\u0965')
738
+
739
+ # dependent vowels
740
+ text=text.replace('\u0c46\u0c56','\u0c48')
741
+
742
+ # correct visarge
743
+ text=re.sub(r'([\u0c00-\u0c7f]):','\\1\u0c03',text)
744
+
745
+ return text
746
+
747
+ def get_char_stats(self,text):
748
+ pass
749
+
750
+ class KannadaNormalizer(BaseNormalizer):
751
+ """
752
+ Normalizer for the Kannada script. In addition to basic normalization by the super class,
753
+
754
+ * Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama
755
+ * canonicalize two-part dependent vowel signs
756
+ * replace colon ':' by visarga if the colon follows a charcter in this script
757
+ """
758
+
759
+ def __init__(self,lang='kn',remove_nuktas=False,nasals_mode='do_nothing',
760
+ do_normalize_chandras=False,do_normalize_vowel_ending=False):
761
+ super(KannadaNormalizer,self).__init__(lang,remove_nuktas,nasals_mode,do_normalize_chandras,do_normalize_vowel_ending)
762
+
763
+
764
+ def normalize(self,text):
765
+
766
+ # common normalization for Indic scripts
767
+ text=super(KannadaNormalizer,self).normalize(text)
768
+
769
+ # replace the poorna virama codes specific to script
770
+ # with generic Indic script codes
771
+ text=text.replace('\u0ce4','\u0964')
772
+ text=text.replace('\u0ce5','\u0965')
773
+
774
+ # dependent vowels
775
+ text=text.replace('\u0cbf\u0cd5','\u0cc0')
776
+ text=text.replace('\u0cc6\u0cd5','\u0cc7')
777
+ text=text.replace('\u0cc6\u0cd6','\u0cc8')
778
+ text=text.replace('\u0cc6\u0cc2','\u0cca')
779
+ text=text.replace('\u0cca\u0cd5','\u0ccb')
780
+
781
+ # correct visarge
782
+ text=re.sub(r'([\u0c80-\u0cff]):','\\1\u0c83',text)
783
+
784
+ return text
785
+
786
+
787
+ class MalayalamNormalizer(BaseNormalizer):
788
+ """
789
+ Normalizer for the Malayalam script. In addition to basic normalization by the super class,
790
+
791
+ * Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama
792
+ * canonicalize two-part dependent vowel signs
793
+ * Change from old encoding of chillus (till Unicode 5.0) to new encoding
794
+ * replace colon ':' by visarga if the colon follows a charcter in this script
795
+ """
796
+
797
+ CHILLU_CHAR_MAP= {
798
+ '\u0d7a': '\u0d23',
799
+ '\u0d7b': '\u0d28',
800
+ '\u0d7c': '\u0d30',
801
+ '\u0d7d': '\u0d32',
802
+ '\u0d7e': '\u0d33',
803
+ '\u0d7f': '\u0d15',
804
+ }
805
+
806
+ def _canonicalize_chillus(self,text):
807
+ for chillu, char in MalayalamNormalizer.CHILLU_CHAR_MAP.items():
808
+ text=text.replace(chillu,'{}\u0d4d'.format(char))
809
+ return text
810
+
811
+ def _correct_geminated_T(self,text):
812
+ return text.replace('\u0d31\u0d4d\u0d31','\u0d1f\u0d4d\u0d1f')
813
+
814
+ def __init__(self,lang='ml',remove_nuktas=False,nasals_mode='do_nothing',do_normalize_chandras=False,
815
+ do_normalize_vowel_ending=False,
816
+ do_canonicalize_chillus=False, do_correct_geminated_T=False):
817
+ super(MalayalamNormalizer,self).__init__(lang,remove_nuktas,nasals_mode,do_normalize_chandras,do_normalize_vowel_ending)
818
+ self.do_canonicalize_chillus=do_canonicalize_chillus
819
+ self.do_correct_geminated_T=do_correct_geminated_T
820
+
821
+ def normalize(self,text):
822
+
823
+ # Change from old encoding of chillus (till Unicode 5.0) to new encoding
824
+ text=text.replace('\u0d23\u0d4d\u200d','\u0d7a')
825
+ text=text.replace('\u0d28\u0d4d\u200d','\u0d7b')
826
+ text=text.replace('\u0d30\u0d4d\u200d','\u0d7c')
827
+ text=text.replace('\u0d32\u0d4d\u200d','\u0d7d')
828
+ text=text.replace('\u0d33\u0d4d\u200d','\u0d7e')
829
+ text=text.replace('\u0d15\u0d4d\u200d','\u0d7f')
830
+
831
+ # Normalize chillus
832
+ if self.do_canonicalize_chillus:
833
+ text=self._canonicalize_chillus(text)
834
+
835
+ # common normalization for Indic scripts
836
+ text=super(MalayalamNormalizer,self).normalize(text)
837
+
838
+ # replace the poorna virama codes specific to script
839
+ # with generic Indic script codes
840
+ text=text.replace('\u0d64','\u0964')
841
+ text=text.replace('\u0d65','\u0965')
842
+
843
+ # dependent vowels
844
+ text=text.replace('\u0d46\u0d3e','\u0d4a')
845
+ text=text.replace('\u0d47\u0d3e','\u0d4b')
846
+
847
+ # au forms
848
+ text=text.replace('\u0d46\u0d57','\u0d4c')
849
+ text=text.replace('\u0d57','\u0d4c')
850
+
851
+ # correct geminated T
852
+ if self.do_correct_geminated_T:
853
+ text=self._correct_geminated_T(text)
854
+
855
+ # correct visarga
856
+ text=re.sub(r'([\u0d00-\u0d7f]):','\\1\u0d03',text)
857
+
858
+ return text
859
+
860
+ class UrduNormalizer(NormalizerI):
861
+ '''Uses UrduHack library.
862
+ https://docs.urduhack.com/en/stable/_modules/urduhack/normalization/character.html#normalize
863
+ '''
864
+
865
+ def __init__(self, lang, remove_nuktas=True):
866
+ self.lang = lang
867
+ self.remove_nuktas = remove_nuktas
868
+
869
+ from urduhack.normalization import (
870
+ remove_diacritics,
871
+ normalize_characters,
872
+ normalize_combine_characters
873
+ ) # TODO: Use only required normalizers
874
+ from urduhack.preprocessing import (
875
+ normalize_whitespace,
876
+ digits_space,
877
+ all_punctuations_space,
878
+ english_characters_space
879
+ )
880
+
881
+ def normalize(self, text):
882
+ text = self._normalize_punctuations(text)
883
+ text = UrduNormalizer.normalize_whitespace(text)
884
+ if self.remove_nuktas:
885
+ text = UrduNormalizer.remove_diacritics(text)
886
+ text = UrduNormalizer.normalize_characters(text)
887
+ text = UrduNormalizer.normalize_combine_characters(text)
888
+ text = UrduNormalizer.digits_space(text)
889
+ text = UrduNormalizer.all_punctuations_space(text)
890
+ text = UrduNormalizer.english_characters_space(text)
891
+ return text
892
+
893
+
894
+ class IndicNormalizerFactory(object):
895
+ """
896
+ Factory class to create language specific normalizers.
897
+
898
+ """
899
+
900
+ def get_normalizer(self,language,**kwargs):
901
+ """
902
+ Call the get_normalizer function to get the language specific normalizer
903
+
904
+ Paramters:
905
+ |language: language code
906
+ |remove_nuktas: boolean, should the normalizer remove nukta characters
907
+ """
908
+ normalizer=None
909
+ if language in ['hi','mr','sa','kK','ne','sd']:
910
+ normalizer=DevanagariNormalizer(lang=language, **kwargs)
911
+ elif language in ['ur']:
912
+ normalizer = UrduNormalizer(lang=language, **kwargs)
913
+ elif language in ['pa']:
914
+ normalizer=GurmukhiNormalizer(lang=language, **kwargs)
915
+ elif language in ['gu']:
916
+ normalizer=GujaratiNormalizer(lang=language, **kwargs)
917
+ elif language in ['bn']:
918
+ normalizer=BengaliNormalizer(lang=language, **kwargs)
919
+ elif language in ['as']:
920
+ normalizer=BengaliNormalizer(lang=language, **kwargs)
921
+ elif language in ['or']:
922
+ normalizer=OriyaNormalizer(lang=language, **kwargs)
923
+ elif language in ['ml']:
924
+ normalizer=MalayalamNormalizer(lang=language, **kwargs)
925
+ elif language in ['kn']:
926
+ normalizer=KannadaNormalizer(lang=language, **kwargs)
927
+ elif language in ['ta']:
928
+ normalizer=TamilNormalizer(lang=language, **kwargs)
929
+ elif language in ['te']:
930
+ normalizer=TeluguNormalizer(lang=language, **kwargs)
931
+ else:
932
+ normalizer=BaseNormalizer(lang=language, **kwargs)
933
+
934
+ return normalizer
935
+
936
+ def is_language_supported(self,language):
937
+ """
938
+ Is the language supported?
939
+ """
940
+ if language in ['hi','mr','sa','kK','ne','sd',
941
+ 'ur',
942
+ 'pa',
943
+ 'gu',
944
+ 'bn','as',
945
+ 'or',
946
+ 'ml',
947
+ 'kn',
948
+ 'ta',
949
+ 'te']:
950
+ return True
951
+ else:
952
+ return False
953
+
954
+
955
+ if __name__ == '__main__':
956
+
957
+ if len(sys.argv)<4:
958
+ print("Usage: python normalize.py <infile> <outfile> <language> [<replace_nukta(True,False)>] [<normalize_nasals(do_nothing|to_anusvaara_strict|to_anusvaara_relaxed|to_nasal_consonants)>]")
959
+ sys.exit(1)
960
+
961
+ language=sys.argv[3]
962
+ remove_nuktas=False
963
+ normalize_nasals='do_nothing'
964
+ if len(sys.argv)>=5:
965
+ remove_nuktas=bool(sys.argv[4])
966
+ if len(sys.argv)>=6:
967
+ normalize_nasals=sys.argv[5]
968
+
969
+ # create normalizer
970
+ factory=IndicNormalizerFactory()
971
+ normalizer=factory.get_normalizer(language,remove_nuktas=remove_nuktas,nasals_mode=normalize_nasals)
972
+
973
+ # DO normalization
974
+ with codecs.open(sys.argv[1],'r','utf-8') as ifile:
975
+ with codecs.open(sys.argv[2],'w','utf-8') as ofile:
976
+ for line in ifile.readlines():
977
+ normalized_line=normalizer.normalize(line)
978
+ ofile.write(normalized_line)
979
+
980
+ ## gather status about normalization
981
+ #with codecs.open(sys.argv[1],'r','utf-8') as ifile:
982
+ # normalizer=DevanagariNormalizer()
983
+ # text=string.join(ifile.readlines(),sep='')
984
+ # normalizer.get_char_stats(text)
indicTrans/indic_nlp_library/indicnlp/script/__init__.py ADDED
File without changes
indicTrans/indic_nlp_library/indicnlp/script/english_script.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) 2013-present, Anoop Kunchukuttan
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the MIT license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+ #
8
+
9
+ import pandas as pd
10
+ import numpy as np
11
+
12
+ from indicnlp import common
13
+ from indicnlp.common import IndicNlpException
14
+
15
+
16
+ #### Maps from ARPABET to Internal Id
17
+ ARPABET_ID_MAP={}
18
+ ID_ARPABET_MAP={}
19
+
20
+
21
+ ###
22
+ # Phonetic Information about script characters
23
+ ###
24
+
25
+ """ Phonetic data for English """
26
+ ENGLISH_PHONETIC_DATA=None
27
+
28
+ """ Phonetic vector for English"""
29
+ ENGLISH_PHONETIC_VECTORS=None
30
+
31
+ """ Length of phonetic vector """
32
+ PHONETIC_VECTOR_LENGTH=38
33
+
34
+ """ Start offset for the phonetic feature vector in the phonetic data vector """
35
+ PHONETIC_VECTOR_START_OFFSET=6
36
+
37
+ ## PHONETIC PROPERTIES in order in which they occur in the vector
38
+ ## This list must be in sync with the keys in the PV_PROP_RANGES dictionary
39
+ PV_PROP=['basic_type',
40
+ 'vowel_length',
41
+ 'vowel_strength',
42
+ 'vowel_status',
43
+ 'consonant_type',
44
+ 'articulation_place',
45
+ 'aspiration',
46
+ 'voicing',
47
+ 'nasalization',
48
+ 'vowel_horizontal',
49
+ 'vowel_vertical',
50
+ 'vowel_roundness',
51
+ ]
52
+
53
+ ###
54
+ # Bit vector ranges for various properties
55
+ ###
56
+
57
+ PV_PROP_RANGES={
58
+ 'basic_type': [0,6],
59
+ 'vowel_length': [6,8],
60
+ 'vowel_strength': [8,11],
61
+ 'vowel_status': [11,13],
62
+ 'consonant_type': [13,18],
63
+ 'articulation_place': [18,23],
64
+ 'aspiration': [23,25],
65
+ 'voicing': [25,27],
66
+ 'nasalization': [27,29],
67
+ 'vowel_horizontal': [29,32],
68
+ 'vowel_vertical': [32,36],
69
+ 'vowel_roundness': [36,38],
70
+ }
71
+
72
+
73
+ ####
74
+ # Indexes into the Phonetic Vector
75
+ ####
76
+ PVIDX_BT_VOWEL=0
77
+ PVIDX_BT_CONSONANT=1
78
+ PVIDX_BT_NUKTA=2
79
+ PVIDX_BT_HALANT=3
80
+ PVIDX_BT_ANUSVAAR=4
81
+ PVIDX_BT_MISC=5
82
+ PVIDX_BT_S=PVIDX_BT_VOWEL
83
+ PVIDX_BT_E=PVIDX_BT_MISC+1
84
+
85
+ PVIDX_VSTAT_DEP=12
86
+
87
+ ####
88
+ SCRIPT_RANGE_START=0x0D00
89
+ ## TBD
90
+ SCRIPT_RANGE_END=0x0D2E
91
+
92
+
93
+ def init():
94
+ """
95
+ To be called by library loader, do not call it in your program
96
+ """
97
+
98
+ global ENGLISH_PHONETIC_DATA, ENGLISH_PHONETIC_VECTORS, PHONETIC_VECTOR_LENGTH, PHONETIC_VECTOR_START_OFFSET
99
+
100
+ ENGLISH_PHONETIC_DATA=pd.read_csv(common.get_resources_path()+'/script/english_script_phonetic_data.csv',encoding='utf-8')
101
+
102
+ ENGLISH_PHONETIC_VECTORS=ENGLISH_PHONETIC_DATA.iloc[:,PHONETIC_VECTOR_START_OFFSET:].values
103
+
104
+ PHONETIC_VECTOR_LENGTH=ENGLISH_PHONETIC_VECTORS.shape[1]
105
+
106
+ ### Load mapping from ARPABET representation of phoneme to internal ID
107
+ global ARPABET_ID_MAP, ID_ARPABET_MAP
108
+
109
+ with open(common.get_resources_path()+'/script/english_arpabet_list.csv','r',encoding='utf-8') as infile:
110
+ for ph_id, name in enumerate(iter(infile)):
111
+ name=name.strip()
112
+ ARPABET_ID_MAP[name]=ph_id
113
+ ID_ARPABET_MAP[ph_id]=name
114
+
115
+
116
+ def phoneme_to_offset(ph):
117
+ return ARPABET_ID_MAP[ph]
118
+
119
+ def offset_to_phoneme(ph_id):
120
+ return ID_ARPABET_MAP[ph_id]
121
+
122
+ def phoneme_to_enc(ph):
123
+ return chr(SCRIPT_RANGE_START+phoneme_to_offset(ph))
124
+
125
+ def enc_to_phoneme(ph):
126
+ return offset_to_phoneme(enc_to_offset(ph))
127
+
128
+ def enc_to_offset(c):
129
+ return ord(c)-SCRIPT_RANGE_START
130
+
131
+ def in_range(offset):
132
+ return offset>=SCRIPT_RANGE_START and offset<SCRIPT_RANGE_END
133
+
134
+ def get_phonetic_info(lang):
135
+ return (ENGLISH_PHONETIC_DATA, ENGLISH_PHONETIC_VECTORS)
136
+
137
+ def invalid_vector():
138
+ ## TODO: check if np datatype is correct?
139
+ return np.array([0]*PHONETIC_VECTOR_LENGTH)
140
+
141
+ def get_phonetic_feature_vector(p,lang):
142
+
143
+ offset=enc_to_offset(p)
144
+
145
+ if not in_range(offset):
146
+ return invalid_vector()
147
+
148
+ phonetic_data, phonetic_vectors= get_phonetic_info(lang)
149
+
150
+ if phonetic_data.iloc[offset]['Valid Vector Representation']==0:
151
+ return invalid_vector()
152
+
153
+ return phonetic_vectors[offset]
154
+
indicTrans/indic_nlp_library/indicnlp/script/indic_scripts.py ADDED
@@ -0,0 +1,301 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) 2013-present, Anoop Kunchukuttan
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the MIT license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+ #
8
+
9
+ import pandas as pd
10
+ import numpy as np
11
+ import os
12
+
13
+ from indicnlp import common
14
+ from indicnlp.common import IndicNlpException
15
+ from indicnlp import langinfo as li
16
+
17
+ ###
18
+ # Phonetic Information about script characters
19
+ ###
20
+
21
+ """ Phonetic data about all languages except Tamil """
22
+ ALL_PHONETIC_DATA=None
23
+
24
+ """ Phonetic data for Tamil """
25
+ TAMIL_PHONETIC_DATA=None
26
+
27
+ """ Phonetic vector for all languages except Tamil """
28
+ ALL_PHONETIC_VECTORS=None
29
+
30
+ """ Phonetic vector for Tamil """
31
+ TAMIL_PHONETIC_VECTORS=None
32
+
33
+ """ Length of phonetic vector """
34
+ PHONETIC_VECTOR_LENGTH=38
35
+
36
+ """ Start offset for the phonetic feature vector in the phonetic data vector """
37
+ PHONETIC_VECTOR_START_OFFSET=6
38
+
39
+ ## PHONETIC PROPERTIES in order in which they occur in the vector
40
+ ## This list must be in sync with the keys in the PV_PROP_RANGES dictionary
41
+ PV_PROP=['basic_type',
42
+ 'vowel_length',
43
+ 'vowel_strength',
44
+ 'vowel_status',
45
+ 'consonant_type',
46
+ 'articulation_place',
47
+ 'aspiration',
48
+ 'voicing',
49
+ 'nasalization',
50
+ 'vowel_horizontal',
51
+ 'vowel_vertical',
52
+ 'vowel_roundness',
53
+ ]
54
+
55
+ ###
56
+ # Bit vector ranges for various properties
57
+ ###
58
+
59
+ PV_PROP_RANGES={
60
+ 'basic_type': [0,6],
61
+ 'vowel_length': [6,8],
62
+ 'vowel_strength': [8,11],
63
+ 'vowel_status': [11,13],
64
+ 'consonant_type': [13,18],
65
+ 'articulation_place': [18,23],
66
+ 'aspiration': [23,25],
67
+ 'voicing': [25,27],
68
+ 'nasalization': [27,29],
69
+ 'vowel_horizontal': [29,32],
70
+ 'vowel_vertical': [32,36],
71
+ 'vowel_roundness': [36,38],
72
+ }
73
+
74
+
75
+ ####
76
+ # Indexes into the Phonetic Vector
77
+ ####
78
+ PVIDX_BT_VOWEL=0
79
+ PVIDX_BT_CONSONANT=1
80
+ PVIDX_BT_NUKTA=2
81
+ PVIDX_BT_HALANT=3
82
+ PVIDX_BT_ANUSVAAR=4
83
+ PVIDX_BT_MISC=5
84
+ PVIDX_BT_S=PVIDX_BT_VOWEL
85
+ PVIDX_BT_E=PVIDX_BT_MISC+1
86
+
87
+ PVIDX_VSTAT_DEP=12
88
+
89
+ #####
90
+ # Unicode information about characters
91
+ #####
92
+
93
+ SCRIPT_OFFSET_START=0
94
+ SCRIPT_OFFSET_RANGE=0x80
95
+
96
+ def init():
97
+ """
98
+ To be called by library loader, do not call it in your program
99
+ """
100
+
101
+ global ALL_PHONETIC_DATA, ALL_PHONETIC_VECTORS, TAMIL_PHONETIC_DATA, TAMIL_PHONETIC_VECTORS, PHONETIC_VECTOR_LENGTH, PHONETIC_VECTOR_START_OFFSET
102
+
103
+ ALL_PHONETIC_DATA=pd.read_csv(os.path.join(common.get_resources_path(),'script','all_script_phonetic_data.csv'),encoding='utf-8')
104
+ TAMIL_PHONETIC_DATA=pd.read_csv(os.path.join(common.get_resources_path(),'script','tamil_script_phonetic_data.csv'),encoding='utf-8')
105
+
106
+ ALL_PHONETIC_VECTORS= ALL_PHONETIC_DATA.iloc[:,PHONETIC_VECTOR_START_OFFSET:].values
107
+ TAMIL_PHONETIC_VECTORS=TAMIL_PHONETIC_DATA.iloc[:,PHONETIC_VECTOR_START_OFFSET:].values
108
+
109
+ PHONETIC_VECTOR_LENGTH=ALL_PHONETIC_VECTORS.shape[1]
110
+
111
+ def is_supported_language(lang):
112
+ return lang in list(li.SCRIPT_RANGES.keys())
113
+
114
+ def get_offset(c,lang):
115
+ if not is_supported_language(lang):
116
+ raise IndicNlpException('Language {} not supported'.format(lang))
117
+ return ord(c)-li.SCRIPT_RANGES[lang][0]
118
+
119
+ def offset_to_char(off,lang):
120
+ """
121
+ Applicable to Brahmi derived Indic scripts
122
+ """
123
+ if not is_supported_language(lang):
124
+ raise IndicNlpException('Language {} not supported'.format(lang))
125
+ return chr(off+li.SCRIPT_RANGES[lang][0])
126
+
127
+ def is_indiclang_char(c,lang):
128
+ """
129
+ Applicable to Brahmi derived Indic scripts
130
+ Note that DANDA and DOUBLE_DANDA have the same Unicode codepoint for all Indic scripts
131
+ """
132
+ if not is_supported_language(lang):
133
+ raise IndicNlpException('Language {} not supported'.format(lang))
134
+ o=get_offset(c,lang)
135
+ return (o>=SCRIPT_OFFSET_START and o<SCRIPT_OFFSET_RANGE) \
136
+ or ord(c)==li.DANDA or ord(c)==li.DOUBLE_DANDA
137
+
138
+ def in_coordinated_range_offset(c_offset):
139
+ """
140
+ Applicable to Brahmi derived Indic scripts
141
+ """
142
+ return (c_offset>=li.COORDINATED_RANGE_START_INCLUSIVE and c_offset<=li.COORDINATED_RANGE_END_INCLUSIVE)
143
+
144
+ def in_coordinated_range(c,lang):
145
+ if not is_supported_language(lang):
146
+ raise IndicNlpException('Language {} not supported'.format(lang))
147
+ return in_coordinated_range_offset(get_offset(c,lang))
148
+
149
+ def get_phonetic_info(lang):
150
+ if not is_supported_language(lang):
151
+ raise IndicNlpException('Language {} not supported'.format(lang))
152
+ phonetic_data= ALL_PHONETIC_DATA if lang!=li.LC_TA else TAMIL_PHONETIC_DATA
153
+ phonetic_vectors= ALL_PHONETIC_VECTORS if lang!=li.LC_TA else TAMIL_PHONETIC_VECTORS
154
+
155
+ return (phonetic_data, phonetic_vectors)
156
+
157
+ def invalid_vector():
158
+ ## TODO: check if np datatype is correct?
159
+ return np.array([0]*PHONETIC_VECTOR_LENGTH)
160
+
161
+ def get_phonetic_feature_vector(c,lang):
162
+
163
+ offset=get_offset(c,lang)
164
+
165
+ if not in_coordinated_range_offset(offset):
166
+ return invalid_vector()
167
+
168
+ phonetic_data, phonetic_vectors= get_phonetic_info(lang)
169
+
170
+ if phonetic_data.iloc[offset]['Valid Vector Representation']==0:
171
+ return invalid_vector()
172
+
173
+ return phonetic_vectors[offset]
174
+
175
+ def get_phonetic_feature_vector_offset(offset,lang):
176
+
177
+ if not in_coordinated_range_offset(offset):
178
+ return invalid_vector()
179
+
180
+ phonetic_data, phonetic_vectors= get_phonetic_info(lang)
181
+
182
+ if phonetic_data.iloc[offset]['Valid Vector Representation']==0:
183
+ return invalid_vector()
184
+
185
+ return phonetic_vectors[offset]
186
+
187
+ ### Unary operations on vectors
188
+ def is_valid(v):
189
+ return np.sum(v)>0
190
+
191
+ def is_vowel(v):
192
+ return v[PVIDX_BT_VOWEL]==1
193
+
194
+ def is_consonant(v):
195
+ return v[PVIDX_BT_CONSONANT]==1
196
+
197
+ def is_halant(v):
198
+ return v[PVIDX_BT_HALANT]==1
199
+
200
+ def is_nukta(v):
201
+ return v[PVIDX_BT_NUKTA]==1
202
+
203
+ def is_anusvaar(v):
204
+ return v[PVIDX_BT_ANUSVAAR]==1
205
+
206
+ def is_misc(v):
207
+ return v[PVIDX_BT_MISC]==1
208
+
209
+ def is_dependent_vowel(v):
210
+ return is_vowel(v) and v[PVIDX_VSTAT_DEP]==1
211
+
212
+ def is_plosive(v):
213
+ return is_consonant(v) and get_property_vector(v,'consonant_type')[0]==1
214
+
215
+ ### Binary operations on phonetic vectors
216
+
217
+ def or_vectors(v1,v2):
218
+ return np.array([ 1 if (b1+b2)>=1 else 0 for b1,b2 in zip(v1,v2) ])
219
+
220
+ def xor_vectors(v1,v2):
221
+ return np.array([ 1 if b1!=b2 else 0 for b1,b2 in zip(v1,v2) ])
222
+
223
+ ### Getting properties from phonetic vectors
224
+
225
+ def get_property_vector(v,prop_name):
226
+ return v[PV_PROP_RANGES[prop_name][0]:PV_PROP_RANGES[prop_name][1]]
227
+
228
+ def get_property_value(v,prop_name):
229
+ factor_bits=get_property_vector(v,prop_name).tolist()
230
+
231
+ v=0
232
+ c=1
233
+ for b in factor_bits[::-1]:
234
+ v+=(c*b)
235
+ c=c*2.0
236
+
237
+ return int(v)
238
+
239
+ def lcsr_indic(srcw,tgtw,slang,tlang):
240
+ """
241
+ compute the Longest Common Subsequence Ratio (LCSR) between two strings at the character level.
242
+ This works for Indic scripts by mapping both languages to a common script
243
+
244
+ srcw: source language string
245
+ tgtw: source language string
246
+ slang: source language
247
+ tlang: target language
248
+ """
249
+ score_mat=np.zeros((len(srcw)+1,len(tgtw)+1))
250
+
251
+ for si,sc in enumerate(srcw,1):
252
+ for ti,tc in enumerate(tgtw,1):
253
+ so=get_offset(sc,slang)
254
+ to=get_offset(tc,tlang)
255
+
256
+ if in_coordinated_range_offset(so) and in_coordinated_range_offset(to) and so==to:
257
+ score_mat[si,ti]=score_mat[si-1,ti-1]+1.0
258
+ elif not (in_coordinated_range_offset(so) or in_coordinated_range_offset(to)) and sc==tc:
259
+ score_mat[si,ti]=score_mat[si-1,ti-1]+1.0
260
+ else:
261
+ score_mat[si,ti]= max(
262
+ score_mat[si,ti-1],
263
+ score_mat[si-1,ti])
264
+
265
+ return (score_mat[-1,-1]/float(max(len(srcw),len(tgtw))),float(len(srcw)),float(len(tgtw)))
266
+
267
+ def lcsr_any(srcw,tgtw):
268
+ """
269
+ LCSR computation if both languages have the same script
270
+ """
271
+ score_mat=np.zeros((len(srcw)+1,len(tgtw)+1))
272
+
273
+ for si,sc in enumerate(srcw,1):
274
+ for ti,tc in enumerate(tgtw,1):
275
+
276
+ if sc==tc:
277
+ score_mat[si,ti]=score_mat[si-1,ti-1]+1.0
278
+ else:
279
+ score_mat[si,ti]= max(
280
+ score_mat[si,ti-1],
281
+ score_mat[si-1,ti])
282
+
283
+ return (score_mat[-1,-1]/float(max(len(srcw),len(tgtw))),float(len(srcw)),float(len(tgtw)))
284
+
285
+ def lcsr(srcw,tgtw,slang,tlang):
286
+ """
287
+ compute the Longest Common Subsequence Ratio (LCSR) between two strings at the character level.
288
+
289
+ srcw: source language string
290
+ tgtw: source language string
291
+ slang: source language
292
+ tlang: target language
293
+ """
294
+
295
+ if slang==tlang or not is_supported_language(slang) or not is_supported_language(tlang):
296
+ return lcsr_any(srcw,tgtw,slang,tlang)
297
+ else:
298
+ return lcsr_indic(srcw,tgtw)
299
+
300
+
301
+
indicTrans/indic_nlp_library/indicnlp/script/phonetic_sim.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) 2013-present, Anoop Kunchukuttan
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the MIT license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+ #
8
+
9
+ from indicnlp import loader
10
+ from indicnlp import langinfo
11
+ from indicnlp.script.indic_scripts import *
12
+ import numpy as np
13
+ import gzip
14
+ import pandas as pd
15
+ import sys
16
+
17
+ def equal(v1,v2):
18
+ return 0.0 if np.sum( xor_vectors(v1, v2)) > 0 else 1.0
19
+
20
+ def dice(v1,v2):
21
+ dotprod=2*float(np.dot( v1, v2.T ))
22
+ return dotprod/float(len(v1)+len(v2))
23
+
24
+ def jaccard(v1,v2):
25
+ dotprod=float(np.dot( v1, v2.T ))
26
+ return dotprod/float(len(v1)+len(v2)-dotprod)
27
+
28
+ def cosine(v1,v2):
29
+ dotprod=float(np.dot( v1, v2.T ))
30
+ norm1=float(np.dot( v1, v1.T ))
31
+ norm2=float(np.dot( v2, v2.T ))
32
+ return ((dotprod)/(np.sqrt(norm1*norm2)+0.00001))
33
+
34
+ def dotprod(v1,v2):
35
+ return float(np.dot( v1, v2.T ))
36
+
37
+ def sim1(v1,v2,base=5.0):
38
+ return np.power(base,dotprod(v1,v2))
39
+
40
+ def softmax(v1,v2):
41
+ return sim1(v1,v2,np.e)
42
+
43
+ def create_similarity_matrix(sim_func,slang,tlang,normalize=True):
44
+
45
+ dim=langinfo.COORDINATED_RANGE_END_INCLUSIVE-langinfo.COORDINATED_RANGE_START_INCLUSIVE+1
46
+ sim_mat=np.zeros((dim,dim))
47
+
48
+ for offset1 in range(langinfo.COORDINATED_RANGE_START_INCLUSIVE, langinfo.COORDINATED_RANGE_END_INCLUSIVE+1):
49
+ v1=get_phonetic_feature_vector(offset_to_char(offset1,slang),slang)
50
+ for offset2 in range(langinfo.COORDINATED_RANGE_START_INCLUSIVE, langinfo.COORDINATED_RANGE_END_INCLUSIVE+1):
51
+ v2=get_phonetic_feature_vector(offset_to_char(offset2,tlang),tlang)
52
+ sim_mat[offset1,offset2]=sim_func(v1,v2)
53
+
54
+ if normalize:
55
+ sums=np.sum(sim_mat, axis=1)
56
+ sim_mat=(sim_mat.transpose()/sums).transpose()
57
+
58
+ return sim_mat
59
+
indicTrans/indic_nlp_library/indicnlp/syllable/__init__.py ADDED
File without changes
indicTrans/indic_nlp_library/indicnlp/syllable/syllabifier.py ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) 2013-present, Anoop Kunchukuttan
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the MIT license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+ #
8
+
9
+ import codecs, sys
10
+ from indicnlp.script import indic_scripts as si
11
+ import re
12
+
13
+ chillu_char_map= {
14
+ '\u0d7a': '\u0d23',
15
+ '\u0d7b': '\u0d28',
16
+ '\u0d7c': '\u0d30',
17
+ '\u0d7d': '\u0d32',
18
+ '\u0d7e': '\u0d33',
19
+ '\u0d7f': '\u0d15',
20
+ }
21
+
22
+ char_chillu_map= {}
23
+ for k,v in chillu_char_map.items():
24
+ char_chillu_map[v]=k
25
+
26
+ def normalize_malayalam(word):
27
+
28
+ word_mask=re.sub(r'[0-9]','0',word)
29
+
30
+ # instead of chillu characters, use consonant+halant
31
+ for chillu,char in chillu_char_map.items():
32
+ word=word.replace(chillu,'{}\u0d4d'.format(char))
33
+ word_mask=word_mask.replace(chillu,'41')
34
+
35
+ word_mask=re.sub(r'[^0-9]','0',word_mask)
36
+
37
+ return word, word_mask
38
+
39
+ def denormalize_malayalam(word, word_mask):
40
+
41
+ word=list(word)
42
+ word_mask=list(word_mask)
43
+
44
+ ## pattern 4
45
+ idx=0
46
+ while idx>=0:
47
+ try:
48
+ idx=word_mask.index('4',idx)
49
+ word[idx:idx+2]=char_chillu_map[word[idx]]
50
+ word_mask[idx:idx+2]='0'
51
+ start=idx
52
+ except ValueError as e:
53
+ break
54
+
55
+ return ''.join(word)
56
+
57
+ def normalize_punjabi(word):
58
+ word_mask=re.sub(r'[0-9]','0',word)
59
+
60
+ ## replace tippi with anusvaar
61
+ word=word.replace('\u0a70','\u0a02')
62
+ word_mask=word_mask.replace('\u0a70','2')
63
+
64
+ ## replace addak+consonant with consonat+halant+consonant
65
+ word=re.sub(r'\u0a71(.)','\\1\u0a4d\\1',word)
66
+ word_mask=re.sub(r'\u0a71(.)','311',word_mask)
67
+
68
+ word_mask=re.sub(r'[^0-9]','0',word_mask)
69
+
70
+ return word, word_mask
71
+
72
+ def denormalize_punjabi(word, word_mask):
73
+
74
+ word=list(word)
75
+ word_mask=list(word_mask)
76
+
77
+ ## pattern 2
78
+ idx=0
79
+ while idx>=0:
80
+ try:
81
+ idx=word_mask.index('2',idx)
82
+ word[idx]='\u0a70'
83
+ word_mask[idx]='0'
84
+ start=idx
85
+ except ValueError as e:
86
+ break
87
+
88
+ ## pattern 3
89
+ idx=0
90
+ while idx>=0:
91
+ try:
92
+ idx=word_mask.index('3',idx)
93
+ word[idx:idx+3]='\u0a71{}'.format(word[idx])
94
+ word_mask[idx:idx+3]='00'
95
+ start=idx
96
+ except ValueError as e:
97
+ break
98
+
99
+ return ''.join(word)
100
+
101
+ def char_backoff(syllables_list,vocab):
102
+ syllables_final=[]
103
+
104
+ if vocab is None:
105
+ syllables_final=syllables_list
106
+ else:
107
+ for s in syllables_list:
108
+ if s in vocab:
109
+ syllables_final.append(s)
110
+ else:
111
+ for x in s:
112
+ syllables_final.append(x)
113
+
114
+ return syllables_final
115
+
116
+
117
+ def orthographic_syllabify_improved(word,lang,vocab=None):
118
+
119
+ word_mask=['0']*len(word)
120
+
121
+ if lang=='ml':
122
+ word, word_mask = normalize_malayalam(word)
123
+ word=word
124
+ elif lang=='pa':
125
+ word, word_mask = normalize_punjabi(word)
126
+
127
+ p_vectors=[si.get_phonetic_feature_vector(c,lang) for c in word]
128
+
129
+ syllables=[]
130
+ syllables_mask=[]
131
+
132
+ for i in range(len(word)):
133
+ v=p_vectors[i]
134
+
135
+ syllables.append(word[i])
136
+ syllables_mask.append(word_mask[i])
137
+
138
+ ### simplified syllabification
139
+ #if i+1<len(word) and \
140
+ # (not si.is_valid(p_vectors[i+1]) or si.is_misc(p_vectors[i+1])):
141
+ # syllables.append(u' ')
142
+ # syllables_mask.append(u'0')
143
+
144
+ #elif not si.is_valid(v) or si.is_misc(v) or si.is_vowel(v):
145
+ # syllables.append(u' ')
146
+ # syllables_mask.append(u'0')
147
+
148
+ #elif i+1<len(word) and \
149
+ # (si.is_consonant(v) or si.is_nukta(v)) and \
150
+ # (si.is_consonant(p_vectors[i+1]) or si.is_anusvaar(p_vectors[i+1])):
151
+ # syllables.append(u' ')
152
+ # syllables_mask.append(u'0')
153
+
154
+ #### better syllabification
155
+ if i+1<len(word) and (not si.is_valid(p_vectors[i+1]) or si.is_misc(p_vectors[i+1])):
156
+ syllables.append(' ')
157
+ syllables_mask.append('0')
158
+
159
+ elif not si.is_valid(v) or si.is_misc(v) :
160
+ syllables.append(' ')
161
+ syllables_mask.append('0')
162
+
163
+ elif si.is_vowel(v):
164
+
165
+ anu_nonplos= ( i+2<len(word) and \
166
+ si.is_anusvaar(p_vectors[i+1]) and \
167
+ not si.is_plosive(p_vectors[i+2])\
168
+ )
169
+
170
+ anu_eow= ( i+2==len(word) and \
171
+ si.is_anusvaar(p_vectors[i+1]) )
172
+
173
+ if not(anu_nonplos or anu_eow):
174
+ syllables.append(' ')
175
+ syllables_mask.append('0')
176
+
177
+ elif i+1<len(word) and \
178
+ (si.is_consonant(v) or si.is_nukta(v)):
179
+ if si.is_consonant(p_vectors[i+1]):
180
+ syllables.append(' ')
181
+ syllables_mask.append('0')
182
+ elif si.is_vowel(p_vectors[i+1]) and \
183
+ not si.is_dependent_vowel(p_vectors[i+1]):
184
+ syllables.append(' ')
185
+ syllables_mask.append('0')
186
+ elif si.is_anusvaar(p_vectors[i+1]):
187
+ anu_nonplos= ( i+2<len(word) and \
188
+ not si.is_plosive(p_vectors[i+2])\
189
+ )
190
+
191
+ anu_eow= i+2==len(word)
192
+
193
+ if not(anu_nonplos or anu_eow):
194
+ syllables.append(' ')
195
+ syllables_mask.append('0')
196
+
197
+ syllables_mask=''.join(syllables_mask)
198
+ syllables=''.join(syllables)
199
+
200
+ #assert len(syllables_mask) == len(syllables)
201
+ #assert syllables_mask.find('01') == -1
202
+ if syllables_mask.find('01') >= 0:
203
+ print('Warning')
204
+
205
+ if lang=='ml':
206
+ syllables = denormalize_malayalam(syllables,syllables_mask)
207
+ elif lang=='pa':
208
+ syllables = denormalize_punjabi(syllables,syllables_mask)
209
+
210
+ syllables_list = syllables.strip().split(' ')
211
+ return(char_backoff(syllables_list,vocab))
212
+
213
+ def orthographic_syllabify(word,lang,vocab=None):
214
+
215
+ p_vectors=[si.get_phonetic_feature_vector(c,lang) for c in word]
216
+
217
+ syllables=[]
218
+
219
+ for i in range(len(word)):
220
+ v=p_vectors[i]
221
+
222
+ syllables.append(word[i])
223
+
224
+ ### simplified syllabification
225
+ #if i+1<len(word) and \
226
+ # (not si.is_valid(p_vectors[i+1]) or si.is_misc(p_vectors[i+1])):
227
+ # syllables.append(u' ')
228
+
229
+ #elif not si.is_valid(v) or si.is_misc(v) or si.is_vowel(v):
230
+ # syllables.append(u' ')
231
+
232
+ #elif i+1<len(word) and \
233
+ # (si.is_consonant(v) or si.is_nukta(v)) and \
234
+ # (si.is_consonant(p_vectors[i+1]) or si.is_anusvaar(p_vectors[i+1])):
235
+ # syllables.append(u' ')
236
+
237
+ #### better syllabification
238
+ if i+1<len(word) and (not si.is_valid(p_vectors[i+1]) or si.is_misc(p_vectors[i+1])):
239
+ syllables.append(' ')
240
+
241
+ elif not si.is_valid(v) or si.is_misc(v) :
242
+ syllables.append(' ')
243
+
244
+ elif si.is_vowel(v):
245
+
246
+ anu_nonplos= ( i+2<len(word) and \
247
+ si.is_anusvaar(p_vectors[i+1]) and \
248
+ not si.is_plosive(p_vectors[i+2])\
249
+ )
250
+
251
+ anu_eow= ( i+2==len(word) and \
252
+ si.is_anusvaar(p_vectors[i+1]) )
253
+
254
+ if not(anu_nonplos or anu_eow):
255
+ syllables.append(' ')
256
+
257
+ elif i+1<len(word) and \
258
+ (si.is_consonant(v) or si.is_nukta(v)):
259
+ if si.is_consonant(p_vectors[i+1]):
260
+ syllables.append(' ')
261
+ elif si.is_vowel(p_vectors[i+1]) and \
262
+ not si.is_dependent_vowel(p_vectors[i+1]):
263
+ syllables.append(' ')
264
+ elif si.is_anusvaar(p_vectors[i+1]):
265
+ anu_nonplos= ( i+2<len(word) and \
266
+ not si.is_plosive(p_vectors[i+2])\
267
+ )
268
+
269
+ anu_eow= i+2==len(word)
270
+
271
+ if not(anu_nonplos or anu_eow):
272
+ syllables.append(' ')
273
+
274
+ syllables_list = ''.join(syllables).strip().split(' ')
275
+ return(char_backoff(syllables_list,vocab))
276
+
277
+ def orthographic_simple_syllabify(word,lang,vocab=None):
278
+
279
+ p_vectors=[si.get_phonetic_feature_vector(c,lang) for c in word]
280
+
281
+ syllables=[]
282
+
283
+ for i in range(len(word)):
284
+ v=p_vectors[i]
285
+
286
+ syllables.append(word[i])
287
+
288
+ ## simplified syllabification
289
+ if i+1<len(word) and \
290
+ (not si.is_valid(p_vectors[i+1]) or si.is_misc(p_vectors[i+1])):
291
+ syllables.append(' ')
292
+
293
+ elif not si.is_valid(v) or si.is_misc(v) or si.is_vowel(v):
294
+ syllables.append(' ')
295
+
296
+ elif i+1<len(word) and \
297
+ (si.is_consonant(v) or si.is_nukta(v)) and \
298
+ (si.is_consonant(p_vectors[i+1]) or si.is_anusvaar(p_vectors[i+1])):
299
+ syllables.append(' ')
300
+
301
+ syllables_list = ''.join(syllables).strip().split(' ')
302
+ return(char_backoff(syllables_list,vocab))