Spaces:

hussain-shk
/

english-to-indic-translation

Runtime error

App Files Files Community

Hussain Shaikh commited on Mar 15, 2022

Commit

7edceed

1 Parent(s): 6325f49

final commit added required files

Browse files

Files changed (30) hide show

.gitignore +143 -0
api/api.py +152 -0
api/punctuate.py +220 -0
app.py +27 -4
inference/__init__.py +0 -0
inference/custom_interactive.py +298 -0
inference/engine.py +198 -0
legacy/apply_bpe_test_valid_notag.sh +33 -0
legacy/apply_bpe_train_notag.sh +33 -0
legacy/env.sh +17 -0
legacy/indictrans_workflow.ipynb +643 -0
legacy/install_fairseq.sh +45 -0
legacy/run_inference.sh +80 -0
legacy/run_joint_inference.sh +74 -0
legacy/tpu_training_instructions.md +92 -0
legacy/translate.sh +70 -0
model_configs/__init__.py +1 -0
model_configs/custom_transformer.py +38 -0
requirements.txt +11 -0
scripts/__init__.py +0 -0
scripts/add_joint_tags_translate.py +61 -0
scripts/add_tags_translate.py +33 -0
scripts/clean_vocab.py +19 -0
scripts/concat_joint_data.py +130 -0
scripts/extract_non_english_pairs.py +108 -0
scripts/postprocess_score.py +48 -0
scripts/postprocess_translate.py +110 -0
scripts/preprocess_translate.py +172 -0
scripts/remove_large_sentences.py +44 -0
scripts/remove_train_devtest_overlaps.py +265 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,143 @@

+#ignore libs folder we use
+indic_nlp_library
+indic_nlp_resources
+subword-nmt
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/

api/api.py ADDED Viewed

	@@ -0,0 +1,152 @@

+import time
+import re
+from math import floor, ceil
+from fairseq import checkpoint_utils, distributed_utils, options, tasks, utils
+# from nltk.tokenize import sent_tokenize
+from flask import Flask, request, jsonify
+from flask_cors import CORS, cross_origin
+import webvtt
+from io import StringIO
+from mosestokenizer import MosesSentenceSplitter
+from indicTrans.inference.engine import Model
+from punctuate import RestorePuncts
+from indicnlp.tokenize.sentence_tokenize import sentence_split
+app = Flask(__name__)
+cors = CORS(app)
+app.config['CORS_HEADERS'] = 'Content-Type'
+indic2en_model = Model(expdir='models/v3/indic-en')
+en2indic_model = Model(expdir='models/v3/en-indic')
+m2m_model = Model(expdir='models/m2m')
+rpunct = RestorePuncts()
+indic_language_dict = {
+    'Assamese': 'as',
+    'Hindi' : 'hi',
+    'Marathi' : 'mr',
+    'Tamil' : 'ta',
+    'Bengali' : 'bn',
+    'Kannada' : 'kn',
+    'Oriya' : 'or',
+    'Telugu' : 'te',
+    'Gujarati' : 'gu',
+    'Malayalam' : 'ml',
+    'Punjabi' : 'pa',
+}
+splitter = MosesSentenceSplitter('en')
+def get_inference_params():
+    source_language = request.form['source_language']
+    target_language = request.form['target_language']
+    if source_language in indic_language_dict and target_language == 'English':
+        model = indic2en_model
+        source_lang = indic_language_dict[source_language]
+        target_lang = 'en'
+    elif source_language == 'English' and target_language in indic_language_dict:
+        model = en2indic_model
+        source_lang = 'en'
+        target_lang = indic_language_dict[target_language]
+    elif source_language in indic_language_dict and target_language in indic_language_dict:
+        model = m2m_model
+        source_lang = indic_language_dict[source_language]
+        target_lang = indic_language_dict[target_language]
+    return model, source_lang, target_lang
+@app.route('/', methods=['GET'])
+def main():
+    return "IndicTrans API"
+@app.route('/supported_languages', methods=['GET'])
+@cross_origin()
+def supported_languages():
+    return jsonify(indic_language_dict)
+@app.route("/translate", methods=['POST'])
+@cross_origin()
+def infer_indic_en():
+    model, source_lang, target_lang = get_inference_params()
+    source_text = request.form['text']
+    start_time = time.time()
+    target_text = model.translate_paragraph(source_text, source_lang, target_lang)
+    end_time = time.time()
+    return {'text':target_text, 'duration':round(end_time-start_time, 2)}
+@app.route("/translate_vtt", methods=['POST'])
+@cross_origin()
+def infer_vtt_indic_en():
+    start_time = time.time()
+    model, source_lang, target_lang = get_inference_params()
+    source_text = request.form['text']
+    # vad_segments = request.form['vad_nochunk'] # Assuming it is an array of start & end timestamps
+    vad = webvtt.read_buffer(StringIO(source_text))
+    source_sentences = [v.text.replace('\r', '').replace('\n', ' ') for v in vad]
+    ## SUMANTH LOGIC HERE ##
+    # for each vad timestamp, do:
+    large_sentence = ' '.join(source_sentences) # only sentences in that time range
+    large_sentence = large_sentence.lower()
+    # split_sents = sentence_split(large_sentence, 'en')
+    # print(split_sents)
+    large_sentence = re.sub(r'[^\w\s]', '', large_sentence)
+    punctuated = rpunct.punctuate(large_sentence, batch_size=32)
+    end_time = time.time()
+    print("Time Taken for punctuation: {} s".format(end_time - start_time))
+    start_time = time.time()
+    split_sents = splitter([punctuated]) ### Please uncomment
+    # print(split_sents)
+    # output_sentence_punctuated = model.translate_paragraph(punctuated, source_lang, target_lang)
+    output_sents = model.batch_translate(split_sents, source_lang, target_lang)
+    # print(output_sents)
+    # output_sents = split_sents
+    # print(output_sents)
+    # align this to those range of source_sentences in `captions`
+    map_ = {split_sents[i] : output_sents[i] for i in range(len(split_sents))}
+    # print(map_)
+    punct_para = ' '.join(list(map_.keys()))
+    nmt_para = ' '.join(list(map_.values()))
+    nmt_words = nmt_para.split(' ')
+    len_punct = len(punct_para.split(' '))
+    len_nmt = len(nmt_para.split(' '))
+    start = 0
+    for i in range(len(vad)):
+        if vad[i].text == '':
+            continue
+        len_caption = len(vad[i].text.split(' '))
+        frac = (len_caption / len_punct)
+        # frac = round(frac, 2)
+        req_nmt_size = floor(frac * len_nmt)
+        # print(frac, req_nmt_size)
+        vad[i].text = ' '.join(nmt_words[start:start+req_nmt_size])
+        # print(vad[i].text)
+        # print(start, req_nmt_size)
+        start += req_nmt_size
+    end_time = time.time()
+    print("Time Taken for translation: {} s".format(end_time - start_time))
+    # vad.save('aligned.vtt')
+    return {
+        'text': vad.content,
+        # 'duration':round(end_time-start_time, 2)
+    }

api/punctuate.py ADDED Viewed

	@@ -0,0 +1,220 @@

+# -*- coding: utf-8 -*-
+# 💾⚙️🔮
+# taken from https://github.com/Felflare/rpunct/blob/master/rpunct/punctuate.py
+# modified to support batching during gpu inference
+__author__ = "Daulet N."
+__email__ = "daulet.nurmanbetov@gmail.com"
+import time
+import logging
+import webvtt
+import torch
+from io import StringIO
+from nltk.tokenize import sent_tokenize
+#from langdetect import detect
+from simpletransformers.ner import NERModel
+class RestorePuncts:
+    def __init__(self, wrds_per_pred=250):
+        self.wrds_per_pred = wrds_per_pred
+        self.overlap_wrds = 30
+        self.valid_labels = ['OU', 'OO', '.O', '!O', ',O', '.U', '!U', ',U', ':O', ';O', ':U', "'O", '-O', '?O', '?U']
+        self.model = NERModel("bert", "felflare/bert-restore-punctuation", labels=self.valid_labels,
+                              args={"silent": True, "max_seq_length": 512})
+        # use_cuda isnt working and this hack seems to load the model correctly to the gpu
+        self.model.device = torch.device("cuda:1")
+        # dummy punctuate to load the model onto gpu
+        self.punctuate("hello how are you")
+    def punctuate(self, text: str, batch_size:int=32, lang:str=''):
+        """
+        Performs punctuation restoration on arbitrarily large text.
+        Detects if input is not English, if non-English was detected terminates predictions.
+        Overrride by supplying `lang='en'`
+        Args:
+            - text (str): Text to punctuate, can be few words to as large as you want.
+            - lang (str): Explicit language of input text.
+        """
+        #if not lang and len(text) > 10:
+        #    lang = detect(text)
+        #if lang != 'en':
+        #    raise Exception(F"""Non English text detected. Restore Punctuation works only for English.
+        #    If you are certain the input is English, pass argument lang='en' to this function.
+        #    Punctuate received: {text}""")
+        def chunks(L, n):
+            return [L[x : x + n] for x in range(0, len(L), n)]
+        # plit up large text into bert digestable chunks
+        splits = self.split_on_toks(text, self.wrds_per_pred, self.overlap_wrds)
+        texts = [i["text"] for i in splits]
+        batches = chunks(texts, batch_size)
+        preds_lst = []
+        for batch in batches:
+            batch_preds, _ = self.model.predict(batch)
+            preds_lst.extend(batch_preds)
+        # predict slices
+        # full_preds_lst contains tuple of labels and logits
+        #full_preds_lst = [self.predict(i['text']) for i in splits]
+        # extract predictions, and discard logits
+        #preds_lst = [i[0][0] for i in full_preds_lst]
+        # join text slices
+        combined_preds = self.combine_results(text, preds_lst)
+        # create punctuated prediction
+        punct_text = self.punctuate_texts(combined_preds)
+        return punct_text
+    def predict(self, input_slice):
+        """
+        Passes the unpunctuated text to the model for punctuation.
+        """
+        predictions, raw_outputs = self.model.predict([input_slice])
+        return predictions, raw_outputs
+    @staticmethod
+    def split_on_toks(text, length, overlap):
+        """
+        Splits text into predefined slices of overlapping text with indexes (offsets)
+        that tie-back to original text.
+        This is done to bypass 512 token limit on transformer models by sequentially
+        feeding chunks of < 512 toks.
+        Example output:
+        [{...}, {"text": "...", 'start_idx': 31354, 'end_idx': 32648}, {...}]
+        """
+        wrds = text.replace('\n', ' ').split(" ")
+        resp = []
+        lst_chunk_idx = 0
+        i = 0
+        while True:
+            # words in the chunk and the overlapping portion
+            wrds_len = wrds[(length * i):(length * (i + 1))]
+            wrds_ovlp = wrds[(length * (i + 1)):((length * (i + 1)) + overlap)]
+            wrds_split = wrds_len + wrds_ovlp
+            # Break loop if no more words
+            if not wrds_split:
+                break
+            wrds_str = " ".join(wrds_split)
+            nxt_chunk_start_idx = len(" ".join(wrds_len))
+            lst_char_idx = len(" ".join(wrds_split))
+            resp_obj = {
+                "text": wrds_str,
+                "start_idx": lst_chunk_idx,
+                "end_idx": lst_char_idx + lst_chunk_idx,
+            }
+            resp.append(resp_obj)
+            lst_chunk_idx += nxt_chunk_start_idx + 1
+            i += 1
+        logging.info(f"Sliced transcript into {len(resp)} slices.")
+        return resp
+    @staticmethod
+    def combine_results(full_text: str, text_slices):
+        """
+        Given a full text and predictions of each slice combines predictions into a single text again.
+        Performs validataion wether text was combined correctly
+        """
+        split_full_text = full_text.replace('\n', ' ').split(" ")
+        split_full_text = [i for i in split_full_text if i]
+        split_full_text_len = len(split_full_text)
+        output_text = []
+        index = 0
+        if len(text_slices[-1]) <= 3 and len(text_slices) > 1:
+            text_slices = text_slices[:-1]
+        for _slice in text_slices:
+            slice_wrds = len(_slice)
+            for ix, wrd in enumerate(_slice):
+                # print(index, "|", str(list(wrd.keys())[0]), "|", split_full_text[index])
+                if index == split_full_text_len:
+                    break
+                if split_full_text[index] == str(list(wrd.keys())[0]) and \
+                        ix <= slice_wrds - 3 and text_slices[-1] != _slice:
+                    index += 1
+                    pred_item_tuple = list(wrd.items())[0]
+                    output_text.append(pred_item_tuple)
+                elif split_full_text[index] == str(list(wrd.keys())[0]) and text_slices[-1] == _slice:
+                    index += 1
+                    pred_item_tuple = list(wrd.items())[0]
+                    output_text.append(pred_item_tuple)
+        assert [i[0] for i in output_text] == split_full_text
+        return output_text
+    @staticmethod
+    def punctuate_texts(full_pred: list):
+        """
+        Given a list of Predictions from the model, applies the predictions to text,
+        thus punctuating it.
+        """
+        punct_resp = ""
+        for i in full_pred:
+            word, label = i
+            if label[-1] == "U":
+                punct_wrd = word.capitalize()
+            else:
+                punct_wrd = word
+            if label[0] != "O":
+                punct_wrd += label[0]
+            punct_resp += punct_wrd + " "
+        punct_resp = punct_resp.strip()
+        # Append trailing period if doesnt exist.
+        if punct_resp[-1].isalnum():
+            punct_resp += "."
+        return punct_resp
+if __name__ == "__main__":
+    start = time.time()
+    punct_model = RestorePuncts()
+    load_model = time.time()
+    print(f'Time to load model: {load_model - start}')
+    # read test file
+    # with open('en_lower.txt', 'r') as fp:
+    #     # test_sample = fp.read()
+    #     lines = fp.readlines()
+    with open('sample.vtt', 'r') as fp:
+        source_text = fp.read()
+    # captions = webvtt.read_buffer(StringIO(source_text))
+    captions = webvtt.read('sample.vtt')
+    source_sentences = [caption.text.replace('\r', '').replace('\n', ' ') for caption in captions]
+    # print(source_sentences)
+    sent = ' '.join(source_sentences)
+    punctuated = punct_model.punctuate(sent)
+    tokenised = sent_tokenize(punctuated)
+    # print(tokenised)
+    for i in range(len(tokenised)):
+        captions[i].text = tokenised[i]
+    # return captions.content
+    captions.save('my_captions.vtt')
+    end = time.time()
+    print(f'Time for run: {end - load_model}')
+    print(f'Total time: {end  - start}')

app.py CHANGED Viewed

@@ -1,7 +1,30 @@
 import gradio as gr
-def greet(name):
-    return "Hell" + name + "!!"
-iface = gr.Interface(fn=greet, inputs="text", outputs="text")
-iface.launch()

+import os
 import gradio as gr
+download="wget --load-cookies /tmp/cookies.txt \"https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1IpcnaQ2ScX_zodt2aLlXa_5Kkntl0nue' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\\n/p')&id=1IpcnaQ2ScX_zodt2aLlXa_5Kkntl0nue\" -O en-indic.zip && rm -rf /tmp/cookies.txt"
+os.system(download)
+os.system('unzip /home/user/app/en-indic.zip')
+from fairseq import checkpoint_utils, distributed_utils, options, tasks, utils
+import gradio as gr
+from inference.engine import Model
+indic2en_model = Model(expdir='/home/user/app/en-indic')
+INDIC = {"Assamese": "as", "Bengali": "bn", "Gujarati": "gu", "Hindi": "hi","Kannada": "kn","Malayalam": "ml", "Marathi": "mr", "Odia": "or","Punjabi": "pa","Tamil": "ta", "Telugu" : "te"}
+def translate(text, lang):
+  return indic2en_model.translate_paragraph(text, 'en', INDIC[lang])
+languages = list(INDIC.keys())
+drop_down = gr.inputs.Dropdown(languages, type="value", default="Hindi", label="Select Target Language")
+text = gr.inputs.Textbox(lines=5, placeholder="Enter Text to translate", default="", label="Enter Text in English")
+text_ouptut = gr.outputs.Textbox(type="auto", label="Translated text in Target Language")
+# example=[['I want to translate this sentence in Hindi','Hindi'],
+#         ['I am feeling very good today.', 'Bengali']]
+supported_lang = ', '.join(languages)
+iface = gr.Interface(fn=translate, inputs=[text,drop_down] , outputs=text_ouptut, title='IndicTrans NMT System', description = 'Currently the model supports ' + supported_lang, article = 'Original repository can be found [here](https://github.com/AI4Bharat/indicTrans)' , examples=None)
+iface.launch(enable_queue=True)

inference/__init__.py ADDED Viewed

File without changes

inference/custom_interactive.py ADDED Viewed

	@@ -0,0 +1,298 @@

+# python wrapper for fairseq-interactive command line tool
+#!/usr/bin/env python3 -u
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Translate raw text with a trained model. Batches data on-the-fly.
+"""
+import ast
+from collections import namedtuple
+import torch
+from fairseq import checkpoint_utils, options, tasks, utils
+from fairseq.dataclass.utils import convert_namespace_to_omegaconf
+from fairseq.token_generation_constraints import pack_constraints, unpack_constraints
+from fairseq_cli.generate import get_symbols_to_strip_from_output
+import codecs
+Batch = namedtuple("Batch", "ids src_tokens src_lengths constraints")
+Translation = namedtuple("Translation", "src_str hypos pos_scores alignments")
+def make_batches(
+    lines, cfg, task, max_positions, encode_fn, constrainted_decoding=False
+):
+    def encode_fn_target(x):
+        return encode_fn(x)
+    if constrainted_decoding:
+        # Strip (tab-delimited) contraints, if present, from input lines,
+        # store them in batch_constraints
+        batch_constraints = [list() for _ in lines]
+        for i, line in enumerate(lines):
+            if "\t" in line:
+                lines[i], *batch_constraints[i] = line.split("\t")
+        # Convert each List[str] to List[Tensor]
+        for i, constraint_list in enumerate(batch_constraints):
+            batch_constraints[i] = [
+                task.target_dictionary.encode_line(
+                    encode_fn_target(constraint),
+                    append_eos=False,
+                    add_if_not_exist=False,
+                )
+                for constraint in constraint_list
+            ]
+    if constrainted_decoding:
+        constraints_tensor = pack_constraints(batch_constraints)
+    else:
+        constraints_tensor = None
+    tokens, lengths = task.get_interactive_tokens_and_lengths(lines, encode_fn)
+    itr = task.get_batch_iterator(
+        dataset=task.build_dataset_for_inference(
+            tokens, lengths, constraints=constraints_tensor
+        ),
+        max_tokens=cfg.dataset.max_tokens,
+        max_sentences=cfg.dataset.batch_size,
+        max_positions=max_positions,
+        ignore_invalid_inputs=cfg.dataset.skip_invalid_size_inputs_valid_test,
+    ).next_epoch_itr(shuffle=False)
+    for batch in itr:
+        ids = batch["id"]
+        src_tokens = batch["net_input"]["src_tokens"]
+        src_lengths = batch["net_input"]["src_lengths"]
+        constraints = batch.get("constraints", None)
+        yield Batch(
+            ids=ids,
+            src_tokens=src_tokens,
+            src_lengths=src_lengths,
+            constraints=constraints,
+        )
+class Translator:
+    def __init__(
+        self, data_dir, checkpoint_path, batch_size=25, constrained_decoding=False
+    ):
+        self.constrained_decoding = constrained_decoding
+        self.parser = options.get_generation_parser(interactive=True)
+        # buffer_size is currently not used but we just initialize it to batch
+        # size + 1 to avoid any assertion errors.
+        if self.constrained_decoding:
+            self.parser.set_defaults(
+                path=checkpoint_path,
+                remove_bpe="subword_nmt",
+                num_workers=-1,
+                constraints="ordered",
+                batch_size=batch_size,
+                buffer_size=batch_size + 1,
+            )
+        else:
+            self.parser.set_defaults(
+                path=checkpoint_path,
+                remove_bpe="subword_nmt",
+                num_workers=-1,
+                batch_size=batch_size,
+                buffer_size=batch_size + 1,
+            )
+        args = options.parse_args_and_arch(self.parser, input_args=[data_dir])
+        # we are explictly setting src_lang and tgt_lang here
+        # generally the data_dir we pass contains {split}-{src_lang}-{tgt_lang}.*.idx files from
+        # which fairseq infers the src and tgt langs(if these are not passed). In deployment we dont
+        # use any idx files and only store the SRC and TGT dictionaries.
+        args.source_lang = "SRC"
+        args.target_lang = "TGT"
+        # since we are truncating sentences to max_seq_len in engine, we can set it to False here
+        args.skip_invalid_size_inputs_valid_test = False
+        # we have custom architechtures in this folder and we will let fairseq
+        # import this
+        args.user_dir = "model_configs"
+        self.cfg = convert_namespace_to_omegaconf(args)
+        utils.import_user_module(self.cfg.common)
+        if self.cfg.interactive.buffer_size < 1:
+            self.cfg.interactive.buffer_size = 1
+        if self.cfg.dataset.max_tokens is None and self.cfg.dataset.batch_size is None:
+            self.cfg.dataset.batch_size = 1
+        assert (
+            not self.cfg.generation.sampling
+            or self.cfg.generation.nbest == self.cfg.generation.beam
+        ), "--sampling requires --nbest to be equal to --beam"
+        assert (
+            not self.cfg.dataset.batch_size
+            or self.cfg.dataset.batch_size <= self.cfg.interactive.buffer_size
+        ), "--batch-size cannot be larger than --buffer-size"
+        # Fix seed for stochastic decoding
+        # if self.cfg.common.seed is not None and not self.cfg.generation.no_seed_provided:
+        #     np.random.seed(self.cfg.common.seed)
+        #     utils.set_torch_seed(self.cfg.common.seed)
+        # if not self.constrained_decoding:
+        #     self.use_cuda = torch.cuda.is_available() and not self.cfg.common.cpu
+        # else:
+        #     self.use_cuda = False
+        self.use_cuda = torch.cuda.is_available() and not self.cfg.common.cpu
+        # Setup task, e.g., translation
+        self.task = tasks.setup_task(self.cfg.task)
+        # Load ensemble
+        overrides = ast.literal_eval(self.cfg.common_eval.model_overrides)
+        self.models, self._model_args = checkpoint_utils.load_model_ensemble(
+            utils.split_paths(self.cfg.common_eval.path),
+            arg_overrides=overrides,
+            task=self.task,
+            suffix=self.cfg.checkpoint.checkpoint_suffix,
+            strict=(self.cfg.checkpoint.checkpoint_shard_count == 1),
+            num_shards=self.cfg.checkpoint.checkpoint_shard_count,
+        )
+        # Set dictionaries
+        self.src_dict = self.task.source_dictionary
+        self.tgt_dict = self.task.target_dictionary
+        # Optimize ensemble for generation
+        for model in self.models:
+            if model is None:
+                continue
+            if self.cfg.common.fp16:
+                model.half()
+            if (
+                self.use_cuda
+                and not self.cfg.distributed_training.pipeline_model_parallel
+            ):
+                model.cuda()
+            model.prepare_for_inference_(self.cfg)
+        # Initialize generator
+        self.generator = self.task.build_generator(self.models, self.cfg.generation)
+        # Handle tokenization and BPE
+        self.tokenizer = self.task.build_tokenizer(self.cfg.tokenizer)
+        self.bpe = self.task.build_bpe(self.cfg.bpe)
+        # Load alignment dictionary for unknown word replacement
+        # (None if no unknown word replacement, empty if no path to align dictionary)
+        self.align_dict = utils.load_align_dict(self.cfg.generation.replace_unk)
+        self.max_positions = utils.resolve_max_positions(
+            self.task.max_positions(), *[model.max_positions() for model in self.models]
+        )
+    def encode_fn(self, x):
+        if self.tokenizer is not None:
+            x = self.tokenizer.encode(x)
+        if self.bpe is not None:
+            x = self.bpe.encode(x)
+        return x
+    def decode_fn(self, x):
+        if self.bpe is not None:
+            x = self.bpe.decode(x)
+        if self.tokenizer is not None:
+            x = self.tokenizer.decode(x)
+        return x
+    def translate(self, inputs, constraints=None):
+        if self.constrained_decoding and constraints is None:
+            raise ValueError("Constraints cant be None in constrained decoding mode")
+        if not self.constrained_decoding and constraints is not None:
+            raise ValueError("Cannot pass constraints during normal translation")
+        if constraints:
+            constrained_decoding = True
+            modified_inputs = []
+            for _input, constraint in zip(inputs, constraints):
+                modified_inputs.append(_input + f"\t{constraint}")
+            inputs = modified_inputs
+        else:
+            constrained_decoding = False
+        start_id = 0
+        results = []
+        final_translations = []
+        for batch in make_batches(
+            inputs,
+            self.cfg,
+            self.task,
+            self.max_positions,
+            self.encode_fn,
+            constrained_decoding,
+        ):
+            bsz = batch.src_tokens.size(0)
+            src_tokens = batch.src_tokens
+            src_lengths = batch.src_lengths
+            constraints = batch.constraints
+            if self.use_cuda:
+                src_tokens = src_tokens.cuda()
+                src_lengths = src_lengths.cuda()
+                if constraints is not None:
+                    constraints = constraints.cuda()
+            sample = {
+                "net_input": {
+                    "src_tokens": src_tokens,
+                    "src_lengths": src_lengths,
+                },
+            }
+            translations = self.task.inference_step(
+                self.generator, self.models, sample, constraints=constraints
+            )
+            list_constraints = [[] for _ in range(bsz)]
+            if constrained_decoding:
+                list_constraints = [unpack_constraints(c) for c in constraints]
+            for i, (id, hypos) in enumerate(zip(batch.ids.tolist(), translations)):
+                src_tokens_i = utils.strip_pad(src_tokens[i], self.tgt_dict.pad())
+                constraints = list_constraints[i]
+                results.append(
+                    (
+                        start_id + id,
+                        src_tokens_i,
+                        hypos,
+                        {
+                            "constraints": constraints,
+                        },
+                    )
+                )
+        # sort output to match input order
+        for id_, src_tokens, hypos, _ in sorted(results, key=lambda x: x[0]):
+            src_str = ""
+            if self.src_dict is not None:
+                src_str = self.src_dict.string(
+                    src_tokens, self.cfg.common_eval.post_process
+                )
+            # Process top predictions
+            for hypo in hypos[: min(len(hypos), self.cfg.generation.nbest)]:
+                hypo_tokens, hypo_str, alignment = utils.post_process_prediction(
+                    hypo_tokens=hypo["tokens"].int().cpu(),
+                    src_str=src_str,
+                    alignment=hypo["alignment"],
+                    align_dict=self.align_dict,
+                    tgt_dict=self.tgt_dict,
+                    remove_bpe="subword_nmt",
+                    extra_symbols_to_ignore=get_symbols_to_strip_from_output(
+                        self.generator
+                    ),
+                )
+                detok_hypo_str = self.decode_fn(hypo_str)
+                final_translations.append(detok_hypo_str)
+        return final_translations

inference/engine.py ADDED Viewed

	@@ -0,0 +1,198 @@

+from os import truncate
+from sacremoses import MosesPunctNormalizer
+from sacremoses import MosesTokenizer
+from sacremoses import MosesDetokenizer
+from subword_nmt.apply_bpe import BPE, read_vocabulary
+import codecs
+from tqdm import tqdm
+from indicnlp.tokenize import indic_tokenize
+from indicnlp.tokenize import indic_detokenize
+from indicnlp.normalize import indic_normalize
+from indicnlp.transliterate import unicode_transliterate
+from mosestokenizer import MosesSentenceSplitter
+from indicnlp.tokenize import sentence_tokenize
+from inference.custom_interactive import Translator
+INDIC = ["as", "bn", "gu", "hi", "kn", "ml", "mr", "or", "pa", "ta", "te"]
+def split_sentences(paragraph, language):
+    if language == "en":
+        with MosesSentenceSplitter(language) as splitter:
+            return splitter([paragraph])
+    elif language in INDIC:
+        return sentence_tokenize.sentence_split(paragraph, lang=language)
+def add_token(sent, tag_infos):
+    """add special tokens specified by tag_infos to each element in list
+    tag_infos: list of tuples (tag_type,tag)
+    each tag_info results in a token of the form: __{tag_type}__{tag}__
+    """
+    tokens = []
+    for tag_type, tag in tag_infos:
+        token = "__" + tag_type + "__" + tag + "__"
+        tokens.append(token)
+    return " ".join(tokens) + " " + sent
+def apply_lang_tags(sents, src_lang, tgt_lang):
+    tagged_sents = []
+    for sent in sents:
+        tagged_sent = add_token(sent.strip(), [("src", src_lang), ("tgt", tgt_lang)])
+        tagged_sents.append(tagged_sent)
+    return tagged_sents
+def truncate_long_sentences(sents):
+    MAX_SEQ_LEN = 200
+    new_sents = []
+    for sent in sents:
+        words = sent.split()
+        num_words = len(words)
+        if num_words > MAX_SEQ_LEN:
+            print_str = " ".join(words[:5]) + " .... " + " ".join(words[-5:])
+            sent = " ".join(words[:MAX_SEQ_LEN])
+            print(
+                f"WARNING: Sentence {print_str} truncated to 200 tokens as it exceeds maximum length limit"
+            )
+        new_sents.append(sent)
+    return new_sents
+class Model:
+    def __init__(self, expdir):
+        self.expdir = expdir
+        self.en_tok = MosesTokenizer(lang="en")
+        self.en_normalizer = MosesPunctNormalizer()
+        self.en_detok = MosesDetokenizer(lang="en")
+        self.xliterator = unicode_transliterate.UnicodeIndicTransliterator()
+        print("Initializing vocab and bpe")
+        self.vocabulary = read_vocabulary(
+            codecs.open(f"{expdir}/vocab/vocab.SRC", encoding="utf-8"), 5
+        )
+        self.bpe = BPE(
+            codecs.open(f"{expdir}/vocab/bpe_codes.32k.SRC", encoding="utf-8"),
+            -1,
+            "@@",
+            self.vocabulary,
+            None,
+        )
+        print("Initializing model for translation")
+        # initialize the model
+        self.translator = Translator(
+            f"{expdir}/final_bin", f"{expdir}/model/checkpoint_best.pt", batch_size=100
+        )
+    # translate a batch of sentences from src_lang to tgt_lang
+    def batch_translate(self, batch, src_lang, tgt_lang):
+        assert isinstance(batch, list)
+        preprocessed_sents = self.preprocess(batch, lang=src_lang)
+        bpe_sents = self.apply_bpe(preprocessed_sents)
+        tagged_sents = apply_lang_tags(bpe_sents, src_lang, tgt_lang)
+        tagged_sents = truncate_long_sentences(tagged_sents)
+        translations = self.translator.translate(tagged_sents)
+        postprocessed_sents = self.postprocess(translations, tgt_lang)
+        return postprocessed_sents
+    # translate a paragraph from src_lang to tgt_lang
+    def translate_paragraph(self, paragraph, src_lang, tgt_lang):
+        assert isinstance(paragraph, str)
+        sents = split_sentences(paragraph, src_lang)
+        postprocessed_sents = self.batch_translate(sents, src_lang, tgt_lang)
+        translated_paragraph = " ".join(postprocessed_sents)
+        return translated_paragraph
+    def preprocess_sent(self, sent, normalizer, lang):
+        if lang == "en":
+            return " ".join(
+                self.en_tok.tokenize(
+                    self.en_normalizer.normalize(sent.strip()), escape=False
+                )
+            )
+        else:
+            # line = indic_detokenize.trivial_detokenize(line.strip(), lang)
+            return unicode_transliterate.UnicodeIndicTransliterator.transliterate(
+                " ".join(
+                    indic_tokenize.trivial_tokenize(
+                        normalizer.normalize(sent.strip()), lang
+                    )
+                ),
+                lang,
+                "hi",
+            ).replace(" ् ", "्")
+    def preprocess(self, sents, lang):
+        """
+        Normalize, tokenize and script convert(for Indic)
+        return number of sentences input file
+        """
+        if lang == "en":
+            # processed_sents = Parallel(n_jobs=-1, backend="multiprocessing")(
+            #     delayed(preprocess_line)(line, None, lang) for line in tqdm(sents, total=num_lines)
+            # )
+            processed_sents = [
+                self.preprocess_sent(line, None, lang) for line in tqdm(sents)
+            ]
+        else:
+            normfactory = indic_normalize.IndicNormalizerFactory()
+            normalizer = normfactory.get_normalizer(lang)
+            # processed_sents = Parallel(n_jobs=-1, backend="multiprocessing")(
+            #     delayed(preprocess_line)(line, normalizer, lang) for line in tqdm(infile, total=num_lines)
+            # )
+            processed_sents = [
+                self.preprocess_sent(line, normalizer, lang) for line in tqdm(sents)
+            ]
+        return processed_sents
+    def postprocess(self, sents, lang, common_lang="hi"):
+        """
+        parse fairseq interactive output, convert script back to native Indic script (in case of Indic languages) and detokenize.
+        infname: fairseq log file
+        outfname: output file of translation (sentences not translated contain the dummy string 'DUMMY_OUTPUT'
+        input_size: expected number of output sentences
+        lang: language
+        """
+        postprocessed_sents = []
+        if lang == "en":
+            for sent in sents:
+                # outfile.write(en_detok.detokenize(sent.split(" ")) + "\n")
+                postprocessed_sents.append(self.en_detok.detokenize(sent.split(" ")))
+        else:
+            for sent in sents:
+                outstr = indic_detokenize.trivial_detokenize(
+                    self.xliterator.transliterate(sent, common_lang, lang), lang
+                )
+                # outfile.write(outstr + "\n")
+                postprocessed_sents.append(outstr)
+        return postprocessed_sents
+    def apply_bpe(self, sents):
+        return [self.bpe.process_line(sent) for sent in sents]

legacy/apply_bpe_test_valid_notag.sh ADDED Viewed

	@@ -0,0 +1,33 @@

+#!/bin/bash
+expdir=$1  # EXPDIR
+org_data_dir=$2
+langs=$3
+#`dirname $0`/env.sh
+SUBWORD_NMT_DIR="subword-nmt"
+echo "Apply to each language"
+for dset in `echo test dev`
+do
+    echo $dset
+    in_dset_dir="$org_data_dir/$dset"
+    out_dset_dir="$expdir/bpe/$dset"
+    for lang in $langs
+    do
+        echo Apply BPE for $dset "-" $lang
+        mkdir -p $out_dset_dir
+        python $SUBWORD_NMT_DIR/subword_nmt/apply_bpe.py \
+            -c $expdir/vocab/bpe_codes.32k.SRC_TGT \
+            --vocabulary $expdir/vocab/vocab.SRC \
+            --vocabulary-threshold 5 \
+            < $in_dset_dir/$dset.$lang \
+            > $out_dset_dir/$dset.$lang
+    done
+done

legacy/apply_bpe_train_notag.sh ADDED Viewed

	@@ -0,0 +1,33 @@

+#!/bin/bash
+expdir=$1  # EXPDIR
+#`dirname $0`/env.sh
+SUBWORD_NMT_DIR="subword-nmt"
+data_dir="$expdir/data"
+train_file=$data_dir/train
+bpe_file=$expdir/bpe/train/train
+mkdir -p $expdir/bpe/train
+echo "Apply to SRC corpus"
+python $SUBWORD_NMT_DIR/subword_nmt/apply_bpe.py \
+    -c $expdir/vocab/bpe_codes.32k.SRC_TGT \
+    --vocabulary $expdir/vocab/vocab.SRC \
+    --vocabulary-threshold 5 \
+    --num-workers "-1" \
+    < $train_file.SRC \
+    > $bpe_file.SRC
+echo "Apply to TGT corpus"
+python $SUBWORD_NMT_DIR/subword_nmt/apply_bpe.py \
+    -c $expdir/vocab/bpe_codes.32k.SRC_TGT \
+    --vocabulary $expdir/vocab/vocab.TGT \
+    --vocabulary-threshold 5 \
+    --num-workers "-1" \
+    < $train_file.TGT \
+    > $bpe_file.TGT

legacy/env.sh ADDED Viewed

	@@ -0,0 +1,17 @@

+export SRC=''
+## Python env directory where fairseq is installed
+export PYTHON_ENV=''
+export SUBWORD_NMT_DIR=''
+export INDIC_RESOURCES_PATH=''
+export INDIC_NLP_HOME=''
+export CUDA_HOME=''
+export PATH=$CUDA_HOME/bin:$INDIC_NLP_HOME:$PATH
+export LD_LIBRARY_PATH=$CUDA_HOME/lib64
+# set environment variable to control GPUS visible to the application
+#export CUDA_VISIBLE_DEVICES="'

legacy/indictrans_workflow.ipynb ADDED Viewed

	@@ -0,0 +1,643 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import random\n",
+    "from tqdm.notebook import tqdm\n",
+    "from sacremoses import MosesPunctNormalizer\n",
+    "from sacremoses import MosesTokenizer\n",
+    "from sacremoses import MosesDetokenizer\n",
+    "from collections import defaultdict\n",
+    "import sacrebleu"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# The path to the local git repo for Indic NLP library\n",
+    "INDIC_NLP_LIB_HOME=\"\"\n",
+    "\n",
+    "# The path to the local git repo for Indic NLP Resources\n",
+    "INDIC_NLP_RESOURCES=\"\"\n",
+    "\n",
+    "import sys\n",
+    "sys.path.append(r'{}'.format(INDIC_NLP_LIB_HOME))\n",
+    "\n",
+    "from indicnlp import common\n",
+    "common.set_resources_path(INDIC_NLP_RESOURCES)\n",
+    "\n",
+    "from indicnlp import loader\n",
+    "loader.load()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import indicnlp\n",
+    "from indicnlp.tokenize import indic_tokenize\n",
+    "from indicnlp.tokenize import indic_detokenize\n",
+    "from indicnlp.normalize import indic_normalize\n",
+    "from indicnlp.transliterate import unicode_transliterate"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "LANGS=[\n",
+    "    \"bn\",\n",
+    "    \"gu\",\n",
+    "    \"hi\",\n",
+    "    \"kn\",\n",
+    "    \"ml\",\n",
+    "    \"mr\",\n",
+    "    \"or\",\n",
+    "    \"pa\",\n",
+    "    \"ta\",\n",
+    "    \"te\",    \n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def preprocess(infname,outfname,lang):\n",
+    "    \"\"\"\n",
+    "    Preparing each corpus file: \n",
+    "      - Normalization\n",
+    "      - Tokenization \n",
+    "      - Script coversion to Devanagari for Indic scripts\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    ### reading \n",
+    "    with open(infname,'r',encoding='utf-8') as infile, \\\n",
+    "         open(outfname,'w',encoding='utf-8') as outfile:\n",
+    "        \n",
+    "        if lang=='en':\n",
+    "            en_tok=MosesTokenizer(lang='en')\n",
+    "            en_normalizer = MosesPunctNormalizer()\n",
+    "            for line in tqdm(infile): \n",
+    "                outline=' '.join(\n",
+    "                        en_tok.tokenize( \n",
+    "                                en_normalizer.normalize(line.strip()), \n",
+    "                                    escape=False ) )\n",
+    "                outfile.write(outline+'\\n')\n",
+    "                \n",
+    "        else:\n",
+    "            normfactory=indic_normalize.IndicNormalizerFactory()\n",
+    "            normalizer=normfactory.get_normalizer(lang)\n",
+    "            for line in tqdm(infile): \n",
+    "                outline=unicode_transliterate.UnicodeIndicTransliterator.transliterate(\n",
+    "                        ' '.join(\n",
+    "                                    indic_tokenize.trivial_tokenize(\n",
+    "                                         normalizer.normalize(line.strip()),   lang) ), lang, 'hi').replace(' ् ','्')\n",
+    "\n",
+    "\n",
+    "                outfile.write(outline+'\\n')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def add_token(sent, tag_infos):\n",
+    "    \"\"\" add special tokens specified by tag_infos to each element in list\n",
+    "\n",
+    "    tag_infos: list of tuples (tag_type,tag)\n",
+    "\n",
+    "    each tag_info results in a token of the form: __{tag_type}__{tag}__\n",
+    "\n",
+    "    \"\"\"\n",
+    "\n",
+    "    tokens=[]\n",
+    "    for tag_type, tag in tag_infos:\n",
+    "        token = '__' + tag_type + '__' + tag + '__'\n",
+    "        tokens.append(token)\n",
+    "\n",
+    "    return ' '.join(tokens) + ' ' + sent \n",
+    "\n",
+    "\n",
+    "def concat_data(data_dir, outdir, lang_pair_list, out_src_lang='SRC', out_trg_lang='TGT'):\n",
+    "    \"\"\"\n",
+    "    data_dir: input dir, contains directories for language pairs named l1-l2\n",
+    "    \"\"\"\n",
+    "    os.makedirs(outdir,exist_ok=True)\n",
+    "\n",
+    "    out_src_fname='{}/train.{}'.format(outdir,out_src_lang)\n",
+    "    out_trg_fname='{}/train.{}'.format(outdir,out_trg_lang)\n",
+    "#     out_meta_fname='{}/metadata.txt'.format(outdir)\n",
+    "\n",
+    "    print()\n",
+    "    print(out_src_fname)\n",
+    "    print(out_trg_fname)\n",
+    "#     print(out_meta_fname)\n",
+    "\n",
+    "    ### concatenate train data \n",
+    "    if os.path.isfile(out_src_fname):\n",
+    "        os.unlink(out_src_fname)\n",
+    "    if os.path.isfile(out_trg_fname):\n",
+    "        os.unlink(out_trg_fname)\n",
+    "#     if os.path.isfile(out_meta_fname):\n",
+    "#         os.unlink(out_meta_fname)\n",
+    "\n",
+    "    for src_lang, trg_lang in tqdm(lang_pair_list):\n",
+    "        print('src: {}, tgt:{}'.format(src_lang,trg_lang)) \n",
+    "\n",
+    "        in_src_fname='{}/{}-{}/train.{}'.format(data_dir,src_lang,trg_lang,src_lang)\n",
+    "        in_trg_fname='{}/{}-{}/train.{}'.format(data_dir,src_lang,trg_lang,trg_lang)\n",
+    "\n",
+    "        print(in_src_fname)\n",
+    "        os.system('cat {} >> {}'.format(in_src_fname,out_src_fname))\n",
+    "\n",
+    "        print(in_trg_fname)\n",
+    "        os.system('cat {} >> {}'.format(in_trg_fname,out_trg_fname))   \n",
+    "    \n",
+    "    \n",
+    "#     with open('{}/lang_pairs.txt'.format(outdir),'w',encoding='utf-8') as lpfile: \n",
+    "#         lpfile.write('\\n'.join( [ '-'.join(x) for x in lang_pair_list ] ))\n",
+    "        \n",
+    "    corpus_stats(data_dir, outdir, lang_pair_list)\n",
+    "        \n",
+    "def corpus_stats(data_dir, outdir, lang_pair_list):\n",
+    "    \"\"\"\n",
+    "    data_dir: input dir, contains directories for language pairs named l1-l2\n",
+    "    \"\"\"\n",
+    "\n",
+    "    with open('{}/lang_pairs.txt'.format(outdir),'w',encoding='utf-8') as lpfile: \n",
+    "\n",
+    "        for src_lang, trg_lang in tqdm(lang_pair_list):\n",
+    "            print('src: {}, tgt:{}'.format(src_lang,trg_lang)) \n",
+    "\n",
+    "            in_src_fname='{}/{}-{}/train.{}'.format(data_dir,src_lang,trg_lang,src_lang)\n",
+    "    #         in_trg_fname='{}/{}-{}/train.{}'.format(data_dir,src_lang,trg_lang,trg_lang)\n",
+    "\n",
+    "            print(in_src_fname)\n",
+    "            corpus_size=0\n",
+    "            with open(in_src_fname,'r',encoding='utf-8') as infile:\n",
+    "                corpus_size=sum(map(lambda x:1,infile))\n",
+    "    \n",
+    "            lpfile.write('{}\\t{}\\t{}\\n'.format(src_lang,trg_lang,corpus_size))\n",
+    "        \n",
+    "def generate_lang_tag_iterator(infname):\n",
+    "    with open(infname,'r',encoding='utf-8') as infile:\n",
+    "        for line in infile:\n",
+    "            src,tgt,count=line.strip().split('\\t')\n",
+    "            count=int(count)\n",
+    "            for _ in range(count):\n",
+    "                yield (src,tgt)    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#### directory containing all experiments \n",
+    "## one directory per experiment \n",
+    "EXPBASEDIR=''\n",
+    "\n",
+    "### directory containing data\n",
+    "## contains 3 directories: train test dev\n",
+    "## train directory structure: \n",
+    "##     - There is one directory for each language pair\n",
+    "##     - Directory naming convention lang1-lang2 (you need another directory/softlink for lang2-lang1)\n",
+    "##     - Each directory contains 6 files: {train,test,dev}.{lang1,lang2}\n",
+    "## test & dev directory structure  \n",
+    "##     - test: contains files {test.l1,test.l2,test.l3} - assumes parallel test files like the wat2021 dataset\n",
+    "##     - valid: contains files {dev.l1,dev.l2,dev.l3} - assumes parallel test files like the wat2021 dataset\n",
+    "## All files are tokenized\n",
+    "ORG_DATA_DIR='{d}/consolidated_unique_preprocessed'.format(d=BASEDIR)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Exp2 (M2O)\n",
+    "\n",
+    "- All *-en "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Params**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "expname='exp2_m2o_baseline'\n",
+    "expdir='{}/{}'.format(EXPBASEDIR,expname)\n",
+    "\n",
+    "lang_pair_list=[]\n",
+    "for lang in LANGS:  \n",
+    "    lang_pair_list.append([lang,'en'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Create Train Corpus**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "indir='{}/train'.format(ORG_DATA_DIR)\n",
+    "outdir='{}/data'.format(expdir)\n",
+    "\n",
+    "# print(lang_pair_list)\n",
+    "concat_data(indir,outdir,lang_pair_list)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Learn BPE**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!echo ./learn_bpe.sh  {expdir}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!echo ./apply_bpe_train_notag.sh  {expdir}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!echo ./apply_bpe_test_valid_notag.sh  {expdir} {ORG_DATA_DIR} {'\"'+' '.join(LANGS+['en'])+'\"'}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Add language tags to train**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dset='train' \n",
+    "\n",
+    "src_fname='{expdir}/bpe/train/{dset}.SRC'.format(expdir=expdir,dset=dset)\n",
+    "tgt_fname='{expdir}/bpe/train/{dset}.TGT'.format(expdir=expdir,dset=dset)\n",
+    "meta_fname='{expdir}/data/lang_pairs.txt'.format(expdir=expdir,dset=dset)\n",
+    "                \n",
+    "out_src_fname='{expdir}/final/{dset}.SRC'.format(expdir=expdir,dset=dset)\n",
+    "out_tgt_fname='{expdir}/final/{dset}.TGT'.format(expdir=expdir,dset=dset)\n",
+    "\n",
+    "lang_tag_iterator=generate_lang_tag_iterator(meta_fname)\n",
+    "\n",
+    "print(expdir)\n",
+    "os.makedirs('{expdir}/final'.format(expdir=expdir),exist_ok=True)\n",
+    "\n",
+    "with open(src_fname,'r',encoding='utf-8') as srcfile, \\\n",
+    "     open(tgt_fname,'r',encoding='utf-8') as tgtfile, \\\n",
+    "     open(out_src_fname,'w',encoding='utf-8') as outsrcfile, \\\n",
+    "     open(out_tgt_fname,'w',encoding='utf-8') as outtgtfile:  \n",
+    "\n",
+    "        for (l1,l2), src_sent, tgt_sent in tqdm(zip(lang_tag_iterator, srcfile, tgtfile)):\n",
+    "            outsrcfile.write(add_token(src_sent.strip(),[('src',l1),('tgt',l2)]) + '\\n' )\n",
+    "            outtgtfile.write(tgt_sent.strip()+'\\n')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Add language tags to valid**\n",
+    "\n",
+    "- add language tags, create parallel corpus\n",
+    "- sample 20\\% for validation set \n",
+    "- Create final validation set"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dset='dev' \n",
+    "out_src_fname='{expdir}/final/{dset}.SRC'.format(\n",
+    "                expdir=expdir,dset=dset)\n",
+    "out_tgt_fname='{expdir}/final/{dset}.TGT'.format(\n",
+    "                expdir=expdir,dset=dset)\n",
+    "\n",
+    "os.makedirs('{expdir}/final'.format(expdir=expdir),exist_ok=True)\n",
+    "\n",
+    "print('Processing validation files')  \n",
+    "consolidated_dset=[]\n",
+    "for l1, l2 in tqdm(lang_pair_list):\n",
+    "    src_fname='{expdir}/bpe/{dset}/{dset}.{lang}'.format(\n",
+    "                    expdir=expdir,dset=dset,lang=l1)\n",
+    "    tgt_fname='{expdir}/bpe/{dset}/{dset}.{lang}'.format(\n",
+    "                    expdir=expdir,dset=dset,lang=l2)\n",
+    "#     print(src_fname)\n",
+    "#     print(os.path.exists(src_fname))\n",
+    "    with open(src_fname,'r',encoding='utf-8') as srcfile, \\\n",
+    "         open(tgt_fname,'r',encoding='utf-8') as tgtfile:\n",
+    "        for src_sent, tgt_sent in zip(srcfile,tgtfile):\n",
+    "            consolidated_dset.append(\n",
+    "                    (  add_token(src_sent.strip(),[('src',l1),('tgt',l2)]),\n",
+    "                       tgt_sent.strip() )\n",
+    "                )\n",
+    "\n",
+    "print('Create validation set')            \n",
+    "random.shuffle(consolidated_dset)\n",
+    "final_set=consolidated_dset[:len(consolidated_dset)//5]    \n",
+    "\n",
+    "print('Original set size: {}'.format(len(consolidated_dset)))   \n",
+    "print('Sampled set size: {}'.format(len(final_set)))   \n",
+    "\n",
+    "print('Write validation set')\n",
+    "\n",
+    "with open(out_src_fname,'w',encoding='utf-8') as srcfile, \\\n",
+    "     open(out_tgt_fname,'w',encoding='utf-8') as tgtfile:\n",
+    "    for src_sent, tgt_sent in final_set: \n",
+    "        srcfile.write(src_sent+'\\n')\n",
+    "        tgtfile.write(tgt_sent+'\\n')\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Add language tags to test**\n",
+    "\n",
+    "- add language tags, create parallel corpus all M2O language pairs \n",
+    "- Create final test set"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dset='test' \n",
+    "out_src_fname='{expdir}/final/{dset}.SRC'.format(\n",
+    "                expdir=expdir,dset=dset)\n",
+    "out_tgt_fname='{expdir}/final/{dset}.TGT'.format(\n",
+    "                expdir=expdir,dset=dset)\n",
+    "\n",
+    "os.makedirs('{expdir}/final'.format(expdir=expdir),exist_ok=True)\n",
+    "\n",
+    "print('Processing test files')  \n",
+    "consolidated_dset=[]\n",
+    "for l1, l2 in tqdm(lang_pair_list):\n",
+    "    src_fname='{expdir}/bpe/{dset}/{dset}.{lang}'.format(\n",
+    "                    expdir=expdir,dset=dset,lang=l1)\n",
+    "    tgt_fname='{expdir}/bpe/{dset}/{dset}.{lang}'.format(\n",
+    "                    expdir=expdir,dset=dset,lang=l2)\n",
+    "#     print(src_fname)\n",
+    "#     print(os.path.exists(src_fname))\n",
+    "    with open(src_fname,'r',encoding='utf-8') as srcfile, \\\n",
+    "         open(tgt_fname,'r',encoding='utf-8') as tgtfile:\n",
+    "        for src_sent, tgt_sent in zip(srcfile,tgtfile):\n",
+    "            consolidated_dset.append(\n",
+    "                    (  add_token(src_sent.strip(),[('src',l1),('tgt',l2)]),\n",
+    "                       tgt_sent.strip() )\n",
+    "                )\n",
+    "\n",
+    "print('Final set size: {}'.format(len(consolidated_dset)))            \n",
+    "            \n",
+    "print('Write test set')\n",
+    "print('testset truncated')\n",
+    "\n",
+    "with open(out_src_fname,'w',encoding='utf-8') as srcfile, \\\n",
+    "     open(out_tgt_fname,'w',encoding='utf-8') as tgtfile:\n",
+    "    for lno, (src_sent, tgt_sent) in enumerate(consolidated_dset,1):\n",
+    "        \n",
+    "        s=src_sent.strip().split(' ')\n",
+    "        t=tgt_sent.strip().split(' ')\n",
+    "        \n",
+    "        if len(s) > 200 or len(t) > 200:\n",
+    "            print('exp: {}, pair: ({},{}), lno: {}: lens: ({},{})'.format(expname,l1,l2,lno,len(s),len(t)))        \n",
+    "        \n",
+    "        src_sent=' '.join(  s[:min(len(s),200)]  )\n",
+    "        tgt_sent=' '.join(  t[:min(len(t),200)]  )\n",
+    "        \n",
+    "        srcfile.write(src_sent+'\\n')\n",
+    "        tgtfile.write(tgt_sent+'\\n')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Binarize data**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!echo ./binarize_training_exp.sh {expdir} SRC TGT"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Training Command**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%bash \n",
+    "\n",
+    "python train.py {expdir}/final_bin \\\n",
+    "        --arch transformer \\\n",
+    "        --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 1.0  \\\n",
+    "        --lr 0.0005 --lr-scheduler inverse_sqrt --warmup-updates 4000 --warmup-init-lr 1e-07  \\\n",
+    "        --dropout 0.2  \\\n",
+    "        --criterion label_smoothed_cross_entropy --label-smoothing 0.1  \\\n",
+    "        --max-tokens 8192  \\\n",
+    "        --max-update 1000000  \\\n",
+    "        --max-source-positions 200  \\\n",
+    "        --max-target-positions 200  \\\n",
+    "        --tensorboard-logdir  {expdir}/tensorboard  \\\n",
+    "        --save-dir {expdir}/model  \\\n",
+    "        --required-batch-size-multiple 8  \\\n",
+    "        --save-interval 1  \\\n",
+    "        --keep-last-epochs 5  \\\n",
+    "        --patience 5  \\\n",
+    "        --fp16"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Cleanup**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# os.unlink('{}')\n",
+    "\n",
+    "to_delete=[\n",
+    "    '{expdir}/data/train.SRC'.format(expdir=expdir,dset=dset),\n",
+    "    '{expdir}/data/train.TGT'.format(expdir=expdir,dset=dset),\n",
+    "    '{expdir}/bpe/train/train.SRC'.format(expdir=expdir,dset=dset),\n",
+    "    '{expdir}/bpe/train/train.TGT'.format(expdir=expdir,dset=dset),\n",
+    "]`\n",
+    "\n",
+    "for fname in to_delete:\n",
+    "    os.unlink(fname)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Evaluation**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dset='test' \n",
+    "consolidated_testoutput_fname='{expdir}/evaluations/test/default/test.SRC_TGT.TGT'.format(expdir=expdir)\n",
+    "consolidated_testoutput_log_fname='{}.log'.format(consolidated_testoutput_fname)\n",
+    "metrics_fname='{expdir}/evaluations/test/default/test.metrics.tsv'.format(expdir=expdir)\n",
+    "                \n",
+    "test_set_size=2390\n",
+    "\n",
+    "consolidated_testoutput=[]\n",
+    "with open(consolidated_testoutput_log_fname,'r',encoding='utf-8') as hypfile:\n",
+    "    consolidated_testoutput= list(map(lambda x: x.strip(), filter(lambda x: x.startswith('H-'),hypfile) ))\n",
+    "    consolidated_testoutput.sort(key=lambda x: int(x.split('\\t')[0].split('-')[1]))\n",
+    "    consolidated_testoutput=[ x.split('\\t')[2] for x in consolidated_testoutput ]\n",
+    "\n",
+    "os.makedirs('{expdir}/evaluations/test/default'.format(expdir=expdir),exist_ok=True)\n",
+    "\n",
+    "with open(consolidated_testoutput_fname,'w',encoding='utf-8') as finalhypfile:\n",
+    "    for sent in consolidated_testoutput:\n",
+    "        finalhypfile.write(sent+'\\n')\n",
+    "\n",
+    "print('Processing test files')  \n",
+    "with open(metrics_fname,'w',encoding='utf-8') as metrics_file: \n",
+    "    for i, (l1, l2) in enumerate(tqdm(lang_pair_list)):\n",
+    "\n",
+    "        start=i*test_set_size\n",
+    "        end=(i+1)*test_set_size\n",
+    "        hyps=consolidated_testoutput[start:end]\n",
+    "        ref_fname='{expdir}/{dset}/{dset}.{lang}'.format(\n",
+    "                        expdir=ORG_DATA_DIR,dset=dset,lang=l2)\n",
+    "\n",
+    "        refs=[]\n",
+    "        with open(ref_fname,'r',encoding='utf-8') as reffile:\n",
+    "            refs.extend(map(lambda x:x.strip(),reffile))\n",
+    "\n",
+    "        assert(len(hyps)==len(refs))\n",
+    "\n",
+    "        bleu=sacrebleu.corpus_bleu(hyps,[refs],tokenize='none')\n",
+    "\n",
+    "        print('{} {} {} {}'.format(l1,l2,bleu.score,bleu.prec_str))\n",
+    "        metrics_file.write('{}\\t{}\\t{}\\t{}\\t{}\\n'.format(expname,l1,l2,bleu.score,bleu.prec_str))\n",
+    "  "
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.0"
+  },
+  "toc": {
+   "base_numbering": 1,
+   "nav_menu": {
+    "height": "243.993px",
+    "width": "160px"
+   },
+   "number_sections": true,
+   "sideBar": true,
+   "skip_h1_title": false,
+   "title_cell": "Table of Contents",
+   "title_sidebar": "Contents",
+   "toc_cell": false,
+   "toc_position": {},
+   "toc_section_display": true,
+   "toc_window_display": false
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

legacy/install_fairseq.sh ADDED Viewed

	@@ -0,0 +1,45 @@

+#NVIDIA CUDA download
+wget "https://developer.nvidia.com/compute/cuda/10.0/Prod/local_installers/cuda_10.0.130_410.48_linux"
+wget "http://developer.download.nvidia.com/compute/cuda/10.0/Prod/patches/1/cuda_10.0.130.1_linux.run"
+## do not install drivers (See this: https://docs.nvidia.com/deploy/cuda-compatibility/index.html)
+sudo sh "cuda_10.0.130_410.48_linux"
+sudo sh "cuda_10.0.130.1_linux.run"
+#Set environment variables
+export CUDA_HOME=/usr/local/cuda-10.0
+export PATH=$CUDA_HOME/bin:$PATH
+export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH
+# Install pytorch 1.2
+python3 -m venv  pytorch1.2
+source pytorch1.2/bin/activate
+which pip3
+pip3 install torch==1.2.0 torchvision==0.4.0
+# Install nccl
+git clone https://github.com/NVIDIA/nccl.git
+cd nccl
+make src.build CUDA_HOME=$CUDA_HOME
+sudo apt install build-essential devscripts debhelper fakeroot
+make pkg.debian.build CUDA_HOME=$CUDA_HOME
+sudo dpkg -i build/pkg/deb/libnccl2_2.7.8-1+cuda10.0_amd64.deb
+sudo dpkg -i build/pkg/deb/libnccl-dev_2.7.8-1+cuda10.0_amd64.deb
+sudo apt-get install -f
+# Install Apex
+git clone https://github.com/NVIDIA/apex
+cd apex
+pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" \
+  --global-option="--deprecated_fused_adam" --global-option="--xentropy" \
+  --global-option="--fast_multihead_attn" ./
+# Install PyArrow
+pip install pyarrow
+# Install fairseq
+pip install --editable ./
+# Install other dependencies
+pip install sacrebleu
+pip install tensorboardX --user

legacy/run_inference.sh ADDED Viewed

	@@ -0,0 +1,80 @@

+src_lang=${1:-hi}
+tgt_lang=${2:-en}
+bucket_path=${3:-gs://ai4b-anuvaad-nmt/baselines/transformer-base/baselines-${src_lang}-${tgt_lang}}
+expdir=../baselines/baselines-${src_lang}-${tgt_lang}
+if [[ -d $expdir ]]
+then
+    echo "$expdir exists on your filesystem. Please delete this if you have made some changes to the bucket files and trying to redownload"
+else
+	mkdir -p $expdir
+	mkdir -p $expdir/model
+    cd ../baselines
+	gsutil -m cp -r $bucket_path/vocab $expdir
+	gsutil -m cp -r $bucket_path/final_bin $expdir
+	gsutil -m cp $bucket_path/model/checkpoint_best.pt $expdir/model
+	cd ../indicTrans
+fi
+if [ $src_lang == 'hi' ] || [ $tgt_lang == 'hi' ]; then
+	#TEST_SETS=( wmt-news wat2021-devtest wat2020-devtest anuvaad-legal tico19 sap-documentation-benchmark all)
+	TEST_SETS=( wat2021-devtest wat2020-devtest wat-2018 wmt-news )
+elif [ $src_lang == 'ta' ] || [ $tgt_lang == 'ta' ]; then
+	# TEST_SETS=( wmt-news wat2021-devtest wat2020-devtest anuvaad-legal  tico19 all)
+	TEST_SETS=( wat2021-devtest wat2020-devtest wat-2018 wmt-news ufal-ta)
+elif [ $src_lang == 'bn' ] || [ $tgt_lang == 'bn' ]; then
+	# TEST_SETS=( wat2021-devtest wat2020-devtest anuvaad-legal tico19 all)
+	TEST_SETS=( wat2021-devtest wat2020-devtest wat-2018)
+elif [ $src_lang == 'gu' ] || [ $tgt_lang == 'gu' ]; then
+	# TEST_SETS=( wmt-news wat2021-devtest wat2020-devtest all)
+	TEST_SETS=( wat2021-devtest wat2020-devtest wmt-news )
+elif [ $src_lang == 'as' ] || [ $tgt_lang == 'as' ]; then
+	TEST_SETS=( pmi )
+elif [ $src_lang == 'kn' ] || [ $tgt_lang == 'kn' ]; then
+	# TEST_SETS=( wat2021-devtest anuvaad-legal all)
+	TEST_SETS=( wat2021-devtest )
+elif [ $src_lang == 'ml' ] || [ $tgt_lang == 'ml' ]; then
+	# TEST_SETS=( wat2021-devtest wat2020-devtest anuvaad-legal all)
+	TEST_SETS=( wat2021-devtest wat2020-devtest wat-2018)
+elif [ $src_lang == 'mr' ] || [ $tgt_lang == 'mr' ]; then
+	# TEST_SETS=( wat2021-devtest wat2020-devtest all)
+	TEST_SETS=( wat2021-devtest wat2020-devtest )
+elif [ $src_lang == 'or' ] || [ $tgt_lang == 'or' ]; then
+	TEST_SETS=( wat2021-devtest )
+elif [ $src_lang == 'pa' ] || [ $tgt_lang == 'pa' ]; then
+	TEST_SETS=( wat2021-devtest )
+elif [ $src_lang == 'te' ] || [ $tgt_lang == 'te' ]; then
+	# TEST_SETS=( wat2021-devtest wat2020-devtest  anuvaad-legal all )
+	TEST_SETS=( wat2021-devtest wat2020-devtest wat-2018)
+fi
+if [ $src_lang == 'en' ]; then
+	indic_lang=$tgt_lang
+else
+	indic_lang=$src_lang
+fi
+for tset in ${TEST_SETS[@]};do
+	echo $tset $src_lang $tgt_lang
+	if [ $tset == 'wat2021-devtest' ]; then
+		SRC_FILE=${expdir}/benchmarks/$tset/test.$src_lang
+		REF_FILE=${expdir}/benchmarks/$tset/test.$tgt_lang
+	else
+		SRC_FILE=${expdir}/benchmarks/$tset/en-${indic_lang}/test.$src_lang
+		REF_FILE=${expdir}/benchmarks/$tset/en-${indic_lang}/test.$tgt_lang
+	fi
+	RESULTS_DIR=${expdir}/results/$tset
+	mkdir -p $RESULTS_DIR
+	bash translate.sh $SRC_FILE $RESULTS_DIR/${src_lang}-${tgt_lang} $src_lang $tgt_lang $expdir $REF_FILE
+	# for newline between different outputs
+	echo
+done
+# send the results to the bucket
+gsutil -m cp -r $expdir/results $bucket_path
+# clear up the space in the instance
+# rm -r $expdir

legacy/run_joint_inference.sh ADDED Viewed

	@@ -0,0 +1,74 @@

+src_lang=${1:-en}
+tgt_lang=${2:-indic}
+bucket_path=${3:-gs://ai4b-anuvaad-nmt/models/transformer-4x/indictrans-${src_lang}-${tgt_lang}}
+mkdir -p ../baselines
+expdir=../baselines/baselines-${src_lang}-${tgt_lang}
+if [[ -d $expdir ]]
+then
+    echo "$expdir exists on your filesystem."
+else
+    cd ../baselines
+    mkdir -p baselines-${src_lang}-${tgt_lang}/model
+    mkdir -p baselines-${src_lang}-${tgt_lang}/final_bin
+    cd baselines-${src_lang}-${tgt_lang}/model
+	gsutil -m cp $bucket_path/model/checkpoint_best.pt .
+    cd ..
+    gsutil -m cp $bucket_path/vocab .
+    gsutil -m cp $bucket_path/final_bin/dict.* final_bin
+	cd ../indicTrans
+fi
+if [ $src_lang == 'hi' ] || [ $tgt_lang == 'hi' ]; then
+	TEST_SETS=( wmt-news wat2021-devtest wat2020-devtest anuvaad-legal tico19 sap-documentation-benchmark all)
+elif [ $src_lang == 'ta' ] || [ $tgt_lang == 'ta' ]; then
+	TEST_SETS=( wmt-news wat2021-devtest wat2020-devtest anuvaad-legal  tico19 all)
+elif [ $src_lang == 'bn' ] || [ $tgt_lang == 'bn' ]; then
+	TEST_SETS=( wat2021-devtest wat2020-devtest anuvaad-legal tico19 all)
+elif [ $src_lang == 'gu' ] || [ $tgt_lang == 'gu' ]; then
+	TEST_SETS=( wmt-news wat2021-devtest wat2020-devtest all)
+elif [ $src_lang == 'as' ] || [ $tgt_lang == 'as' ]; then
+	TEST_SETS=( all )
+elif [ $src_lang == 'kn' ] || [ $tgt_lang == 'kn' ]; then
+	TEST_SETS=( wat2021-devtest anuvaad-legal all)
+elif [ $src_lang == 'ml' ] || [ $tgt_lang == 'ml' ]; then
+	TEST_SETS=( wat2021-devtest wat2020-devtest anuvaad-legal all)
+elif [ $src_lang == 'mr' ] || [ $tgt_lang == 'mr' ]; then
+	TEST_SETS=( wat2021-devtest wat2020-devtest all)
+elif [ $src_lang == 'or' ] || [ $tgt_lang == 'or' ]; then
+	TEST_SETS=( all )
+elif [ $src_lang == 'pa' ] || [ $tgt_lang == 'pa' ]; then
+	TEST_SETS=( all )
+elif [ $src_lang == 'te' ] || [ $tgt_lang == 'te' ]; then
+	TEST_SETS=( wat2021-devtest wat2020-devtest  anuvaad-legal all )
+fi
+if [ $src_lang == 'en' ]; then
+	indic_lang=$tgt_lang
+else
+	indic_lang=$src_lang
+fi
+for tset in ${TEST_SETS[@]};do
+	echo $tset $src_lang $tgt_lang
+	if [ $tset == 'wat2021-devtest' ]; then
+		SRC_FILE=${expdir}/devtest/$tset/test.$src_lang
+		REF_FILE=${expdir}/devtest/$tset/test.$tgt_lang
+	else
+		SRC_FILE=${expdir}/devtest/$tset/en-${indic_lang}/test.$src_lang
+		REF_FILE=${expdir}/devtest/$tset/en-${indic_lang}/test.$tgt_lang
+	fi
+	RESULTS_DIR=${expdir}/results/$tset
+	mkdir -p $RESULTS_DIR
+	bash joint_translate.sh $SRC_FILE $RESULTS_DIR/${src_lang}-${tgt_lang} $src_lang $tgt_lang $expdir $REF_FILE
+	# for newline between different outputs
+	echo
+done

legacy/tpu_training_instructions.md ADDED Viewed

	@@ -0,0 +1,92 @@

+## Instructions to run on Google cloud TPUs
+Before starting these steps, make sure to prepare the dataset (normalization -> bpe -> .. -> binarization) following the steps in indicTrans workflow or do these steps on a cpu instance before launching the tpu instance (to save time and costs)
+### Creating TPU instance
+- Create a cpu instance on gcp with `torch-xla` image like:
+```bash
+gcloud compute --project=${PROJECT_ID} instances create <name for your instance> \
+  --zone=<zone>  \
+  --machine-type=n1-standard-16  \
+  --image-family=torch-xla \
+  --image-project=ml-images  \
+  --boot-disk-size=200GB \
+  --scopes=https://www.googleapis.com/auth/cloud-platform
+```
+- Once the instance is created, Launch a Cloud TPU (from your cpu vm instance) using the following command (you can change the `accelerator_type` according to your needs):
+```bash
+gcloud compute tpus create <name for your TPU> \
+--zone=<zone> \
+--network=default \
+--version=pytorch-1.7 \
+--accelerator-type=v3-8
+```
+                                          (or)
+Create a new tpu using the GUI in https://console.cloud.google.com/compute/tpus and make sure to select `version` as  `pytorch 1.7`.
+- Once the tpu is launched, identify its ip address:
+```bash
+# you can run this inside cpu instance and note down the IP address which is located under the NETWORK_ENDPOINTS column
+gcloud compute tpus list --zone=us-central1-a
+```
+                                          (or)
+Go to https://console.cloud.google.com/compute/tpus and note down ip address for the created TPU from the `interal ip` column
+### Installing Fairseq, getting data on the cpu instance
+- Activate the `torch xla 1.7` conda environment and install necessary libs for IndicTrans (**Excluding FairSeq**):
+```bash
+conda activate torch-xla-1.7
+pip install sacremoses pandas mock sacrebleu tensorboardX pyarrow
+```
+- Configure environment variables for TPU:
+```bash
+export TPU_IP_ADDRESS=ip-address; \
+export XRT_TPU_CONFIG="tpu_worker;0;$TPU_IP_ADDRESS:8470"
+```
+- Download the prepared binarized data for FairSeq
+- Clone the latest version of Fairseq (this supports tpu) and install from source. There is an [issue](https://github.com/pytorch/fairseq/issues/3259) with the latest commit and hence we use a different commit to install from source (This may have been fixed in the latest master but we have not tested it.)
+```bash
+git clone https://github.com/pytorch/fairseq.git
+git checkout da9eaba12d82b9bfc1442f0e2c6fc1b895f4d35d
+pip install --editable ./
+```
+- Start TPU training
+```bash
+# this is for using all tpu cores
+export MKL_SERVICE_FORCE_INTEL=1
+fairseq-train   {expdir}/exp2_m2o_baseline/final_bin \
+--max-source-positions=200 \
+--max-target-positions=200 \
+--max-update=1000000 \
+--save-interval=5   \
+--arch=transformer  \
+--attention-dropout=0.1   \
+--criterion=label_smoothed_cross_entropy   \
+--source-lang=SRC   \
+--lr-scheduler=inverse_sqrt   \
+--skip-invalid-size-inputs-valid-test   \
+--target-lang=TGT   \
+--label-smoothing=0.1   \
+--update-freq=1   \
+--optimizer adam   \
+--adam-betas '(0.9, 0.98)'   \
+--warmup-init-lr 1e-07   \
+--lr 0.0005   \
+--warmup-updates 4000   \
+--dropout 0.2 \
+--weight-decay 0.0  \
+--tpu \
+--distributed-world-size 8   \
+--max-tokens 8192 \
+--num-batch-buckets 8 \
+--tensorboard-logdir  {expdir}/exp2_m2o_baseline/tensorboard  \
+--save-dir {expdir}/exp2_m2o_baseline/model \
+--keep-last-epochs 5 \
+--patience 5
+```
+**Note** While training, we noticed that the training was slower on tpus, compared to using multiple GPUs, we have documented some issues and [filed an issue](https://github.com/pytorch/fairseq/issues/3317) at fairseq repo for advice. We'll update this section as we learn more about efficient training on TPUs. Also feel free to open an issue/pull request if you find a bug or know an efficient method to make code train faster on tpus.

legacy/translate.sh ADDED Viewed

	@@ -0,0 +1,70 @@

+#!/bin/bash
+echo `date`
+infname=$1
+outfname=$2
+src_lang=$3
+tgt_lang=$4
+exp_dir=$5
+ref_fname=$6
+if [ $src_lang == 'en' ]; then
+	SRC_PREFIX='TGT'
+	TGT_PREFIX='SRC'
+else
+    SRC_PREFIX='SRC'
+	TGT_PREFIX='TGT'
+fi
+#`dirname $0`/env.sh
+SUBWORD_NMT_DIR='subword-nmt'
+model_dir=$exp_dir/model
+data_bin_dir=$exp_dir/final_bin
+### normalization and script conversion
+echo "Applying normalization and script conversion"
+input_size=`python preprocess_translate.py $infname $outfname.norm $src_lang`
+echo "Number of sentences in input: $input_size"
+### apply BPE to input file
+echo "Applying BPE"
+python $SUBWORD_NMT_DIR/subword_nmt/apply_bpe.py \
+    -c $exp_dir/vocab/bpe_codes.32k.${SRC_PREFIX}_${TGT_PREFIX} \
+    --vocabulary $exp_dir/vocab/vocab.$SRC_PREFIX \
+    --vocabulary-threshold 5 \
+    < $outfname.norm \
+    > $outfname.bpe
+# not needed for joint training
+# echo "Adding language tags"
+# python add_tags_translate.py $outfname._bpe $outfname.bpe $src_lang $tgt_lang
+### run decoder
+echo "Decoding"
+src_input_bpe_fname=$outfname.bpe
+tgt_output_fname=$outfname
+fairseq-interactive  $data_bin_dir \
+    -s $SRC_PREFIX -t $TGT_PREFIX \
+    --distributed-world-size 1  \
+    --path $model_dir/checkpoint_best.pt \
+    --batch-size 64  --buffer-size 2500 --beam 5  --remove-bpe \
+    --skip-invalid-size-inputs-valid-test \
+    --input $src_input_bpe_fname  >  $tgt_output_fname.log 2>&1
+echo "Extracting translations, script conversion and detokenization"
+python postprocess_translate.py $tgt_output_fname.log $tgt_output_fname $input_size $tgt_lang
+if [ $src_lang == 'en' ]; then
+    # indicnlp tokenize the output files before evaluation
+    input_size=`python preprocess_translate.py $ref_fname $ref_fname.tok $tgt_lang`
+    input_size=`python preprocess_translate.py $tgt_output_fname $tgt_output_fname.tok $tgt_lang`
+    sacrebleu --tokenize none $ref_fname.tok < $tgt_output_fname.tok
+else
+    # indic to en models
+    sacrebleu $ref_fname < $tgt_output_fname
+fi
+echo `date`
+echo "Translation completed"

model_configs/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from . import custom_transformer

model_configs/custom_transformer.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from fairseq.models import register_model_architecture
+from fairseq.models.transformer import base_architecture
+@register_model_architecture("transformer", "transformer_2x")
+def transformer_big(args):
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16)
+    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1024)
+    args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 4096)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16)
+    base_architecture(args)
+@register_model_architecture("transformer", "transformer_4x")
+def transformer_huge(args):
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1536)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16)
+    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1536)
+    args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 4096)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16)
+    base_architecture(args)
+@register_model_architecture("transformer", "transformer_9x")
+def transformer_xlarge(args):
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 2048)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 8192)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16)
+    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 2048)
+    args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 8192)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16)
+    base_architecture(args)

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+sacremoses
+pandas
+mock
+sacrebleu
+pyarrow
+indic-nlp-library
+mosestokenizer
+subword-nmt
+numpy
+tensorboardX
+git+https://github.com/pytorch/fairseq.git

scripts/__init__.py ADDED Viewed

File without changes

scripts/add_joint_tags_translate.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import sys
+from tqdm import tqdm
+import os
+def add_token(sent, tag_infos):
+    """ add special tokens specified by tag_infos to each element in list
+    tag_infos: list of tuples (tag_type,tag)
+    each tag_info results in a token of the form: __{tag_type}__{tag}__
+    """
+    tokens = []
+    for tag_type, tag in tag_infos:
+        token = '__' + tag_type + '__' + tag + '__'
+        tokens.append(token)
+    return ' '.join(tokens) + ' ' + sent
+def generate_lang_tag_iterator(infname):
+    with open(infname, 'r', encoding='utf-8') as infile:
+        for line in infile:
+            src, tgt, count = line.strip().split('\t')
+            count = int(count)
+            for _ in range(count):
+                yield (src, tgt)
+if __name__ == '__main__':
+    expdir = sys.argv[1]
+    dset = sys.argv[2]
+    src_fname = '{expdir}/bpe/{dset}.SRC'.format(
+        expdir=expdir, dset=dset)
+    tgt_fname = '{expdir}/bpe/{dset}.TGT'.format(
+        expdir=expdir, dset=dset)
+    meta_fname = '{expdir}/data/{dset}_lang_pairs.txt'.format(
+        expdir=expdir, dset=dset)
+    out_src_fname = '{expdir}/final/{dset}.SRC'.format(
+        expdir=expdir, dset=dset)
+    out_tgt_fname = '{expdir}/final/{dset}.TGT'.format(
+        expdir=expdir, dset=dset)
+    lang_tag_iterator = generate_lang_tag_iterator(meta_fname)
+    os.makedirs('{expdir}/final'.format(expdir=expdir), exist_ok=True)
+    with open(src_fname, 'r', encoding='utf-8') as srcfile, \
+            open(tgt_fname, 'r', encoding='utf-8') as tgtfile, \
+            open(out_src_fname, 'w', encoding='utf-8') as outsrcfile, \
+            open(out_tgt_fname, 'w', encoding='utf-8') as outtgtfile:
+        for (l1, l2), src_sent, tgt_sent in tqdm(zip(lang_tag_iterator,
+                                                     srcfile, tgtfile)):
+            outsrcfile.write(add_token(src_sent.strip(), [
+                             ('src', l1), ('tgt', l2)]) + '\n')
+            outtgtfile.write(tgt_sent.strip() + '\n')

scripts/add_tags_translate.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import sys
+def add_token(sent, tag_infos):
+    """ add special tokens specified by tag_infos to each element in list
+    tag_infos: list of tuples (tag_type,tag)
+    each tag_info results in a token of the form: __{tag_type}__{tag}__
+    """
+    tokens = []
+    for tag_type, tag in tag_infos:
+        token = '__' + tag_type + '__' + tag + '__'
+        tokens.append(token)
+    return ' '.join(tokens) + ' ' + sent
+if __name__ == '__main__':
+    infname = sys.argv[1]
+    outfname = sys.argv[2]
+    src_lang = sys.argv[3]
+    tgt_lang = sys.argv[4]
+    with open(infname, 'r', encoding='utf-8') as infile, \
+            open(outfname, 'w', encoding='utf-8') as outfile:
+        for line in infile:
+            outstr = add_token(
+                line.strip(), [('src', src_lang), ('tgt', tgt_lang)])
+            outfile.write(outstr + '\n')

scripts/clean_vocab.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import sys
+import codecs
+def clean_vocab(in_vocab_fname, out_vocab_fname):
+    with codecs.open(in_vocab_fname, "r", encoding="utf-8") as infile, codecs.open(
+        out_vocab_fname, "w", encoding="utf-8"
+    ) as outfile:
+        for i, line in enumerate(infile):
+            fields = line.strip("\r\n ").split(" ")
+            if len(fields) == 2:
+                outfile.write(line)
+            if len(fields) != 2:
+                print("{}: {}".format(i, line.strip()))
+                for c in line:
+                    print("{}:{}".format(c, hex(ord(c))))
+if __name__ == "__main__":
+    clean_vocab(sys.argv[1], sys.argv[2])

scripts/concat_joint_data.py ADDED Viewed

	@@ -0,0 +1,130 @@

+import os
+from tqdm import tqdm
+import sys
+LANGS = [
+    "as",
+    "bn",
+    "gu",
+    "hi",
+    "kn",
+    "ml",
+    "mr",
+    "or",
+    "pa",
+    "ta",
+    "te",
+    #"ur"
+]
+def add_token(sent, tag_infos):
+    """ add special tokens specified by tag_infos to each element in list
+    tag_infos: list of tuples (tag_type,tag)
+    each tag_info results in a token of the form: __{tag_type}__{tag}__
+    """
+    tokens = []
+    for tag_type, tag in tag_infos:
+        token = '__' + tag_type + '__' + tag + '__'
+        tokens.append(token)
+    return ' '.join(tokens) + ' ' + sent
+def concat_data(data_dir, outdir, lang_pair_list,
+                out_src_lang='SRC', out_trg_lang='TGT', split='train'):
+    """
+    data_dir: input dir, contains directories for language pairs named l1-l2
+    """
+    os.makedirs(outdir, exist_ok=True)
+    out_src_fname = '{}/{}.{}'.format(outdir, split, out_src_lang)
+    out_trg_fname = '{}/{}.{}'.format(outdir, split, out_trg_lang)
+#     out_meta_fname='{}/metadata.txt'.format(outdir)
+    print()
+    print(out_src_fname)
+    print(out_trg_fname)
+#     print(out_meta_fname)
+    # concatenate train data
+    if os.path.isfile(out_src_fname):
+        os.unlink(out_src_fname)
+    if os.path.isfile(out_trg_fname):
+        os.unlink(out_trg_fname)
+#     if os.path.isfile(out_meta_fname):
+#         os.unlink(out_meta_fname)
+    for src_lang, trg_lang in tqdm(lang_pair_list):
+        print('src: {}, tgt:{}'.format(src_lang, trg_lang))
+        in_src_fname = '{}/{}-{}/{}.{}'.format(
+            data_dir, src_lang, trg_lang, split, src_lang)
+        in_trg_fname = '{}/{}-{}/{}.{}'.format(
+            data_dir, src_lang, trg_lang, split, trg_lang)
+        if not os.path.exists(in_src_fname):
+            continue
+        if not os.path.exists(in_trg_fname):
+            continue
+        print(in_src_fname)
+        os.system('cat {} >> {}'.format(in_src_fname, out_src_fname))
+        print(in_trg_fname)
+        os.system('cat {} >> {}'.format(in_trg_fname, out_trg_fname))
+#     with open('{}/lang_pairs.txt'.format(outdir),'w',encoding='utf-8') as lpfile:
+#         lpfile.write('\n'.join( [ '-'.join(x) for x in lang_pair_list ] ))
+    corpus_stats(data_dir, outdir, lang_pair_list, split)
+def corpus_stats(data_dir, outdir, lang_pair_list, split):
+    """
+    data_dir: input dir, contains directories for language pairs named l1-l2
+    """
+    with open('{}/{}_lang_pairs.txt'.format(outdir, split), 'w', encoding='utf-8') as lpfile:
+        for src_lang, trg_lang in tqdm(lang_pair_list):
+            print('src: {}, tgt:{}'.format(src_lang, trg_lang))
+            in_src_fname = '{}/{}-{}/{}.{}'.format(
+                data_dir, src_lang, trg_lang, split, src_lang)
+    #         in_trg_fname='{}/{}-{}/train.{}'.format(data_dir,src_lang,trg_lang,trg_lang)
+            if not os.path.exists(in_src_fname):
+                continue
+            print(in_src_fname)
+            corpus_size = 0
+            with open(in_src_fname, 'r', encoding='utf-8') as infile:
+                corpus_size = sum(map(lambda x: 1, infile))
+            lpfile.write('{}\t{}\t{}\n'.format(
+                src_lang, trg_lang, corpus_size))
+if __name__ == '__main__':
+    in_dir = sys.argv[1]
+    out_dir = sys.argv[2]
+    src_lang = sys.argv[3]
+    tgt_lang = sys.argv[4]
+    split = sys.argv[5]
+    lang_pair_list = []
+    if src_lang == 'en':
+        for lang in LANGS:
+            lang_pair_list.append(['en', lang])
+    else:
+        for lang in LANGS:
+            lang_pair_list.append([lang, 'en'])
+    concat_data(in_dir, out_dir, lang_pair_list, split=split)

scripts/extract_non_english_pairs.py ADDED Viewed

	@@ -0,0 +1,108 @@

+from tqdm import tqdm
+import os
+from collections import defaultdict
+def read_file(fname):
+    with open(fname, "r", encoding="utf-8") as infile:
+        for line in infile:
+            yield line.strip()
+def extract_non_english_pairs(indir, outdir, LANGS):
+    """
+    Extracts non-english pair parallel corpora
+    indir: contains english centric data in the following form:
+            - directory named en-xx for language xx
+            - each directory contains a train.en and train.xx
+    outdir: output directory to store mined data for each pair.
+            One directory is created for each pair.
+    LANGS: list of languages in the corpus (other than English).
+            The language codes must correspond to the ones used in the
+            files and directories in indir. Prefarably, sort the languages
+            in this list in alphabetic order. outdir will contain data for xx-yy,
+            but not for yy-xx, so it will be convenient to have this list in sorted order.
+    """
+    for i in tqdm(range(len(LANGS) - 1)):
+        print()
+        for j in range(i + 1, len(LANGS)):
+            lang1 = LANGS[i]
+            lang2 = LANGS[j]
+            #         print()
+            print("{} {}".format(lang1, lang2))
+            fname1 = "{}/en-{}/train.en".format(indir, lang1)
+            fname2 = "{}/en-{}/train.en".format(indir, lang2)
+            #         print(fname1)
+            #         print(fname2)
+            enset_l1 = set(read_file(fname1))
+            common_en_set = enset_l1.intersection(read_file(fname2))
+            ## this block should be used if you want to consider multiple translations.
+            # il_fname1 = "{}/en-{}/train.{}".format(indir, lang1, lang1)
+            # en_lang1_dict = defaultdict(list)
+            # for en_line, il_line in zip(read_file(fname1), read_file(il_fname1)):
+            #     if en_line in common_en_set:
+            #         en_lang1_dict[en_line].append(il_line)
+            #         # this block should be used if you DONT to consider multiple translation.
+            il_fname1='{}/en-{}/train.{}'.format(indir,lang1,lang1)
+            en_lang1_dict={}
+            for en_line,il_line in zip(read_file(fname1),read_file(il_fname1)):
+                if en_line in common_en_set:
+                    en_lang1_dict[en_line]=il_line
+            os.makedirs("{}/{}-{}".format(outdir, lang1, lang2), exist_ok=True)
+            out_l1_fname = "{o}/{l1}-{l2}/train.{l1}".format(
+                o=outdir, l1=lang1, l2=lang2
+            )
+            out_l2_fname = "{o}/{l1}-{l2}/train.{l2}".format(
+                o=outdir, l1=lang1, l2=lang2
+            )
+            il_fname2 = "{}/en-{}/train.{}".format(indir, lang2, lang2)
+            with open(out_l1_fname, "w", encoding="utf-8") as out_l1_file, open(
+                out_l2_fname, "w", encoding="utf-8"
+            ) as out_l2_file:
+                for en_line, il_line in zip(read_file(fname2), read_file(il_fname2)):
+                    if en_line in en_lang1_dict:
+                        # this block should be used if you want to consider multiple tranlations.
+                        for il_line_lang1 in en_lang1_dict[en_line]:
+                        #     lang1_line, lang2_line = il_line_lang1, il_line
+                        #     out_l1_file.write(lang1_line + "\n")
+                        #     out_l2_file.write(lang2_line + "\n")
+                    # this block should be used if you DONT to consider multiple translation.
+	                        lang1_line, lang2_line = en_lang1_dict[en_line], il_line
+	                        out_l1_file.write(lang1_line+'\n')
+	                        out_l2_file.write(lang2_line+'\n')
+def get_extracted_stats(outdir, LANGS):
+    """
+    gathers stats from the extracted directories
+    outdir: output directory to store mined data for each pair.
+            One directory is created for each pair.
+    LANGS: list of languages in the corpus (other than languages).
+            The language codes must correspond to the ones used in the
+            files and directories in indir. Prefarably, sort the languages
+            in this list in alphabetic order. outdir will contain data for xx-yy,
+    """
+    common_stats = []
+    for i in tqdm(range(len(LANGS) - 1)):
+        for j in range(i + 1, len(LANGS)):
+            lang1 = LANGS[i]
+            lang2 = LANGS[j]
+            out_l1_fname = "{o}/{l1}-{l2}/train.{l1}".format(
+                o=outdir, l1=lang1, l2=lang2
+            )
+            cnt = sum([1 for _ in read_file(out_l1_fname)])
+            common_stats.append((lang1, lang2, cnt))
+            common_stats.append((lang2, lang1, cnt))
+    return common_stats

scripts/postprocess_score.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import sys
+def postprocess(
+    infname, outfname, input_size
+):
+    """
+    parse fairseq interactive output, convert script back to native Indic script (in case of Indic languages) and detokenize.
+    infname: fairseq log file
+    outfname: output file of translation (sentences not translated contain the dummy string 'DUMMY_OUTPUT'
+    input_size: expected number of output sentences
+    """
+    consolidated_testoutput = []
+    # with open(infname,'r',encoding='utf-8') as infile:
+    # consolidated_testoutput= list(map(lambda x: x.strip(), filter(lambda x: x.startswith('H-'),infile) ))
+    # consolidated_testoutput.sort(key=lambda x: int(x.split('\t')[0].split('-')[1]))
+    # consolidated_testoutput=[ x.split('\t')[2] for x in consolidated_testoutput ]
+    consolidated_testoutput = [(x, 0.0, "") for x in range(input_size)]
+    temp_testoutput = []
+    with open(infname, "r", encoding="utf-8") as infile:
+        temp_testoutput = list(
+            map(
+                lambda x: x.strip().split("\t"),
+                filter(lambda x: x.startswith("H-"), infile),
+            )
+        )
+        temp_testoutput = list(
+            map(lambda x: (int(x[0].split("-")[1]), float(x[1]), x[2]), temp_testoutput)
+        )
+        for sid, score, hyp in temp_testoutput:
+            consolidated_testoutput[sid] = (sid, score, hyp)
+        #consolidated_testoutput = [x[2] for x in consolidated_testoutput]
+    with open(outfname, "w", encoding="utf-8") as outfile:
+        for (sid, score, hyp) in consolidated_testoutput:
+            outfile.write("{}\n".format(score))
+if __name__ == "__main__":
+    infname = sys.argv[1]
+    outfname = sys.argv[2]
+    input_size = int(sys.argv[3])
+    postprocess(
+        infname, outfname, input_size
+    )

scripts/postprocess_translate.py ADDED Viewed

	@@ -0,0 +1,110 @@

+INDIC_NLP_LIB_HOME = "indic_nlp_library"
+INDIC_NLP_RESOURCES = "indic_nlp_resources"
+import sys
+from indicnlp import transliterate
+sys.path.append(r"{}".format(INDIC_NLP_LIB_HOME))
+from indicnlp import common
+common.set_resources_path(INDIC_NLP_RESOURCES)
+from indicnlp import loader
+loader.load()
+from sacremoses import MosesPunctNormalizer
+from sacremoses import MosesTokenizer
+from sacremoses import MosesDetokenizer
+from collections import defaultdict
+import indicnlp
+from indicnlp.tokenize import indic_tokenize
+from indicnlp.tokenize import indic_detokenize
+from indicnlp.normalize import indic_normalize
+from indicnlp.transliterate import unicode_transliterate
+def postprocess(
+    infname, outfname, input_size, lang, common_lang="hi", transliterate=False
+):
+    """
+    parse fairseq interactive output, convert script back to native Indic script (in case of Indic languages) and detokenize.
+    infname: fairseq log file
+    outfname: output file of translation (sentences not translated contain the dummy string 'DUMMY_OUTPUT'
+    input_size: expected number of output sentences
+    lang: language
+    """
+    consolidated_testoutput = []
+    # with open(infname,'r',encoding='utf-8') as infile:
+    # consolidated_testoutput= list(map(lambda x: x.strip(), filter(lambda x: x.startswith('H-'),infile) ))
+    # consolidated_testoutput.sort(key=lambda x: int(x.split('\t')[0].split('-')[1]))
+    # consolidated_testoutput=[ x.split('\t')[2] for x in consolidated_testoutput ]
+    consolidated_testoutput = [(x, 0.0, "") for x in range(input_size)]
+    temp_testoutput = []
+    with open(infname, "r", encoding="utf-8") as infile:
+        temp_testoutput = list(
+            map(
+                lambda x: x.strip().split("\t"),
+                filter(lambda x: x.startswith("H-"), infile),
+            )
+        )
+        temp_testoutput = list(
+            map(lambda x: (int(x[0].split("-")[1]), float(x[1]), x[2]), temp_testoutput)
+        )
+        for sid, score, hyp in temp_testoutput:
+            consolidated_testoutput[sid] = (sid, score, hyp)
+        consolidated_testoutput = [x[2] for x in consolidated_testoutput]
+    if lang == "en":
+        en_detok = MosesDetokenizer(lang="en")
+        with open(outfname, "w", encoding="utf-8") as outfile:
+            for sent in consolidated_testoutput:
+                outfile.write(en_detok.detokenize(sent.split(" ")) + "\n")
+    else:
+        xliterator = unicode_transliterate.UnicodeIndicTransliterator()
+        with open(outfname, "w", encoding="utf-8") as outfile:
+            for sent in consolidated_testoutput:
+                if transliterate:
+                    outstr = indic_detokenize.trivial_detokenize(
+                        xliterator.transliterate(sent, common_lang, lang), lang
+                    )
+                else:
+                    outstr = indic_detokenize.trivial_detokenize(sent, lang)
+                outfile.write(outstr + "\n")
+if __name__ == "__main__":
+    #     # The path to the local git repo for Indic NLP library
+    # INDIC_NLP_LIB_HOME="indic_nlp_library"
+    # INDIC_NLP_RESOURCES = "indic_nlp_resources"
+    # sys.path.append('{}'.format(INDIC_NLP_LIB_HOME))
+    # common.set_resources_path(INDIC_NLP_RESOURCES)
+    #     # The path to the local git repo for Indic NLP Resources
+    #     INDIC_NLP_RESOURCES=""
+    #     sys.path.append('{}'.format(INDIC_NLP_LIB_HOME))
+    #     common.set_resources_path(INDIC_NLP_RESOURCES)
+    # loader.load()
+    infname = sys.argv[1]
+    outfname = sys.argv[2]
+    input_size = int(sys.argv[3])
+    lang = sys.argv[4]
+    if len(sys.argv) == 5:
+        transliterate = False
+    elif len(sys.argv) == 6:
+        transliterate = sys.argv[5]
+        if transliterate.lower() == "true":
+            transliterate = True
+        else:
+            transliterate = False
+    else:
+        print(f"Invalid arguments: {sys.argv}")
+        exit()
+    postprocess(
+        infname, outfname, input_size, lang, common_lang="hi", transliterate=transliterate
+    )

scripts/preprocess_translate.py ADDED Viewed

	@@ -0,0 +1,172 @@

+INDIC_NLP_LIB_HOME = "indic_nlp_library"
+INDIC_NLP_RESOURCES = "indic_nlp_resources"
+import sys
+sys.path.append(r"{}".format(INDIC_NLP_LIB_HOME))
+from indicnlp import common
+common.set_resources_path(INDIC_NLP_RESOURCES)
+from indicnlp import loader
+loader.load()
+from sacremoses import MosesPunctNormalizer
+from sacremoses import MosesTokenizer
+from sacremoses import MosesDetokenizer
+from collections import defaultdict
+from tqdm import tqdm
+from joblib import Parallel, delayed
+from indicnlp.tokenize import indic_tokenize
+from indicnlp.tokenize import indic_detokenize
+from indicnlp.normalize import indic_normalize
+from indicnlp.transliterate import unicode_transliterate
+en_tok = MosesTokenizer(lang="en")
+en_normalizer = MosesPunctNormalizer()
+def preprocess_line(line, normalizer, lang, transliterate=False):
+    if lang == "en":
+        return " ".join(
+            en_tok.tokenize(en_normalizer.normalize(line.strip()), escape=False)
+        )
+    elif transliterate:
+        # line = indic_detokenize.trivial_detokenize(line.strip(), lang)
+        return unicode_transliterate.UnicodeIndicTransliterator.transliterate(
+            " ".join(
+                indic_tokenize.trivial_tokenize(
+                    normalizer.normalize(line.strip()), lang
+                )
+            ),
+            lang,
+            "hi",
+        ).replace(" ् ", "्")
+    else:
+        # we only need to transliterate for joint training
+        return " ".join(
+            indic_tokenize.trivial_tokenize(normalizer.normalize(line.strip()), lang)
+        )
+def preprocess(infname, outfname, lang, transliterate=False):
+    """
+    Normalize, tokenize and script convert(for Indic)
+    return number of sentences input file
+    """
+    n = 0
+    num_lines = sum(1 for line in open(infname, "r"))
+    if lang == "en":
+        with open(infname, "r", encoding="utf-8") as infile, open(
+            outfname, "w", encoding="utf-8"
+        ) as outfile:
+            out_lines = Parallel(n_jobs=-1, backend="multiprocessing")(
+                delayed(preprocess_line)(line, None, lang)
+                for line in tqdm(infile, total=num_lines)
+            )
+            for line in out_lines:
+                outfile.write(line + "\n")
+                n += 1
+    else:
+        normfactory = indic_normalize.IndicNormalizerFactory()
+        normalizer = normfactory.get_normalizer(lang)
+        # reading
+        with open(infname, "r", encoding="utf-8") as infile, open(
+            outfname, "w", encoding="utf-8"
+        ) as outfile:
+            out_lines = Parallel(n_jobs=-1, backend="multiprocessing")(
+                delayed(preprocess_line)(line, normalizer, lang, transliterate)
+                for line in tqdm(infile, total=num_lines)
+            )
+            for line in out_lines:
+                outfile.write(line + "\n")
+                n += 1
+    return n
+def old_preprocess(infname, outfname, lang):
+    """
+    Preparing each corpus file:
+      - Normalization
+      - Tokenization
+      - Script coversion to Devanagari for Indic scripts
+    """
+    n = 0
+    num_lines = sum(1 for line in open(infname, "r"))
+    # reading
+    with open(infname, "r", encoding="utf-8") as infile, open(
+        outfname, "w", encoding="utf-8"
+    ) as outfile:
+        if lang == "en":
+            en_tok = MosesTokenizer(lang="en")
+            en_normalizer = MosesPunctNormalizer()
+            for line in tqdm(infile, total=num_lines):
+                outline = " ".join(
+                    en_tok.tokenize(en_normalizer.normalize(line.strip()), escape=False)
+                )
+                outfile.write(outline + "\n")
+                n += 1
+        else:
+            normfactory = indic_normalize.IndicNormalizerFactory()
+            normalizer = normfactory.get_normalizer(lang)
+            for line in tqdm(infile, total=num_lines):
+                outline = (
+                    unicode_transliterate.UnicodeIndicTransliterator.transliterate(
+                        " ".join(
+                            indic_tokenize.trivial_tokenize(
+                                normalizer.normalize(line.strip()), lang
+                            )
+                        ),
+                        lang,
+                        "hi",
+                    ).replace(" ् ", "्")
+                )
+                outfile.write(outline + "\n")
+                n += 1
+    return n
+if __name__ == "__main__":
+    # INDIC_NLP_LIB_HOME = "indic_nlp_library"
+    # INDIC_NLP_RESOURCES = "indic_nlp_resources"
+    # sys.path.append(r'{}'.format(INDIC_NLP_LIB_HOME))
+    # common.set_resources_path(INDIC_NLP_RESOURCES)
+    # data_dir = '../joint_training/v1'
+    # new_dir = data_dir + '.norm'
+    # for path, subdirs, files in os.walk(data_dir):
+    #     for name in files:
+    #         infile = os.path.join(path, name)
+    #         lang = infile.split('.')[-1]
+    #         outfile = os.path.join(path.replace(data_dir, new_dir), name)
+    #         preprocess(infile, outfile, lang)
+    # loader.load()
+    infname = sys.argv[1]
+    outfname = sys.argv[2]
+    lang = sys.argv[3]
+    if len(sys.argv) == 4:
+        transliterate = False
+    elif len(sys.argv) == 5:
+        transliterate = sys.argv[4]
+        if transliterate.lower() == "true":
+            transliterate = True
+        else:
+            transliterate = False
+    else:
+        print(f"Invalid arguments: {sys.argv}")
+        exit()
+    print(preprocess(infname, outfname, lang, transliterate))

scripts/remove_large_sentences.py ADDED Viewed

	@@ -0,0 +1,44 @@

+from tqdm import tqdm
+import sys
+def remove_large_sentences(src_path, tgt_path):
+    count = 0
+    new_src_lines = []
+    new_tgt_lines = []
+    src_num_lines = sum(1 for line in open(src_path, "r", encoding="utf-8"))
+    tgt_num_lines = sum(1 for line in open(tgt_path, "r", encoding="utf-8"))
+    assert src_num_lines == tgt_num_lines
+    with open(src_path, encoding="utf-8") as f1, open(tgt_path, encoding="utf-8") as f2:
+        for src_line, tgt_line in tqdm(zip(f1, f2), total=src_num_lines):
+            src_tokens = src_line.strip().split(" ")
+            tgt_tokens = tgt_line.strip().split(" ")
+            if len(src_tokens) > 200 or len(tgt_tokens) > 200:
+                count += 1
+                continue
+            new_src_lines.append(src_line)
+            new_tgt_lines.append(tgt_line)
+    return count, new_src_lines, new_tgt_lines
+def create_txt(outFile, lines, add_newline=False):
+    outfile = open("{0}".format(outFile), "w", encoding="utf-8")
+    for line in lines:
+        if add_newline:
+            outfile.write(line + "\n")
+        else:
+            outfile.write(line)
+    outfile.close()
+if __name__ == "__main__":
+    src_path = sys.argv[1]
+    tgt_path = sys.argv[2]
+    new_src_path = sys.argv[3]
+    new_tgt_path = sys.argv[4]
+    count, new_src_lines, new_tgt_lines = remove_large_sentences(src_path, tgt_path)
+    print(f'{count} lines removed due to seq_len > 200')
+    create_txt(new_src_path, new_src_lines)
+    create_txt(new_tgt_path, new_tgt_lines)

scripts/remove_train_devtest_overlaps.py ADDED Viewed

	@@ -0,0 +1,265 @@

+import os
+import string
+import shutil
+from itertools import permutations, chain
+from collections import defaultdict
+from tqdm import tqdm
+import sys
+INDIC_LANGS = ["as", "bn", "gu", "hi", "kn", "ml", "mr", "or", "pa", "ta", "te"]
+# we will be testing the overlaps of training data with all these benchmarks
+# benchmarks = ['wat2021-devtest', 'wat2020-devtest', 'wat-2018', 'wmt-news', 'ufal-ta', 'pmi']
+def read_lines(path):
+    # if path doesnt exist, return empty list
+    if not os.path.exists(path):
+        return []
+    with open(path, "r") as f:
+        lines = f.readlines()
+    return lines
+def create_txt(outFile, lines):
+    add_newline = not "\n" in lines[0]
+    outfile = open("{0}".format(outFile), "w")
+    for line in lines:
+        if add_newline:
+            outfile.write(line + "\n")
+        else:
+            outfile.write(line)
+    outfile.close()
+def pair_dedup_files(src_file, tgt_file):
+    src_lines = read_lines(src_file)
+    tgt_lines = read_lines(tgt_file)
+    len_before = len(src_lines)
+    src_dedupped, tgt_dedupped = pair_dedup_lists(src_lines, tgt_lines)
+    len_after = len(src_dedupped)
+    num_duplicates = len_before - len_after
+    print(f"Dropped duplicate pairs in {src_file} Num duplicates -> {num_duplicates}")
+    create_txt(src_file, src_dedupped)
+    create_txt(tgt_file, tgt_dedupped)
+def pair_dedup_lists(src_list, tgt_list):
+    src_tgt = list(set(zip(src_list, tgt_list)))
+    src_deduped, tgt_deduped = zip(*src_tgt)
+    return src_deduped, tgt_deduped
+def strip_and_normalize(line):
+    # lowercase line, remove spaces and strip punctuation
+    # one of the fastest way to add an exclusion list and remove that
+    # list of characters from a string
+    # https://towardsdatascience.com/how-to-efficiently-remove-punctuations-from-a-string-899ad4a059fb
+    exclist = string.punctuation + "\u0964"
+    table_ = str.maketrans("", "", exclist)
+    line = line.replace(" ", "").lower()
+    # dont use this method, it is painfully slow
+    # line = "".join([i for i in line if i not in string.punctuation])
+    line = line.translate(table_)
+    return line
+def expand_tupled_list(list_of_tuples):
+    # convert list of tuples into two lists
+    # https://stackoverflow.com/questions/8081545/how-to-convert-list-of-tuples-to-multiple-lists
+    # [(en, as), (as, bn), (bn, gu)] - > [en, as, bn], [as, bn, gu]
+    list_a, list_b = map(list, zip(*list_of_tuples))
+    return list_a, list_b
+def get_src_tgt_lang_lists(many2many=False):
+    if many2many is False:
+        SRC_LANGS = ["en"]
+        TGT_LANGS = INDIC_LANGS
+    else:
+        all_languages = INDIC_LANGS + ["en"]
+        # lang_pairs = list(permutations(all_languages, 2))
+        SRC_LANGS, TGT_LANGS = all_languages, all_languages
+    return SRC_LANGS, TGT_LANGS
+def normalize_and_gather_all_benchmarks(devtest_dir, many2many=False):
+    # This is a dict of dict of lists
+    # the first keys are for lang-pair, the second keys are for src/tgt
+    # the values are the devtest lines.
+    # so devtest_pairs_normalized[en-as][src] will store src(en lines)
+    # so devtest_pairs_normalized[en-as][tgt] will store tgt(as lines)
+    devtest_pairs_normalized = defaultdict(lambda: defaultdict(list))
+    SRC_LANGS, TGT_LANGS = get_src_tgt_lang_lists(many2many)
+    benchmarks = os.listdir(devtest_dir)
+    for dataset in benchmarks:
+        for src_lang in SRC_LANGS:
+            for tgt_lang in TGT_LANGS:
+                if src_lang == tgt_lang:
+                    continue
+                if dataset == "wat2021-devtest":
+                    # wat2021 dev and test sets have differnet folder structure
+                    src_dev = read_lines(f"{devtest_dir}/{dataset}/dev.{src_lang}")
+                    tgt_dev = read_lines(f"{devtest_dir}/{dataset}/dev.{tgt_lang}")
+                    src_test = read_lines(f"{devtest_dir}/{dataset}/test.{src_lang}")
+                    tgt_test = read_lines(f"{devtest_dir}/{dataset}/test.{tgt_lang}")
+                else:
+                    src_dev = read_lines(
+                        f"{devtest_dir}/{dataset}/{src_lang}-{tgt_lang}/dev.{src_lang}"
+                    )
+                    tgt_dev = read_lines(
+                        f"{devtest_dir}/{dataset}/{src_lang}-{tgt_lang}/dev.{tgt_lang}"
+                    )
+                    src_test = read_lines(
+                        f"{devtest_dir}/{dataset}/{src_lang}-{tgt_lang}/test.{src_lang}"
+                    )
+                    tgt_test = read_lines(
+                        f"{devtest_dir}/{dataset}/{src_lang}-{tgt_lang}/test.{tgt_lang}"
+                    )
+                # if the tgt_pair data doesnt exist for a particular test set,
+                # it will be an empty list
+                if tgt_test == [] or tgt_dev == []:
+                    # print(f'{dataset} does not have {src_lang}-{tgt_lang} data')
+                    continue
+                # combine both dev and test sets into one
+                src_devtest = src_dev + src_test
+                tgt_devtest = tgt_dev + tgt_test
+                src_devtest = [strip_and_normalize(line) for line in src_devtest]
+                tgt_devtest = [strip_and_normalize(line) for line in tgt_devtest]
+                devtest_pairs_normalized[f"{src_lang}-{tgt_lang}"]["src"].extend(
+                    src_devtest
+                )
+                devtest_pairs_normalized[f"{src_lang}-{tgt_lang}"]["tgt"].extend(
+                    tgt_devtest
+                )
+    # dedup merged benchmark datasets
+    for src_lang in SRC_LANGS:
+        for tgt_lang in TGT_LANGS:
+            if src_lang == tgt_lang:
+                continue
+            src_devtest, tgt_devtest = (
+                devtest_pairs_normalized[f"{src_lang}-{tgt_lang}"]["src"],
+                devtest_pairs_normalized[f"{src_lang}-{tgt_lang}"]["tgt"],
+            )
+            # if the devtest data doesnt exist for the src-tgt pair then continue
+            if src_devtest == [] or tgt_devtest == []:
+                continue
+            src_devtest, tgt_devtest = pair_dedup_lists(src_devtest, tgt_devtest)
+            (
+                devtest_pairs_normalized[f"{src_lang}-{tgt_lang}"]["src"],
+                devtest_pairs_normalized[f"{src_lang}-{tgt_lang}"]["tgt"],
+            ) = (
+                src_devtest,
+                tgt_devtest,
+            )
+    return devtest_pairs_normalized
+def remove_train_devtest_overlaps(train_dir, devtest_dir, many2many=False):
+    devtest_pairs_normalized = normalize_and_gather_all_benchmarks(
+        devtest_dir, many2many
+    )
+    SRC_LANGS, TGT_LANGS = get_src_tgt_lang_lists(many2many)
+    if not many2many:
+        all_src_sentences_normalized = []
+        for key in devtest_pairs_normalized:
+            all_src_sentences_normalized.extend(devtest_pairs_normalized[key]["src"])
+        # remove all duplicates. Now this contains all the normalized
+        # english sentences in all test benchmarks across all lang pair
+        all_src_sentences_normalized = list(set(all_src_sentences_normalized))
+    else:
+        all_src_sentences_normalized = None
+    src_overlaps = []
+    tgt_overlaps = []
+    for src_lang in SRC_LANGS:
+        for tgt_lang in TGT_LANGS:
+            if src_lang == tgt_lang:
+                continue
+            new_src_train = []
+            new_tgt_train = []
+            pair = f"{src_lang}-{tgt_lang}"
+            src_train = read_lines(f"{train_dir}/{pair}/train.{src_lang}")
+            tgt_train = read_lines(f"{train_dir}/{pair}/train.{tgt_lang}")
+            len_before = len(src_train)
+            if len_before == 0:
+                continue
+            src_train_normalized = [strip_and_normalize(line) for line in src_train]
+            tgt_train_normalized = [strip_and_normalize(line) for line in tgt_train]
+            if all_src_sentences_normalized:
+                src_devtest_normalized = all_src_sentences_normalized
+            else:
+                src_devtest_normalized = devtest_pairs_normalized[pair]["src"]
+            tgt_devtest_normalized = devtest_pairs_normalized[pair]["tgt"]
+            # compute all src and tgt super strict overlaps for a lang pair
+            overlaps = set(src_train_normalized) & set(src_devtest_normalized)
+            src_overlaps.extend(list(overlaps))
+            overlaps = set(tgt_train_normalized) & set(tgt_devtest_normalized)
+            tgt_overlaps.extend(list(overlaps))
+            # dictionaries offer o(1) lookup
+            src_overlaps_dict = {}
+            tgt_overlaps_dict = {}
+            for line in src_overlaps:
+                src_overlaps_dict[line] = 1
+            for line in tgt_overlaps:
+                tgt_overlaps_dict[line] = 1
+            # loop to remove the ovelapped data
+            idx = -1
+            for src_line_norm, tgt_line_norm in tqdm(
+                zip(src_train_normalized, tgt_train_normalized), total=len_before
+            ):
+                idx += 1
+                if src_overlaps_dict.get(src_line_norm, None):
+                    continue
+                if tgt_overlaps_dict.get(tgt_line_norm, None):
+                    continue
+                new_src_train.append(src_train[idx])
+                new_tgt_train.append(tgt_train[idx])
+            len_after = len(new_src_train)
+            print(
+                f"Detected overlaps between train and devetest for {pair} is {len_before - len_after}"
+            )
+            print(f"saving new files at {train_dir}/{pair}/")
+            create_txt(f"{train_dir}/{pair}/train.{src_lang}", new_src_train)
+            create_txt(f"{train_dir}/{pair}/train.{tgt_lang}", new_tgt_train)
+if __name__ == "__main__":
+    train_data_dir = sys.argv[1]
+    # benchmarks directory should contains all the test sets
+    devtest_data_dir = sys.argv[2]
+    if len(sys.argv) == 3:
+        many2many = False
+    elif len(sys.argv) == 4:
+        many2many = sys.argv[4]
+        if many2many.lower() == "true":
+            many2many = True
+        else:
+            many2many = False
+    remove_train_devtest_overlaps(train_data_dir, devtest_data_dir, many2many)