In [None]:
import os
import random
from tqdm.notebook import tqdm
from sacremoses import MosesPunctNormalizer
from sacremoses import MosesTokenizer
from sacremoses import MosesDetokenizer
from collections import defaultdict
import sacrebleu

In [None]:
# The path to the local git repo for Indic NLP library
INDIC_NLP_LIB_HOME=""

# The path to the local git repo for Indic NLP Resources
INDIC_NLP_RESOURCES=""

import sys
sys.path.append(r'{}'.format(INDIC_NLP_LIB_HOME))

from indicnlp import common
common.set_resources_path(INDIC_NLP_RESOURCES)

from indicnlp import loader
loader.load()

In [None]:
import indicnlp
from indicnlp.tokenize import indic_tokenize
from indicnlp.tokenize import indic_detokenize
from indicnlp.normalize import indic_normalize
from indicnlp.transliterate import unicode_transliterate

In [None]:
LANGS=[
 "bn",
 "gu",
 "hi",
 "kn",
 "ml",
 "mr",
 "or",
 "pa",
 "ta",
 "te", 
]

In [None]:
def preprocess(infname,outfname,lang):
 """
 Preparing each corpus file: 
 - Normalization
 - Tokenization 
 - Script coversion to Devanagari for Indic scripts
 """
 
 ### reading 
 with open(infname,'r',encoding='utf-8') as infile, \
 open(outfname,'w',encoding='utf-8') as outfile:
 
 if lang=='en':
 en_tok=MosesTokenizer(lang='en')
 en_normalizer = MosesPunctNormalizer()
 for line in tqdm(infile): 
 outline=' '.join(
 en_tok.tokenize( 
 en_normalizer.normalize(line.strip()), 
 escape=False ) )
 outfile.write(outline+'\n')
 
 else:
 normfactory=indic_normalize.IndicNormalizerFactory()
 normalizer=normfactory.get_normalizer(lang)
 for line in tqdm(infile): 
 outline=unicode_transliterate.UnicodeIndicTransliterator.transliterate(
 ' '.join(
 indic_tokenize.trivial_tokenize(
 normalizer.normalize(line.strip()), lang) ), lang, 'hi').replace(' ् ','्')


 outfile.write(outline+'\n')

In [None]:
def add_token(sent, tag_infos):
 """ add special tokens specified by tag_infos to each element in list

 tag_infos: list of tuples (tag_type,tag)

 each tag_info results in a token of the form: __{tag_type}__{tag}__

 """

 tokens=[]
 for tag_type, tag in tag_infos:
 token = '__' + tag_type + '__' + tag + '__'
 tokens.append(token)

 return ' '.join(tokens) + ' ' + sent 


def concat_data(data_dir, outdir, lang_pair_list, out_src_lang='SRC', out_trg_lang='TGT'):
 """
 data_dir: input dir, contains directories for language pairs named l1-l2
 """
 os.makedirs(outdir,exist_ok=True)

 out_src_fname='{}/train.{}'.format(outdir,out_src_lang)
 out_trg_fname='{}/train.{}'.format(outdir,out_trg_lang)
# out_meta_fname='{}/metadata.txt'.format(outdir)

 print()
 print(out_src_fname)
 print(out_trg_fname)
# print(out_meta_fname)

 ### concatenate train data 
 if os.path.isfile(out_src_fname):
 os.unlink(out_src_fname)
 if os.path.isfile(out_trg_fname):
 os.unlink(out_trg_fname)
# if os.path.isfile(out_meta_fname):
# os.unlink(out_meta_fname)

 for src_lang, trg_lang in tqdm(lang_pair_list):
 print('src: {}, tgt:{}'.format(src_lang,trg_lang)) 

 in_src_fname='{}/{}-{}/train.{}'.format(data_dir,src_lang,trg_lang,src_lang)
 in_trg_fname='{}/{}-{}/train.{}'.format(data_dir,src_lang,trg_lang,trg_lang)

 print(in_src_fname)
 os.system('cat {} >> {}'.format(in_src_fname,out_src_fname))

 print(in_trg_fname)
 os.system('cat {} >> {}'.format(in_trg_fname,out_trg_fname)) 
 
 
# with open('{}/lang_pairs.txt'.format(outdir),'w',encoding='utf-8') as lpfile: 
# lpfile.write('\n'.join( [ '-'.join(x) for x in lang_pair_list ] ))
 
 corpus_stats(data_dir, outdir, lang_pair_list)
 
def corpus_stats(data_dir, outdir, lang_pair_list):
 """
 data_dir: input dir, contains directories for language pairs named l1-l2
 """

 with open('{}/lang_pairs.txt'.format(outdir),'w',encoding='utf-8') as lpfile: 

 for src_lang, trg_lang in tqdm(lang_pair_list):
 print('src: {}, tgt:{}'.format(src_lang,trg_lang)) 

 in_src_fname='{}/{}-{}/train.{}'.format(data_dir,src_lang,trg_lang,src_lang)
 # in_trg_fname='{}/{}-{}/train.{}'.format(data_dir,src_lang,trg_lang,trg_lang)

 print(in_src_fname)
 corpus_size=0
 with open(in_src_fname,'r',encoding='utf-8') as infile:
 corpus_size=sum(map(lambda x:1,infile))
 
 lpfile.write('{}\t{}\t{}\n'.format(src_lang,trg_lang,corpus_size))
 
def generate_lang_tag_iterator(infname):
 with open(infname,'r',encoding='utf-8') as infile:
 for line in infile:
 src,tgt,count=line.strip().split('\t')
 count=int(count)
 for _ in range(count):
 yield (src,tgt) 

In [None]:
#### directory containing all experiments 
## one directory per experiment 
EXPBASEDIR=''

### directory containing data
## contains 3 directories: train test dev
## train directory structure: 
## - There is one directory for each language pair
## - Directory naming convention lang1-lang2 (you need another directory/softlink for lang2-lang1)
## - Each directory contains 6 files: {train,test,dev}.{lang1,lang2}
## test & dev directory structure 
## - test: contains files {test.l1,test.l2,test.l3} - assumes parallel test files like the wat2021 dataset
## - valid: contains files {dev.l1,dev.l2,dev.l3} - assumes parallel test files like the wat2021 dataset
## All files are tokenized
ORG_DATA_DIR='{d}/consolidated_unique_preprocessed'.format(d=BASEDIR)



# Exp2 (M2O)

- All *-en 

**Params**

In [None]:
expname='exp2_m2o_baseline'
expdir='{}/{}'.format(EXPBASEDIR,expname)

lang_pair_list=[]
for lang in LANGS: 
 lang_pair_list.append([lang,'en'])

**Create Train Corpus**

In [None]:
indir='{}/train'.format(ORG_DATA_DIR)
outdir='{}/data'.format(expdir)

# print(lang_pair_list)
concat_data(indir,outdir,lang_pair_list)

**Learn BPE**

In [None]:
!echo ./learn_bpe.sh {expdir}

In [None]:
!echo ./apply_bpe_train_notag.sh {expdir}

In [None]:
!echo ./apply_bpe_test_valid_notag.sh {expdir} {ORG_DATA_DIR} {'"'+' '.join(LANGS+['en'])+'"'}

**Add language tags to train**

In [None]:
dset='train' 

src_fname='{expdir}/bpe/train/{dset}.SRC'.format(expdir=expdir,dset=dset)
tgt_fname='{expdir}/bpe/train/{dset}.TGT'.format(expdir=expdir,dset=dset)
meta_fname='{expdir}/data/lang_pairs.txt'.format(expdir=expdir,dset=dset)
 
out_src_fname='{expdir}/final/{dset}.SRC'.format(expdir=expdir,dset=dset)
out_tgt_fname='{expdir}/final/{dset}.TGT'.format(expdir=expdir,dset=dset)

lang_tag_iterator=generate_lang_tag_iterator(meta_fname)

print(expdir)
os.makedirs('{expdir}/final'.format(expdir=expdir),exist_ok=True)

with open(src_fname,'r',encoding='utf-8') as srcfile, \
 open(tgt_fname,'r',encoding='utf-8') as tgtfile, \
 open(out_src_fname,'w',encoding='utf-8') as outsrcfile, \
 open(out_tgt_fname,'w',encoding='utf-8') as outtgtfile: 

 for (l1,l2), src_sent, tgt_sent in tqdm(zip(lang_tag_iterator, srcfile, tgtfile)):
 outsrcfile.write(add_token(src_sent.strip(),[('src',l1),('tgt',l2)]) + '\n' )
 outtgtfile.write(tgt_sent.strip()+'\n')

**Add language tags to valid**

- add language tags, create parallel corpus
- sample 20\% for validation set 
- Create final validation set

In [None]:
dset='dev' 
out_src_fname='{expdir}/final/{dset}.SRC'.format(
 expdir=expdir,dset=dset)
out_tgt_fname='{expdir}/final/{dset}.TGT'.format(
 expdir=expdir,dset=dset)

os.makedirs('{expdir}/final'.format(expdir=expdir),exist_ok=True)

print('Processing validation files') 
consolidated_dset=[]
for l1, l2 in tqdm(lang_pair_list):
 src_fname='{expdir}/bpe/{dset}/{dset}.{lang}'.format(
 expdir=expdir,dset=dset,lang=l1)
 tgt_fname='{expdir}/bpe/{dset}/{dset}.{lang}'.format(
 expdir=expdir,dset=dset,lang=l2)
# print(src_fname)
# print(os.path.exists(src_fname))
 with open(src_fname,'r',encoding='utf-8') as srcfile, \
 open(tgt_fname,'r',encoding='utf-8') as tgtfile:
 for src_sent, tgt_sent in zip(srcfile,tgtfile):
 consolidated_dset.append(
 ( add_token(src_sent.strip(),[('src',l1),('tgt',l2)]),
 tgt_sent.strip() )
 )

print('Create validation set') 
random.shuffle(consolidated_dset)
final_set=consolidated_dset[:len(consolidated_dset)//5] 

print('Original set size: {}'.format(len(consolidated_dset))) 
print('Sampled set size: {}'.format(len(final_set))) 

print('Write validation set')

with open(out_src_fname,'w',encoding='utf-8') as srcfile, \
 open(out_tgt_fname,'w',encoding='utf-8') as tgtfile:
 for src_sent, tgt_sent in final_set: 
 srcfile.write(src_sent+'\n')
 tgtfile.write(tgt_sent+'\n')


**Add language tags to test**

- add language tags, create parallel corpus all M2O language pairs 
- Create final test set

In [None]:
dset='test' 
out_src_fname='{expdir}/final/{dset}.SRC'.format(
 expdir=expdir,dset=dset)
out_tgt_fname='{expdir}/final/{dset}.TGT'.format(
 expdir=expdir,dset=dset)

os.makedirs('{expdir}/final'.format(expdir=expdir),exist_ok=True)

print('Processing test files') 
consolidated_dset=[]
for l1, l2 in tqdm(lang_pair_list):
 src_fname='{expdir}/bpe/{dset}/{dset}.{lang}'.format(
 expdir=expdir,dset=dset,lang=l1)
 tgt_fname='{expdir}/bpe/{dset}/{dset}.{lang}'.format(
 expdir=expdir,dset=dset,lang=l2)
# print(src_fname)
# print(os.path.exists(src_fname))
 with open(src_fname,'r',encoding='utf-8') as srcfile, \
 open(tgt_fname,'r',encoding='utf-8') as tgtfile:
 for src_sent, tgt_sent in zip(srcfile,tgtfile):
 consolidated_dset.append(
 ( add_token(src_sent.strip(),[('src',l1),('tgt',l2)]),
 tgt_sent.strip() )
 )

print('Final set size: {}'.format(len(consolidated_dset))) 
 
print('Write test set')
print('testset truncated')

with open(out_src_fname,'w',encoding='utf-8') as srcfile, \
 open(out_tgt_fname,'w',encoding='utf-8') as tgtfile:
 for lno, (src_sent, tgt_sent) in enumerate(consolidated_dset,1):
 
 s=src_sent.strip().split(' ')
 t=tgt_sent.strip().split(' ')
 
 if len(s) > 200 or len(t) > 200:
 print('exp: {}, pair: ({},{}), lno: {}: lens: ({},{})'.format(expname,l1,l2,lno,len(s),len(t))) 
 
 src_sent=' '.join( s[:min(len(s),200)] )
 tgt_sent=' '.join( t[:min(len(t),200)] )
 
 srcfile.write(src_sent+'\n')
 tgtfile.write(tgt_sent+'\n')

**Binarize data**

In [None]:
!echo ./binarize_training_exp.sh {expdir} SRC TGT

**Training Command**

In [None]:
%%bash 

python train.py {expdir}/final_bin \
 --arch transformer \
 --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 1.0 \
 --lr 0.0005 --lr-scheduler inverse_sqrt --warmup-updates 4000 --warmup-init-lr 1e-07 \
 --dropout 0.2 \
 --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
 --max-tokens 8192 \
 --max-update 1000000 \
 --max-source-positions 200 \
 --max-target-positions 200 \
 --tensorboard-logdir {expdir}/tensorboard \
 --save-dir {expdir}/model \
 --required-batch-size-multiple 8 \
 --save-interval 1 \
 --keep-last-epochs 5 \
 --patience 5 \
 --fp16

**Cleanup**

In [None]:
# os.unlink('{}')

to_delete=[
 '{expdir}/data/train.SRC'.format(expdir=expdir,dset=dset),
 '{expdir}/data/train.TGT'.format(expdir=expdir,dset=dset),
 '{expdir}/bpe/train/train.SRC'.format(expdir=expdir,dset=dset),
 '{expdir}/bpe/train/train.TGT'.format(expdir=expdir,dset=dset),
]`

for fname in to_delete:
 os.unlink(fname)

**Evaluation**

In [None]:
dset='test' 
consolidated_testoutput_fname='{expdir}/evaluations/test/default/test.SRC_TGT.TGT'.format(expdir=expdir)
consolidated_testoutput_log_fname='{}.log'.format(consolidated_testoutput_fname)
metrics_fname='{expdir}/evaluations/test/default/test.metrics.tsv'.format(expdir=expdir)
 
test_set_size=2390

consolidated_testoutput=[]
with open(consolidated_testoutput_log_fname,'r',encoding='utf-8') as hypfile:
 consolidated_testoutput= list(map(lambda x: x.strip(), filter(lambda x: x.startswith('H-'),hypfile) ))
 consolidated_testoutput.sort(key=lambda x: int(x.split('\t')[0].split('-')[1]))
 consolidated_testoutput=[ x.split('\t')[2] for x in consolidated_testoutput ]

os.makedirs('{expdir}/evaluations/test/default'.format(expdir=expdir),exist_ok=True)

with open(consolidated_testoutput_fname,'w',encoding='utf-8') as finalhypfile:
 for sent in consolidated_testoutput:
 finalhypfile.write(sent+'\n')

print('Processing test files') 
with open(metrics_fname,'w',encoding='utf-8') as metrics_file: 
 for i, (l1, l2) in enumerate(tqdm(lang_pair_list)):

 start=i*test_set_size
 end=(i+1)*test_set_size
 hyps=consolidated_testoutput[start:end]
 ref_fname='{expdir}/{dset}/{dset}.{lang}'.format(
 expdir=ORG_DATA_DIR,dset=dset,lang=l2)

 refs=[]
 with open(ref_fname,'r',encoding='utf-8') as reffile:
 refs.extend(map(lambda x:x.strip(),reffile))

 assert(len(hyps)==len(refs))

 bleu=sacrebleu.corpus_bleu(hyps,[refs],tokenize='none')

 print('{} {} {} {}'.format(l1,l2,bleu.score,bleu.prec_str))
 metrics_file.write('{}\t{}\t{}\t{}\t{}\n'.format(expname,l1,l2,bleu.score,bleu.prec_str))
 