{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os\n", "import random\n", "from tqdm.notebook import tqdm\n", "from sacremoses import MosesPunctNormalizer\n", "from sacremoses import MosesTokenizer\n", "from sacremoses import MosesDetokenizer\n", "from collections import defaultdict\n", "import sacrebleu" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# The path to the local git repo for Indic NLP library\n", "INDIC_NLP_LIB_HOME=\"\"\n", "\n", "# The path to the local git repo for Indic NLP Resources\n", "INDIC_NLP_RESOURCES=\"\"\n", "\n", "import sys\n", "sys.path.append(r'{}'.format(INDIC_NLP_LIB_HOME))\n", "\n", "from indicnlp import common\n", "common.set_resources_path(INDIC_NLP_RESOURCES)\n", "\n", "from indicnlp import loader\n", "loader.load()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import indicnlp\n", "from indicnlp.tokenize import indic_tokenize\n", "from indicnlp.tokenize import indic_detokenize\n", "from indicnlp.normalize import indic_normalize\n", "from indicnlp.transliterate import unicode_transliterate" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "LANGS=[\n", " \"bn\",\n", " \"gu\",\n", " \"hi\",\n", " \"kn\",\n", " \"ml\",\n", " \"mr\",\n", " \"or\",\n", " \"pa\",\n", " \"ta\",\n", " \"te\", \n", "]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def preprocess(infname,outfname,lang):\n", " \"\"\"\n", " Preparing each corpus file: \n", " - Normalization\n", " - Tokenization \n", " - Script coversion to Devanagari for Indic scripts\n", " \"\"\"\n", " \n", " ### reading \n", " with open(infname,'r',encoding='utf-8') as infile, \\\n", " open(outfname,'w',encoding='utf-8') as outfile:\n", " \n", " if lang=='en':\n", " en_tok=MosesTokenizer(lang='en')\n", " en_normalizer = MosesPunctNormalizer()\n", " for line in tqdm(infile): \n", " outline=' '.join(\n", " en_tok.tokenize( \n", " en_normalizer.normalize(line.strip()), \n", " escape=False ) )\n", " outfile.write(outline+'\\n')\n", " \n", " else:\n", " normfactory=indic_normalize.IndicNormalizerFactory()\n", " normalizer=normfactory.get_normalizer(lang)\n", " for line in tqdm(infile): \n", " outline=unicode_transliterate.UnicodeIndicTransliterator.transliterate(\n", " ' '.join(\n", " indic_tokenize.trivial_tokenize(\n", " normalizer.normalize(line.strip()), lang) ), lang, 'hi').replace(' ् ','्')\n", "\n", "\n", " outfile.write(outline+'\\n')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def add_token(sent, tag_infos):\n", " \"\"\" add special tokens specified by tag_infos to each element in list\n", "\n", " tag_infos: list of tuples (tag_type,tag)\n", "\n", " each tag_info results in a token of the form: __{tag_type}__{tag}__\n", "\n", " \"\"\"\n", "\n", " tokens=[]\n", " for tag_type, tag in tag_infos:\n", " token = '__' + tag_type + '__' + tag + '__'\n", " tokens.append(token)\n", "\n", " return ' '.join(tokens) + ' ' + sent \n", "\n", "\n", "def concat_data(data_dir, outdir, lang_pair_list, out_src_lang='SRC', out_trg_lang='TGT'):\n", " \"\"\"\n", " data_dir: input dir, contains directories for language pairs named l1-l2\n", " \"\"\"\n", " os.makedirs(outdir,exist_ok=True)\n", "\n", " out_src_fname='{}/train.{}'.format(outdir,out_src_lang)\n", " out_trg_fname='{}/train.{}'.format(outdir,out_trg_lang)\n", "# out_meta_fname='{}/metadata.txt'.format(outdir)\n", "\n", " print()\n", " print(out_src_fname)\n", " print(out_trg_fname)\n", "# print(out_meta_fname)\n", "\n", " ### concatenate train data \n", " if os.path.isfile(out_src_fname):\n", " os.unlink(out_src_fname)\n", " if os.path.isfile(out_trg_fname):\n", " os.unlink(out_trg_fname)\n", "# if os.path.isfile(out_meta_fname):\n", "# os.unlink(out_meta_fname)\n", "\n", " for src_lang, trg_lang in tqdm(lang_pair_list):\n", " print('src: {}, tgt:{}'.format(src_lang,trg_lang)) \n", "\n", " in_src_fname='{}/{}-{}/train.{}'.format(data_dir,src_lang,trg_lang,src_lang)\n", " in_trg_fname='{}/{}-{}/train.{}'.format(data_dir,src_lang,trg_lang,trg_lang)\n", "\n", " print(in_src_fname)\n", " os.system('cat {} >> {}'.format(in_src_fname,out_src_fname))\n", "\n", " print(in_trg_fname)\n", " os.system('cat {} >> {}'.format(in_trg_fname,out_trg_fname)) \n", " \n", " \n", "# with open('{}/lang_pairs.txt'.format(outdir),'w',encoding='utf-8') as lpfile: \n", "# lpfile.write('\\n'.join( [ '-'.join(x) for x in lang_pair_list ] ))\n", " \n", " corpus_stats(data_dir, outdir, lang_pair_list)\n", " \n", "def corpus_stats(data_dir, outdir, lang_pair_list):\n", " \"\"\"\n", " data_dir: input dir, contains directories for language pairs named l1-l2\n", " \"\"\"\n", "\n", " with open('{}/lang_pairs.txt'.format(outdir),'w',encoding='utf-8') as lpfile: \n", "\n", " for src_lang, trg_lang in tqdm(lang_pair_list):\n", " print('src: {}, tgt:{}'.format(src_lang,trg_lang)) \n", "\n", " in_src_fname='{}/{}-{}/train.{}'.format(data_dir,src_lang,trg_lang,src_lang)\n", " # in_trg_fname='{}/{}-{}/train.{}'.format(data_dir,src_lang,trg_lang,trg_lang)\n", "\n", " print(in_src_fname)\n", " corpus_size=0\n", " with open(in_src_fname,'r',encoding='utf-8') as infile:\n", " corpus_size=sum(map(lambda x:1,infile))\n", " \n", " lpfile.write('{}\\t{}\\t{}\\n'.format(src_lang,trg_lang,corpus_size))\n", " \n", "def generate_lang_tag_iterator(infname):\n", " with open(infname,'r',encoding='utf-8') as infile:\n", " for line in infile:\n", " src,tgt,count=line.strip().split('\\t')\n", " count=int(count)\n", " for _ in range(count):\n", " yield (src,tgt) " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#### directory containing all experiments \n", "## one directory per experiment \n", "EXPBASEDIR=''\n", "\n", "### directory containing data\n", "## contains 3 directories: train test dev\n", "## train directory structure: \n", "## - There is one directory for each language pair\n", "## - Directory naming convention lang1-lang2 (you need another directory/softlink for lang2-lang1)\n", "## - Each directory contains 6 files: {train,test,dev}.{lang1,lang2}\n", "## test & dev directory structure \n", "## - test: contains files {test.l1,test.l2,test.l3} - assumes parallel test files like the wat2021 dataset\n", "## - valid: contains files {dev.l1,dev.l2,dev.l3} - assumes parallel test files like the wat2021 dataset\n", "## All files are tokenized\n", "ORG_DATA_DIR='{d}/consolidated_unique_preprocessed'.format(d=BASEDIR)\n", "\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Exp2 (M2O)\n", "\n", "- All *-en " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Params**" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "expname='exp2_m2o_baseline'\n", "expdir='{}/{}'.format(EXPBASEDIR,expname)\n", "\n", "lang_pair_list=[]\n", "for lang in LANGS: \n", " lang_pair_list.append([lang,'en'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Create Train Corpus**" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "indir='{}/train'.format(ORG_DATA_DIR)\n", "outdir='{}/data'.format(expdir)\n", "\n", "# print(lang_pair_list)\n", "concat_data(indir,outdir,lang_pair_list)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Learn BPE**" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!echo ./learn_bpe.sh {expdir}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!echo ./apply_bpe_train_notag.sh {expdir}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!echo ./apply_bpe_test_valid_notag.sh {expdir} {ORG_DATA_DIR} {'\"'+' '.join(LANGS+['en'])+'\"'}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Add language tags to train**" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dset='train' \n", "\n", "src_fname='{expdir}/bpe/train/{dset}.SRC'.format(expdir=expdir,dset=dset)\n", "tgt_fname='{expdir}/bpe/train/{dset}.TGT'.format(expdir=expdir,dset=dset)\n", "meta_fname='{expdir}/data/lang_pairs.txt'.format(expdir=expdir,dset=dset)\n", " \n", "out_src_fname='{expdir}/final/{dset}.SRC'.format(expdir=expdir,dset=dset)\n", "out_tgt_fname='{expdir}/final/{dset}.TGT'.format(expdir=expdir,dset=dset)\n", "\n", "lang_tag_iterator=generate_lang_tag_iterator(meta_fname)\n", "\n", "print(expdir)\n", "os.makedirs('{expdir}/final'.format(expdir=expdir),exist_ok=True)\n", "\n", "with open(src_fname,'r',encoding='utf-8') as srcfile, \\\n", " open(tgt_fname,'r',encoding='utf-8') as tgtfile, \\\n", " open(out_src_fname,'w',encoding='utf-8') as outsrcfile, \\\n", " open(out_tgt_fname,'w',encoding='utf-8') as outtgtfile: \n", "\n", " for (l1,l2), src_sent, tgt_sent in tqdm(zip(lang_tag_iterator, srcfile, tgtfile)):\n", " outsrcfile.write(add_token(src_sent.strip(),[('src',l1),('tgt',l2)]) + '\\n' )\n", " outtgtfile.write(tgt_sent.strip()+'\\n')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Add language tags to valid**\n", "\n", "- add language tags, create parallel corpus\n", "- sample 20\\% for validation set \n", "- Create final validation set" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dset='dev' \n", "out_src_fname='{expdir}/final/{dset}.SRC'.format(\n", " expdir=expdir,dset=dset)\n", "out_tgt_fname='{expdir}/final/{dset}.TGT'.format(\n", " expdir=expdir,dset=dset)\n", "\n", "os.makedirs('{expdir}/final'.format(expdir=expdir),exist_ok=True)\n", "\n", "print('Processing validation files') \n", "consolidated_dset=[]\n", "for l1, l2 in tqdm(lang_pair_list):\n", " src_fname='{expdir}/bpe/{dset}/{dset}.{lang}'.format(\n", " expdir=expdir,dset=dset,lang=l1)\n", " tgt_fname='{expdir}/bpe/{dset}/{dset}.{lang}'.format(\n", " expdir=expdir,dset=dset,lang=l2)\n", "# print(src_fname)\n", "# print(os.path.exists(src_fname))\n", " with open(src_fname,'r',encoding='utf-8') as srcfile, \\\n", " open(tgt_fname,'r',encoding='utf-8') as tgtfile:\n", " for src_sent, tgt_sent in zip(srcfile,tgtfile):\n", " consolidated_dset.append(\n", " ( add_token(src_sent.strip(),[('src',l1),('tgt',l2)]),\n", " tgt_sent.strip() )\n", " )\n", "\n", "print('Create validation set') \n", "random.shuffle(consolidated_dset)\n", "final_set=consolidated_dset[:len(consolidated_dset)//5] \n", "\n", "print('Original set size: {}'.format(len(consolidated_dset))) \n", "print('Sampled set size: {}'.format(len(final_set))) \n", "\n", "print('Write validation set')\n", "\n", "with open(out_src_fname,'w',encoding='utf-8') as srcfile, \\\n", " open(out_tgt_fname,'w',encoding='utf-8') as tgtfile:\n", " for src_sent, tgt_sent in final_set: \n", " srcfile.write(src_sent+'\\n')\n", " tgtfile.write(tgt_sent+'\\n')\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Add language tags to test**\n", "\n", "- add language tags, create parallel corpus all M2O language pairs \n", "- Create final test set" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dset='test' \n", "out_src_fname='{expdir}/final/{dset}.SRC'.format(\n", " expdir=expdir,dset=dset)\n", "out_tgt_fname='{expdir}/final/{dset}.TGT'.format(\n", " expdir=expdir,dset=dset)\n", "\n", "os.makedirs('{expdir}/final'.format(expdir=expdir),exist_ok=True)\n", "\n", "print('Processing test files') \n", "consolidated_dset=[]\n", "for l1, l2 in tqdm(lang_pair_list):\n", " src_fname='{expdir}/bpe/{dset}/{dset}.{lang}'.format(\n", " expdir=expdir,dset=dset,lang=l1)\n", " tgt_fname='{expdir}/bpe/{dset}/{dset}.{lang}'.format(\n", " expdir=expdir,dset=dset,lang=l2)\n", "# print(src_fname)\n", "# print(os.path.exists(src_fname))\n", " with open(src_fname,'r',encoding='utf-8') as srcfile, \\\n", " open(tgt_fname,'r',encoding='utf-8') as tgtfile:\n", " for src_sent, tgt_sent in zip(srcfile,tgtfile):\n", " consolidated_dset.append(\n", " ( add_token(src_sent.strip(),[('src',l1),('tgt',l2)]),\n", " tgt_sent.strip() )\n", " )\n", "\n", "print('Final set size: {}'.format(len(consolidated_dset))) \n", " \n", "print('Write test set')\n", "print('testset truncated')\n", "\n", "with open(out_src_fname,'w',encoding='utf-8') as srcfile, \\\n", " open(out_tgt_fname,'w',encoding='utf-8') as tgtfile:\n", " for lno, (src_sent, tgt_sent) in enumerate(consolidated_dset,1):\n", " \n", " s=src_sent.strip().split(' ')\n", " t=tgt_sent.strip().split(' ')\n", " \n", " if len(s) > 200 or len(t) > 200:\n", " print('exp: {}, pair: ({},{}), lno: {}: lens: ({},{})'.format(expname,l1,l2,lno,len(s),len(t))) \n", " \n", " src_sent=' '.join( s[:min(len(s),200)] )\n", " tgt_sent=' '.join( t[:min(len(t),200)] )\n", " \n", " srcfile.write(src_sent+'\\n')\n", " tgtfile.write(tgt_sent+'\\n')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Binarize data**" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!echo ./binarize_training_exp.sh {expdir} SRC TGT" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Training Command**" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%bash \n", "\n", "python train.py {expdir}/final_bin \\\n", " --arch transformer \\\n", " --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 1.0 \\\n", " --lr 0.0005 --lr-scheduler inverse_sqrt --warmup-updates 4000 --warmup-init-lr 1e-07 \\\n", " --dropout 0.2 \\\n", " --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \\\n", " --max-tokens 8192 \\\n", " --max-update 1000000 \\\n", " --max-source-positions 200 \\\n", " --max-target-positions 200 \\\n", " --tensorboard-logdir {expdir}/tensorboard \\\n", " --save-dir {expdir}/model \\\n", " --required-batch-size-multiple 8 \\\n", " --save-interval 1 \\\n", " --keep-last-epochs 5 \\\n", " --patience 5 \\\n", " --fp16" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Cleanup**" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# os.unlink('{}')\n", "\n", "to_delete=[\n", " '{expdir}/data/train.SRC'.format(expdir=expdir,dset=dset),\n", " '{expdir}/data/train.TGT'.format(expdir=expdir,dset=dset),\n", " '{expdir}/bpe/train/train.SRC'.format(expdir=expdir,dset=dset),\n", " '{expdir}/bpe/train/train.TGT'.format(expdir=expdir,dset=dset),\n", "]`\n", "\n", "for fname in to_delete:\n", " os.unlink(fname)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Evaluation**" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dset='test' \n", "consolidated_testoutput_fname='{expdir}/evaluations/test/default/test.SRC_TGT.TGT'.format(expdir=expdir)\n", "consolidated_testoutput_log_fname='{}.log'.format(consolidated_testoutput_fname)\n", "metrics_fname='{expdir}/evaluations/test/default/test.metrics.tsv'.format(expdir=expdir)\n", " \n", "test_set_size=2390\n", "\n", "consolidated_testoutput=[]\n", "with open(consolidated_testoutput_log_fname,'r',encoding='utf-8') as hypfile:\n", " consolidated_testoutput= list(map(lambda x: x.strip(), filter(lambda x: x.startswith('H-'),hypfile) ))\n", " consolidated_testoutput.sort(key=lambda x: int(x.split('\\t')[0].split('-')[1]))\n", " consolidated_testoutput=[ x.split('\\t')[2] for x in consolidated_testoutput ]\n", "\n", "os.makedirs('{expdir}/evaluations/test/default'.format(expdir=expdir),exist_ok=True)\n", "\n", "with open(consolidated_testoutput_fname,'w',encoding='utf-8') as finalhypfile:\n", " for sent in consolidated_testoutput:\n", " finalhypfile.write(sent+'\\n')\n", "\n", "print('Processing test files') \n", "with open(metrics_fname,'w',encoding='utf-8') as metrics_file: \n", " for i, (l1, l2) in enumerate(tqdm(lang_pair_list)):\n", "\n", " start=i*test_set_size\n", " end=(i+1)*test_set_size\n", " hyps=consolidated_testoutput[start:end]\n", " ref_fname='{expdir}/{dset}/{dset}.{lang}'.format(\n", " expdir=ORG_DATA_DIR,dset=dset,lang=l2)\n", "\n", " refs=[]\n", " with open(ref_fname,'r',encoding='utf-8') as reffile:\n", " refs.extend(map(lambda x:x.strip(),reffile))\n", "\n", " assert(len(hyps)==len(refs))\n", "\n", " bleu=sacrebleu.corpus_bleu(hyps,[refs],tokenize='none')\n", "\n", " print('{} {} {} {}'.format(l1,l2,bleu.score,bleu.prec_str))\n", " metrics_file.write('{}\\t{}\\t{}\\t{}\\t{}\\n'.format(expname,l1,l2,bleu.score,bleu.prec_str))\n", " " ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.0" }, "toc": { "base_numbering": 1, "nav_menu": { "height": "243.993px", "width": "160px" }, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": false } }, "nbformat": 4, "nbformat_minor": 4 }