import sys from indicnlp import langinfo from indicnlp import loader if __name__ == '__main__': """ This script corrects the incorrect tokenization done by Moses tokenizer. The Moses tokenizer splits on nukta and halant characters Usage: python correct_moses_tokenizer.py """ loader.load() infname=sys.argv[1] outfname=sys.argv[2] lang=sys.argv[3] halant_char=langinfo.offset_to_char(langinfo.HALANTA_OFFSET,lang) nukta_char=langinfo.offset_to_char(langinfo.NUKTA_OFFSET,lang) with open(infname,'r',encoding='utf-8') as infile, \ open(outfname,'w',encoding='utf-8') as outfile: for line in infile: outfile.write( line.replace( ' {} '.format(halant_char), halant_char).replace( ' {} '.format(nukta_char), nukta_char).replace( ' {}{}'.format(nukta_char,halant_char),'{}{}'.format(nukta_char,halant_char)) )