Spaces:
Runtime error
Runtime error
import sys | |
from indicnlp import langinfo | |
from indicnlp import loader | |
if __name__ == '__main__': | |
""" | |
This script corrects the incorrect tokenization done by Moses tokenizer. | |
The Moses tokenizer splits on nukta and halant characters | |
Usage: python correct_moses_tokenizer.py <infname> <outfname> <langcode> | |
""" | |
loader.load() | |
infname=sys.argv[1] | |
outfname=sys.argv[2] | |
lang=sys.argv[3] | |
halant_char=langinfo.offset_to_char(langinfo.HALANTA_OFFSET,lang) | |
nukta_char=langinfo.offset_to_char(langinfo.NUKTA_OFFSET,lang) | |
with open(infname,'r',encoding='utf-8') as infile, \ | |
open(outfname,'w',encoding='utf-8') as outfile: | |
for line in infile: | |
outfile.write( | |
line.replace( | |
' {} '.format(halant_char), halant_char).replace( | |
' {} '.format(nukta_char), nukta_char).replace( | |
' {}{}'.format(nukta_char,halant_char),'{}{}'.format(nukta_char,halant_char)) | |
) | |