harveen
Adding code
9bbf386
raw history blame
No virus
1.05 kB
import sys
from indicnlp import langinfo
from indicnlp import loader
if __name__ == '__main__':
"""
This script corrects the incorrect tokenization done by Moses tokenizer.
The Moses tokenizer splits on nukta and halant characters
Usage: python correct_moses_tokenizer.py <infname> <outfname> <langcode>
"""
loader.load()
infname=sys.argv[1]
outfname=sys.argv[2]
lang=sys.argv[3]
halant_char=langinfo.offset_to_char(langinfo.HALANTA_OFFSET,lang)
nukta_char=langinfo.offset_to_char(langinfo.NUKTA_OFFSET,lang)
with open(infname,'r',encoding='utf-8') as infile, \
open(outfname,'w',encoding='utf-8') as outfile:
for line in infile:
outfile.write(
line.replace(
' {} '.format(halant_char), halant_char).replace(
' {} '.format(nukta_char), nukta_char).replace(
' {}{}'.format(nukta_char,halant_char),'{}{}'.format(nukta_char,halant_char))
)