|
import sys |
|
from indicnlp import common |
|
common.set_resources_path(INDIC_NLP_RESOURCES) |
|
|
|
from indicnlp import loader |
|
from indicnlp.normalize import indic_normalize |
|
from indicnlp.transliterate import unicode_transliterate |
|
|
|
if __name__ == '__main__': |
|
""" |
|
This script transliterates Hindi to Kannada. It removes/remaps |
|
characters only found in Hindi. It also adds halanta to words ending |
|
with consonant - as is the convention in Kannada |
|
""" |
|
|
|
infname=sys.argv[1] |
|
outfname=sys.agv[2] |
|
loader.load() |
|
|
|
normalizer_factory=indic_normalize.IndicNormalizerFactory() |
|
normalizer=normalizer_factory.get_normalizer('hi') |
|
|
|
with open(infname,'r',encoding='utf-8') as infile, \ |
|
open(outfname,'w',encoding='utf-8') as outfile: |
|
for line in infile: |
|
line=line.strip() |
|
line=normalizer.normalize(line) |
|
|
|
|
|
line=line.replace('\u0900','\u0902') |
|
line=line.replace('\u0901','\u0902') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
line=line.replace('\u0945','\u093e') |
|
line=line.replace('\u0949','\u093e') |
|
|
|
|
|
line=line.replace('\u093c','') |
|
|
|
|
|
|
|
|
|
words=line.split(' ') |
|
outwords=[] |
|
for word in line.split(' '): |
|
if isc.is_consonant(isc.get_phonetic_feature_vector(word[-1],'hi')): |
|
word=word+'\u094d' |
|
outwords.append(word) |
|
line=' '.join(outwords) |
|
|
|
|
|
|
|
line=unicode_transliterate.UnicodeIndicTransliterator.transliterate(line,'hi','kn') |
|
|
|
outfile.write(line+'\n') |
|
|
|
|
|
|