en_to_indic_translation / indic_nlp_library /contrib /hindi_to_kannada_transliterator.py
harveen
Adding code
9bbf386
raw history blame
No virus
2.21 kB
import sys
from indicnlp import common
common.set_resources_path(INDIC_NLP_RESOURCES)
from indicnlp import loader
from indicnlp.normalize import indic_normalize
from indicnlp.transliterate import unicode_transliterate
if __name__ == '__main__':
"""
This script transliterates Hindi to Kannada. It removes/remaps
characters only found in Hindi. It also adds halanta to words ending
with consonant - as is the convention in Kannada
"""
infname=sys.argv[1] # one sentence/word per line. Sentences should be space-tokenized
outfname=sys.agv[2]
loader.load()
normalizer_factory=indic_normalize.IndicNormalizerFactory()
normalizer=normalizer_factory.get_normalizer('hi')
with open(infname,'r',encoding='utf-8') as infile, \
open(outfname,'w',encoding='utf-8') as outfile:
for line in infile:
line=line.strip()
line=normalizer.normalize(line)
## replace chandrabindus with anusvara
line=line.replace('\u0900','\u0902')
line=line.replace('\u0901','\u0902')
### replace chandra e and o diacritics with e and o respectively
#line=line.replace('\u0945','\u0947')
#line=line.replace('\u0949','\u094b')
### replace chandra e and o diacritics with a diacritic
## this seems to be general usage
line=line.replace('\u0945','\u093e')
line=line.replace('\u0949','\u093e')
## remove nukta
line=line.replace('\u093c','')
## add halant if word ends with consonant
#if isc.is_consonant(isc.get_phonetic_feature_vector(line[-1],'hi')):
# line=line+'\u094d'
words=line.split(' ')
outwords=[]
for word in line.split(' '):
if isc.is_consonant(isc.get_phonetic_feature_vector(word[-1],'hi')):
word=word+'\u094d'
outwords.append(word)
line=' '.join(outwords)
## script conversion
line=unicode_transliterate.UnicodeIndicTransliterator.transliterate(line,'hi','kn')
outfile.write(line+'\n')