harveen
Adding code
9bbf386
raw history blame
No virus
6.44 kB
#
# Copyright (c) 2013-present, Anoop Kunchukuttan
# All rights reserved.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
#
#Program for normalization of text written in Unicode. This is mainly geared towards Indic scripts
#
# @author Anoop Kunchukuttan
#
import sys
from indicnlp.normalize import indic_normalize
from indicnlp.transliterate import unicode_transliterate
from indicnlp import loader
class AggressiveScriptUnifier():
def __init__(self,common_lang='hi',nasals_mode='to_nasal_consonants'):
self.common_lang=common_lang
self.nasals_mode=nasals_mode
self.do_normalize_chandras=True
self.do_normalize_vowel_ending=True
self.remove_nuktas=True
self.normalizer_map={}
self._init_normalizers()
def _init_normalizers(self):
normalizer_factory=indic_normalize.IndicNormalizerFactory()
## for languages with common parameters
for lang in ['hi','mr','sa','kK','ne','sd','bn','gu','ta','te','kn']:
self.normalizer_map[lang]=normalizer_factory.get_normalizer(lang, nasals_mode=self.nasals_mode,
do_normalize_chandras=self.do_normalize_chandras, remove_nuktas=self.remove_nuktas,
do_normalize_vowel_ending=self.do_normalize_vowel_ending)
## for languages with language specific parameters
self.normalizer_map['pa']=normalizer_factory.get_normalizer('pa', nasals_mode=self.nasals_mode,
do_normalize_chandras=self.do_normalize_chandras, remove_nuktas=self.remove_nuktas,
do_normalize_vowel_ending=self.do_normalize_vowel_ending,
do_canonicalize_addak=True, do_canonicalize_tippi=True,
do_replace_vowel_bases=True)
self.normalizer_map['or']=normalizer_factory.get_normalizer('or', nasals_mode=self.nasals_mode,
do_normalize_chandras=self.do_normalize_chandras, remove_nuktas=self.remove_nuktas,
do_normalize_vowel_ending=self.do_normalize_vowel_ending,
do_remap_wa=True)
self.normalizer_map['as']=normalizer_factory.get_normalizer('as', nasals_mode=self.nasals_mode,
do_normalize_chandras=self.do_normalize_chandras, remove_nuktas=self.remove_nuktas,
do_normalize_vowel_ending=self.do_normalize_vowel_ending,
do_remap_assamese_chars=True)
self.normalizer_map['ml']=normalizer_factory.get_normalizer('ml', nasals_mode=self.nasals_mode,
do_normalize_chandras=self.do_normalize_chandras, remove_nuktas=self.remove_nuktas,
do_normalize_vowel_ending=self.do_normalize_vowel_ending,
do_canonicalize_chillus=True, do_correct_geminated_T=True)
def transform(self,text,lang):
text=self.normalizer_map[lang].normalize(text)
text=unicode_transliterate.UnicodeIndicTransliterator.transliterate(text, lang, self.common_lang)
return text
class BasicScriptUnifier():
def __init__(self,common_lang='hi',nasals_mode='do_nothing'):
self.common_lang=common_lang
self.nasals_mode=nasals_mode
self.normalizer_map={}
self._init_normalizers()
def _init_normalizers(self):
normalizer_factory=indic_normalize.IndicNormalizerFactory()
for lang in ['hi','mr','sa','kK','ne','sd','bn','gu','ta','te','kn','pa','or','as','ml']:
self.normalizer_map[lang]=normalizer_factory.get_normalizer(lang, nasals_mode=self.nasals_mode)
def transform(self,text,lang):
if lang in self.normalizer_map:
text=self.normalizer_map[lang].normalize(text)
text=unicode_transliterate.UnicodeIndicTransliterator.transliterate(text, lang, self.common_lang)
return text
class NaiveScriptUnifier():
def __init__(self,common_lang='hi'):
self.common_lang=common_lang
def transform(self,text,lang):
text=unicode_transliterate.UnicodeIndicTransliterator.transliterate(text, lang, self.common_lang)
return text
if __name__ == '__main__':
loader.load()
if len(sys.argv)<=4:
print("Usage: python script_unifier <command> <infile> <outfile> <language>")
sys.exit(1)
if sys.argv[1]=='aggressive':
language=sys.argv[4]
unifier=AggressiveScriptUnifier(nasals_mode='to_nasal_consonants')
with open(sys.argv[2],'r',encoding='utf-8') as ifile:
with open(sys.argv[3],'w',encoding='utf-8') as ofile:
for i, line in enumerate(ifile.readlines()):
line=line.strip()
transliterated_line=unifier.transform(line,language)
ofile.write(transliterated_line+'\n')
elif sys.argv[1]=='moderate':
language=sys.argv[4]
unifier=AggressiveScriptUnifier(nasals_mode='do_nothing')
with open(sys.argv[2],'r',encoding='utf-8') as ifile:
with open(sys.argv[3],'w',encoding='utf-8') as ofile:
for i, line in enumerate(ifile.readlines()):
line=line.strip()
transliterated_line=unifier.transform(line,language)
ofile.write(transliterated_line+'\n')
elif sys.argv[1]=='basic':
language=sys.argv[4]
unifier=BasicScriptUnifier()
with open(sys.argv[2],'r',encoding='utf-8') as ifile:
with open(sys.argv[3],'w',encoding='utf-8') as ofile:
for i, line in enumerate(ifile.readlines()):
line=line.strip()
transliterated_line=unifier.transform(line,language)
ofile.write(transliterated_line+'\n')
elif sys.argv[1]=='naive':
language=sys.argv[4]
unifier=NaiveScriptUnifier()
with open(sys.argv[2],'r',encoding='utf-8') as ifile:
with open(sys.argv[3],'w',encoding='utf-8') as ofile:
for i, line in enumerate(ifile.readlines()):
line=line.strip()
transliterated_line=unifier.transform(line,language)
ofile.write(transliterated_line+'\n')