File size: 6,439 Bytes
e50fe35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
# 
#  Copyright (c) 2013-present, Anoop Kunchukuttan
#  All rights reserved.
#  
#  This source code is licensed under the MIT license found in the
#  LICENSE file in the root directory of this source tree.
# 

#Program for normalization of text written in Unicode. This is mainly geared towards Indic scripts 
#
# @author Anoop Kunchukuttan 
#

import sys
from indicnlp.normalize import indic_normalize
from indicnlp.transliterate import unicode_transliterate
from indicnlp import loader

class AggressiveScriptUnifier():

    def __init__(self,common_lang='hi',nasals_mode='to_nasal_consonants'):
        self.common_lang=common_lang
        self.nasals_mode=nasals_mode
        self.do_normalize_chandras=True
        self.do_normalize_vowel_ending=True
        self.remove_nuktas=True
        self.normalizer_map={}
        self._init_normalizers()

    def _init_normalizers(self):
        normalizer_factory=indic_normalize.IndicNormalizerFactory()

        ## for languages with common parameters
        for lang in ['hi','mr','sa','kK','ne','sd','bn','gu','ta','te','kn']:
            self.normalizer_map[lang]=normalizer_factory.get_normalizer(lang, nasals_mode=self.nasals_mode, 
                    do_normalize_chandras=self.do_normalize_chandras, remove_nuktas=self.remove_nuktas,
                    do_normalize_vowel_ending=self.do_normalize_vowel_ending)

        ## for languages with language specific parameters
        self.normalizer_map['pa']=normalizer_factory.get_normalizer('pa', nasals_mode=self.nasals_mode, 
                    do_normalize_chandras=self.do_normalize_chandras, remove_nuktas=self.remove_nuktas,
                    do_normalize_vowel_ending=self.do_normalize_vowel_ending,
                    do_canonicalize_addak=True, do_canonicalize_tippi=True,
                    do_replace_vowel_bases=True)
        self.normalizer_map['or']=normalizer_factory.get_normalizer('or', nasals_mode=self.nasals_mode, 
                    do_normalize_chandras=self.do_normalize_chandras, remove_nuktas=self.remove_nuktas,
                    do_normalize_vowel_ending=self.do_normalize_vowel_ending,
                    do_remap_wa=True)
        self.normalizer_map['as']=normalizer_factory.get_normalizer('as', nasals_mode=self.nasals_mode,
                    do_normalize_chandras=self.do_normalize_chandras, remove_nuktas=self.remove_nuktas,
                    do_normalize_vowel_ending=self.do_normalize_vowel_ending,
                    do_remap_assamese_chars=True)
        self.normalizer_map['ml']=normalizer_factory.get_normalizer('ml', nasals_mode=self.nasals_mode,
                    do_normalize_chandras=self.do_normalize_chandras, remove_nuktas=self.remove_nuktas,
                    do_normalize_vowel_ending=self.do_normalize_vowel_ending,
                    do_canonicalize_chillus=True, do_correct_geminated_T=True)

    def transform(self,text,lang):
        text=self.normalizer_map[lang].normalize(text)
        text=unicode_transliterate.UnicodeIndicTransliterator.transliterate(text, lang, self.common_lang)
        return text

class BasicScriptUnifier():

    def __init__(self,common_lang='hi',nasals_mode='do_nothing'):
        self.common_lang=common_lang
        self.nasals_mode=nasals_mode
        self.normalizer_map={}
        self._init_normalizers()

    def _init_normalizers(self):
        normalizer_factory=indic_normalize.IndicNormalizerFactory()

        for lang in ['hi','mr','sa','kK','ne','sd','bn','gu','ta','te','kn','pa','or','as','ml']:
            self.normalizer_map[lang]=normalizer_factory.get_normalizer(lang, nasals_mode=self.nasals_mode)    

    def transform(self,text,lang):

        if lang in self.normalizer_map:
            text=self.normalizer_map[lang].normalize(text)

        text=unicode_transliterate.UnicodeIndicTransliterator.transliterate(text, lang, self.common_lang)
        return text

class NaiveScriptUnifier():

    def __init__(self,common_lang='hi'):
        self.common_lang=common_lang

    def transform(self,text,lang):

        text=unicode_transliterate.UnicodeIndicTransliterator.transliterate(text, lang, self.common_lang)
        return text

if __name__ == '__main__': 

    loader.load()

    if len(sys.argv)<=4:
        print("Usage: python script_unifier <command> <infile> <outfile> <language>")
        sys.exit(1)

    if sys.argv[1]=='aggressive':

        language=sys.argv[4]

        unifier=AggressiveScriptUnifier(nasals_mode='to_nasal_consonants')

        with open(sys.argv[2],'r',encoding='utf-8') as ifile:
            with open(sys.argv[3],'w',encoding='utf-8') as ofile:
                for i, line in enumerate(ifile.readlines()):

                    line=line.strip()
                    transliterated_line=unifier.transform(line,language)
                    ofile.write(transliterated_line+'\n')

    elif sys.argv[1]=='moderate':

        language=sys.argv[4]

        unifier=AggressiveScriptUnifier(nasals_mode='do_nothing')

        with open(sys.argv[2],'r',encoding='utf-8') as ifile:
            with open(sys.argv[3],'w',encoding='utf-8') as ofile:
                for i, line in enumerate(ifile.readlines()):

                    line=line.strip()
                    transliterated_line=unifier.transform(line,language)
                    ofile.write(transliterated_line+'\n')
                    
    elif sys.argv[1]=='basic':

        language=sys.argv[4]

        unifier=BasicScriptUnifier()

        with open(sys.argv[2],'r',encoding='utf-8') as ifile:
            with open(sys.argv[3],'w',encoding='utf-8') as ofile:
                for i, line in enumerate(ifile.readlines()):

                    line=line.strip()
                    transliterated_line=unifier.transform(line,language)
                    ofile.write(transliterated_line+'\n')

    elif sys.argv[1]=='naive':

        language=sys.argv[4]

        unifier=NaiveScriptUnifier()

        with open(sys.argv[2],'r',encoding='utf-8') as ifile:
            with open(sys.argv[3],'w',encoding='utf-8') as ofile:
                for i, line in enumerate(ifile.readlines()):

                    line=line.strip()
                    transliterated_line=unifier.transform(line,language)
                    ofile.write(transliterated_line+'\n')