File size: 4,556 Bytes
e50fe35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
# 
#  Copyright (c) 2013-present, Anoop Kunchukuttan
#  All rights reserved.
#  
#  This source code is licensed under the MIT license found in the
#  LICENSE file in the root directory of this source tree.
# 

import codecs, sys, itertools,re,os
import morfessor 

from functools import lru_cache

from indicnlp import langinfo
from indicnlp import common
from indicnlp.tokenize import indic_tokenize

# Unsupervised Morphological Analyser for Indian languages. 
#
# @author Anoop Kunchukuttan 
#

class MorphAnalyzerI(object):
    """
     Interface for Morph Analyzer
    """

    def morph_analyze(word):
        pass 

    def morph_analyze_document(tokens):
        pass 

class UnsupervisedMorphAnalyzer(MorphAnalyzerI): 
    """
    Unsupervised Morphological analyser built using Morfessor 2.0
    """

    def __init__(self,lang,add_marker=False): 
        self.lang=lang
        self.add_marker=add_marker

        io = morfessor.MorfessorIO()
        self._morfessor_model=io.read_any_model(os.path.join(common.INDIC_RESOURCES_PATH,'morph','morfessor','{}.model'.format(lang)))        

        self._script_range_pat=r'^[{}-{}]+$'.format(chr(langinfo.SCRIPT_RANGES[lang][0]),chr(langinfo.SCRIPT_RANGES[lang][1]))
        self._script_check_re=re.compile(self._script_range_pat)

    def _contains_number(self,text):
        if self.lang in langinfo.SCRIPT_RANGES: 
            for c in text: 
                offset=ord(c)-langinfo.SCRIPT_RANGES[self.lang][0]
                if offset >=langinfo.NUMERIC_OFFSET_START and offset <= langinfo.NUMERIC_OFFSET_END:
                    return True  
        return False     

    def _morphanalysis_needed(self,word):
        return self._script_check_re.match(word) and not self._contains_number(word)

    @lru_cache(maxsize=16384)
    def morph_analyze(self,word):
        """
        Morphanalyzes a single word and returns a list of component morphemes

        @param word: string input word 
        """
        m_list=[]
        if self._morphanalysis_needed(word): 
            val=self._morfessor_model.viterbi_segment(word)
            m_list=val[0]
            if self.add_marker:
                m_list= [ '{}_S_'.format(m) if i>0 else '{}_R_'.format(m)  for i,m in enumerate(m_list)]
        else:
            if self.add_marker:
                word='{}_E_'.format(word)
            m_list=[word]
        return m_list 

        ### Older implementation
        #val=self._morfessor_model.viterbi_segment(word)
        #m_list=val[0]
        #if self.add_marker:
        #    m_list= [ u'{}_S_'.format(m) if i>0 else u'{}_R_'.format(m)  for i,m in enumerate(m_list)]
        #return m_list
    

    def morph_analyze_document(self,tokens):
        """
        Morphanalyzes a document, represented as a list of tokens
        Each word  is morphanalyzed and result is a list of morphemes constituting the document 

        @param tokens: string sequence of words 

        @return list of segments in the document after morph analysis 
        """

        out_tokens=[]
        for token in tokens: 
            morphs=self.morph_analyze(token)
            out_tokens.extend(morphs)
        return out_tokens    

        #### Older implementation
        #out_tokens=[]
        #for token in tokens: 
        #    if self._morphanalysis_needed(token): 
        #        morphs=self.morph_analyze(token)
        #        out_tokens.extend(morphs)
        #    else:
        #        if self.add_marker:
        #            token=u'{}_E_'.format(token)
        #        out_tokens.append(token)
        #return out_tokens    


if __name__ == '__main__': 

    if len(sys.argv)<4:
        print("Usage: python unsupervised_morph.py <infile> <outfile> <language> <indic_resources_path> [<add_marker>]")
        sys.exit(1)

    language=sys.argv[3]
    common.INDIC_RESOURCES_PATH=sys.argv[4]

    add_marker=False

    if len(sys.argv)==6:
        add_marker= True if sys.argv[5] == 'True' else False 

    print('Loading morph analyser for ' + language) 
    analyzer=UnsupervisedMorphAnalyzer(language,add_marker)
    print('Loaded morph analyser for ' + language) 

    with codecs.open(sys.argv[1],'r','utf-8') as ifile:
        with codecs.open(sys.argv[2],'w','utf-8') as ofile:
            for line in ifile.readlines():
                line=line.strip()
                tokens=indic_tokenize.trivial_tokenize(line)
                morph_tokens=analyzer.morph_analyze_document(tokens)
                ofile.write(' '.join(morph_tokens))
                ofile.write('\n')