# # Copyright (c) 2013-present, Anoop Kunchukuttan # All rights reserved. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. # import codecs, sys, itertools,re,os import morfessor from functools import lru_cache from indicnlp import langinfo from indicnlp import common from indicnlp.tokenize import indic_tokenize # Unsupervised Morphological Analyser for Indian languages. # # @author Anoop Kunchukuttan # class MorphAnalyzerI(object): """ Interface for Morph Analyzer """ def morph_analyze(word): pass def morph_analyze_document(tokens): pass class UnsupervisedMorphAnalyzer(MorphAnalyzerI): """ Unsupervised Morphological analyser built using Morfessor 2.0 """ def __init__(self,lang,add_marker=False): self.lang=lang self.add_marker=add_marker io = morfessor.MorfessorIO() self._morfessor_model=io.read_any_model(os.path.join(common.INDIC_RESOURCES_PATH,'morph','morfessor','{}.model'.format(lang))) self._script_range_pat=r'^[{}-{}]+$'.format(chr(langinfo.SCRIPT_RANGES[lang][0]),chr(langinfo.SCRIPT_RANGES[lang][1])) self._script_check_re=re.compile(self._script_range_pat) def _contains_number(self,text): if self.lang in langinfo.SCRIPT_RANGES: for c in text: offset=ord(c)-langinfo.SCRIPT_RANGES[self.lang][0] if offset >=langinfo.NUMERIC_OFFSET_START and offset <= langinfo.NUMERIC_OFFSET_END: return True return False def _morphanalysis_needed(self,word): return self._script_check_re.match(word) and not self._contains_number(word) @lru_cache(maxsize=16384) def morph_analyze(self,word): """ Morphanalyzes a single word and returns a list of component morphemes @param word: string input word """ m_list=[] if self._morphanalysis_needed(word): val=self._morfessor_model.viterbi_segment(word) m_list=val[0] if self.add_marker: m_list= [ '{}_S_'.format(m) if i>0 else '{}_R_'.format(m) for i,m in enumerate(m_list)] else: if self.add_marker: word='{}_E_'.format(word) m_list=[word] return m_list ### Older implementation #val=self._morfessor_model.viterbi_segment(word) #m_list=val[0] #if self.add_marker: # m_list= [ u'{}_S_'.format(m) if i>0 else u'{}_R_'.format(m) for i,m in enumerate(m_list)] #return m_list def morph_analyze_document(self,tokens): """ Morphanalyzes a document, represented as a list of tokens Each word is morphanalyzed and result is a list of morphemes constituting the document @param tokens: string sequence of words @return list of segments in the document after morph analysis """ out_tokens=[] for token in tokens: morphs=self.morph_analyze(token) out_tokens.extend(morphs) return out_tokens #### Older implementation #out_tokens=[] #for token in tokens: # if self._morphanalysis_needed(token): # morphs=self.morph_analyze(token) # out_tokens.extend(morphs) # else: # if self.add_marker: # token=u'{}_E_'.format(token) # out_tokens.append(token) #return out_tokens if __name__ == '__main__': if len(sys.argv)<4: print("Usage: python unsupervised_morph.py []") sys.exit(1) language=sys.argv[3] common.INDIC_RESOURCES_PATH=sys.argv[4] add_marker=False if len(sys.argv)==6: add_marker= True if sys.argv[5] == 'True' else False print('Loading morph analyser for ' + language) analyzer=UnsupervisedMorphAnalyzer(language,add_marker) print('Loaded morph analyser for ' + language) with codecs.open(sys.argv[1],'r','utf-8') as ifile: with codecs.open(sys.argv[2],'w','utf-8') as ofile: for line in ifile.readlines(): line=line.strip() tokens=indic_tokenize.trivial_tokenize(line) morph_tokens=analyzer.morph_analyze_document(tokens) ofile.write(' '.join(morph_tokens)) ofile.write('\n')