# # Copyright (c) 2013-present, Anoop Kunchukuttan # All rights reserved. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. # import codecs, sys from indicnlp.script import indic_scripts as si import re chillu_char_map= { '\u0d7a': '\u0d23', '\u0d7b': '\u0d28', '\u0d7c': '\u0d30', '\u0d7d': '\u0d32', '\u0d7e': '\u0d33', '\u0d7f': '\u0d15', } char_chillu_map= {} for k,v in chillu_char_map.items(): char_chillu_map[v]=k def normalize_malayalam(word): word_mask=re.sub(r'[0-9]','0',word) # instead of chillu characters, use consonant+halant for chillu,char in chillu_char_map.items(): word=word.replace(chillu,'{}\u0d4d'.format(char)) word_mask=word_mask.replace(chillu,'41') word_mask=re.sub(r'[^0-9]','0',word_mask) return word, word_mask def denormalize_malayalam(word, word_mask): word=list(word) word_mask=list(word_mask) ## pattern 4 idx=0 while idx>=0: try: idx=word_mask.index('4',idx) word[idx:idx+2]=char_chillu_map[word[idx]] word_mask[idx:idx+2]='0' start=idx except ValueError as e: break return ''.join(word) def normalize_punjabi(word): word_mask=re.sub(r'[0-9]','0',word) ## replace tippi with anusvaar word=word.replace('\u0a70','\u0a02') word_mask=word_mask.replace('\u0a70','2') ## replace addak+consonant with consonat+halant+consonant word=re.sub(r'\u0a71(.)','\\1\u0a4d\\1',word) word_mask=re.sub(r'\u0a71(.)','311',word_mask) word_mask=re.sub(r'[^0-9]','0',word_mask) return word, word_mask def denormalize_punjabi(word, word_mask): word=list(word) word_mask=list(word_mask) ## pattern 2 idx=0 while idx>=0: try: idx=word_mask.index('2',idx) word[idx]='\u0a70' word_mask[idx]='0' start=idx except ValueError as e: break ## pattern 3 idx=0 while idx>=0: try: idx=word_mask.index('3',idx) word[idx:idx+3]='\u0a71{}'.format(word[idx]) word_mask[idx:idx+3]='00' start=idx except ValueError as e: break return ''.join(word) def char_backoff(syllables_list,vocab): syllables_final=[] if vocab is None: syllables_final=syllables_list else: for s in syllables_list: if s in vocab: syllables_final.append(s) else: for x in s: syllables_final.append(x) return syllables_final def orthographic_syllabify_improved(word,lang,vocab=None): word_mask=['0']*len(word) if lang=='ml': word, word_mask = normalize_malayalam(word) word=word elif lang=='pa': word, word_mask = normalize_punjabi(word) p_vectors=[si.get_phonetic_feature_vector(c,lang) for c in word] syllables=[] syllables_mask=[] for i in range(len(word)): v=p_vectors[i] syllables.append(word[i]) syllables_mask.append(word_mask[i]) ### simplified syllabification #if i+1= 0: print('Warning') if lang=='ml': syllables = denormalize_malayalam(syllables,syllables_mask) elif lang=='pa': syllables = denormalize_punjabi(syllables,syllables_mask) syllables_list = syllables.strip().split(' ') return(char_backoff(syllables_list,vocab)) def orthographic_syllabify(word,lang,vocab=None): p_vectors=[si.get_phonetic_feature_vector(c,lang) for c in word] syllables=[] for i in range(len(word)): v=p_vectors[i] syllables.append(word[i]) ### simplified syllabification #if i+1