# # Copyright (c) 2013-present, Anoop Kunchukuttan # All rights reserved. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. # import pandas as pd import numpy as np import os from indicnlp import common from indicnlp.common import IndicNlpException from indicnlp import langinfo as li ### # Phonetic Information about script characters ### """ Phonetic data about all languages except Tamil """ ALL_PHONETIC_DATA=None """ Phonetic data for Tamil """ TAMIL_PHONETIC_DATA=None """ Phonetic vector for all languages except Tamil """ ALL_PHONETIC_VECTORS=None """ Phonetic vector for Tamil """ TAMIL_PHONETIC_VECTORS=None """ Length of phonetic vector """ PHONETIC_VECTOR_LENGTH=38 """ Start offset for the phonetic feature vector in the phonetic data vector """ PHONETIC_VECTOR_START_OFFSET=6 ## PHONETIC PROPERTIES in order in which they occur in the vector ## This list must be in sync with the keys in the PV_PROP_RANGES dictionary PV_PROP=['basic_type', 'vowel_length', 'vowel_strength', 'vowel_status', 'consonant_type', 'articulation_place', 'aspiration', 'voicing', 'nasalization', 'vowel_horizontal', 'vowel_vertical', 'vowel_roundness', ] ### # Bit vector ranges for various properties ### PV_PROP_RANGES={ 'basic_type': [0,6], 'vowel_length': [6,8], 'vowel_strength': [8,11], 'vowel_status': [11,13], 'consonant_type': [13,18], 'articulation_place': [18,23], 'aspiration': [23,25], 'voicing': [25,27], 'nasalization': [27,29], 'vowel_horizontal': [29,32], 'vowel_vertical': [32,36], 'vowel_roundness': [36,38], } #### # Indexes into the Phonetic Vector #### PVIDX_BT_VOWEL=0 PVIDX_BT_CONSONANT=1 PVIDX_BT_NUKTA=2 PVIDX_BT_HALANT=3 PVIDX_BT_ANUSVAAR=4 PVIDX_BT_MISC=5 PVIDX_BT_S=PVIDX_BT_VOWEL PVIDX_BT_E=PVIDX_BT_MISC+1 PVIDX_VSTAT_DEP=12 ##### # Unicode information about characters ##### SCRIPT_OFFSET_START=0 SCRIPT_OFFSET_RANGE=0x80 def init(): """ To be called by library loader, do not call it in your program """ global ALL_PHONETIC_DATA, ALL_PHONETIC_VECTORS, TAMIL_PHONETIC_DATA, TAMIL_PHONETIC_VECTORS, PHONETIC_VECTOR_LENGTH, PHONETIC_VECTOR_START_OFFSET ALL_PHONETIC_DATA=pd.read_csv(os.path.join(common.get_resources_path(),'script','all_script_phonetic_data.csv'),encoding='utf-8') TAMIL_PHONETIC_DATA=pd.read_csv(os.path.join(common.get_resources_path(),'script','tamil_script_phonetic_data.csv'),encoding='utf-8') ALL_PHONETIC_VECTORS= ALL_PHONETIC_DATA.iloc[:,PHONETIC_VECTOR_START_OFFSET:].values TAMIL_PHONETIC_VECTORS=TAMIL_PHONETIC_DATA.iloc[:,PHONETIC_VECTOR_START_OFFSET:].values PHONETIC_VECTOR_LENGTH=ALL_PHONETIC_VECTORS.shape[1] def is_supported_language(lang): return lang in list(li.SCRIPT_RANGES.keys()) def get_offset(c,lang): if not is_supported_language(lang): raise IndicNlpException('Language {} not supported'.format(lang)) return ord(c)-li.SCRIPT_RANGES[lang][0] def offset_to_char(off,lang): """ Applicable to Brahmi derived Indic scripts """ if not is_supported_language(lang): raise IndicNlpException('Language {} not supported'.format(lang)) return chr(off+li.SCRIPT_RANGES[lang][0]) def is_indiclang_char(c,lang): """ Applicable to Brahmi derived Indic scripts Note that DANDA and DOUBLE_DANDA have the same Unicode codepoint for all Indic scripts """ if not is_supported_language(lang): raise IndicNlpException('Language {} not supported'.format(lang)) o=get_offset(c,lang) return (o>=SCRIPT_OFFSET_START and o=li.COORDINATED_RANGE_START_INCLUSIVE and c_offset<=li.COORDINATED_RANGE_END_INCLUSIVE) def in_coordinated_range(c,lang): if not is_supported_language(lang): raise IndicNlpException('Language {} not supported'.format(lang)) return in_coordinated_range_offset(get_offset(c,lang)) def get_phonetic_info(lang): if not is_supported_language(lang): raise IndicNlpException('Language {} not supported'.format(lang)) phonetic_data= ALL_PHONETIC_DATA if lang!=li.LC_TA else TAMIL_PHONETIC_DATA phonetic_vectors= ALL_PHONETIC_VECTORS if lang!=li.LC_TA else TAMIL_PHONETIC_VECTORS return (phonetic_data, phonetic_vectors) def invalid_vector(): ## TODO: check if np datatype is correct? return np.array([0]*PHONETIC_VECTOR_LENGTH) def get_phonetic_feature_vector(c,lang): offset=get_offset(c,lang) if not in_coordinated_range_offset(offset): return invalid_vector() phonetic_data, phonetic_vectors= get_phonetic_info(lang) if phonetic_data.iloc[offset]['Valid Vector Representation']==0: return invalid_vector() return phonetic_vectors[offset] def get_phonetic_feature_vector_offset(offset,lang): if not in_coordinated_range_offset(offset): return invalid_vector() phonetic_data, phonetic_vectors= get_phonetic_info(lang) if phonetic_data.iloc[offset]['Valid Vector Representation']==0: return invalid_vector() return phonetic_vectors[offset] ### Unary operations on vectors def is_valid(v): return np.sum(v)>0 def is_vowel(v): return v[PVIDX_BT_VOWEL]==1 def is_consonant(v): return v[PVIDX_BT_CONSONANT]==1 def is_halant(v): return v[PVIDX_BT_HALANT]==1 def is_nukta(v): return v[PVIDX_BT_NUKTA]==1 def is_anusvaar(v): return v[PVIDX_BT_ANUSVAAR]==1 def is_misc(v): return v[PVIDX_BT_MISC]==1 def is_dependent_vowel(v): return is_vowel(v) and v[PVIDX_VSTAT_DEP]==1 def is_plosive(v): return is_consonant(v) and get_property_vector(v,'consonant_type')[0]==1 ### Binary operations on phonetic vectors def or_vectors(v1,v2): return np.array([ 1 if (b1+b2)>=1 else 0 for b1,b2 in zip(v1,v2) ]) def xor_vectors(v1,v2): return np.array([ 1 if b1!=b2 else 0 for b1,b2 in zip(v1,v2) ]) ### Getting properties from phonetic vectors def get_property_vector(v,prop_name): return v[PV_PROP_RANGES[prop_name][0]:PV_PROP_RANGES[prop_name][1]] def get_property_value(v,prop_name): factor_bits=get_property_vector(v,prop_name).tolist() v=0 c=1 for b in factor_bits[::-1]: v+=(c*b) c=c*2.0 return int(v) def lcsr_indic(srcw,tgtw,slang,tlang): """ compute the Longest Common Subsequence Ratio (LCSR) between two strings at the character level. This works for Indic scripts by mapping both languages to a common script srcw: source language string tgtw: source language string slang: source language tlang: target language """ score_mat=np.zeros((len(srcw)+1,len(tgtw)+1)) for si,sc in enumerate(srcw,1): for ti,tc in enumerate(tgtw,1): so=get_offset(sc,slang) to=get_offset(tc,tlang) if in_coordinated_range_offset(so) and in_coordinated_range_offset(to) and so==to: score_mat[si,ti]=score_mat[si-1,ti-1]+1.0 elif not (in_coordinated_range_offset(so) or in_coordinated_range_offset(to)) and sc==tc: score_mat[si,ti]=score_mat[si-1,ti-1]+1.0 else: score_mat[si,ti]= max( score_mat[si,ti-1], score_mat[si-1,ti]) return (score_mat[-1,-1]/float(max(len(srcw),len(tgtw))),float(len(srcw)),float(len(tgtw))) def lcsr_any(srcw,tgtw): """ LCSR computation if both languages have the same script """ score_mat=np.zeros((len(srcw)+1,len(tgtw)+1)) for si,sc in enumerate(srcw,1): for ti,tc in enumerate(tgtw,1): if sc==tc: score_mat[si,ti]=score_mat[si-1,ti-1]+1.0 else: score_mat[si,ti]= max( score_mat[si,ti-1], score_mat[si-1,ti]) return (score_mat[-1,-1]/float(max(len(srcw),len(tgtw))),float(len(srcw)),float(len(tgtw))) def lcsr(srcw,tgtw,slang,tlang): """ compute the Longest Common Subsequence Ratio (LCSR) between two strings at the character level. srcw: source language string tgtw: source language string slang: source language tlang: target language """ if slang==tlang or not is_supported_language(slang) or not is_supported_language(tlang): return lcsr_any(srcw,tgtw,slang,tlang) else: return lcsr_indic(srcw,tgtw)