harveen
Adding code
9bbf386
raw history blame
No virus
3.97 kB
#
# Copyright (c) 2013-present, Anoop Kunchukuttan
# All rights reserved.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
#
import pandas as pd
import numpy as np
from indicnlp import common
from indicnlp.common import IndicNlpException
#### Maps from ARPABET to Internal Id
ARPABET_ID_MAP={}
ID_ARPABET_MAP={}
###
# Phonetic Information about script characters
###
""" Phonetic data for English """
ENGLISH_PHONETIC_DATA=None
""" Phonetic vector for English"""
ENGLISH_PHONETIC_VECTORS=None
""" Length of phonetic vector """
PHONETIC_VECTOR_LENGTH=38
""" Start offset for the phonetic feature vector in the phonetic data vector """
PHONETIC_VECTOR_START_OFFSET=6
## PHONETIC PROPERTIES in order in which they occur in the vector
## This list must be in sync with the keys in the PV_PROP_RANGES dictionary
PV_PROP=['basic_type',
'vowel_length',
'vowel_strength',
'vowel_status',
'consonant_type',
'articulation_place',
'aspiration',
'voicing',
'nasalization',
'vowel_horizontal',
'vowel_vertical',
'vowel_roundness',
]
###
# Bit vector ranges for various properties
###
PV_PROP_RANGES={
'basic_type': [0,6],
'vowel_length': [6,8],
'vowel_strength': [8,11],
'vowel_status': [11,13],
'consonant_type': [13,18],
'articulation_place': [18,23],
'aspiration': [23,25],
'voicing': [25,27],
'nasalization': [27,29],
'vowel_horizontal': [29,32],
'vowel_vertical': [32,36],
'vowel_roundness': [36,38],
}
####
# Indexes into the Phonetic Vector
####
PVIDX_BT_VOWEL=0
PVIDX_BT_CONSONANT=1
PVIDX_BT_NUKTA=2
PVIDX_BT_HALANT=3
PVIDX_BT_ANUSVAAR=4
PVIDX_BT_MISC=5
PVIDX_BT_S=PVIDX_BT_VOWEL
PVIDX_BT_E=PVIDX_BT_MISC+1
PVIDX_VSTAT_DEP=12
####
SCRIPT_RANGE_START=0x0D00
## TBD
SCRIPT_RANGE_END=0x0D2E
def init():
"""
To be called by library loader, do not call it in your program
"""
global ENGLISH_PHONETIC_DATA, ENGLISH_PHONETIC_VECTORS, PHONETIC_VECTOR_LENGTH, PHONETIC_VECTOR_START_OFFSET
ENGLISH_PHONETIC_DATA=pd.read_csv(common.get_resources_path()+'/script/english_script_phonetic_data.csv',encoding='utf-8')
ENGLISH_PHONETIC_VECTORS=ENGLISH_PHONETIC_DATA.iloc[:,PHONETIC_VECTOR_START_OFFSET:].values
PHONETIC_VECTOR_LENGTH=ENGLISH_PHONETIC_VECTORS.shape[1]
### Load mapping from ARPABET representation of phoneme to internal ID
global ARPABET_ID_MAP, ID_ARPABET_MAP
with open(common.get_resources_path()+'/script/english_arpabet_list.csv','r',encoding='utf-8') as infile:
for ph_id, name in enumerate(iter(infile)):
name=name.strip()
ARPABET_ID_MAP[name]=ph_id
ID_ARPABET_MAP[ph_id]=name
def phoneme_to_offset(ph):
return ARPABET_ID_MAP[ph]
def offset_to_phoneme(ph_id):
return ID_ARPABET_MAP[ph_id]
def phoneme_to_enc(ph):
return chr(SCRIPT_RANGE_START+phoneme_to_offset(ph))
def enc_to_phoneme(ph):
return offset_to_phoneme(enc_to_offset(ph))
def enc_to_offset(c):
return ord(c)-SCRIPT_RANGE_START
def in_range(offset):
return offset>=SCRIPT_RANGE_START and offset<SCRIPT_RANGE_END
def get_phonetic_info(lang):
return (ENGLISH_PHONETIC_DATA, ENGLISH_PHONETIC_VECTORS)
def invalid_vector():
## TODO: check if np datatype is correct?
return np.array([0]*PHONETIC_VECTOR_LENGTH)
def get_phonetic_feature_vector(p,lang):
offset=enc_to_offset(p)
if not in_range(offset):
return invalid_vector()
phonetic_data, phonetic_vectors= get_phonetic_info(lang)
if phonetic_data.iloc[offset]['Valid Vector Representation']==0:
return invalid_vector()
return phonetic_vectors[offset]