# # Copyright (c) 2013-present, Anoop Kunchukuttan # All rights reserved. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. # import os import pandas as pd import numpy as np from indicnlp import common #### Maps from ARPABET to Internal Id ARPABET_ID_MAP = {} ID_ARPABET_MAP = {} ### # Phonetic Information about script characters ### """ Phonetic data for English """ ENGLISH_PHONETIC_DATA = None """ Phonetic vector for English""" ENGLISH_PHONETIC_VECTORS = None """ Length of phonetic vector """ PHONETIC_VECTOR_LENGTH = 38 """ Start offset for the phonetic feature vector in the phonetic data vector """ PHONETIC_VECTOR_START_OFFSET = 6 ## PHONETIC PROPERTIES in order in which they occur in the vector ## This list must be in sync with the keys in the PV_PROP_RANGES dictionary PV_PROP = [ "basic_type", "vowel_length", "vowel_strength", "vowel_status", "consonant_type", "articulation_place", "aspiration", "voicing", "nasalization", "vowel_horizontal", "vowel_vertical", "vowel_roundness", ] ### # Bit vector ranges for various properties ### PV_PROP_RANGES = { "basic_type": [0, 6], "vowel_length": [6, 8], "vowel_strength": [8, 11], "vowel_status": [11, 13], "consonant_type": [13, 18], "articulation_place": [18, 23], "aspiration": [23, 25], "voicing": [25, 27], "nasalization": [27, 29], "vowel_horizontal": [29, 32], "vowel_vertical": [32, 36], "vowel_roundness": [36, 38], } #### # Indexes into the Phonetic Vector #### PVIDX_BT_VOWEL = 0 PVIDX_BT_CONSONANT = 1 PVIDX_BT_NUKTA = 2 PVIDX_BT_HALANT = 3 PVIDX_BT_ANUSVAAR = 4 PVIDX_BT_MISC = 5 PVIDX_BT_S = PVIDX_BT_VOWEL PVIDX_BT_E = PVIDX_BT_MISC + 1 PVIDX_VSTAT_DEP = 12 #### SCRIPT_RANGE_START = 0x0D00 ## TBD SCRIPT_RANGE_END = 0x0D2E def init(): """ To be called by library loader, do not call it in your program """ global ENGLISH_PHONETIC_DATA, ENGLISH_PHONETIC_VECTORS, PHONETIC_VECTOR_LENGTH, PHONETIC_VECTOR_START_OFFSET ENGLISH_PHONETIC_DATA = pd.read_csv( os.path.join( common.get_resources_path(), "script", "english_script_phonetic_data.csv" ), encoding="utf-8", ) ENGLISH_PHONETIC_VECTORS = ENGLISH_PHONETIC_DATA.iloc[ :, PHONETIC_VECTOR_START_OFFSET: ].values PHONETIC_VECTOR_LENGTH = ENGLISH_PHONETIC_VECTORS.shape[1] ### Load mapping from ARPABET representation of phoneme to internal ID global ARPABET_ID_MAP, ID_ARPABET_MAP with open( os.path.join(common.get_resources_path(), "script", "english_arpabet_list.csv"), "r", encoding="utf-8", ) as infile: for ph_id, name in enumerate(iter(infile)): name = name.strip() ARPABET_ID_MAP[name] = ph_id ID_ARPABET_MAP[ph_id] = name def phoneme_to_offset(ph): return ARPABET_ID_MAP[ph] def offset_to_phoneme(ph_id): return ID_ARPABET_MAP[ph_id] def phoneme_to_enc(ph): return chr(SCRIPT_RANGE_START + phoneme_to_offset(ph)) def enc_to_phoneme(ph): return offset_to_phoneme(enc_to_offset(ph)) def enc_to_offset(c): return ord(c) - SCRIPT_RANGE_START def in_range(offset): return offset >= SCRIPT_RANGE_START and offset < SCRIPT_RANGE_END def get_phonetic_info(lang): return (ENGLISH_PHONETIC_DATA, ENGLISH_PHONETIC_VECTORS) def invalid_vector(): ## TODO: check if np datatype is correct? return np.array([0] * PHONETIC_VECTOR_LENGTH) def get_phonetic_feature_vector(p, lang): offset = enc_to_offset(p) if not in_range(offset): return invalid_vector() phonetic_data, phonetic_vectors = get_phonetic_info(lang) if phonetic_data.iloc[offset]["Valid Vector Representation"] == 0: return invalid_vector() return phonetic_vectors[offset]