Spaces:
Sleeping
Sleeping
# | |
# Copyright (c) 2013-present, Anoop Kunchukuttan | |
# All rights reserved. | |
# | |
# This source code is licensed under the MIT license found in the | |
# LICENSE file in the root directory of this source tree. | |
# | |
import os | |
import pandas as pd | |
import numpy as np | |
from indicnlp import common | |
#### Maps from ARPABET to Internal Id | |
ARPABET_ID_MAP = {} | |
ID_ARPABET_MAP = {} | |
### | |
# Phonetic Information about script characters | |
### | |
""" Phonetic data for English """ | |
ENGLISH_PHONETIC_DATA = None | |
""" Phonetic vector for English""" | |
ENGLISH_PHONETIC_VECTORS = None | |
""" Length of phonetic vector """ | |
PHONETIC_VECTOR_LENGTH = 38 | |
""" Start offset for the phonetic feature vector in the phonetic data vector """ | |
PHONETIC_VECTOR_START_OFFSET = 6 | |
## PHONETIC PROPERTIES in order in which they occur in the vector | |
## This list must be in sync with the keys in the PV_PROP_RANGES dictionary | |
PV_PROP = [ | |
"basic_type", | |
"vowel_length", | |
"vowel_strength", | |
"vowel_status", | |
"consonant_type", | |
"articulation_place", | |
"aspiration", | |
"voicing", | |
"nasalization", | |
"vowel_horizontal", | |
"vowel_vertical", | |
"vowel_roundness", | |
] | |
### | |
# Bit vector ranges for various properties | |
### | |
PV_PROP_RANGES = { | |
"basic_type": [0, 6], | |
"vowel_length": [6, 8], | |
"vowel_strength": [8, 11], | |
"vowel_status": [11, 13], | |
"consonant_type": [13, 18], | |
"articulation_place": [18, 23], | |
"aspiration": [23, 25], | |
"voicing": [25, 27], | |
"nasalization": [27, 29], | |
"vowel_horizontal": [29, 32], | |
"vowel_vertical": [32, 36], | |
"vowel_roundness": [36, 38], | |
} | |
#### | |
# Indexes into the Phonetic Vector | |
#### | |
PVIDX_BT_VOWEL = 0 | |
PVIDX_BT_CONSONANT = 1 | |
PVIDX_BT_NUKTA = 2 | |
PVIDX_BT_HALANT = 3 | |
PVIDX_BT_ANUSVAAR = 4 | |
PVIDX_BT_MISC = 5 | |
PVIDX_BT_S = PVIDX_BT_VOWEL | |
PVIDX_BT_E = PVIDX_BT_MISC + 1 | |
PVIDX_VSTAT_DEP = 12 | |
#### | |
SCRIPT_RANGE_START = 0x0D00 | |
## TBD | |
SCRIPT_RANGE_END = 0x0D2E | |
def init(): | |
""" | |
To be called by library loader, do not call it in your program | |
""" | |
global ENGLISH_PHONETIC_DATA, ENGLISH_PHONETIC_VECTORS, PHONETIC_VECTOR_LENGTH, PHONETIC_VECTOR_START_OFFSET | |
ENGLISH_PHONETIC_DATA = pd.read_csv( | |
os.path.join( | |
common.get_resources_path(), "script", "english_script_phonetic_data.csv" | |
), | |
encoding="utf-8", | |
) | |
ENGLISH_PHONETIC_VECTORS = ENGLISH_PHONETIC_DATA.iloc[ | |
:, PHONETIC_VECTOR_START_OFFSET: | |
].values | |
PHONETIC_VECTOR_LENGTH = ENGLISH_PHONETIC_VECTORS.shape[1] | |
### Load mapping from ARPABET representation of phoneme to internal ID | |
global ARPABET_ID_MAP, ID_ARPABET_MAP | |
with open( | |
os.path.join(common.get_resources_path(), "script", "english_arpabet_list.csv"), | |
"r", | |
encoding="utf-8", | |
) as infile: | |
for ph_id, name in enumerate(iter(infile)): | |
name = name.strip() | |
ARPABET_ID_MAP[name] = ph_id | |
ID_ARPABET_MAP[ph_id] = name | |
def phoneme_to_offset(ph): | |
return ARPABET_ID_MAP[ph] | |
def offset_to_phoneme(ph_id): | |
return ID_ARPABET_MAP[ph_id] | |
def phoneme_to_enc(ph): | |
return chr(SCRIPT_RANGE_START + phoneme_to_offset(ph)) | |
def enc_to_phoneme(ph): | |
return offset_to_phoneme(enc_to_offset(ph)) | |
def enc_to_offset(c): | |
return ord(c) - SCRIPT_RANGE_START | |
def in_range(offset): | |
return offset >= SCRIPT_RANGE_START and offset < SCRIPT_RANGE_END | |
def get_phonetic_info(lang): | |
return (ENGLISH_PHONETIC_DATA, ENGLISH_PHONETIC_VECTORS) | |
def invalid_vector(): | |
## TODO: check if np datatype is correct? | |
return np.array([0] * PHONETIC_VECTOR_LENGTH) | |
def get_phonetic_feature_vector(p, lang): | |
offset = enc_to_offset(p) | |
if not in_range(offset): | |
return invalid_vector() | |
phonetic_data, phonetic_vectors = get_phonetic_info(lang) | |
if phonetic_data.iloc[offset]["Valid Vector Representation"] == 0: | |
return invalid_vector() | |
return phonetic_vectors[offset] | |