Spaces:
Sleeping
Sleeping
# | |
# Copyright (c) 2013-present, Anoop Kunchukuttan | |
# All rights reserved. | |
# | |
# This source code is licensed under the MIT license found in the | |
# LICENSE file in the root directory of this source tree. | |
# | |
import pandas as pd | |
import numpy as np | |
import os | |
from indicnlp import common | |
from indicnlp.common import IndicNlpException | |
from indicnlp import langinfo as li | |
### | |
# Phonetic Information about script characters | |
### | |
""" Phonetic data about all languages except Tamil """ | |
ALL_PHONETIC_DATA = None | |
""" Phonetic data for Tamil """ | |
TAMIL_PHONETIC_DATA = None | |
""" Phonetic vector for all languages except Tamil """ | |
ALL_PHONETIC_VECTORS = None | |
""" Phonetic vector for Tamil """ | |
TAMIL_PHONETIC_VECTORS = None | |
""" Length of phonetic vector """ | |
PHONETIC_VECTOR_LENGTH = 38 | |
""" Start offset for the phonetic feature vector in the phonetic data vector """ | |
PHONETIC_VECTOR_START_OFFSET = 6 | |
## PHONETIC PROPERTIES in order in which they occur in the vector | |
## This list must be in sync with the keys in the PV_PROP_RANGES dictionary | |
PV_PROP = [ | |
"basic_type", | |
"vowel_length", | |
"vowel_strength", | |
"vowel_status", | |
"consonant_type", | |
"articulation_place", | |
"aspiration", | |
"voicing", | |
"nasalization", | |
"vowel_horizontal", | |
"vowel_vertical", | |
"vowel_roundness", | |
] | |
### | |
# Bit vector ranges for various properties | |
### | |
PV_PROP_RANGES = { | |
"basic_type": [0, 6], | |
"vowel_length": [6, 8], | |
"vowel_strength": [8, 11], | |
"vowel_status": [11, 13], | |
"consonant_type": [13, 18], | |
"articulation_place": [18, 23], | |
"aspiration": [23, 25], | |
"voicing": [25, 27], | |
"nasalization": [27, 29], | |
"vowel_horizontal": [29, 32], | |
"vowel_vertical": [32, 36], | |
"vowel_roundness": [36, 38], | |
} | |
#### | |
# Indexes into the Phonetic Vector | |
#### | |
PVIDX_BT_VOWEL = 0 | |
PVIDX_BT_CONSONANT = 1 | |
PVIDX_BT_NUKTA = 2 | |
PVIDX_BT_HALANT = 3 | |
PVIDX_BT_ANUSVAAR = 4 | |
PVIDX_BT_MISC = 5 | |
PVIDX_BT_S = PVIDX_BT_VOWEL | |
PVIDX_BT_E = PVIDX_BT_MISC + 1 | |
PVIDX_VSTAT_DEP = 12 | |
##### | |
# Unicode information about characters | |
##### | |
SCRIPT_OFFSET_START = 0 | |
SCRIPT_OFFSET_RANGE = 0x80 | |
def init(): | |
""" | |
To be called by library loader, do not call it in your program | |
""" | |
global ALL_PHONETIC_DATA, ALL_PHONETIC_VECTORS, TAMIL_PHONETIC_DATA, TAMIL_PHONETIC_VECTORS, PHONETIC_VECTOR_LENGTH, PHONETIC_VECTOR_START_OFFSET | |
ALL_PHONETIC_DATA = pd.read_csv( | |
os.path.join( | |
common.get_resources_path(), "script", "all_script_phonetic_data.csv" | |
), | |
encoding="utf-8", | |
) | |
TAMIL_PHONETIC_DATA = pd.read_csv( | |
os.path.join( | |
common.get_resources_path(), "script", "tamil_script_phonetic_data.csv" | |
), | |
encoding="utf-8", | |
) | |
ALL_PHONETIC_VECTORS = ALL_PHONETIC_DATA.iloc[ | |
:, PHONETIC_VECTOR_START_OFFSET: | |
].values | |
TAMIL_PHONETIC_VECTORS = TAMIL_PHONETIC_DATA.iloc[ | |
:, PHONETIC_VECTOR_START_OFFSET: | |
].values | |
PHONETIC_VECTOR_LENGTH = ALL_PHONETIC_VECTORS.shape[1] | |
def is_supported_language(lang): | |
return lang in list(li.SCRIPT_RANGES.keys()) | |
def get_offset(c, lang): | |
if not is_supported_language(lang): | |
raise IndicNlpException("Language {} not supported".format(lang)) | |
return ord(c) - li.SCRIPT_RANGES[lang][0] | |
def offset_to_char(off, lang): | |
""" | |
Applicable to Brahmi derived Indic scripts | |
""" | |
if not is_supported_language(lang): | |
raise IndicNlpException("Language {} not supported".format(lang)) | |
return chr(off + li.SCRIPT_RANGES[lang][0]) | |
def is_indiclang_char(c, lang): | |
""" | |
Applicable to Brahmi derived Indic scripts | |
Note that DANDA and DOUBLE_DANDA have the same Unicode codepoint for all Indic scripts | |
""" | |
if not is_supported_language(lang): | |
raise IndicNlpException("Language {} not supported".format(lang)) | |
o = get_offset(c, lang) | |
return ( | |
(o >= SCRIPT_OFFSET_START and o < SCRIPT_OFFSET_RANGE) | |
or ord(c) == li.DANDA | |
or ord(c) == li.DOUBLE_DANDA | |
) | |
def in_coordinated_range_offset(c_offset): | |
""" | |
Applicable to Brahmi derived Indic scripts | |
""" | |
return ( | |
c_offset >= li.COORDINATED_RANGE_START_INCLUSIVE | |
and c_offset <= li.COORDINATED_RANGE_END_INCLUSIVE | |
) | |
def in_coordinated_range(c, lang): | |
if not is_supported_language(lang): | |
raise IndicNlpException("Language {} not supported".format(lang)) | |
return in_coordinated_range_offset(get_offset(c, lang)) | |
def get_phonetic_info(lang): | |
if not is_supported_language(lang): | |
raise IndicNlpException("Language {} not supported".format(lang)) | |
phonetic_data = ALL_PHONETIC_DATA if lang != li.LC_TA else TAMIL_PHONETIC_DATA | |
phonetic_vectors = ( | |
ALL_PHONETIC_VECTORS if lang != li.LC_TA else TAMIL_PHONETIC_VECTORS | |
) | |
return (phonetic_data, phonetic_vectors) | |
def invalid_vector(): | |
## TODO: check if np datatype is correct? | |
return np.array([0] * PHONETIC_VECTOR_LENGTH) | |
def get_phonetic_feature_vector(c, lang): | |
offset = get_offset(c, lang) | |
if not in_coordinated_range_offset(offset): | |
return invalid_vector() | |
phonetic_data, phonetic_vectors = get_phonetic_info(lang) | |
if phonetic_data.iloc[offset]["Valid Vector Representation"] == 0: | |
return invalid_vector() | |
return phonetic_vectors[offset] | |
def get_phonetic_feature_vector_offset(offset, lang): | |
if not in_coordinated_range_offset(offset): | |
return invalid_vector() | |
phonetic_data, phonetic_vectors = get_phonetic_info(lang) | |
if phonetic_data.iloc[offset]["Valid Vector Representation"] == 0: | |
return invalid_vector() | |
return phonetic_vectors[offset] | |
### Unary operations on vectors | |
def is_valid(v): | |
return np.sum(v) > 0 | |
def is_vowel(v): | |
return v[PVIDX_BT_VOWEL] == 1 | |
def is_consonant(v): | |
return v[PVIDX_BT_CONSONANT] == 1 | |
def is_halant(v): | |
return v[PVIDX_BT_HALANT] == 1 | |
def is_nukta(v): | |
return v[PVIDX_BT_NUKTA] == 1 | |
def is_anusvaar(v): | |
return v[PVIDX_BT_ANUSVAAR] == 1 | |
def is_misc(v): | |
return v[PVIDX_BT_MISC] == 1 | |
def is_dependent_vowel(v): | |
return is_vowel(v) and v[PVIDX_VSTAT_DEP] == 1 | |
def is_plosive(v): | |
return is_consonant(v) and get_property_vector(v, "consonant_type")[0] == 1 | |
### Binary operations on phonetic vectors | |
def or_vectors(v1, v2): | |
return np.array([1 if (b1 + b2) >= 1 else 0 for b1, b2 in zip(v1, v2)]) | |
def xor_vectors(v1, v2): | |
return np.array([1 if b1 != b2 else 0 for b1, b2 in zip(v1, v2)]) | |
### Getting properties from phonetic vectors | |
def get_property_vector(v, prop_name): | |
return v[PV_PROP_RANGES[prop_name][0] : PV_PROP_RANGES[prop_name][1]] | |
def get_property_value(v, prop_name): | |
factor_bits = get_property_vector(v, prop_name).tolist() | |
v = 0 | |
c = 1 | |
for b in factor_bits[::-1]: | |
v += c * b | |
c = c * 2.0 | |
return int(v) | |
def lcsr_indic(srcw, tgtw, slang, tlang): | |
""" | |
compute the Longest Common Subsequence Ratio (LCSR) between two strings at the character level. | |
This works for Indic scripts by mapping both languages to a common script | |
srcw: source language string | |
tgtw: source language string | |
slang: source language | |
tlang: target language | |
""" | |
score_mat = np.zeros((len(srcw) + 1, len(tgtw) + 1)) | |
for si, sc in enumerate(srcw, 1): | |
for ti, tc in enumerate(tgtw, 1): | |
so = get_offset(sc, slang) | |
to = get_offset(tc, tlang) | |
if ( | |
in_coordinated_range_offset(so) | |
and in_coordinated_range_offset(to) | |
and so == to | |
): | |
score_mat[si, ti] = score_mat[si - 1, ti - 1] + 1.0 | |
elif ( | |
not (in_coordinated_range_offset(so) or in_coordinated_range_offset(to)) | |
and sc == tc | |
): | |
score_mat[si, ti] = score_mat[si - 1, ti - 1] + 1.0 | |
else: | |
score_mat[si, ti] = max(score_mat[si, ti - 1], score_mat[si - 1, ti]) | |
return ( | |
score_mat[-1, -1] / float(max(len(srcw), len(tgtw))), | |
float(len(srcw)), | |
float(len(tgtw)), | |
) | |
def lcsr_any(srcw, tgtw): | |
""" | |
LCSR computation if both languages have the same script | |
""" | |
score_mat = np.zeros((len(srcw) + 1, len(tgtw) + 1)) | |
for si, sc in enumerate(srcw, 1): | |
for ti, tc in enumerate(tgtw, 1): | |
if sc == tc: | |
score_mat[si, ti] = score_mat[si - 1, ti - 1] + 1.0 | |
else: | |
score_mat[si, ti] = max(score_mat[si, ti - 1], score_mat[si - 1, ti]) | |
return ( | |
score_mat[-1, -1] / float(max(len(srcw), len(tgtw))), | |
float(len(srcw)), | |
float(len(tgtw)), | |
) | |
def lcsr(srcw, tgtw, slang, tlang): | |
""" | |
compute the Longest Common Subsequence Ratio (LCSR) between two strings at the character level. | |
srcw: source language string | |
tgtw: source language string | |
slang: source language | |
tlang: target language | |
""" | |
if ( | |
slang == tlang | |
or not is_supported_language(slang) | |
or not is_supported_language(tlang) | |
): | |
return lcsr_any(srcw, tgtw, slang, tlang) | |
else: | |
return lcsr_indic(srcw, tgtw) | |