Spaces:

Rajendransp133
/

microservice-NMT

Sleeping

App Files Files Community

microservice-NMT / libs /indic_nlp_library /indicnlp /script /indic_scripts.py

Rajendransp133

Upload 86 files

ac901c7 verified about 1 month ago

raw

history blame contribute delete

9.56 kB

	#
	# Copyright (c) 2013-present, Anoop Kunchukuttan
	# All rights reserved.
	#
	# This source code is licensed under the MIT license found in the
	# LICENSE file in the root directory of this source tree.
	#

	import pandas as pd
	import numpy as np
	import os

	from indicnlp import common
	from indicnlp.common import IndicNlpException
	from indicnlp import langinfo as li

	###
	# Phonetic Information about script characters
	###

	""" Phonetic data about all languages except Tamil """
	ALL_PHONETIC_DATA = None

	""" Phonetic data for Tamil """
	TAMIL_PHONETIC_DATA = None

	""" Phonetic vector for all languages except Tamil """
	ALL_PHONETIC_VECTORS = None

	""" Phonetic vector for Tamil """
	TAMIL_PHONETIC_VECTORS = None

	""" Length of phonetic vector """
	PHONETIC_VECTOR_LENGTH = 38

	""" Start offset for the phonetic feature vector in the phonetic data vector """
	PHONETIC_VECTOR_START_OFFSET = 6

	## PHONETIC PROPERTIES in order in which they occur in the vector
	## This list must be in sync with the keys in the PV_PROP_RANGES dictionary
	PV_PROP = [
	"basic_type",
	"vowel_length",
	"vowel_strength",
	"vowel_status",
	"consonant_type",
	"articulation_place",
	"aspiration",
	"voicing",
	"nasalization",
	"vowel_horizontal",
	"vowel_vertical",
	"vowel_roundness",
	]

	###
	# Bit vector ranges for various properties
	###

	PV_PROP_RANGES = {
	"basic_type": [0, 6],
	"vowel_length": [6, 8],
	"vowel_strength": [8, 11],
	"vowel_status": [11, 13],
	"consonant_type": [13, 18],
	"articulation_place": [18, 23],
	"aspiration": [23, 25],
	"voicing": [25, 27],
	"nasalization": [27, 29],
	"vowel_horizontal": [29, 32],
	"vowel_vertical": [32, 36],
	"vowel_roundness": [36, 38],
	}


	####
	# Indexes into the Phonetic Vector
	####
	PVIDX_BT_VOWEL = 0
	PVIDX_BT_CONSONANT = 1
	PVIDX_BT_NUKTA = 2
	PVIDX_BT_HALANT = 3
	PVIDX_BT_ANUSVAAR = 4
	PVIDX_BT_MISC = 5
	PVIDX_BT_S = PVIDX_BT_VOWEL
	PVIDX_BT_E = PVIDX_BT_MISC + 1

	PVIDX_VSTAT_DEP = 12

	#####
	# Unicode information about characters
	#####

	SCRIPT_OFFSET_START = 0
	SCRIPT_OFFSET_RANGE = 0x80


	def init():
	"""
	To be called by library loader, do not call it in your program
	"""

	global ALL_PHONETIC_DATA, ALL_PHONETIC_VECTORS, TAMIL_PHONETIC_DATA, TAMIL_PHONETIC_VECTORS, PHONETIC_VECTOR_LENGTH, PHONETIC_VECTOR_START_OFFSET

	ALL_PHONETIC_DATA = pd.read_csv(
	os.path.join(
	common.get_resources_path(), "script", "all_script_phonetic_data.csv"
	),
	encoding="utf-8",
	)
	TAMIL_PHONETIC_DATA = pd.read_csv(
	os.path.join(
	common.get_resources_path(), "script", "tamil_script_phonetic_data.csv"
	),
	encoding="utf-8",
	)

	ALL_PHONETIC_VECTORS = ALL_PHONETIC_DATA.iloc[
	:, PHONETIC_VECTOR_START_OFFSET:
	].values
	TAMIL_PHONETIC_VECTORS = TAMIL_PHONETIC_DATA.iloc[
	:, PHONETIC_VECTOR_START_OFFSET:
	].values

	PHONETIC_VECTOR_LENGTH = ALL_PHONETIC_VECTORS.shape[1]


	def is_supported_language(lang):
	return lang in list(li.SCRIPT_RANGES.keys())


	def get_offset(c, lang):
	if not is_supported_language(lang):
	raise IndicNlpException("Language {} not supported".format(lang))
	return ord(c) - li.SCRIPT_RANGES[lang][0]


	def offset_to_char(off, lang):
	"""
	Applicable to Brahmi derived Indic scripts
	"""
	if not is_supported_language(lang):
	raise IndicNlpException("Language {} not supported".format(lang))
	return chr(off + li.SCRIPT_RANGES[lang][0])


	def is_indiclang_char(c, lang):
	"""
	Applicable to Brahmi derived Indic scripts
	Note that DANDA and DOUBLE_DANDA have the same Unicode codepoint for all Indic scripts
	"""
	if not is_supported_language(lang):
	raise IndicNlpException("Language {} not supported".format(lang))
	o = get_offset(c, lang)
	return (
	(o >= SCRIPT_OFFSET_START and o < SCRIPT_OFFSET_RANGE)
	or ord(c) == li.DANDA
	or ord(c) == li.DOUBLE_DANDA
	)


	def in_coordinated_range_offset(c_offset):
	"""
	Applicable to Brahmi derived Indic scripts
	"""
	return (
	c_offset >= li.COORDINATED_RANGE_START_INCLUSIVE
	and c_offset <= li.COORDINATED_RANGE_END_INCLUSIVE
	)


	def in_coordinated_range(c, lang):
	if not is_supported_language(lang):
	raise IndicNlpException("Language {} not supported".format(lang))
	return in_coordinated_range_offset(get_offset(c, lang))


	def get_phonetic_info(lang):
	if not is_supported_language(lang):
	raise IndicNlpException("Language {} not supported".format(lang))
	phonetic_data = ALL_PHONETIC_DATA if lang != li.LC_TA else TAMIL_PHONETIC_DATA
	phonetic_vectors = (
	ALL_PHONETIC_VECTORS if lang != li.LC_TA else TAMIL_PHONETIC_VECTORS
	)

	return (phonetic_data, phonetic_vectors)


	def invalid_vector():
	## TODO: check if np datatype is correct?
	return np.array([0] * PHONETIC_VECTOR_LENGTH)


	def get_phonetic_feature_vector(c, lang):
	offset = get_offset(c, lang)

	if not in_coordinated_range_offset(offset):
	return invalid_vector()

	phonetic_data, phonetic_vectors = get_phonetic_info(lang)

	if phonetic_data.iloc[offset]["Valid Vector Representation"] == 0:
	return invalid_vector()

	return phonetic_vectors[offset]


	def get_phonetic_feature_vector_offset(offset, lang):
	if not in_coordinated_range_offset(offset):
	return invalid_vector()

	phonetic_data, phonetic_vectors = get_phonetic_info(lang)

	if phonetic_data.iloc[offset]["Valid Vector Representation"] == 0:
	return invalid_vector()

	return phonetic_vectors[offset]


	### Unary operations on vectors
	def is_valid(v):
	return np.sum(v) > 0


	def is_vowel(v):
	return v[PVIDX_BT_VOWEL] == 1


	def is_consonant(v):
	return v[PVIDX_BT_CONSONANT] == 1


	def is_halant(v):
	return v[PVIDX_BT_HALANT] == 1


	def is_nukta(v):
	return v[PVIDX_BT_NUKTA] == 1


	def is_anusvaar(v):
	return v[PVIDX_BT_ANUSVAAR] == 1


	def is_misc(v):
	return v[PVIDX_BT_MISC] == 1


	def is_dependent_vowel(v):
	return is_vowel(v) and v[PVIDX_VSTAT_DEP] == 1


	def is_plosive(v):
	return is_consonant(v) and get_property_vector(v, "consonant_type")[0] == 1


	### Binary operations on phonetic vectors


	def or_vectors(v1, v2):
	return np.array([1 if (b1 + b2) >= 1 else 0 for b1, b2 in zip(v1, v2)])


	def xor_vectors(v1, v2):
	return np.array([1 if b1 != b2 else 0 for b1, b2 in zip(v1, v2)])


	### Getting properties from phonetic vectors


	def get_property_vector(v, prop_name):
	return v[PV_PROP_RANGES[prop_name][0] : PV_PROP_RANGES[prop_name][1]]


	def get_property_value(v, prop_name):
	factor_bits = get_property_vector(v, prop_name).tolist()

	v = 0
	c = 1
	for b in factor_bits[::-1]:
	v += c * b
	c = c * 2.0

	return int(v)


	def lcsr_indic(srcw, tgtw, slang, tlang):
	"""
	compute the Longest Common Subsequence Ratio (LCSR) between two strings at the character level.
	This works for Indic scripts by mapping both languages to a common script

	srcw: source language string
	tgtw: source language string
	slang: source language
	tlang: target language
	"""
	score_mat = np.zeros((len(srcw) + 1, len(tgtw) + 1))

	for si, sc in enumerate(srcw, 1):
	for ti, tc in enumerate(tgtw, 1):
	so = get_offset(sc, slang)
	to = get_offset(tc, tlang)

	if (
	in_coordinated_range_offset(so)
	and in_coordinated_range_offset(to)
	and so == to
	):
	score_mat[si, ti] = score_mat[si - 1, ti - 1] + 1.0
	elif (
	not (in_coordinated_range_offset(so) or in_coordinated_range_offset(to))
	and sc == tc
	):
	score_mat[si, ti] = score_mat[si - 1, ti - 1] + 1.0
	else:
	score_mat[si, ti] = max(score_mat[si, ti - 1], score_mat[si - 1, ti])

	return (
	score_mat[-1, -1] / float(max(len(srcw), len(tgtw))),
	float(len(srcw)),
	float(len(tgtw)),
	)


	def lcsr_any(srcw, tgtw):
	"""
	LCSR computation if both languages have the same script
	"""
	score_mat = np.zeros((len(srcw) + 1, len(tgtw) + 1))

	for si, sc in enumerate(srcw, 1):
	for ti, tc in enumerate(tgtw, 1):
	if sc == tc:
	score_mat[si, ti] = score_mat[si - 1, ti - 1] + 1.0
	else:
	score_mat[si, ti] = max(score_mat[si, ti - 1], score_mat[si - 1, ti])

	return (
	score_mat[-1, -1] / float(max(len(srcw), len(tgtw))),
	float(len(srcw)),
	float(len(tgtw)),
	)


	def lcsr(srcw, tgtw, slang, tlang):
	"""
	compute the Longest Common Subsequence Ratio (LCSR) between two strings at the character level.

	srcw: source language string
	tgtw: source language string
	slang: source language
	tlang: target language
	"""

	if (
	slang == tlang
	or not is_supported_language(slang)
	or not is_supported_language(tlang)
	):
	return lcsr_any(srcw, tgtw, slang, tlang)
	else:
	return lcsr_indic(srcw, tgtw)