Spaces:

Rajendransp133
/

microservice-NMT

Sleeping

App Files Files Community

microservice-NMT / libs /indic_nlp_library /indicnlp /script /english_script.py

Rajendransp133

Upload 86 files

ac901c7 verified 3 months ago

raw

history blame contribute delete

4.09 kB

	#
	# Copyright (c) 2013-present, Anoop Kunchukuttan
	# All rights reserved.
	#
	# This source code is licensed under the MIT license found in the
	# LICENSE file in the root directory of this source tree.
	#

	import os
	import pandas as pd
	import numpy as np

	from indicnlp import common


	#### Maps from ARPABET to Internal Id
	ARPABET_ID_MAP = {}
	ID_ARPABET_MAP = {}


	###
	# Phonetic Information about script characters
	###

	""" Phonetic data for English """
	ENGLISH_PHONETIC_DATA = None

	""" Phonetic vector for English"""
	ENGLISH_PHONETIC_VECTORS = None

	""" Length of phonetic vector """
	PHONETIC_VECTOR_LENGTH = 38

	""" Start offset for the phonetic feature vector in the phonetic data vector """
	PHONETIC_VECTOR_START_OFFSET = 6

	## PHONETIC PROPERTIES in order in which they occur in the vector
	## This list must be in sync with the keys in the PV_PROP_RANGES dictionary
	PV_PROP = [
	"basic_type",
	"vowel_length",
	"vowel_strength",
	"vowel_status",
	"consonant_type",
	"articulation_place",
	"aspiration",
	"voicing",
	"nasalization",
	"vowel_horizontal",
	"vowel_vertical",
	"vowel_roundness",
	]

	###
	# Bit vector ranges for various properties
	###

	PV_PROP_RANGES = {
	"basic_type": [0, 6],
	"vowel_length": [6, 8],
	"vowel_strength": [8, 11],
	"vowel_status": [11, 13],
	"consonant_type": [13, 18],
	"articulation_place": [18, 23],
	"aspiration": [23, 25],
	"voicing": [25, 27],
	"nasalization": [27, 29],
	"vowel_horizontal": [29, 32],
	"vowel_vertical": [32, 36],
	"vowel_roundness": [36, 38],
	}


	####
	# Indexes into the Phonetic Vector
	####
	PVIDX_BT_VOWEL = 0
	PVIDX_BT_CONSONANT = 1
	PVIDX_BT_NUKTA = 2
	PVIDX_BT_HALANT = 3
	PVIDX_BT_ANUSVAAR = 4
	PVIDX_BT_MISC = 5
	PVIDX_BT_S = PVIDX_BT_VOWEL
	PVIDX_BT_E = PVIDX_BT_MISC + 1

	PVIDX_VSTAT_DEP = 12

	####
	SCRIPT_RANGE_START = 0x0D00
	## TBD
	SCRIPT_RANGE_END = 0x0D2E


	def init():
	"""
	To be called by library loader, do not call it in your program
	"""

	global ENGLISH_PHONETIC_DATA, ENGLISH_PHONETIC_VECTORS, PHONETIC_VECTOR_LENGTH, PHONETIC_VECTOR_START_OFFSET

	ENGLISH_PHONETIC_DATA = pd.read_csv(
	os.path.join(
	common.get_resources_path(), "script", "english_script_phonetic_data.csv"
	),
	encoding="utf-8",
	)

	ENGLISH_PHONETIC_VECTORS = ENGLISH_PHONETIC_DATA.iloc[
	:, PHONETIC_VECTOR_START_OFFSET:
	].values

	PHONETIC_VECTOR_LENGTH = ENGLISH_PHONETIC_VECTORS.shape[1]

	### Load mapping from ARPABET representation of phoneme to internal ID
	global ARPABET_ID_MAP, ID_ARPABET_MAP

	with open(
	os.path.join(common.get_resources_path(), "script", "english_arpabet_list.csv"),
	"r",
	encoding="utf-8",
	) as infile:
	for ph_id, name in enumerate(iter(infile)):
	name = name.strip()
	ARPABET_ID_MAP[name] = ph_id
	ID_ARPABET_MAP[ph_id] = name


	def phoneme_to_offset(ph):
	return ARPABET_ID_MAP[ph]


	def offset_to_phoneme(ph_id):
	return ID_ARPABET_MAP[ph_id]


	def phoneme_to_enc(ph):
	return chr(SCRIPT_RANGE_START + phoneme_to_offset(ph))


	def enc_to_phoneme(ph):
	return offset_to_phoneme(enc_to_offset(ph))


	def enc_to_offset(c):
	return ord(c) - SCRIPT_RANGE_START


	def in_range(offset):
	return offset >= SCRIPT_RANGE_START and offset < SCRIPT_RANGE_END


	def get_phonetic_info(lang):
	return (ENGLISH_PHONETIC_DATA, ENGLISH_PHONETIC_VECTORS)


	def invalid_vector():
	## TODO: check if np datatype is correct?
	return np.array([0] * PHONETIC_VECTOR_LENGTH)


	def get_phonetic_feature_vector(p, lang):
	offset = enc_to_offset(p)

	if not in_range(offset):
	return invalid_vector()

	phonetic_data, phonetic_vectors = get_phonetic_info(lang)

	if phonetic_data.iloc[offset]["Valid Vector Representation"] == 0:
	return invalid_vector()

	return phonetic_vectors[offset]