Spaces:

ILAD
/

rhg-script-converter-ui

Sleeping

App Files Files Community

rhg-script-converter-ui / epitran /_epitran.py

micahg

Initial file upload

609216a 11 months ago

raw

history blame

6.44 kB

	# -- coding: utf-8 --
	import logging
	from typing import Union

	import panphon.featuretable
	from epitran.epihan import Epihan, EpihanTraditional
	from epitran.flite import FliteLexLookup
	from epitran.puncnorm import PuncNorm
	from epitran.simple import SimpleEpitran
	from epitran.xsampa import XSampa

	logger = logging.getLogger('epitran')
	logger.setLevel(logging.WARNING)

	class Epitran(object):
	"""Unified interface for IPA transliteration/transcription

	:param code str: ISO 639-3 plus "-" plus ISO 15924 code of the language/script pair that should be loaded
	:param preproc bool: apply preprocessors
	:param postproc bool: apply postprocessors
	:param ligatures bool: use precomposed ligatures instead of standard IPA
	:param cedict_filename str: path to file containing the CC-CEDict dictionary
	:param rev boolean: use reverse transliteration
	:param rev_preproc bool: if True, apply preprocessors when reverse transliterating
	:param rev_postproc bool: if True, apply postprocessors when reverse transliterating
	"""
	special = {'eng-Latn': FliteLexLookup,
	'cmn-Hans': Epihan,
	'cmn-Hant': EpihanTraditional}

	def __init__(self, code: str, preproc: bool=True, postproc: bool=True, ligatures: bool=False,
	cedict_file: Union[bool, None]=None, rev: bool=False,
	rev_preproc: bool=True, rev_postproc: bool=True, tones: bool=False):
	"""Constructor method"""
	if code in self.special:
	self.epi = self.special[code](ligatures=ligatures, cedict_file=cedict_file, tones=tones)
	else:
	self.epi = SimpleEpitran(code, preproc, postproc, ligatures, rev, rev_preproc, rev_postproc, tones=tones)
	self.ft = panphon.featuretable.FeatureTable()
	self.xsampa = XSampa()
	self.puncnorm = PuncNorm()

	def transliterate(self, word: str, normpunc: bool=False, ligatures: bool=False) -> str:
	"""Transliterates/transcribes a word into IPA

	:param word str: word to transcribe
	:param normpunc bool: if True, normalize punctuation
	:param ligatures bool: if True, use precomposed ligatures instead of standard IPA
	:return: An IPA string corresponding to the input orthographic string
	:rtype: str
	"""
	return self.epi.transliterate(word, normpunc, ligatures)

	def reverse_transliterate(self, ipa: str) -> str:
	"""Reconstructs word from IPA. Does the reverse of transliterate()

	:param ipa str: An IPA representation of a word
	:return: An orthographic representation of the word
	:rtype: str
	"""
	return self.epi.reverse_transliterate(ipa)

	def strict_trans(self, word: str, normpunc:bool =False, ligatures: bool=False) -> str:
	"""Transliterate a word into IPA, ignoring all characters that cannot be recognized.

	:param word str: word to transcribe
	:param normpunc bool, optional: if True, normalize punctuation
	:param ligatures bool, optional: if True, use precomposed ligatures instead of standard IPA
	:return: An IPA string corresponding to the input orthographic string, with all uncoverted characters omitted
	:rtype: str
	"""
	return self.epi.strict_trans(word, normpunc, ligatures)

	def trans_list(self, word: str, normpunc: bool=False, ligatures: bool=False) -> "list[str]":
	"""Transliterates/transcribes a word into list of IPA phonemes

	:param word str: word to transcribe
	:param normpunc bool, optional: if True, normalize punctuation
	:param ligatures bool, optional: if True, use precomposed ligatures instead of standard IPA
	:return: list of IPA strings, each corresponding to a segment
	:rtype: list[str]
	"""
	return self.ft.segs_safe(self.epi.transliterate(word, normpunc, ligatures))

	def trans_delimiter(self, text: str, delimiter: str=str(' '), normpunc: bool=False, ligatures: bool=False):
	"""Return IPA transliteration with a delimiter between segments

	:param text str: An orthographic text
	:param delimiter str, optional: A string to insert between segments
	:param normpunc bool, optional: If True, normalize punctuation
	:param ligatures bool, optional: If True, use precomposed ligatures instead of standard IPA
	:return: String of IPA phonemes separated by `delimiter`
	:rtype: str
	"""
	return delimiter.join(self.trans_list(text, normpunc=normpunc,
	ligatures=ligatures))

	def xsampa_list(self, word: str, normpunc: bool=False, ligaturize: bool=False):
	"""Transliterates/transcribes a word as X-SAMPA

	:param word str: An orthographic word
	:param normpunc bool, optional: If True, normalize punctuation
	:param ligatures bool, optional: If True, use precomposed ligatures instead of standard IPA
	:return: List of X-SAMPA strings corresponding to `word`
	:rtype: list[str]
	"""
	ipa_segs = self.ft.ipa_segs(self.epi.strict_trans(word, normpunc,
	ligaturize))
	return list(map(self.xsampa.ipa2xs, ipa_segs))

	def word_to_tuples(self, word: str, normpunc: bool=False, _ligaturize: bool=False):
	"""Given a word, returns a list of tuples corresponding to IPA segments. The "feature
	vectors" form a list consisting of (segment, vector) pairs.
	For IPA segments, segment is a substring of phonetic_form such that the
	concatenation of all segments in the list is equal to the phonetic_form.
	The vectors are a sequence of integers drawn from the set {-1, 0, 1}
	where -1 corresponds to '-', 0 corresponds to '0', and 1 corresponds to
	'+'.

	:param word str: An orthographic word
	:param normpunc bool, optional: If True, normalize punctuation
	:param ligatures bool, optional: If True, use precomposed ligatures instead of standard IPA
	:return: A list of tuples corresponding to IPA segments
	:rtype: list[tuple[str, str, str, str, list[int]]]
	"""
	try:
	return self.epi.word_to_tuples(word, normpunc)
	except AttributeError:
	raise AttributeError('Method word_to_tuples not yet implemented for this language-script pair!') from AttributeError