Spaces:

ILAD
/

rhg-script-converter-ui

Running

App Files Files Community

rhg-script-converter-ui / epitran /backoff.py

micahg

Initial file upload

609216a 7 months ago

raw

history blame contribute delete

No virus

3.14 kB

	# -- coding: utf-8 --
	from __future__ import (print_function, absolute_import,
	unicode_literals)

	import regex as re
	from . import _epitran
	import panphon.featuretable
	from epitran.puncnorm import PuncNorm
	from epitran.xsampa import XSampa
	from epitran.stripdiacritics import StripDiacritics


	class Backoff(object):
	"""Implements rudimentary language ID and backoff."""

	def __init__(self, lang_script_codes, cedict_file=None):
	"""Construct a Backoff object.

	Args:
	lang_script_codes (list): codes for languages to try, starting
	with the highest priority languages
	cedict_file (str): path to the CC-CEdict dictionary file
	(necessary only when cmn-Hans or cmn-Hant are used)
	"""
	self.langs = [_epitran.Epitran(c, cedict_file=cedict_file)
	for c in lang_script_codes]
	self.num_re = re.compile(r'\p{Number}+')
	self.ft = panphon.featuretable.FeatureTable()
	self.xsampa = XSampa()
	self.puncnorm = PuncNorm()
	self.dias = [StripDiacritics(c) for c in lang_script_codes]

	def transliterate(self, token):
	"""Return IPA transliteration given by first acceptable mode.
	Args:
	token (unicode): orthographic text
	Returns:
	unicode: transliteration as Unicode IPA string
	"""
	tr_list = []
	while token:
	is_outside_lang = True
	for dia, lang in zip(self.dias, self.langs):
	source = ''
	while True:
	m = lang.epi.regexp.match(dia.process(token))
	if not m:
	break
	s = m.group()
	token = token[len(s):]
	source += s
	is_outside_lang = False
	tr_list.append(lang.transliterate(source))
	if is_outside_lang:
	m = re.match(r'\p{Number}+', token)
	if m:
	source = m.group()
	tr_list.append(source)
	token = token[len(source):]
	else:
	tr_list.append(token[0])
	token = token[1:]
	return ''.join(tr_list)

	def trans_list(self, token):
	"""Transliterate/transcribe a word into list of IPA phonemes.

	Args:
	token (unicode): word to transcribe; unicode string

	Returns:
	list: list of IPA unicode strings, each corresponding to a segment
	"""
	return self.ft.segs_safe(self.transliterate(token))

	def xsampa_list(self, token):
	"""Transcribe a word into a list of X-SAMPA phonemes.

	Args:
	token (unicode): word to transcribe; unicode strings

	Returns:
	list: list of X-SAMPA strings, each corresponding to a segment
	"""
	if re.match(r'^\p{Number}+$', token):
	return ''
	else:
	ipa_segs = self.ft.ipa_segs(self.transliterate(token))
	return list(map(self.xsampa.ipa2xs, ipa_segs))