Spaces:

ILAD
/

rhg-script-converter-ui

Sleeping

App Files Files Community

rhg-script-converter-ui / epitran /epihan.py

micahg

Initial file upload

609216a 11 months ago

raw

history blame

4.61 kB

	# -- utf-8 --
	from __future__ import print_function, unicode_literals, division, absolute_import

	import os.path

	import pkg_resources
	import regex as re

	from . import cedict
	from . import rules
	from . import download
	from epitran.ligaturize import ligaturize


	class MissingData(Exception):
	pass


	class Epihan(object):
	punc = [(u'\uff0c', u','),
	(u'\uff01', u'!'),
	(u'\uff1f', u'?'),
	(u'\uff1b', u';'),
	(u'\uff1a', u':'),
	(u'\uff08', u'('),
	(u'\uff09', u')'),
	(u'\uff3b', u'['),
	(u'\uff3d', u']'),
	(u'\u3010', u'['),
	(u'\u3011', u']'),
	]

	def __init__(self, ligatures=False, cedict_file=None,
	rules_file='pinyin-to-ipa.txt', tones=False):
	"""Construct epitran object for Chinese

	Args:
	ligatures (bool): if True, use ligatures instead of standard IPA
	cedict_file (str): path to CC-CEDict dictionary file
	rules_file (str): name of file with rules for converting pinyin to
	IPA
	tones (bool): if True, output tones as Chao tone numbers; overrides
	`rules_file`
	"""
	# If no cedict_file is specified, raise and error
	if not cedict_file:
	if download.cedict_exists():
	cedict_file = download.get_cedict_file()
	else:
	raise MissingData('Download CC-CEDICT with "epitran.download.cedict()')
	if tones:
	rules_file = os.path.join('data', 'rules', 'pinyin-to-ipa-tones.txt')
	else:
	rules_file = os.path.join('data', 'rules', rules_file)
	rules_file = pkg_resources.resource_filename(__name__, rules_file)
	self.cedict = cedict.CEDictTrie(cedict_file)
	self.rules = rules.Rules([rules_file])
	self.regexp = re.compile(r'\p{Han}')

	def normalize_punc(self, text):
	"""Normalize punctutation in a string

	Args:
	text (unicode): an orthographic string

	Return:
	unicode: an orthographic string with punctation normalized to
	Western equivalents
	"""
	for a, b in self.punc:
	text = text.replace(a, b)
	return text

	def transliterate(self, text, normpunc=False, ligatures=False):
	"""Transliterates/transcribes a word into IPA

	Args:
	word (str): word to transcribe; Unicode string
	normpunc (bool): normalize punctuation
	ligatures (bool): use precomposed ligatures instead of standard IPA

	Returns:
	str: Unicode IPA string
	"""
	tokens = self.cedict.tokenize(text)
	ipa_tokens = []
	for token in tokens:
	if token in self.cedict.hanzi:
	(pinyin, _) = self.cedict.hanzi[token]
	pinyin = u''.join(pinyin).lower()
	ipa = self.rules.apply(pinyin)
	ipa_tokens.append(ipa.replace(u',', u''))
	else:
	if normpunc:
	token = self.normalize_punc(token)
	ipa_tokens.append(token)
	ipa_tokens = map(ligaturize, ipa_tokens)\
	if ligatures else ipa_tokens
	return u''.join(ipa_tokens)

	def strict_trans(self, text, normpunc=False, ligatures=False):
	return self.transliterate(text, normpunc, ligatures)


	class EpihanTraditional(Epihan):
	def __init__(self, ligatures=False, cedict_file=None, tones=False, rules_file='pinyin-to-ipa.txt'):
	"""Construct epitran object for Traditional Chinese

	Args:
	ligatures (bool): if True, use ligatures instead of standard IPA
	cedict_file (str): path to CC-CEDict dictionary file
	rules_file (str): name of file with rules for converting pinyin to
	IPA
	"""
	if not cedict_file:
	if download.cedict_exists():
	cedict_file = download.get_cedict_file()
	else:
	raise MissingData('Download CC-CEDICT with "epitran.download.cedict().')
	if tones:
	rules_file = os.path.join('data', 'rules', 'pinyin-to-ipa-tones.txt')
	else:
	rules_file = os.path.join('data', 'rules', rules_file)
	rules_file = pkg_resources.resource_filename(__name__, rules_file)
	self.cedict = cedict.CEDictTrie(cedict_file, traditional=True)
	self.rules = rules.Rules([rules_file])
	self.regexp = re.compile(r'\p{Han}')