Spaces:

ILAD
/

rhg-script-converter-ui

Running

App Files Files Community

rhg-script-converter-ui / epitran /flite.py

micahg

Initial file upload

609216a 9 months ago

raw

history blame contribute delete

No virus

7.66 kB

	# -- coding: utf-8 --
	from __future__ import (absolute_import, division, print_function,
	unicode_literals)

	import logging
	import os.path
	import string
	import sys
	import unicodedata

	import pkg_resources
	import regex as re

	import panphon
	import unicodecsv as csv
	from epitran.ligaturize import ligaturize
	from epitran.puncnorm import PuncNorm

	if os.name == 'posix' and sys.version_info[0] < 3:
	import subprocess32 as subprocess
	else:
	import subprocess

	logging.basicConfig(level=logging.CRITICAL)
	logger = logging.getLogger('epitran')


	if sys.version_info[0] == 3:
	def unicode(x):
	return x


	class Flite(object):
	"""English G2P using the Flite speech synthesis system."""
	def __init__(self, arpabet='arpabet', ligatures=False, **kwargs):
	"""Construct a Flite "wrapper"

	Args:
	arpabet (str): file containing ARPAbet to IPA mapping
	ligatures (bool): if True, use non-standard ligatures instead of
	standard IPA
	"""
	arpabet = pkg_resources.resource_filename(__name__, os.path.join('data', arpabet + '.csv'))
	self.arpa_map = self._read_arpabet(arpabet)
	self.chunk_re = re.compile(r"([A-Za-z'’]+\|[^A-Za-z'’]+)", re.U)
	self.letter_re = re.compile(r"[A-Za-z'’]+")
	self.regexp = re.compile(r'[A-Za-z]')
	self.puncnorm = PuncNorm()
	self.ligatures = ligatures
	self.ft = panphon.FeatureTable()
	self.num_panphon_fts = len(self.ft.names)


	def _read_arpabet(self, arpabet):
	arpa_map = {}
	with open(arpabet, 'rb') as f:
	reader = csv.reader(f, encoding='utf-8')
	for arpa, ipa in reader:
	arpa_map[arpa] = ipa
	return arpa_map

	def normalize(self, text):
	text = unicode(text)
	text = unicodedata.normalize('NFD', text)
	text = ''.join(filter(lambda x: x in string.printable, text))
	return text

	def arpa_text_to_list(self, arpa_text):
	return arpa_text.split(' ')[1:-1]

	def arpa_to_ipa(self, arpa_text, ligatures=False):
	arpa_text = arpa_text.strip()
	arpa_list = self.arpa_text_to_list(arpa_text)
	arpa_list = map(lambda d: re.sub(r'\d', '', d), arpa_list)
	ipa_list = map(lambda d: self.arpa_map[d], arpa_list)
	text = ''.join(ipa_list)
	return text

	def english_g2p(self, english):
	"""Stub for English G2P function to be overwritten by subclasses"""
	return ""

	def transliterate(self, text, normpunc=False, ligatures=False):
	"""Convert English text to IPA transcription

	Args:
	text (unicode): English text
	normpunc (bool): if True, normalize punctuation downward
	ligatures (bool): if True, use non-standard ligatures instead of
	standard IPA
	"""
	text = unicodedata.normalize('NFC', text)
	acc = []
	for chunk in self.chunk_re.findall(text):
	if self.letter_re.match(chunk):
	acc.append(self.english_g2p(chunk))
	else:
	acc.append(chunk)
	text = ''.join(acc)
	text = self.puncnorm.norm(text) if normpunc else text
	text = ligaturize(text) if (ligatures or self.ligatures) else text
	return text

	def strict_trans(self, text, normpunc=False, ligatures=False):
	return self.transliterate(text, normpunc, ligatures)

	def word_to_tuples(self, word, normpunc=False):
	"""Given a word, returns a list of tuples corresponding to IPA segments.

	Args:
	word (unicode): word to transliterate
	normpunc (bool): If True, normalizes punctuation to ASCII inventory

	Returns:
	list: A list of (category, lettercase, orthographic_form,
	phonetic_form, feature_vectors) tuples.

	The "feature vectors" form a list consisting of (segment, vector) pairs.
	For IPA segments, segment is a substring of phonetic_form such that the
	concatenation of all segments in the list is equal to the phonetic_form.
	The vectors are a sequence of integers drawn from the set {-1, 0, 1}
	where -1 corresponds to '-', 0 corresponds to '0', and 1 corresponds to
	'+'.
	"""
	def cat_and_cap(c):
	cat, case = tuple(unicodedata.category(c))
	case = 1 if case == 'u' else 0
	return unicode(cat), case

	def recode_ft(ft):
	try:
	return {'+': 1, '0': 0, '-': -1}[ft]
	except KeyError:
	return None

	def vec2bin(vec):
	return map(recode_ft, vec)

	def to_vector(seg):
	return seg, vec2bin(self.ft.segment_to_vector(seg))

	def to_vectors(phon):
	if phon == '':
	return [(-1, [0] * self.num_panphon_fts)]
	else:
	return [to_vector(seg) for seg in self.ft.ipa_segs(phon)]

	tuples = []
	word = unicode(word)
	# word = self.strip_diacritics.process(word)
	word = unicodedata.normalize('NFKD', word)
	word = unicodedata.normalize('NFC', word)
	while word:
	match = re.match('[A-Za-z]+', word)
	if match:
	span = match.group(0)
	cat, case = cat_and_cap(span[0])
	phonword = self.transliterate(span)
	phonsegs = self.ft.ipa_segs(phonword)
	maxlen = max(len(phonsegs), len(span))
	orth = list(span) + [''] * (maxlen - len(span))
	phonsegs += [''] * (maxlen - len(phonsegs))
	for p, o in zip(phonsegs, orth):
	tuples.append(('L', case, o, p, to_vectors(p)))
	word = word[len(span):]
	else:
	span = word[0]
	span = self.puncnorm.norm(span) if normpunc else span
	cat, case = cat_and_cap(span)
	cat = 'P' if normpunc and cat in self.puncnorm else cat
	phon = ''
	vecs = to_vectors(phon)
	tuples.append((cat, case, span, phon, vecs))
	word = word[1:]
	return tuples


	class FliteT2P(Flite):
	"""Flite G2P using t2p."""

	def english_g2p(self, text):
	text = self.normalize(text)
	try:
	arpa_text = subprocess.check_output(['t2p', '"{}"'.format(text)])
	arpa_text = arpa_text.decode('utf-8')
	except OSError:
	logger.warning('t2p (from flite) is not installed.')
	arpa_text = ''
	except subprocess.CalledProcessError:
	logger.warning('Non-zero exit status from t2p.')
	arpa_text = ''
	return self.arpa_to_ipa(arpa_text)


	class FliteLexLookup(Flite):
	"""Flite G2P using lex_lookup."""

	def arpa_text_to_list(self, arpa_text):
	return arpa_text[1:-1].split(' ')

	def english_g2p(self, text):
	text = self.normalize(text).lower()
	try:
	arpa_text = subprocess.check_output(['lex_lookup', text])
	arpa_text = arpa_text.decode('utf-8')
	except OSError:
	logger.warning('lex_lookup (from flite) is not installed.')
	arpa_text = ''
	except subprocess.CalledProcessError:
	logger.warning('Non-zero exit status from lex_lookup.')
	arpa_text = ''
	# Split on newlines and take the first element (in case lex_lookup
	# returns multiple lines).
	arpa_text = arpa_text.splitlines()[0]
	return self.arpa_to_ipa(arpa_text)