# -*- coding: utf-8 -*- from __future__ import (print_function, absolute_import, unicode_literals) import regex as re from . import _epitran import panphon.featuretable from epitran.puncnorm import PuncNorm from epitran.xsampa import XSampa from epitran.stripdiacritics import StripDiacritics class Backoff(object): """Implements rudimentary language ID and backoff.""" def __init__(self, lang_script_codes, cedict_file=None): """Construct a Backoff object. Args: lang_script_codes (list): codes for languages to try, starting with the highest priority languages cedict_file (str): path to the CC-CEdict dictionary file (necessary only when cmn-Hans or cmn-Hant are used) """ self.langs = [_epitran.Epitran(c, cedict_file=cedict_file) for c in lang_script_codes] self.num_re = re.compile(r'\p{Number}+') self.ft = panphon.featuretable.FeatureTable() self.xsampa = XSampa() self.puncnorm = PuncNorm() self.dias = [StripDiacritics(c) for c in lang_script_codes] def transliterate(self, token): """Return IPA transliteration given by first acceptable mode. Args: token (unicode): orthographic text Returns: unicode: transliteration as Unicode IPA string """ tr_list = [] while token: is_outside_lang = True for dia, lang in zip(self.dias, self.langs): source = '' while True: m = lang.epi.regexp.match(dia.process(token)) if not m: break s = m.group() token = token[len(s):] source += s is_outside_lang = False tr_list.append(lang.transliterate(source)) if is_outside_lang: m = re.match(r'\p{Number}+', token) if m: source = m.group() tr_list.append(source) token = token[len(source):] else: tr_list.append(token[0]) token = token[1:] return ''.join(tr_list) def trans_list(self, token): """Transliterate/transcribe a word into list of IPA phonemes. Args: token (unicode): word to transcribe; unicode string Returns: list: list of IPA unicode strings, each corresponding to a segment """ return self.ft.segs_safe(self.transliterate(token)) def xsampa_list(self, token): """Transcribe a word into a list of X-SAMPA phonemes. Args: token (unicode): word to transcribe; unicode strings Returns: list: list of X-SAMPA strings, each corresponding to a segment """ if re.match(r'^\p{Number}+$', token): return '' else: ipa_segs = self.ft.ipa_segs(self.transliterate(token)) return list(map(self.xsampa.ipa2xs, ipa_segs))