micahg's picture
Initial file upload
609216a
raw
history blame contribute delete
No virus
3.14 kB
# -*- coding: utf-8 -*-
from __future__ import (print_function, absolute_import,
unicode_literals)
import regex as re
from . import _epitran
import panphon.featuretable
from epitran.puncnorm import PuncNorm
from epitran.xsampa import XSampa
from epitran.stripdiacritics import StripDiacritics
class Backoff(object):
"""Implements rudimentary language ID and backoff."""
def __init__(self, lang_script_codes, cedict_file=None):
"""Construct a Backoff object.
Args:
lang_script_codes (list): codes for languages to try, starting
with the highest priority languages
cedict_file (str): path to the CC-CEdict dictionary file
(necessary only when cmn-Hans or cmn-Hant are used)
"""
self.langs = [_epitran.Epitran(c, cedict_file=cedict_file)
for c in lang_script_codes]
self.num_re = re.compile(r'\p{Number}+')
self.ft = panphon.featuretable.FeatureTable()
self.xsampa = XSampa()
self.puncnorm = PuncNorm()
self.dias = [StripDiacritics(c) for c in lang_script_codes]
def transliterate(self, token):
"""Return IPA transliteration given by first acceptable mode.
Args:
token (unicode): orthographic text
Returns:
unicode: transliteration as Unicode IPA string
"""
tr_list = []
while token:
is_outside_lang = True
for dia, lang in zip(self.dias, self.langs):
source = ''
while True:
m = lang.epi.regexp.match(dia.process(token))
if not m:
break
s = m.group()
token = token[len(s):]
source += s
is_outside_lang = False
tr_list.append(lang.transliterate(source))
if is_outside_lang:
m = re.match(r'\p{Number}+', token)
if m:
source = m.group()
tr_list.append(source)
token = token[len(source):]
else:
tr_list.append(token[0])
token = token[1:]
return ''.join(tr_list)
def trans_list(self, token):
"""Transliterate/transcribe a word into list of IPA phonemes.
Args:
token (unicode): word to transcribe; unicode string
Returns:
list: list of IPA unicode strings, each corresponding to a segment
"""
return self.ft.segs_safe(self.transliterate(token))
def xsampa_list(self, token):
"""Transcribe a word into a list of X-SAMPA phonemes.
Args:
token (unicode): word to transcribe; unicode strings
Returns:
list: list of X-SAMPA strings, each corresponding to a segment
"""
if re.match(r'^\p{Number}+$', token):
return ''
else:
ipa_segs = self.ft.ipa_segs(self.transliterate(token))
return list(map(self.xsampa.ipa2xs, ipa_segs))