micahg's picture
Initial file upload
609216a
raw
history blame
4.61 kB
# -*- utf-8 -*-
from __future__ import print_function, unicode_literals, division, absolute_import
import os.path
import pkg_resources
import regex as re
from . import cedict
from . import rules
from . import download
from epitran.ligaturize import ligaturize
class MissingData(Exception):
pass
class Epihan(object):
punc = [(u'\uff0c', u','),
(u'\uff01', u'!'),
(u'\uff1f', u'?'),
(u'\uff1b', u';'),
(u'\uff1a', u':'),
(u'\uff08', u'('),
(u'\uff09', u')'),
(u'\uff3b', u'['),
(u'\uff3d', u']'),
(u'\u3010', u'['),
(u'\u3011', u']'),
]
def __init__(self, ligatures=False, cedict_file=None,
rules_file='pinyin-to-ipa.txt', tones=False):
"""Construct epitran object for Chinese
Args:
ligatures (bool): if True, use ligatures instead of standard IPA
cedict_file (str): path to CC-CEDict dictionary file
rules_file (str): name of file with rules for converting pinyin to
IPA
tones (bool): if True, output tones as Chao tone numbers; overrides
`rules_file`
"""
# If no cedict_file is specified, raise and error
if not cedict_file:
if download.cedict_exists():
cedict_file = download.get_cedict_file()
else:
raise MissingData('Download CC-CEDICT with "epitran.download.cedict()')
if tones:
rules_file = os.path.join('data', 'rules', 'pinyin-to-ipa-tones.txt')
else:
rules_file = os.path.join('data', 'rules', rules_file)
rules_file = pkg_resources.resource_filename(__name__, rules_file)
self.cedict = cedict.CEDictTrie(cedict_file)
self.rules = rules.Rules([rules_file])
self.regexp = re.compile(r'\p{Han}')
def normalize_punc(self, text):
"""Normalize punctutation in a string
Args:
text (unicode): an orthographic string
Return:
unicode: an orthographic string with punctation normalized to
Western equivalents
"""
for a, b in self.punc:
text = text.replace(a, b)
return text
def transliterate(self, text, normpunc=False, ligatures=False):
"""Transliterates/transcribes a word into IPA
Args:
word (str): word to transcribe; Unicode string
normpunc (bool): normalize punctuation
ligatures (bool): use precomposed ligatures instead of standard IPA
Returns:
str: Unicode IPA string
"""
tokens = self.cedict.tokenize(text)
ipa_tokens = []
for token in tokens:
if token in self.cedict.hanzi:
(pinyin, _) = self.cedict.hanzi[token]
pinyin = u''.join(pinyin).lower()
ipa = self.rules.apply(pinyin)
ipa_tokens.append(ipa.replace(u',', u''))
else:
if normpunc:
token = self.normalize_punc(token)
ipa_tokens.append(token)
ipa_tokens = map(ligaturize, ipa_tokens)\
if ligatures else ipa_tokens
return u''.join(ipa_tokens)
def strict_trans(self, text, normpunc=False, ligatures=False):
return self.transliterate(text, normpunc, ligatures)
class EpihanTraditional(Epihan):
def __init__(self, ligatures=False, cedict_file=None, tones=False, rules_file='pinyin-to-ipa.txt'):
"""Construct epitran object for Traditional Chinese
Args:
ligatures (bool): if True, use ligatures instead of standard IPA
cedict_file (str): path to CC-CEDict dictionary file
rules_file (str): name of file with rules for converting pinyin to
IPA
"""
if not cedict_file:
if download.cedict_exists():
cedict_file = download.get_cedict_file()
else:
raise MissingData('Download CC-CEDICT with "epitran.download.cedict().')
if tones:
rules_file = os.path.join('data', 'rules', 'pinyin-to-ipa-tones.txt')
else:
rules_file = os.path.join('data', 'rules', rules_file)
rules_file = pkg_resources.resource_filename(__name__, rules_file)
self.cedict = cedict.CEDictTrie(cedict_file, traditional=True)
self.rules = rules.Rules([rules_file])
self.regexp = re.compile(r'\p{Han}')