Spaces:
Sleeping
Sleeping
# -*- utf-8 -*- | |
from __future__ import print_function, unicode_literals, division, absolute_import | |
import os.path | |
import pkg_resources | |
import regex as re | |
from . import cedict | |
from . import rules | |
from . import download | |
from epitran.ligaturize import ligaturize | |
class MissingData(Exception): | |
pass | |
class Epihan(object): | |
punc = [(u'\uff0c', u','), | |
(u'\uff01', u'!'), | |
(u'\uff1f', u'?'), | |
(u'\uff1b', u';'), | |
(u'\uff1a', u':'), | |
(u'\uff08', u'('), | |
(u'\uff09', u')'), | |
(u'\uff3b', u'['), | |
(u'\uff3d', u']'), | |
(u'\u3010', u'['), | |
(u'\u3011', u']'), | |
] | |
def __init__(self, ligatures=False, cedict_file=None, | |
rules_file='pinyin-to-ipa.txt', tones=False): | |
"""Construct epitran object for Chinese | |
Args: | |
ligatures (bool): if True, use ligatures instead of standard IPA | |
cedict_file (str): path to CC-CEDict dictionary file | |
rules_file (str): name of file with rules for converting pinyin to | |
IPA | |
tones (bool): if True, output tones as Chao tone numbers; overrides | |
`rules_file` | |
""" | |
# If no cedict_file is specified, raise and error | |
if not cedict_file: | |
if download.cedict_exists(): | |
cedict_file = download.get_cedict_file() | |
else: | |
raise MissingData('Download CC-CEDICT with "epitran.download.cedict()') | |
if tones: | |
rules_file = os.path.join('data', 'rules', 'pinyin-to-ipa-tones.txt') | |
else: | |
rules_file = os.path.join('data', 'rules', rules_file) | |
rules_file = pkg_resources.resource_filename(__name__, rules_file) | |
self.cedict = cedict.CEDictTrie(cedict_file) | |
self.rules = rules.Rules([rules_file]) | |
self.regexp = re.compile(r'\p{Han}') | |
def normalize_punc(self, text): | |
"""Normalize punctutation in a string | |
Args: | |
text (unicode): an orthographic string | |
Return: | |
unicode: an orthographic string with punctation normalized to | |
Western equivalents | |
""" | |
for a, b in self.punc: | |
text = text.replace(a, b) | |
return text | |
def transliterate(self, text, normpunc=False, ligatures=False): | |
"""Transliterates/transcribes a word into IPA | |
Args: | |
word (str): word to transcribe; Unicode string | |
normpunc (bool): normalize punctuation | |
ligatures (bool): use precomposed ligatures instead of standard IPA | |
Returns: | |
str: Unicode IPA string | |
""" | |
tokens = self.cedict.tokenize(text) | |
ipa_tokens = [] | |
for token in tokens: | |
if token in self.cedict.hanzi: | |
(pinyin, _) = self.cedict.hanzi[token] | |
pinyin = u''.join(pinyin).lower() | |
ipa = self.rules.apply(pinyin) | |
ipa_tokens.append(ipa.replace(u',', u'')) | |
else: | |
if normpunc: | |
token = self.normalize_punc(token) | |
ipa_tokens.append(token) | |
ipa_tokens = map(ligaturize, ipa_tokens)\ | |
if ligatures else ipa_tokens | |
return u''.join(ipa_tokens) | |
def strict_trans(self, text, normpunc=False, ligatures=False): | |
return self.transliterate(text, normpunc, ligatures) | |
class EpihanTraditional(Epihan): | |
def __init__(self, ligatures=False, cedict_file=None, tones=False, rules_file='pinyin-to-ipa.txt'): | |
"""Construct epitran object for Traditional Chinese | |
Args: | |
ligatures (bool): if True, use ligatures instead of standard IPA | |
cedict_file (str): path to CC-CEDict dictionary file | |
rules_file (str): name of file with rules for converting pinyin to | |
IPA | |
""" | |
if not cedict_file: | |
if download.cedict_exists(): | |
cedict_file = download.get_cedict_file() | |
else: | |
raise MissingData('Download CC-CEDICT with "epitran.download.cedict().') | |
if tones: | |
rules_file = os.path.join('data', 'rules', 'pinyin-to-ipa-tones.txt') | |
else: | |
rules_file = os.path.join('data', 'rules', rules_file) | |
rules_file = pkg_resources.resource_filename(__name__, rules_file) | |
self.cedict = cedict.CEDictTrie(cedict_file, traditional=True) | |
self.rules = rules.Rules([rules_file]) | |
self.regexp = re.compile(r'\p{Han}') | |