micahg's picture
Initial file upload
609216a
from __future__ import print_function, unicode_literals, division, absolute_import
import os.path
import sys
from unicodedata import normalize
import pkg_resources
import epitran
import unicodecsv as csv
class ReRomanizer(object):
"""Converts IPA representations to a readable roman form."""
def __init__(self, code, table, decompose=True, cedict_file=None):
"""Construct object for re-romanizing Epitran output.
This class converts orthographic input, via Epitran, to a more
conventional romanization that should be more readable to most humans.
Args:
code (str): ISO 639-3 code and ISO 15924 code joined with a hyphen
table (str): Name of re-romanization table
decompose (bool): apply decomposing normalization
"""
self.epi = epitran.Epitran(code, cedict_file=cedict_file)
self.mapping = self._load_reromanizer(table, decompose)
def _load_reromanizer(self, table, decompose):
path = os.path.join('data', 'reromanize', table + '.csv')
path = pkg_resources.resource_filename(__name__, path)
if os.path.isfile(path):
mapping = {}
with open(path, 'rb') as f:
reader = csv.reader(f, encoding='utf-8')
next(reader)
for ipa, rom in reader:
rom = normalize('NFD', rom) if decompose else normalize('NFC', rom)
mapping[ipa] = rom
return mapping
else:
print('File {} does not exist.'.format(path), file=sys.stderr)
return {}
def reromanize_ipa(self, tr_list):
re_rom_list = []
for seg in tr_list:
if seg in self.mapping:
re_rom_list.append(self.mapping[seg])
else:
re_rom_list.append(seg)
return re_rom_list
def reromanize(self, text):
"""Convert orthographic text to romanized text
Arg:
text (unicode): orthographic text
Returns:
unicode: romanized text
"""
tr_list = self.epi.trans_list(text)
return ''.join(self.reromanize_ipa(tr_list))