micahg's picture
Initial file upload
609216a
raw
history blame
1.68 kB
from __future__ import (absolute_import, division, print_function,
unicode_literals)
import logging
import os.path
import unicodedata
import pkg_resources
from epitran.rules import Rules
logging.basicConfig(level=logging.DEBUG)
class PrePostProcessor(object):
def __init__(self, code, fix, rev):
"""Constructs a pre/post-processor for orthographic/IPA strings
This class reads processor files consisting of context-sensitive rules
and compiles them into regular expression objects that can then be used
to perform regex replacements in cascades that capture feeding and
bleeding.
Args:
code (str): ISO 639-3 code and ISO 15924 code joined with a hyphen
fix (str): 'pre' for preprocessors, 'post' for postprocessors
rev (boolean): True for reverse transliterating pre/post-processors
"""
self.rules = self._read_rules(code, fix, rev)
def _read_rules(self, code, fix, rev):
assert fix in ['pre', 'post']
code += '_rev' if rev else ''
fn = os.path.join('data', fix, code + '.txt')
try:
abs_fn = pkg_resources.resource_filename(__name__, fn)
except KeyError:
return Rules([])
if os.path.isfile(abs_fn):
return Rules([abs_fn])
else:
return Rules([])
def process(self, word):
"""Apply processor to an input string
Args:
word (unicode): input string (orthographic or IPA)
Returns:
unicode: output string with all rules applied in order
"""
return self.rules.apply(word)