micahg's picture
Initial file upload
609216a
raw
history blame
No virus
2.39 kB
# -*- coding: utf-8 -*-
from __future__ import (absolute_import, division, print_function,
unicode_literals)
import os
import pkg_resources
import unicodecsv as csv
from epitran import Epitran
class Space(object):
def __init__(self, code, space_names):
"""Construct a Space object
Space objects take strings (corresponding to segments) and return
integers, placing them in an integer space that can be translated into
a one-hot vector.
The resulting object has a dictionary-like interface that supports
indexing and iteration over "keys".
Args:
code (str): ISO 639-3 code joined to ISO 15924 code with "-"
space_names (list): list of space names consisting of ISO 639-3
codes joined to ISO 15924 codes with "-"
"""
self.epi = Epitran(code)
self.dict = self._load_space(space_names)
def _load_space(self, space_names):
segs = set()
scripts = list(set([nm.split('-')[1] for nm in space_names]))
punc_fns = ['punc-{}.csv'.format(sc) for sc in scripts]
for punc_fn in punc_fns:
punc_fn = os.path.join('data', 'space', punc_fn)
punc_fn = pkg_resources.resource_filename(__name__, punc_fn)
with open(punc_fn, 'rb') as f:
reader = csv.reader(f, encoding='utf-8')
for (mark,) in reader:
segs.add(mark)
for name in space_names:
fn = os.path.join('data', 'space', name + '.csv')
fn = pkg_resources.resource_filename(__name__, fn)
with open(fn, 'rb') as f:
reader = csv.reader(f, encoding='utf-8')
for _, to_ in reader:
for seg in self.epi.ft.ipa_segs(to_):
segs.add(seg)
enum = enumerate(sorted(list(segs)))
return {seg: num for num, seg in enum}
def __iter__(self):
return iter(self.dict)
def __getitem__(self, key):
"""Given a string as a key, return the corresponding integer
Args:
key (unicode): a unicode key corresponding to a segment
Returns:
int: the integer corresponding to the unicode string
"""
try:
return self.dict[key]
except KeyError:
return len(self.dict)