Spaces:
Sleeping
Sleeping
# -*- coding: utf-8 -*- | |
from __future__ import (absolute_import, division, print_function, | |
unicode_literals) | |
import os.path | |
import unicodedata | |
import pkg_resources | |
import marisa_trie | |
import panphon | |
import unicodecsv as csv | |
class XSampa(object): | |
ipa2xs_fn = 'ipa-xsampa.csv' | |
def __init__(self): | |
"""Construct an IPA-XSampa conversion object | |
""" | |
self.trie = self._read_ipa2xs() | |
self.ft = panphon.FeatureTable() | |
def _read_ipa2xs(self): | |
path = os.path.join('data', self.ipa2xs_fn) | |
path = pkg_resources.resource_filename(__name__, path) | |
pairs = [] | |
with open(path, 'rb') as f: | |
reader = csv.reader(f, encoding='utf-8') | |
next(reader) | |
for ipa, xs, _ in reader: | |
pairs.append((ipa, xs.encode('utf-8'),)) | |
trie = marisa_trie.BytesTrie(pairs) | |
return trie | |
def prefixes(self, s): | |
return self.trie.prefixes(s) | |
def longest_prefix(self, s): | |
prefixes = self.prefixes(s) | |
if not prefixes: | |
return '' | |
else: | |
return sorted(prefixes, key=len)[-1] # sort by length and return last | |
def ipa2xs(self, ipa): | |
"""Convert IPA string (unicode) to X-SAMPA string | |
Args: | |
ipa (unicode): An IPA string as unicode | |
Returns: | |
list: a list of strings corresponding to X-SAMPA segments | |
Non-IPA segments are skipped. | |
""" | |
xsampa = [] | |
ipa = unicodedata.normalize('NFD', ipa) | |
while ipa: | |
token = self.longest_prefix(ipa) | |
if token: | |
xs = self.trie[token][0] # take first member of the list | |
xsampa.append(xs.decode('utf-8')) | |
ipa = ipa[len(token):] | |
else: | |
ipa = ipa[1:] | |
return ''.join(xsampa) | |