File size: 3,138 Bytes
609216a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# -*- coding: utf-8 -*-
from __future__ import (print_function, absolute_import,
                        unicode_literals)

import regex as re
from . import _epitran
import panphon.featuretable
from epitran.puncnorm import PuncNorm
from epitran.xsampa import XSampa
from epitran.stripdiacritics import StripDiacritics


class Backoff(object):
    """Implements rudimentary language ID and backoff."""

    def __init__(self, lang_script_codes, cedict_file=None):
        """Construct a Backoff object.

        Args:
            lang_script_codes (list): codes for languages to try, starting
            with the highest priority languages
            cedict_file (str): path to the CC-CEdict dictionary file
            (necessary only when cmn-Hans or cmn-Hant are used)
        """
        self.langs = [_epitran.Epitran(c, cedict_file=cedict_file)
                      for c in lang_script_codes]
        self.num_re = re.compile(r'\p{Number}+')
        self.ft = panphon.featuretable.FeatureTable()
        self.xsampa = XSampa()
        self.puncnorm = PuncNorm()
        self.dias = [StripDiacritics(c) for c in lang_script_codes]

    def transliterate(self, token):
        """Return IPA transliteration given by first acceptable mode.
        Args:
            token (unicode): orthographic text
        Returns:
            unicode: transliteration as Unicode IPA string
        """
        tr_list = []
        while token:
            is_outside_lang = True
            for dia, lang in zip(self.dias, self.langs):
                source = ''
                while True:
                    m = lang.epi.regexp.match(dia.process(token))
                    if not m:
                        break
                    s = m.group()
                    token = token[len(s):]
                    source += s
                    is_outside_lang = False
                tr_list.append(lang.transliterate(source))
            if is_outside_lang:
                m = re.match(r'\p{Number}+', token)
                if m:
                    source = m.group()
                    tr_list.append(source)
                    token = token[len(source):]
                else:
                    tr_list.append(token[0])
                    token = token[1:]
        return ''.join(tr_list)

    def trans_list(self, token):
        """Transliterate/transcribe a word into list of IPA phonemes.

        Args:
            token (unicode): word to transcribe; unicode string

        Returns:
            list: list of IPA unicode strings, each corresponding to a segment
        """
        return self.ft.segs_safe(self.transliterate(token))

    def xsampa_list(self, token):
        """Transcribe a word into a list of X-SAMPA phonemes.

        Args:
            token (unicode): word to transcribe; unicode strings

        Returns:
            list: list of X-SAMPA strings, each corresponding to a segment
        """
        if re.match(r'^\p{Number}+$', token):
            return ''
        else:
            ipa_segs = self.ft.ipa_segs(self.transliterate(token))
            return list(map(self.xsampa.ipa2xs, ipa_segs))