File size: 2,626 Bytes
48c6ce0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# modified from https://github.com/feng-yufei/shared_debugging_code/blob/main/text_processing/phonemizer.py
import itertools
import re
from typing import Dict
from typing import List

import regex
from gruut import sentences
from gruut.const import Sentence
from gruut.const import Word
from AR.text_processing.symbols import SYMBOL_TO_ID


class GruutPhonemizer:
    def __init__(self, language: str):
        self._phonemizer = sentences
        self.lang = language
        self.symbol_to_id = SYMBOL_TO_ID
        self._special_cases_dict: Dict[str] = {
            r"\.\.\.": "... ",
            ";": "; ",
            ":": ": ",
            ",": ", ",
            r"\.": ". ",
            "!": "! ",
            r"\?": "? ",
            "—": "—",
            "…": "… ",
            "«": "«",
            "»": "»"
        }
        self._punctuation_regexp: str = rf"([{''.join(self._special_cases_dict.keys())}])"

    def _normalize_punctuation(self, text: str) -> str:
        text = regex.sub(fr"\pZ+{self._punctuation_regexp}", r"\1", text)
        text = regex.sub(fr"{self._punctuation_regexp}(\pL)", r"\1 \2", text)
        text = regex.sub(r"\pZ+", r" ", text)
        return text.strip()

    def _convert_punctuation(self, word: Word) -> str:
        if not word.phonemes:
            return ''
        if word.phonemes[0] in ['‖', '|']:
            return word.text.strip()

        phonemes = ''.join(word.phonemes)
        # remove modifier characters ˈˌː with regex
        phonemes = re.sub(r'[ˈˌː͡]', '', phonemes)
        return phonemes.strip()

    def phonemize(self, text: str, espeak: bool=False) -> str:
        text_to_phonemize: str = self._normalize_punctuation(text)
        sents: List[Sentence] = [
            sent
            for sent in self._phonemizer(
                text_to_phonemize, lang="en-us", espeak=espeak)
        ]
        words: List[str] = [
            self._convert_punctuation(word) for word in itertools.chain(*sents)
        ]
        return ' '.join(words)

    def transform(self, phonemes):
        # convert phonemes to ids
        # dictionary is in symbols.py
        return [
            self.symbol_to_id[p] for p in phonemes
            if p in self.symbol_to_id.keys()
        ]


if __name__ == "__main__":
    phonemizer = GruutPhonemizer("en-us")
    # text -> IPA
    phonemes = phonemizer.phonemize("Hello, wor-ld ?")
    print("phonemes:", phonemes)
    print("len(phonemes):", len(phonemes))
    phoneme_ids = phonemizer.transform(phonemes)
    print("phoneme_ids:", phoneme_ids)
    print("len(phoneme_ids):", len(phoneme_ids))