File size: 6,830 Bytes
5f858bc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
import re
from typing import Iterable, List, Tuple
import cn2an
from english_utils.abbreviations import expand_abbreviations
from english_utils.time_norm import expand_time_english
from english_utils.number_norm import normalize_numbers as replace_numbers_en


def merge_short_sentences_zh(sens):
    # return sens
    """Avoid short sentences by merging them with the following sentence.

    Args:
        List[str]: list of input sentences.

    Returns:
        List[str]: list of output sentences.
    """
    sens_out = []
    for s in sens:
        # If the previous sentense is too short, merge them with
        # the current sentence.
        if len(sens_out) > 0 and len(sens_out[-1]) <= 2:
            sens_out[-1] = sens_out[-1] + " " + s
        else:
            sens_out.append(s)
    try:
        if len(sens_out[-1]) <= 2:
            sens_out[-2] = sens_out[-2] + " " + sens_out[-1]
            sens_out.pop(-1)
    except:
        pass
    return sens_out


def split_sentences_zh(text, min_len=10):
    text = re.sub('[。!?;]', '.', text)
    text = re.sub('[,]', ',', text)
    # 将文本中的换行符、空格和制表符替换为空格
    text = re.sub('[\n\t ]+', ' ', text)
    # 在标点符号后添加一个空格
    text = re.sub('([,.!?;])', r'\1 $#!', text)
    # 分隔句子并去除前后空格
    # sentences = [s.strip() for s in re.split('(。|!|?|;)', text)]
    sentences = [s.strip() for s in text.split('$#!')]
    if len(sentences[-1]) == 0: del sentences[-1]

    new_sentences = []
    new_sent = []
    count_len = 0
    for ind, sent in enumerate(sentences):
        new_sent.append(sent)
        count_len += len(sent)
        if count_len > min_len or ind == len(sentences) - 1:
            count_len = 0
            new_sentences.append(' '.join(new_sent))
            new_sent = []
    return merge_short_sentences_zh(new_sentences)


def intersperse(lst, item):
    result = [item] * (len(lst) * 2 + 1)
    result[1::2] = lst
    return result


def replace_numbers_zh(text):
    numbers = re.findall(r"\d+(?:\.?\d+)?", text)
    for number in numbers:
        text = text.replace(number, cn2an.an2cn(number), 1)
    return text


def replace_punctuation(text):
    rep_map = {
        ":": ",",
        ";": ",",
        ",": ",",
        "。": ".",
        "!": "!",
        "?": "?",
        "\n": ".",
        "·": ",",
        "、": ",",
        "...": "…",
        "$": ".",
        "“": "'",
        "”": "'",
        "‘": "'",
        "’": "'",
        "(": "'",
        ")": "'",
        "(": "'",
        ")": "'",
        "《": "'",
        "》": "'",
        "【": "'",
        "】": "'",
        "[": "'",
        "]": "'",
        "—": "-",
        "~": "-",
        "~": "-",
        "「": "'",
        "」": "'",
    }

    for k, v in rep_map.items():
        text = text.replace(k, v)
    return text


class Lexicon:
    def __init__(self, lexion_filename: str, tokens_filename: str):
        tokens = dict()
        with open(tokens_filename, encoding="utf-8") as f:
            for line in f:
                s, i = line.split()
                tokens[s] = int(i)

        lexicon = dict()
        with open(lexion_filename, encoding="utf-8") as f:
            for line in f:
                splits = line.split()
                word_or_phrase = splits[0]
                phone_tone_list = splits[1:]
                assert len(phone_tone_list) & 1 == 0, len(phone_tone_list)
                phone_str = phone_tone_list[: len(phone_tone_list) // 2]
                phones = [tokens[p] for p in phone_str]

                tones = phone_tone_list[len(phone_tone_list) // 2 :]
                tones = [int(t) for t in tones]

                lexicon[word_or_phrase] = (phone_str, phones, tones)
        lexicon["呣"] = lexicon["母"]
        lexicon["嗯"] = lexicon["恩"]
        self.lexicon = lexicon

        punctuation = ["!", "?", "…", ",", ".", "'", "-"]
        for p in punctuation:
            i = tokens[p]
            tone = 0
            self.lexicon[p] = ([p], [i], [tone])
        self.lexicon[" "] = ([" "], [tokens["_"]], [0])

    def g2p_zh_mix_en(self, text: str) -> Tuple[List[int], List[int]]:
        phone_str = []
        phones = []
        tones = []

        if text not in self.lexicon:
            # print("t", text)
            if len(text) > 1:
                for w in text:
                    # print("w: ", w)
                    s, _, p, t = self.convert(w)
                    if p:
                        phone_str += s
                        phones += p
                        tones += t
            return phone_str, phones, tones

        phone_str, phones, tones = self.lexicon[text]
        return phone_str, phones, tones
    
    
    def split_zh_en(self, text):
        if re.search(r'[a-zA-Z]+', text):
            spliter = '#$&^!@'
            # replace all english words
            text = re.sub(r'[a-zA-Z]+', lambda x: f'{spliter}{x.group()}{spliter}', text)
            texts = text.split(spliter)
            texts = [t for t in texts if len(t) > 0]
            return texts
        else:
            return [text]
        
    
    def normalize_english(self, text):
        text = text.lower()
        text = expand_time_english(text)
        text = replace_numbers_en(text)
        text = expand_abbreviations(text)
        return text

    def normalize_chinese(self, text):
        text = replace_numbers_zh(text)
        return text
    

    def is_english(self, text):
        return 1 if re.match(r'[a-zA-Z\s]+', text) else 0

    def convert(self, text: Iterable[str]) -> Tuple[List[int], List[int]]:
        phone_str = []
        yinjie_num = []
        phones = []
        tones = []

        text = replace_punctuation(text)
        texts_zh_en = self.split_zh_en(text)
        en_num = sum([self.is_english(i) for i in texts_zh_en])
        if en_num * 2 >= len(texts_zh_en):
            texts_zh_en = self.split_zh_en(self.normalize_english(text))
        else:
            texts_zh_en = self.split_zh_en(self.normalize_chinese(text))
        for text_one_lang in texts_zh_en:
            if self.is_english(text_one_lang):
                # English
                s, p, t = self.g2p_zh_mix_en(text_one_lang)

                phone_str += s
                yinjie_num.append(len(s))
                phones += p
                tones += t
            else:
                # print(f"text_one_lang = {text_one_lang}")
                for tl in text_one_lang:
                    s, p, t = self.g2p_zh_mix_en(tl)

                    phone_str += s
                    yinjie_num.append(len(s))
                    phones += p
                    tones += t
            
        return phone_str, yinjie_num, phones, tones