ElesisSiegherts commited on
Commit
a2521ef
1 Parent(s): 164dba8

Upload 36 files

Browse files
text/__init__.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from text.symbols import *
2
+
3
+ _symbol_to_id = {s: i for i, s in enumerate(symbols)}
4
+
5
+
6
+ def cleaned_text_to_sequence(cleaned_text, tones, language):
7
+ """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
8
+ Args:
9
+ text: string to convert to a sequence
10
+ Returns:
11
+ List of integers corresponding to the symbols in the text
12
+ """
13
+ phones = [_symbol_to_id[symbol] for symbol in cleaned_text]
14
+ tone_start = language_tone_start_map[language]
15
+ tones = [i + tone_start for i in tones]
16
+ lang_id = language_id_map[language]
17
+ lang_ids = [lang_id for i in phones]
18
+ return phones, tones, lang_ids
19
+
20
+
21
+ def get_bert(norm_text, word2ph, language, device):
22
+ from .chinese_bert import get_bert_feature as zh_bert
23
+ from .english_bert_mock import get_bert_feature as en_bert
24
+ from .japanese_bert import get_bert_feature as jp_bert
25
+
26
+ lang_bert_func_map = {"ZH": zh_bert, "EN": en_bert, "JP": jp_bert}
27
+ bert = lang_bert_func_map[language](norm_text, word2ph, device)
28
+ return bert
29
+
30
+
31
+ def check_bert_models():
32
+ import json
33
+ from pathlib import Path
34
+
35
+ from config import config
36
+ from .bert_utils import _check_bert
37
+
38
+ if config.mirror.lower() == "openi":
39
+ import openi
40
+
41
+ kwargs = {"token": config.openi_token} if config.openi_token else {}
42
+ openi.login(**kwargs)
43
+
44
+ with open("./bert/bert_models.json", "r") as fp:
45
+ models = json.load(fp)
46
+ for k, v in models.items():
47
+ local_path = Path("./bert").joinpath(k)
48
+ _check_bert(v["repo_id"], v["files"], local_path)
49
+
50
+
51
+ def init_openjtalk():
52
+ import platform
53
+
54
+ if platform.platform() == "Linux":
55
+ import pyopenjtalk
56
+
57
+ pyopenjtalk.g2p("こんにちは,世界。")
58
+
59
+
60
+ init_openjtalk()
61
+ check_bert_models()
text/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (2.45 kB). View file
 
text/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (2.42 kB). View file
 
text/__pycache__/bert_utils.cpython-310.pyc ADDED
Binary file (737 Bytes). View file
 
text/__pycache__/bert_utils.cpython-38.pyc ADDED
Binary file (743 Bytes). View file
 
text/__pycache__/chinese.cpython-310.pyc ADDED
Binary file (4.6 kB). View file
 
text/__pycache__/chinese.cpython-38.pyc ADDED
Binary file (4.53 kB). View file
 
text/__pycache__/chinese_bert.cpython-310.pyc ADDED
Binary file (1.75 kB). View file
 
text/__pycache__/chinese_bert.cpython-38.pyc ADDED
Binary file (1.69 kB). View file
 
text/__pycache__/cleaner.cpython-310.pyc ADDED
Binary file (984 Bytes). View file
 
text/__pycache__/cleaner.cpython-38.pyc ADDED
Binary file (974 Bytes). View file
 
text/__pycache__/english.cpython-310.pyc ADDED
Binary file (9.5 kB). View file
 
text/__pycache__/english.cpython-38.pyc ADDED
Binary file (9.63 kB). View file
 
text/__pycache__/english_bert_mock.cpython-310.pyc ADDED
Binary file (1.25 kB). View file
 
text/__pycache__/english_bert_mock.cpython-38.pyc ADDED
Binary file (1.23 kB). View file
 
text/__pycache__/japanese.cpython-310.pyc ADDED
Binary file (11.4 kB). View file
 
text/__pycache__/japanese.cpython-38.pyc ADDED
Binary file (11 kB). View file
 
text/__pycache__/japanese_bert.cpython-310.pyc ADDED
Binary file (1.32 kB). View file
 
text/__pycache__/japanese_bert.cpython-38.pyc ADDED
Binary file (1.31 kB). View file
 
text/__pycache__/symbols.cpython-310.pyc ADDED
Binary file (1.48 kB). View file
 
text/__pycache__/symbols.cpython-38.pyc ADDED
Binary file (1.83 kB). View file
 
text/__pycache__/tone_sandhi.cpython-310.pyc ADDED
Binary file (13.4 kB). View file
 
text/__pycache__/tone_sandhi.cpython-38.pyc ADDED
Binary file (15.6 kB). View file
 
text/bert_utils.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ from huggingface_hub import hf_hub_download
4
+
5
+ from config import config
6
+
7
+
8
+ MIRROR: str = config.mirror
9
+
10
+
11
+ def _check_bert(repo_id, files, local_path):
12
+ for file in files:
13
+ if not Path(local_path).joinpath(file).exists():
14
+ if MIRROR.lower() == "openi":
15
+ import openi
16
+
17
+ openi.model.download_model(
18
+ "Stardust_minus/Bert-VITS2", repo_id.split("/")[-1], "./bert"
19
+ )
20
+ else:
21
+ hf_hub_download(
22
+ repo_id, file, local_dir=local_path, local_dir_use_symlinks=False
23
+ )
text/chinese.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+
4
+ import cn2an
5
+ from pypinyin import lazy_pinyin, Style
6
+
7
+ from text.symbols import punctuation
8
+ from text.tone_sandhi import ToneSandhi
9
+
10
+ current_file_path = os.path.dirname(__file__)
11
+ pinyin_to_symbol_map = {
12
+ line.split("\t")[0]: line.strip().split("\t")[1]
13
+ for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines()
14
+ }
15
+
16
+ import jieba.posseg as psg
17
+
18
+
19
+ rep_map = {
20
+ ":": ",",
21
+ ";": ",",
22
+ ",": ",",
23
+ "。": ".",
24
+ "!": "!",
25
+ "?": "?",
26
+ "\n": ".",
27
+ "·": ",",
28
+ "、": ",",
29
+ "...": "…",
30
+ "$": ".",
31
+ "“": "'",
32
+ "”": "'",
33
+ '"': "'",
34
+ "‘": "'",
35
+ "’": "'",
36
+ "(": "'",
37
+ ")": "'",
38
+ "(": "'",
39
+ ")": "'",
40
+ "《": "'",
41
+ "》": "'",
42
+ "【": "'",
43
+ "】": "'",
44
+ "[": "'",
45
+ "]": "'",
46
+ "—": "-",
47
+ "~": "-",
48
+ "~": "-",
49
+ "「": "'",
50
+ "」": "'",
51
+ }
52
+
53
+ tone_modifier = ToneSandhi()
54
+
55
+
56
+ def replace_punctuation(text):
57
+ text = text.replace("嗯", "恩").replace("呣", "母")
58
+ pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
59
+
60
+ replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
61
+
62
+ replaced_text = re.sub(
63
+ r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text
64
+ )
65
+
66
+ return replaced_text
67
+
68
+
69
+ def g2p(text):
70
+ pattern = r"(?<=[{0}])\s*".format("".join(punctuation))
71
+ sentences = [i for i in re.split(pattern, text) if i.strip() != ""]
72
+ phones, tones, word2ph = _g2p(sentences)
73
+ assert sum(word2ph) == len(phones)
74
+ assert len(word2ph) == len(text) # Sometimes it will crash,you can add a try-catch.
75
+ phones = ["_"] + phones + ["_"]
76
+ tones = [0] + tones + [0]
77
+ word2ph = [1] + word2ph + [1]
78
+ return phones, tones, word2ph
79
+
80
+
81
+ def _get_initials_finals(word):
82
+ initials = []
83
+ finals = []
84
+ orig_initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS)
85
+ orig_finals = lazy_pinyin(
86
+ word, neutral_tone_with_five=True, style=Style.FINALS_TONE3
87
+ )
88
+ for c, v in zip(orig_initials, orig_finals):
89
+ initials.append(c)
90
+ finals.append(v)
91
+ return initials, finals
92
+
93
+
94
+ def _g2p(segments):
95
+ phones_list = []
96
+ tones_list = []
97
+ word2ph = []
98
+ for seg in segments:
99
+ # Replace all English words in the sentence
100
+ seg = re.sub("[a-zA-Z]+", "", seg)
101
+ seg_cut = psg.lcut(seg)
102
+ initials = []
103
+ finals = []
104
+ seg_cut = tone_modifier.pre_merge_for_modify(seg_cut)
105
+ for word, pos in seg_cut:
106
+ if pos == "eng":
107
+ continue
108
+ sub_initials, sub_finals = _get_initials_finals(word)
109
+ sub_finals = tone_modifier.modified_tone(word, pos, sub_finals)
110
+ initials.append(sub_initials)
111
+ finals.append(sub_finals)
112
+
113
+ # assert len(sub_initials) == len(sub_finals) == len(word)
114
+ initials = sum(initials, [])
115
+ finals = sum(finals, [])
116
+ #
117
+ for c, v in zip(initials, finals):
118
+ raw_pinyin = c + v
119
+ # NOTE: post process for pypinyin outputs
120
+ # we discriminate i, ii and iii
121
+ if c == v:
122
+ assert c in punctuation
123
+ phone = [c]
124
+ tone = "0"
125
+ word2ph.append(1)
126
+ else:
127
+ v_without_tone = v[:-1]
128
+ tone = v[-1]
129
+
130
+ pinyin = c + v_without_tone
131
+ assert tone in "12345"
132
+
133
+ if c:
134
+ # 多音节
135
+ v_rep_map = {
136
+ "uei": "ui",
137
+ "iou": "iu",
138
+ "uen": "un",
139
+ }
140
+ if v_without_tone in v_rep_map.keys():
141
+ pinyin = c + v_rep_map[v_without_tone]
142
+ else:
143
+ # 单音节
144
+ pinyin_rep_map = {
145
+ "ing": "ying",
146
+ "i": "yi",
147
+ "in": "yin",
148
+ "u": "wu",
149
+ }
150
+ if pinyin in pinyin_rep_map.keys():
151
+ pinyin = pinyin_rep_map[pinyin]
152
+ else:
153
+ single_rep_map = {
154
+ "v": "yu",
155
+ "e": "e",
156
+ "i": "y",
157
+ "u": "w",
158
+ }
159
+ if pinyin[0] in single_rep_map.keys():
160
+ pinyin = single_rep_map[pinyin[0]] + pinyin[1:]
161
+
162
+ assert pinyin in pinyin_to_symbol_map.keys(), (pinyin, seg, raw_pinyin)
163
+ phone = pinyin_to_symbol_map[pinyin].split(" ")
164
+ word2ph.append(len(phone))
165
+
166
+ phones_list += phone
167
+ tones_list += [int(tone)] * len(phone)
168
+ return phones_list, tones_list, word2ph
169
+
170
+
171
+ def text_normalize(text):
172
+ numbers = re.findall(r"\d+(?:\.?\d+)?", text)
173
+ for number in numbers:
174
+ text = text.replace(number, cn2an.an2cn(number), 1)
175
+ text = replace_punctuation(text)
176
+ return text
177
+
178
+
179
+ def get_bert_feature(text, word2ph):
180
+ from text import chinese_bert
181
+
182
+ return chinese_bert.get_bert_feature(text, word2ph)
183
+
184
+
185
+ if __name__ == "__main__":
186
+ from text.chinese_bert import get_bert_feature
187
+
188
+ text = "啊!但是《原神》是由,米哈\游自主, [研发]的一款全.新开放世界.冒险游戏"
189
+ text = text_normalize(text)
190
+ print(text)
191
+ phones, tones, word2ph = g2p(text)
192
+ bert = get_bert_feature(text, word2ph)
193
+
194
+ print(phones, tones, word2ph, bert.shape)
195
+
196
+
197
+ # # 示例用法
198
+ # text = "这是一个示例文本:,你好!这是一个测试...."
199
+ # print(g2p_paddle(text)) # 输出: 这是一个示例文本你好这是一个测试
text/chinese_bert.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+
3
+ import torch
4
+ from transformers import AutoModelForMaskedLM, AutoTokenizer
5
+
6
+ from config import config
7
+
8
+ LOCAL_PATH = "./bert/chinese-roberta-wwm-ext-large"
9
+
10
+ tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH)
11
+
12
+ models = dict()
13
+
14
+
15
+ def get_bert_feature(text, word2ph, device=config.bert_gen_config.device):
16
+ if (
17
+ sys.platform == "darwin"
18
+ and torch.backends.mps.is_available()
19
+ and device == "cpu"
20
+ ):
21
+ device = "mps"
22
+ if not device:
23
+ device = "cuda"
24
+ if device not in models.keys():
25
+ models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device)
26
+ with torch.no_grad():
27
+ inputs = tokenizer(text, return_tensors="pt")
28
+ for i in inputs:
29
+ inputs[i] = inputs[i].to(device)
30
+ res = models[device](**inputs, output_hidden_states=True)
31
+ res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
32
+
33
+ assert len(word2ph) == len(text) + 2
34
+ word2phone = word2ph
35
+ phone_level_feature = []
36
+ for i in range(len(word2phone)):
37
+ repeat_feature = res[i].repeat(word2phone[i], 1)
38
+ phone_level_feature.append(repeat_feature)
39
+
40
+ phone_level_feature = torch.cat(phone_level_feature, dim=0)
41
+
42
+ return phone_level_feature.T
43
+
44
+
45
+ if __name__ == "__main__":
46
+ word_level_feature = torch.rand(38, 1024) # 12个词,每个词1024维特征
47
+ word2phone = [
48
+ 1,
49
+ 2,
50
+ 1,
51
+ 2,
52
+ 2,
53
+ 1,
54
+ 2,
55
+ 2,
56
+ 1,
57
+ 2,
58
+ 2,
59
+ 1,
60
+ 2,
61
+ 2,
62
+ 2,
63
+ 2,
64
+ 2,
65
+ 1,
66
+ 1,
67
+ 2,
68
+ 2,
69
+ 1,
70
+ 2,
71
+ 2,
72
+ 2,
73
+ 2,
74
+ 1,
75
+ 2,
76
+ 2,
77
+ 2,
78
+ 2,
79
+ 2,
80
+ 1,
81
+ 2,
82
+ 2,
83
+ 2,
84
+ 2,
85
+ 1,
86
+ ]
87
+
88
+ # 计算总帧数
89
+ total_frames = sum(word2phone)
90
+ print(word_level_feature.shape)
91
+ print(word2phone)
92
+ phone_level_feature = []
93
+ for i in range(len(word2phone)):
94
+ print(word_level_feature[i].shape)
95
+
96
+ # 对每个词重复word2phone[i]次
97
+ repeat_feature = word_level_feature[i].repeat(word2phone[i], 1)
98
+ phone_level_feature.append(repeat_feature)
99
+
100
+ phone_level_feature = torch.cat(phone_level_feature, dim=0)
101
+ print(phone_level_feature.shape) # torch.Size([36, 1024])
text/cleaner.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from text import chinese, japanese, english, cleaned_text_to_sequence
2
+
3
+
4
+ language_module_map = {"ZH": chinese, "JP": japanese, "EN": english}
5
+
6
+
7
+ def clean_text(text, language):
8
+ language_module = language_module_map[language]
9
+ norm_text = language_module.text_normalize(text)
10
+ phones, tones, word2ph = language_module.g2p(norm_text)
11
+ return norm_text, phones, tones, word2ph
12
+
13
+
14
+ def clean_text_bert(text, language):
15
+ language_module = language_module_map[language]
16
+ norm_text = language_module.text_normalize(text)
17
+ phones, tones, word2ph = language_module.g2p(norm_text)
18
+ bert = language_module.get_bert_feature(norm_text, word2ph)
19
+ return phones, tones, bert
20
+
21
+
22
+ def text_to_sequence(text, language):
23
+ norm_text, phones, tones, word2ph = clean_text(text, language)
24
+ return cleaned_text_to_sequence(phones, tones, language)
25
+
26
+
27
+ if __name__ == "__main__":
28
+ pass
text/cmudict.rep ADDED
The diff for this file is too large to render. See raw diff
 
text/cmudict_cache.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b9b21b20325471934ba92f2e4a5976989e7d920caa32e7a286eacb027d197949
3
+ size 6212655
text/english.py ADDED
@@ -0,0 +1,453 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+ import os
3
+ import re
4
+ from g2p_en import G2p
5
+ from transformers import DebertaV2Tokenizer
6
+
7
+ from text import symbols
8
+
9
+ current_file_path = os.path.dirname(__file__)
10
+ CMU_DICT_PATH = os.path.join(current_file_path, "cmudict.rep")
11
+ CACHE_PATH = os.path.join(current_file_path, "cmudict_cache.pickle")
12
+ _g2p = G2p()
13
+ LOCAL_PATH = "./bert/deberta-v3-large"
14
+ tokenizer = DebertaV2Tokenizer.from_pretrained(LOCAL_PATH)
15
+
16
+ arpa = {
17
+ "AH0",
18
+ "S",
19
+ "AH1",
20
+ "EY2",
21
+ "AE2",
22
+ "EH0",
23
+ "OW2",
24
+ "UH0",
25
+ "NG",
26
+ "B",
27
+ "G",
28
+ "AY0",
29
+ "M",
30
+ "AA0",
31
+ "F",
32
+ "AO0",
33
+ "ER2",
34
+ "UH1",
35
+ "IY1",
36
+ "AH2",
37
+ "DH",
38
+ "IY0",
39
+ "EY1",
40
+ "IH0",
41
+ "K",
42
+ "N",
43
+ "W",
44
+ "IY2",
45
+ "T",
46
+ "AA1",
47
+ "ER1",
48
+ "EH2",
49
+ "OY0",
50
+ "UH2",
51
+ "UW1",
52
+ "Z",
53
+ "AW2",
54
+ "AW1",
55
+ "V",
56
+ "UW2",
57
+ "AA2",
58
+ "ER",
59
+ "AW0",
60
+ "UW0",
61
+ "R",
62
+ "OW1",
63
+ "EH1",
64
+ "ZH",
65
+ "AE0",
66
+ "IH2",
67
+ "IH",
68
+ "Y",
69
+ "JH",
70
+ "P",
71
+ "AY1",
72
+ "EY0",
73
+ "OY2",
74
+ "TH",
75
+ "HH",
76
+ "D",
77
+ "ER0",
78
+ "CH",
79
+ "AO1",
80
+ "AE1",
81
+ "AO2",
82
+ "OY1",
83
+ "AY2",
84
+ "IH1",
85
+ "OW0",
86
+ "L",
87
+ "SH",
88
+ }
89
+
90
+
91
+ def post_replace_ph(ph):
92
+ rep_map = {
93
+ ":": ",",
94
+ ";": ",",
95
+ ",": ",",
96
+ "。": ".",
97
+ "!": "!",
98
+ "?": "?",
99
+ "\n": ".",
100
+ "·": ",",
101
+ "、": ",",
102
+ "…": "...",
103
+ "···": "...",
104
+ "・・・": "...",
105
+ "v": "V",
106
+ }
107
+ if ph in rep_map.keys():
108
+ ph = rep_map[ph]
109
+ if ph in symbols:
110
+ return ph
111
+ if ph not in symbols:
112
+ ph = "UNK"
113
+ return ph
114
+
115
+
116
+ rep_map = {
117
+ ":": ",",
118
+ ";": ",",
119
+ ",": ",",
120
+ "。": ".",
121
+ "!": "!",
122
+ "?": "?",
123
+ "\n": ".",
124
+ ".": ".",
125
+ "…": "...",
126
+ "···": "...",
127
+ "・・・": "...",
128
+ "·": ",",
129
+ "・": ",",
130
+ "、": ",",
131
+ "$": ".",
132
+ "“": "'",
133
+ "”": "'",
134
+ '"': "'",
135
+ "‘": "'",
136
+ "’": "'",
137
+ "(": "'",
138
+ ")": "'",
139
+ "(": "'",
140
+ ")": "'",
141
+ "《": "'",
142
+ "》": "'",
143
+ "【": "'",
144
+ "】": "'",
145
+ "[": "'",
146
+ "]": "'",
147
+ "—": "-",
148
+ "−": "-",
149
+ "~": "-",
150
+ "~": "-",
151
+ "「": "'",
152
+ "」": "'",
153
+ }
154
+
155
+
156
+ def replace_punctuation(text):
157
+ pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
158
+
159
+ replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
160
+
161
+ # replaced_text = re.sub(
162
+ # r"[^\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\u3400-\u4DBF\u3005"
163
+ # + "".join(punctuation)
164
+ # + r"]+",
165
+ # "",
166
+ # replaced_text,
167
+ # )
168
+
169
+ return replaced_text
170
+
171
+
172
+ def read_dict():
173
+ g2p_dict = {}
174
+ start_line = 49
175
+ with open(CMU_DICT_PATH) as f:
176
+ line = f.readline()
177
+ line_index = 1
178
+ while line:
179
+ if line_index >= start_line:
180
+ line = line.strip()
181
+ word_split = line.split(" ")
182
+ word = word_split[0]
183
+
184
+ syllable_split = word_split[1].split(" - ")
185
+ g2p_dict[word] = []
186
+ for syllable in syllable_split:
187
+ phone_split = syllable.split(" ")
188
+ g2p_dict[word].append(phone_split)
189
+
190
+ line_index = line_index + 1
191
+ line = f.readline()
192
+
193
+ return g2p_dict
194
+
195
+
196
+ def cache_dict(g2p_dict, file_path):
197
+ with open(file_path, "wb") as pickle_file:
198
+ pickle.dump(g2p_dict, pickle_file)
199
+
200
+
201
+ def get_dict():
202
+ if os.path.exists(CACHE_PATH):
203
+ with open(CACHE_PATH, "rb") as pickle_file:
204
+ g2p_dict = pickle.load(pickle_file)
205
+ else:
206
+ g2p_dict = read_dict()
207
+ cache_dict(g2p_dict, CACHE_PATH)
208
+
209
+ return g2p_dict
210
+
211
+
212
+ eng_dict = get_dict()
213
+
214
+
215
+ def refine_ph(phn):
216
+ tone = 0
217
+ if re.search(r"\d$", phn):
218
+ tone = int(phn[-1]) + 1
219
+ phn = phn[:-1]
220
+ return phn.lower(), tone
221
+
222
+
223
+ def refine_syllables(syllables):
224
+ tones = []
225
+ phonemes = []
226
+ for phn_list in syllables:
227
+ for i in range(len(phn_list)):
228
+ phn = phn_list[i]
229
+ phn, tone = refine_ph(phn)
230
+ phonemes.append(phn)
231
+ tones.append(tone)
232
+ return phonemes, tones
233
+
234
+
235
+ import re
236
+ import inflect
237
+
238
+ _inflect = inflect.engine()
239
+ _comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
240
+ _decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
241
+ _pounds_re = re.compile(r"£([0-9\,]*[0-9]+)")
242
+ _dollars_re = re.compile(r"\$([0-9\.\,]*[0-9]+)")
243
+ _ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
244
+ _number_re = re.compile(r"[0-9]+")
245
+
246
+ # List of (regular expression, replacement) pairs for abbreviations:
247
+ _abbreviations = [
248
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
249
+ for x in [
250
+ ("mrs", "misess"),
251
+ ("mr", "mister"),
252
+ ("dr", "doctor"),
253
+ ("st", "saint"),
254
+ ("co", "company"),
255
+ ("jr", "junior"),
256
+ ("maj", "major"),
257
+ ("gen", "general"),
258
+ ("drs", "doctors"),
259
+ ("rev", "reverend"),
260
+ ("lt", "lieutenant"),
261
+ ("hon", "honorable"),
262
+ ("sgt", "sergeant"),
263
+ ("capt", "captain"),
264
+ ("esq", "esquire"),
265
+ ("ltd", "limited"),
266
+ ("col", "colonel"),
267
+ ("ft", "fort"),
268
+ ]
269
+ ]
270
+
271
+
272
+ # List of (ipa, lazy ipa) pairs:
273
+ _lazy_ipa = [
274
+ (re.compile("%s" % x[0]), x[1])
275
+ for x in [
276
+ ("r", "ɹ"),
277
+ ("æ", "e"),
278
+ ("ɑ", "a"),
279
+ ("ɔ", "o"),
280
+ ("ð", "z"),
281
+ ("θ", "s"),
282
+ ("ɛ", "e"),
283
+ ("ɪ", "i"),
284
+ ("ʊ", "u"),
285
+ ("ʒ", "ʥ"),
286
+ ("ʤ", "ʥ"),
287
+ ("ˈ", "↓"),
288
+ ]
289
+ ]
290
+
291
+ # List of (ipa, lazy ipa2) pairs:
292
+ _lazy_ipa2 = [
293
+ (re.compile("%s" % x[0]), x[1])
294
+ for x in [
295
+ ("r", "ɹ"),
296
+ ("ð", "z"),
297
+ ("θ", "s"),
298
+ ("ʒ", "ʑ"),
299
+ ("ʤ", "dʑ"),
300
+ ("ˈ", "↓"),
301
+ ]
302
+ ]
303
+
304
+ # List of (ipa, ipa2) pairs
305
+ _ipa_to_ipa2 = [
306
+ (re.compile("%s" % x[0]), x[1]) for x in [("r", "ɹ"), ("ʤ", "dʒ"), ("ʧ", "tʃ")]
307
+ ]
308
+
309
+
310
+ def _expand_dollars(m):
311
+ match = m.group(1)
312
+ parts = match.split(".")
313
+ if len(parts) > 2:
314
+ return match + " dollars" # Unexpected format
315
+ dollars = int(parts[0]) if parts[0] else 0
316
+ cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
317
+ if dollars and cents:
318
+ dollar_unit = "dollar" if dollars == 1 else "dollars"
319
+ cent_unit = "cent" if cents == 1 else "cents"
320
+ return "%s %s, %s %s" % (dollars, dollar_unit, cents, cent_unit)
321
+ elif dollars:
322
+ dollar_unit = "dollar" if dollars == 1 else "dollars"
323
+ return "%s %s" % (dollars, dollar_unit)
324
+ elif cents:
325
+ cent_unit = "cent" if cents == 1 else "cents"
326
+ return "%s %s" % (cents, cent_unit)
327
+ else:
328
+ return "zero dollars"
329
+
330
+
331
+ def _remove_commas(m):
332
+ return m.group(1).replace(",", "")
333
+
334
+
335
+ def _expand_ordinal(m):
336
+ return _inflect.number_to_words(m.group(0))
337
+
338
+
339
+ def _expand_number(m):
340
+ num = int(m.group(0))
341
+ if num > 1000 and num < 3000:
342
+ if num == 2000:
343
+ return "two thousand"
344
+ elif num > 2000 and num < 2010:
345
+ return "two thousand " + _inflect.number_to_words(num % 100)
346
+ elif num % 100 == 0:
347
+ return _inflect.number_to_words(num // 100) + " hundred"
348
+ else:
349
+ return _inflect.number_to_words(
350
+ num, andword="", zero="oh", group=2
351
+ ).replace(", ", " ")
352
+ else:
353
+ return _inflect.number_to_words(num, andword="")
354
+
355
+
356
+ def _expand_decimal_point(m):
357
+ return m.group(1).replace(".", " point ")
358
+
359
+
360
+ def normalize_numbers(text):
361
+ text = re.sub(_comma_number_re, _remove_commas, text)
362
+ text = re.sub(_pounds_re, r"\1 pounds", text)
363
+ text = re.sub(_dollars_re, _expand_dollars, text)
364
+ text = re.sub(_decimal_number_re, _expand_decimal_point, text)
365
+ text = re.sub(_ordinal_re, _expand_ordinal, text)
366
+ text = re.sub(_number_re, _expand_number, text)
367
+ return text
368
+
369
+
370
+ def text_normalize(text):
371
+ text = normalize_numbers(text)
372
+ text = replace_punctuation(text)
373
+ text = re.sub(r"([,;.\?\!])([\w])", r"\1 \2", text)
374
+ return text
375
+
376
+
377
+ def distribute_phone(n_phone, n_word):
378
+ phones_per_word = [0] * n_word
379
+ for task in range(n_phone):
380
+ min_tasks = min(phones_per_word)
381
+ min_index = phones_per_word.index(min_tasks)
382
+ phones_per_word[min_index] += 1
383
+ return phones_per_word
384
+
385
+
386
+ def sep_text(text):
387
+ words = re.split(r"([,;.\?\!\s+])", text)
388
+ words = [word for word in words if word.strip() != ""]
389
+ return words
390
+
391
+
392
+ def g2p(text):
393
+ phones = []
394
+ tones = []
395
+ # word2ph = []
396
+ words = sep_text(text)
397
+ tokens = [tokenizer.tokenize(i) for i in words]
398
+ for word in words:
399
+ if word.upper() in eng_dict:
400
+ phns, tns = refine_syllables(eng_dict[word.upper()])
401
+ phones.append([post_replace_ph(i) for i in phns])
402
+ tones.append(tns)
403
+ # word2ph.append(len(phns))
404
+ else:
405
+ phone_list = list(filter(lambda p: p != " ", _g2p(word)))
406
+ phns = []
407
+ tns = []
408
+ for ph in phone_list:
409
+ if ph in arpa:
410
+ ph, tn = refine_ph(ph)
411
+ phns.append(ph)
412
+ tns.append(tn)
413
+ else:
414
+ phns.append(ph)
415
+ tns.append(0)
416
+ phones.append([post_replace_ph(i) for i in phns])
417
+ tones.append(tns)
418
+ # word2ph.append(len(phns))
419
+ # phones = [post_replace_ph(i) for i in phones]
420
+
421
+ word2ph = []
422
+ for token, phoneme in zip(tokens, phones):
423
+ phone_len = len(phoneme)
424
+ word_len = len(token)
425
+
426
+ aaa = distribute_phone(phone_len, word_len)
427
+ word2ph += aaa
428
+
429
+ phones = ["_"] + [j for i in phones for j in i] + ["_"]
430
+ tones = [0] + [j for i in tones for j in i] + [0]
431
+ word2ph = [1] + word2ph + [1]
432
+ assert len(phones) == len(tones), text
433
+ assert len(phones) == sum(word2ph), text
434
+
435
+ return phones, tones, word2ph
436
+
437
+
438
+ def get_bert_feature(text, word2ph):
439
+ from text import english_bert_mock
440
+
441
+ return english_bert_mock.get_bert_feature(text, word2ph)
442
+
443
+
444
+ if __name__ == "__main__":
445
+ # print(get_dict())
446
+ # print(eng_word_to_phoneme("hello"))
447
+ print(g2p("In this paper, we propose 1 DSPGAN, a GAN-based universal vocoder."))
448
+ # all_phones = set()
449
+ # for k, syllables in eng_dict.items():
450
+ # for group in syllables:
451
+ # for ph in group:
452
+ # all_phones.add(ph)
453
+ # print(all_phones)
text/english_bert_mock.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+
3
+ import torch
4
+ from transformers import DebertaV2Model, DebertaV2Tokenizer
5
+
6
+ from config import config
7
+
8
+
9
+ LOCAL_PATH = "./bert/deberta-v3-large"
10
+
11
+ tokenizer = DebertaV2Tokenizer.from_pretrained(LOCAL_PATH)
12
+
13
+ models = dict()
14
+
15
+
16
+ def get_bert_feature(text, word2ph, device=config.bert_gen_config.device):
17
+ if (
18
+ sys.platform == "darwin"
19
+ and torch.backends.mps.is_available()
20
+ and device == "cpu"
21
+ ):
22
+ device = "mps"
23
+ if not device:
24
+ device = "cuda"
25
+ if device not in models.keys():
26
+ models[device] = DebertaV2Model.from_pretrained(LOCAL_PATH).to(device)
27
+ with torch.no_grad():
28
+ inputs = tokenizer(text, return_tensors="pt")
29
+ for i in inputs:
30
+ inputs[i] = inputs[i].to(device)
31
+ res = models[device](**inputs, output_hidden_states=True)
32
+ res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
33
+ assert len(word2ph) == res.shape[0], (text, res.shape[0], len(word2ph))
34
+ word2phone = word2ph
35
+ phone_level_feature = []
36
+ for i in range(len(word2phone)):
37
+ repeat_feature = res[i].repeat(word2phone[i], 1)
38
+ phone_level_feature.append(repeat_feature)
39
+
40
+ phone_level_feature = torch.cat(phone_level_feature, dim=0)
41
+
42
+ return phone_level_feature.T
text/japanese.py ADDED
@@ -0,0 +1,432 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Convert Japanese text to phonemes which is
2
+ # compatible with Julius https://github.com/julius-speech/segmentation-kit
3
+ import re
4
+ import unicodedata
5
+
6
+ from transformers import AutoTokenizer
7
+
8
+ from text import punctuation, symbols
9
+
10
+ from num2words import num2words
11
+
12
+ import pyopenjtalk
13
+ import jaconv
14
+
15
+
16
+ def kata2phoneme(text: str) -> str:
17
+ """Convert katakana text to phonemes."""
18
+ text = text.strip()
19
+ if text == "ー":
20
+ return ["ー"]
21
+ elif text.startswith("ー"):
22
+ return ["ー"] + kata2phoneme(text[1:])
23
+ res = []
24
+ prev = None
25
+ while text:
26
+ if re.match(_MARKS, text):
27
+ res.append(text)
28
+ text = text[1:]
29
+ continue
30
+ if text.startswith("ー"):
31
+ if prev:
32
+ res.append(prev[-1])
33
+ text = text[1:]
34
+ continue
35
+ res += pyopenjtalk.g2p(text).lower().replace("cl", "q").split(" ")
36
+ break
37
+ # res = _COLON_RX.sub(":", res)
38
+ return res
39
+
40
+
41
+ def hira2kata(text: str) -> str:
42
+ return jaconv.hira2kata(text)
43
+
44
+
45
+ _SYMBOL_TOKENS = set(list("・、。?!"))
46
+ _NO_YOMI_TOKENS = set(list("「」『』―()[][]"))
47
+ _MARKS = re.compile(
48
+ r"[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
49
+ )
50
+
51
+
52
+ def text2kata(text: str) -> str:
53
+ parsed = pyopenjtalk.run_frontend(text)
54
+
55
+ res = []
56
+ for parts in parsed:
57
+ word, yomi = replace_punctuation(parts["string"]), parts["pron"].replace(
58
+ "’", ""
59
+ )
60
+ if yomi:
61
+ if re.match(_MARKS, yomi):
62
+ if len(word) > 1:
63
+ word = [replace_punctuation(i) for i in list(word)]
64
+ yomi = word
65
+ res += yomi
66
+ sep += word
67
+ continue
68
+ elif word not in rep_map.keys() and word not in rep_map.values():
69
+ word = ","
70
+ yomi = word
71
+ res.append(yomi)
72
+ else:
73
+ if word in _SYMBOL_TOKENS:
74
+ res.append(word)
75
+ elif word in ("っ", "ッ"):
76
+ res.append("ッ")
77
+ elif word in _NO_YOMI_TOKENS:
78
+ pass
79
+ else:
80
+ res.append(word)
81
+ return hira2kata("".join(res))
82
+
83
+
84
+ def text2sep_kata(text: str) -> (list, list):
85
+ parsed = pyopenjtalk.run_frontend(text)
86
+
87
+ res = []
88
+ sep = []
89
+ for parts in parsed:
90
+ word, yomi = replace_punctuation(parts["string"]), parts["pron"].replace(
91
+ "’", ""
92
+ )
93
+ if yomi:
94
+ if re.match(_MARKS, yomi):
95
+ if len(word) > 1:
96
+ word = [replace_punctuation(i) for i in list(word)]
97
+ yomi = word
98
+ res += yomi
99
+ sep += word
100
+ continue
101
+ elif word not in rep_map.keys() and word not in rep_map.values():
102
+ word = ","
103
+ yomi = word
104
+ res.append(yomi)
105
+ else:
106
+ if word in _SYMBOL_TOKENS:
107
+ res.append(word)
108
+ elif word in ("っ", "ッ"):
109
+ res.append("ッ")
110
+ elif word in _NO_YOMI_TOKENS:
111
+ pass
112
+ else:
113
+ res.append(word)
114
+ sep.append(word)
115
+ return sep, [hira2kata(i) for i in res], get_accent(parsed)
116
+
117
+
118
+ def get_accent(parsed):
119
+ labels = pyopenjtalk.make_label(parsed)
120
+
121
+ phonemes = []
122
+ accents = []
123
+ for n, label in enumerate(labels):
124
+ phoneme = re.search(r"\-([^\+]*)\+", label).group(1)
125
+ if phoneme not in ["sil", "pau"]:
126
+ phonemes.append(phoneme.replace("cl", "q").lower())
127
+ else:
128
+ continue
129
+ a1 = int(re.search(r"/A:(\-?[0-9]+)\+", label).group(1))
130
+ a2 = int(re.search(r"\+(\d+)\+", label).group(1))
131
+ if re.search(r"\-([^\+]*)\+", labels[n + 1]).group(1) in ["sil", "pau"]:
132
+ a2_next = -1
133
+ else:
134
+ a2_next = int(re.search(r"\+(\d+)\+", labels[n + 1]).group(1))
135
+ # Falling
136
+ if a1 == 0 and a2_next == a2 + 1:
137
+ accents.append(-1)
138
+ # Rising
139
+ elif a2 == 1 and a2_next == 2:
140
+ accents.append(1)
141
+ else:
142
+ accents.append(0)
143
+ return list(zip(phonemes, accents))
144
+
145
+
146
+ _ALPHASYMBOL_YOMI = {
147
+ "#": "シャープ",
148
+ "%": "パーセント",
149
+ "&": "アンド",
150
+ "+": "プラス",
151
+ "-": "マイナス",
152
+ ":": "コロン",
153
+ ";": "セミコロン",
154
+ "<": "小なり",
155
+ "=": "イコール",
156
+ ">": "大なり",
157
+ "@": "アット",
158
+ "a": "エー",
159
+ "b": "ビー",
160
+ "c": "シー",
161
+ "d": "ディー",
162
+ "e": "イー",
163
+ "f": "エフ",
164
+ "g": "ジー",
165
+ "h": "エイチ",
166
+ "i": "アイ",
167
+ "j": "ジェー",
168
+ "k": "ケー",
169
+ "l": "エル",
170
+ "m": "エム",
171
+ "n": "エヌ",
172
+ "o": "オー",
173
+ "p": "ピー",
174
+ "q": "キュー",
175
+ "r": "アール",
176
+ "s": "エス",
177
+ "t": "ティー",
178
+ "u": "ユー",
179
+ "v": "ブイ",
180
+ "w": "ダブリュー",
181
+ "x": "エックス",
182
+ "y": "ワイ",
183
+ "z": "ゼット",
184
+ "α": "アルファ",
185
+ "β": "ベータ",
186
+ "γ": "ガンマ",
187
+ "δ": "デルタ",
188
+ "ε": "イプシロン",
189
+ "ζ": "ゼータ",
190
+ "η": "イータ",
191
+ "θ": "シータ",
192
+ "ι": "イオタ",
193
+ "κ": "カッパ",
194
+ "λ": "ラムダ",
195
+ "μ": "ミュー",
196
+ "ν": "ニュー",
197
+ "ξ": "クサイ",
198
+ "ο": "オミクロン",
199
+ "π": "パイ",
200
+ "ρ": "ロー",
201
+ "σ": "シグマ",
202
+ "τ": "タウ",
203
+ "υ": "ウプシロン",
204
+ "φ": "ファイ",
205
+ "χ": "カイ",
206
+ "ψ": "プサイ",
207
+ "ω": "オメガ",
208
+ }
209
+
210
+
211
+ _NUMBER_WITH_SEPARATOR_RX = re.compile("[0-9]{1,3}(,[0-9]{3})+")
212
+ _CURRENCY_MAP = {"$": "ドル", "¥": "円", "£": "ポンド", "€": "ユーロ"}
213
+ _CURRENCY_RX = re.compile(r"([$¥£€])([0-9.]*[0-9])")
214
+ _NUMBER_RX = re.compile(r"[0-9]+(\.[0-9]+)?")
215
+
216
+
217
+ def japanese_convert_numbers_to_words(text: str) -> str:
218
+ res = _NUMBER_WITH_SEPARATOR_RX.sub(lambda m: m[0].replace(",", ""), text)
219
+ res = _CURRENCY_RX.sub(lambda m: m[2] + _CURRENCY_MAP.get(m[1], m[1]), res)
220
+ res = _NUMBER_RX.sub(lambda m: num2words(m[0], lang="ja"), res)
221
+ return res
222
+
223
+
224
+ def japanese_convert_alpha_symbols_to_words(text: str) -> str:
225
+ return "".join([_ALPHASYMBOL_YOMI.get(ch, ch) for ch in text.lower()])
226
+
227
+
228
+ def japanese_text_to_phonemes(text: str) -> str:
229
+ """Convert Japanese text to phonemes."""
230
+ res = unicodedata.normalize("NFKC", text)
231
+ res = japanese_convert_numbers_to_words(res)
232
+ # res = japanese_convert_alpha_symbols_to_words(res)
233
+ res = text2kata(res)
234
+ res = kata2phoneme(res)
235
+ return res
236
+
237
+
238
+ def is_japanese_character(char):
239
+ # 定义日语文字系统的 Unicode 范围
240
+ japanese_ranges = [
241
+ (0x3040, 0x309F), # 平假名
242
+ (0x30A0, 0x30FF), # 片假名
243
+ (0x4E00, 0x9FFF), # 汉字 (CJK Unified Ideographs)
244
+ (0x3400, 0x4DBF), # 汉字扩展 A
245
+ (0x20000, 0x2A6DF), # 汉字扩展 B
246
+ # 可以根据需要添加其他汉字扩展范围
247
+ ]
248
+
249
+ # 将字符的 Unicode 编码转换为整数
250
+ char_code = ord(char)
251
+
252
+ # 检查字符是否在任何一个日语范围内
253
+ for start, end in japanese_ranges:
254
+ if start <= char_code <= end:
255
+ return True
256
+
257
+ return False
258
+
259
+
260
+ rep_map = {
261
+ ":": ",",
262
+ ";": ",",
263
+ ",": ",",
264
+ "。": ".",
265
+ "!": "!",
266
+ "?": "?",
267
+ "\n": ".",
268
+ ".": ".",
269
+ "…": "...",
270
+ "···": "...",
271
+ "・・・": "...",
272
+ "·": ",",
273
+ "・": ",",
274
+ "、": ",",
275
+ "$": ".",
276
+ "“": "'",
277
+ "”": "'",
278
+ '"': "'",
279
+ "‘": "'",
280
+ "’": "'",
281
+ "(": "'",
282
+ ")": "'",
283
+ "(": "'",
284
+ ")": "'",
285
+ "《": "'",
286
+ "》": "'",
287
+ "【": "'",
288
+ "】": "'",
289
+ "[": "'",
290
+ "]": "'",
291
+ "—": "-",
292
+ "−": "-",
293
+ "~": "-",
294
+ "~": "-",
295
+ "「": "'",
296
+ "」": "'",
297
+ }
298
+
299
+
300
+ def replace_punctuation(text):
301
+ pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
302
+
303
+ replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
304
+
305
+ replaced_text = re.sub(
306
+ r"[^\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\u3400-\u4DBF\u3005"
307
+ + "".join(punctuation)
308
+ + r"]+",
309
+ "",
310
+ replaced_text,
311
+ )
312
+
313
+ return replaced_text
314
+
315
+
316
+ def text_normalize(text):
317
+ res = unicodedata.normalize("NFKC", text)
318
+ res = japanese_convert_numbers_to_words(res)
319
+ # res = "".join([i for i in res if is_japanese_character(i)])
320
+ res = replace_punctuation(res)
321
+ res = res.replace("゙", "")
322
+ return res
323
+
324
+
325
+ def distribute_phone(n_phone, n_word):
326
+ phones_per_word = [0] * n_word
327
+ for task in range(n_phone):
328
+ min_tasks = min(phones_per_word)
329
+ min_index = phones_per_word.index(min_tasks)
330
+ phones_per_word[min_index] += 1
331
+ return phones_per_word
332
+
333
+
334
+ def handle_long(sep_phonemes):
335
+ for i in range(len(sep_phonemes)):
336
+ if sep_phonemes[i][0] == "ー":
337
+ sep_phonemes[i][0] = sep_phonemes[i - 1][-1]
338
+ if "ー" in sep_phonemes[i]:
339
+ for j in range(len(sep_phonemes[i])):
340
+ if sep_phonemes[i][j] == "ー":
341
+ sep_phonemes[i][j] = sep_phonemes[i][j - 1][-1]
342
+ return sep_phonemes
343
+
344
+
345
+ tokenizer = AutoTokenizer.from_pretrained("./bert/deberta-v2-large-japanese-char-wwm")
346
+
347
+
348
+ def align_tones(phones, tones):
349
+ res = []
350
+ for pho in phones:
351
+ temp = [0] * len(pho)
352
+ for idx, p in enumerate(pho):
353
+ if len(tones) == 0:
354
+ break
355
+ if p == tones[0][0]:
356
+ temp[idx] = tones[0][1]
357
+ if idx > 0:
358
+ temp[idx] += temp[idx - 1]
359
+ tones.pop(0)
360
+ temp = [0] + temp
361
+ temp = temp[:-1]
362
+ if -1 in temp:
363
+ temp = [i + 1 for i in temp]
364
+ res.append(temp)
365
+ res = [i for j in res for i in j]
366
+ assert not any([i < 0 for i in res]) and not any([i > 1 for i in res])
367
+ return res
368
+
369
+
370
+ def rearrange_tones(tones, phones):
371
+ res = [0] * len(tones)
372
+ for i in range(len(tones)):
373
+ if i == 0:
374
+ if tones[i] not in punctuation:
375
+ res[i] = 1
376
+ elif tones[i] == prev:
377
+ if phones[i] in punctuation:
378
+ res[i] = 0
379
+ else:
380
+ res[i] = 1
381
+ elif tones[i] > prev:
382
+ res[i] = 2
383
+ elif tones[i] < prev:
384
+ res[i - 1] = 3
385
+ res[i] = 1
386
+ prev = tones[i]
387
+ return res
388
+
389
+
390
+ def g2p(norm_text):
391
+ sep_text, sep_kata, acc = text2sep_kata(norm_text)
392
+ sep_tokenized = []
393
+ for i in sep_text:
394
+ if i not in punctuation:
395
+ sep_tokenized.append(tokenizer.tokenize(i))
396
+ else:
397
+ sep_tokenized.append([i])
398
+
399
+ sep_phonemes = handle_long([kata2phoneme(i) for i in sep_kata])
400
+ # 异常处理,MeCab不认识的词的话会一路传到这里来,然后炸掉。目前来看只有那些超级稀有的生僻词会出现这种情况
401
+ for i in sep_phonemes:
402
+ for j in i:
403
+ assert j in symbols, (sep_text, sep_kata, sep_phonemes)
404
+ tones = align_tones(sep_phonemes, acc)
405
+
406
+ word2ph = []
407
+ for token, phoneme in zip(sep_tokenized, sep_phonemes):
408
+ phone_len = len(phoneme)
409
+ word_len = len(token)
410
+
411
+ aaa = distribute_phone(phone_len, word_len)
412
+ word2ph += aaa
413
+ phones = ["_"] + [j for i in sep_phonemes for j in i] + ["_"]
414
+ # tones = [0] + rearrange_tones(tones, phones[1:-1]) + [0]
415
+ tones = [0] + tones + [0]
416
+ word2ph = [1] + word2ph + [1]
417
+ assert len(phones) == len(tones)
418
+ return phones, tones, word2ph
419
+
420
+
421
+ if __name__ == "__main__":
422
+ tokenizer = AutoTokenizer.from_pretrained("./bert/deberta-v2-large-japanese")
423
+ text = "hello,こんにちは、世界ー!……"
424
+ from text.japanese_bert import get_bert_feature
425
+
426
+ text = text_normalize(text)
427
+ print(text)
428
+
429
+ phones, tones, word2ph = g2p(text)
430
+ bert = get_bert_feature(text, word2ph)
431
+
432
+ print(phones, tones, word2ph, bert.shape)
text/japanese_bert.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+
3
+ import torch
4
+ from transformers import AutoModelForMaskedLM, AutoTokenizer
5
+
6
+ from config import config
7
+ from text.japanese import text2sep_kata
8
+
9
+ LOCAL_PATH = "./bert/deberta-v2-large-japanese-char-wwm"
10
+
11
+ tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH)
12
+
13
+ models = dict()
14
+
15
+
16
+ def get_bert_feature(text, word2ph, device=config.bert_gen_config.device):
17
+ text = "".join(text2sep_kata(text)[0])
18
+ if (
19
+ sys.platform == "darwin"
20
+ and torch.backends.mps.is_available()
21
+ and device == "cpu"
22
+ ):
23
+ device = "mps"
24
+ if not device:
25
+ device = "cuda"
26
+ if device not in models.keys():
27
+ models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device)
28
+ with torch.no_grad():
29
+ inputs = tokenizer(text, return_tensors="pt")
30
+ for i in inputs:
31
+ inputs[i] = inputs[i].to(device)
32
+ res = models[device](**inputs, output_hidden_states=True)
33
+ res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
34
+
35
+ assert len(word2ph) == len(text) + 2
36
+ word2phone = word2ph
37
+ phone_level_feature = []
38
+ for i in range(len(word2phone)):
39
+ repeat_feature = res[i].repeat(word2phone[i], 1)
40
+ phone_level_feature.append(repeat_feature)
41
+
42
+ phone_level_feature = torch.cat(phone_level_feature, dim=0)
43
+
44
+ return phone_level_feature.T
text/opencpop-strict.txt ADDED
@@ -0,0 +1,429 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ a AA a
2
+ ai AA ai
3
+ an AA an
4
+ ang AA ang
5
+ ao AA ao
6
+ ba b a
7
+ bai b ai
8
+ ban b an
9
+ bang b ang
10
+ bao b ao
11
+ bei b ei
12
+ ben b en
13
+ beng b eng
14
+ bi b i
15
+ bian b ian
16
+ biao b iao
17
+ bie b ie
18
+ bin b in
19
+ bing b ing
20
+ bo b o
21
+ bu b u
22
+ ca c a
23
+ cai c ai
24
+ can c an
25
+ cang c ang
26
+ cao c ao
27
+ ce c e
28
+ cei c ei
29
+ cen c en
30
+ ceng c eng
31
+ cha ch a
32
+ chai ch ai
33
+ chan ch an
34
+ chang ch ang
35
+ chao ch ao
36
+ che ch e
37
+ chen ch en
38
+ cheng ch eng
39
+ chi ch ir
40
+ chong ch ong
41
+ chou ch ou
42
+ chu ch u
43
+ chua ch ua
44
+ chuai ch uai
45
+ chuan ch uan
46
+ chuang ch uang
47
+ chui ch ui
48
+ chun ch un
49
+ chuo ch uo
50
+ ci c i0
51
+ cong c ong
52
+ cou c ou
53
+ cu c u
54
+ cuan c uan
55
+ cui c ui
56
+ cun c un
57
+ cuo c uo
58
+ da d a
59
+ dai d ai
60
+ dan d an
61
+ dang d ang
62
+ dao d ao
63
+ de d e
64
+ dei d ei
65
+ den d en
66
+ deng d eng
67
+ di d i
68
+ dia d ia
69
+ dian d ian
70
+ diao d iao
71
+ die d ie
72
+ ding d ing
73
+ diu d iu
74
+ dong d ong
75
+ dou d ou
76
+ du d u
77
+ duan d uan
78
+ dui d ui
79
+ dun d un
80
+ duo d uo
81
+ e EE e
82
+ ei EE ei
83
+ en EE en
84
+ eng EE eng
85
+ er EE er
86
+ fa f a
87
+ fan f an
88
+ fang f ang
89
+ fei f ei
90
+ fen f en
91
+ feng f eng
92
+ fo f o
93
+ fou f ou
94
+ fu f u
95
+ ga g a
96
+ gai g ai
97
+ gan g an
98
+ gang g ang
99
+ gao g ao
100
+ ge g e
101
+ gei g ei
102
+ gen g en
103
+ geng g eng
104
+ gong g ong
105
+ gou g ou
106
+ gu g u
107
+ gua g ua
108
+ guai g uai
109
+ guan g uan
110
+ guang g uang
111
+ gui g ui
112
+ gun g un
113
+ guo g uo
114
+ ha h a
115
+ hai h ai
116
+ han h an
117
+ hang h ang
118
+ hao h ao
119
+ he h e
120
+ hei h ei
121
+ hen h en
122
+ heng h eng
123
+ hong h ong
124
+ hou h ou
125
+ hu h u
126
+ hua h ua
127
+ huai h uai
128
+ huan h uan
129
+ huang h uang
130
+ hui h ui
131
+ hun h un
132
+ huo h uo
133
+ ji j i
134
+ jia j ia
135
+ jian j ian
136
+ jiang j iang
137
+ jiao j iao
138
+ jie j ie
139
+ jin j in
140
+ jing j ing
141
+ jiong j iong
142
+ jiu j iu
143
+ ju j v
144
+ jv j v
145
+ juan j van
146
+ jvan j van
147
+ jue j ve
148
+ jve j ve
149
+ jun j vn
150
+ jvn j vn
151
+ ka k a
152
+ kai k ai
153
+ kan k an
154
+ kang k ang
155
+ kao k ao
156
+ ke k e
157
+ kei k ei
158
+ ken k en
159
+ keng k eng
160
+ kong k ong
161
+ kou k ou
162
+ ku k u
163
+ kua k ua
164
+ kuai k uai
165
+ kuan k uan
166
+ kuang k uang
167
+ kui k ui
168
+ kun k un
169
+ kuo k uo
170
+ la l a
171
+ lai l ai
172
+ lan l an
173
+ lang l ang
174
+ lao l ao
175
+ le l e
176
+ lei l ei
177
+ leng l eng
178
+ li l i
179
+ lia l ia
180
+ lian l ian
181
+ liang l iang
182
+ liao l iao
183
+ lie l ie
184
+ lin l in
185
+ ling l ing
186
+ liu l iu
187
+ lo l o
188
+ long l ong
189
+ lou l ou
190
+ lu l u
191
+ luan l uan
192
+ lun l un
193
+ luo l uo
194
+ lv l v
195
+ lve l ve
196
+ ma m a
197
+ mai m ai
198
+ man m an
199
+ mang m ang
200
+ mao m ao
201
+ me m e
202
+ mei m ei
203
+ men m en
204
+ meng m eng
205
+ mi m i
206
+ mian m ian
207
+ miao m iao
208
+ mie m ie
209
+ min m in
210
+ ming m ing
211
+ miu m iu
212
+ mo m o
213
+ mou m ou
214
+ mu m u
215
+ na n a
216
+ nai n ai
217
+ nan n an
218
+ nang n ang
219
+ nao n ao
220
+ ne n e
221
+ nei n ei
222
+ nen n en
223
+ neng n eng
224
+ ni n i
225
+ nian n ian
226
+ niang n iang
227
+ niao n iao
228
+ nie n ie
229
+ nin n in
230
+ ning n ing
231
+ niu n iu
232
+ nong n ong
233
+ nou n ou
234
+ nu n u
235
+ nuan n uan
236
+ nun n un
237
+ nuo n uo
238
+ nv n v
239
+ nve n ve
240
+ o OO o
241
+ ou OO ou
242
+ pa p a
243
+ pai p ai
244
+ pan p an
245
+ pang p ang
246
+ pao p ao
247
+ pei p ei
248
+ pen p en
249
+ peng p eng
250
+ pi p i
251
+ pian p ian
252
+ piao p iao
253
+ pie p ie
254
+ pin p in
255
+ ping p ing
256
+ po p o
257
+ pou p ou
258
+ pu p u
259
+ qi q i
260
+ qia q ia
261
+ qian q ian
262
+ qiang q iang
263
+ qiao q iao
264
+ qie q ie
265
+ qin q in
266
+ qing q ing
267
+ qiong q iong
268
+ qiu q iu
269
+ qu q v
270
+ qv q v
271
+ quan q van
272
+ qvan q van
273
+ que q ve
274
+ qve q ve
275
+ qun q vn
276
+ qvn q vn
277
+ ran r an
278
+ rang r ang
279
+ rao r ao
280
+ re r e
281
+ ren r en
282
+ reng r eng
283
+ ri r ir
284
+ rong r ong
285
+ rou r ou
286
+ ru r u
287
+ rua r ua
288
+ ruan r uan
289
+ rui r ui
290
+ run r un
291
+ ruo r uo
292
+ sa s a
293
+ sai s ai
294
+ san s an
295
+ sang s ang
296
+ sao s ao
297
+ se s e
298
+ sen s en
299
+ seng s eng
300
+ sha sh a
301
+ shai sh ai
302
+ shan sh an
303
+ shang sh ang
304
+ shao sh ao
305
+ she sh e
306
+ shei sh ei
307
+ shen sh en
308
+ sheng sh eng
309
+ shi sh ir
310
+ shou sh ou
311
+ shu sh u
312
+ shua sh ua
313
+ shuai sh uai
314
+ shuan sh uan
315
+ shuang sh uang
316
+ shui sh ui
317
+ shun sh un
318
+ shuo sh uo
319
+ si s i0
320
+ song s ong
321
+ sou s ou
322
+ su s u
323
+ suan s uan
324
+ sui s ui
325
+ sun s un
326
+ suo s uo
327
+ ta t a
328
+ tai t ai
329
+ tan t an
330
+ tang t ang
331
+ tao t ao
332
+ te t e
333
+ tei t ei
334
+ teng t eng
335
+ ti t i
336
+ tian t ian
337
+ tiao t iao
338
+ tie t ie
339
+ ting t ing
340
+ tong t ong
341
+ tou t ou
342
+ tu t u
343
+ tuan t uan
344
+ tui t ui
345
+ tun t un
346
+ tuo t uo
347
+ wa w a
348
+ wai w ai
349
+ wan w an
350
+ wang w ang
351
+ wei w ei
352
+ wen w en
353
+ weng w eng
354
+ wo w o
355
+ wu w u
356
+ xi x i
357
+ xia x ia
358
+ xian x ian
359
+ xiang x iang
360
+ xiao x iao
361
+ xie x ie
362
+ xin x in
363
+ xing x ing
364
+ xiong x iong
365
+ xiu x iu
366
+ xu x v
367
+ xv x v
368
+ xuan x van
369
+ xvan x van
370
+ xue x ve
371
+ xve x ve
372
+ xun x vn
373
+ xvn x vn
374
+ ya y a
375
+ yan y En
376
+ yang y ang
377
+ yao y ao
378
+ ye y E
379
+ yi y i
380
+ yin y in
381
+ ying y ing
382
+ yo y o
383
+ yong y ong
384
+ you y ou
385
+ yu y v
386
+ yv y v
387
+ yuan y van
388
+ yvan y van
389
+ yue y ve
390
+ yve y ve
391
+ yun y vn
392
+ yvn y vn
393
+ za z a
394
+ zai z ai
395
+ zan z an
396
+ zang z ang
397
+ zao z ao
398
+ ze z e
399
+ zei z ei
400
+ zen z en
401
+ zeng z eng
402
+ zha zh a
403
+ zhai zh ai
404
+ zhan zh an
405
+ zhang zh ang
406
+ zhao zh ao
407
+ zhe zh e
408
+ zhei zh ei
409
+ zhen zh en
410
+ zheng zh eng
411
+ zhi zh ir
412
+ zhong zh ong
413
+ zhou zh ou
414
+ zhu zh u
415
+ zhua zh ua
416
+ zhuai zh uai
417
+ zhuan zh uan
418
+ zhuang zh uang
419
+ zhui zh ui
420
+ zhun zh un
421
+ zhuo zh uo
422
+ zi z i0
423
+ zong z ong
424
+ zou z ou
425
+ zu z u
426
+ zuan z uan
427
+ zui z ui
428
+ zun z un
429
+ zuo z uo
text/symbols.py ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ punctuation = ["!", "?", "…", ",", ".", "'", "-"]
2
+ pu_symbols = punctuation + ["SP", "UNK"]
3
+ pad = "_"
4
+
5
+ # chinese
6
+ zh_symbols = [
7
+ "E",
8
+ "En",
9
+ "a",
10
+ "ai",
11
+ "an",
12
+ "ang",
13
+ "ao",
14
+ "b",
15
+ "c",
16
+ "ch",
17
+ "d",
18
+ "e",
19
+ "ei",
20
+ "en",
21
+ "eng",
22
+ "er",
23
+ "f",
24
+ "g",
25
+ "h",
26
+ "i",
27
+ "i0",
28
+ "ia",
29
+ "ian",
30
+ "iang",
31
+ "iao",
32
+ "ie",
33
+ "in",
34
+ "ing",
35
+ "iong",
36
+ "ir",
37
+ "iu",
38
+ "j",
39
+ "k",
40
+ "l",
41
+ "m",
42
+ "n",
43
+ "o",
44
+ "ong",
45
+ "ou",
46
+ "p",
47
+ "q",
48
+ "r",
49
+ "s",
50
+ "sh",
51
+ "t",
52
+ "u",
53
+ "ua",
54
+ "uai",
55
+ "uan",
56
+ "uang",
57
+ "ui",
58
+ "un",
59
+ "uo",
60
+ "v",
61
+ "van",
62
+ "ve",
63
+ "vn",
64
+ "w",
65
+ "x",
66
+ "y",
67
+ "z",
68
+ "zh",
69
+ "AA",
70
+ "EE",
71
+ "OO",
72
+ ]
73
+ num_zh_tones = 6
74
+
75
+ # japanese
76
+ ja_symbols = [
77
+ "N",
78
+ "a",
79
+ "a:",
80
+ "b",
81
+ "by",
82
+ "ch",
83
+ "d",
84
+ "dy",
85
+ "e",
86
+ "e:",
87
+ "f",
88
+ "g",
89
+ "gy",
90
+ "h",
91
+ "hy",
92
+ "i",
93
+ "i:",
94
+ "j",
95
+ "k",
96
+ "ky",
97
+ "m",
98
+ "my",
99
+ "n",
100
+ "ny",
101
+ "o",
102
+ "o:",
103
+ "p",
104
+ "py",
105
+ "q",
106
+ "r",
107
+ "ry",
108
+ "s",
109
+ "sh",
110
+ "t",
111
+ "ts",
112
+ "ty",
113
+ "u",
114
+ "u:",
115
+ "w",
116
+ "y",
117
+ "z",
118
+ "zy",
119
+ ]
120
+ num_ja_tones = 2
121
+
122
+ # English
123
+ en_symbols = [
124
+ "aa",
125
+ "ae",
126
+ "ah",
127
+ "ao",
128
+ "aw",
129
+ "ay",
130
+ "b",
131
+ "ch",
132
+ "d",
133
+ "dh",
134
+ "eh",
135
+ "er",
136
+ "ey",
137
+ "f",
138
+ "g",
139
+ "hh",
140
+ "ih",
141
+ "iy",
142
+ "jh",
143
+ "k",
144
+ "l",
145
+ "m",
146
+ "n",
147
+ "ng",
148
+ "ow",
149
+ "oy",
150
+ "p",
151
+ "r",
152
+ "s",
153
+ "sh",
154
+ "t",
155
+ "th",
156
+ "uh",
157
+ "uw",
158
+ "V",
159
+ "w",
160
+ "y",
161
+ "z",
162
+ "zh",
163
+ ]
164
+ num_en_tones = 4
165
+
166
+ # combine all symbols
167
+ normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols))
168
+ symbols = [pad] + normal_symbols + pu_symbols
169
+ sil_phonemes_ids = [symbols.index(i) for i in pu_symbols]
170
+
171
+ # combine all tones
172
+ num_tones = num_zh_tones + num_ja_tones + num_en_tones
173
+
174
+ # language maps
175
+ language_id_map = {"ZH": 0, "JP": 1, "EN": 2}
176
+ num_languages = len(language_id_map.keys())
177
+
178
+ language_tone_start_map = {
179
+ "ZH": 0,
180
+ "JP": num_zh_tones,
181
+ "EN": num_zh_tones + num_ja_tones,
182
+ }
183
+
184
+ if __name__ == "__main__":
185
+ a = set(zh_symbols)
186
+ b = set(en_symbols)
187
+ print(sorted(a & b))
text/tone_sandhi.py ADDED
@@ -0,0 +1,769 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from typing import List
15
+ from typing import Tuple
16
+
17
+ import jieba
18
+ from pypinyin import lazy_pinyin
19
+ from pypinyin import Style
20
+
21
+
22
+ class ToneSandhi:
23
+ def __init__(self):
24
+ self.must_neural_tone_words = {
25
+ "麻烦",
26
+ "麻利",
27
+ "鸳鸯",
28
+ "高粱",
29
+ "骨头",
30
+ "骆驼",
31
+ "马虎",
32
+ "首饰",
33
+ "馒头",
34
+ "馄饨",
35
+ "风筝",
36
+ "难为",
37
+ "队伍",
38
+ "阔气",
39
+ "闺女",
40
+ "门道",
41
+ "锄头",
42
+ "铺盖",
43
+ "铃铛",
44
+ "铁匠",
45
+ "钥匙",
46
+ "里脊",
47
+ "里头",
48
+ "部分",
49
+ "那么",
50
+ "道士",
51
+ "造化",
52
+ "迷糊",
53
+ "连累",
54
+ "这么",
55
+ "这个",
56
+ "运气",
57
+ "过去",
58
+ "软和",
59
+ "转悠",
60
+ "踏实",
61
+ "跳蚤",
62
+ "跟头",
63
+ "趔趄",
64
+ "财主",
65
+ "豆腐",
66
+ "讲究",
67
+ "记性",
68
+ "记号",
69
+ "认识",
70
+ "规矩",
71
+ "见识",
72
+ "裁缝",
73
+ "补丁",
74
+ "衣裳",
75
+ "衣服",
76
+ "衙门",
77
+ "街坊",
78
+ "行李",
79
+ "行当",
80
+ "蛤蟆",
81
+ "蘑菇",
82
+ "薄荷",
83
+ "葫芦",
84
+ "葡萄",
85
+ "萝卜",
86
+ "荸荠",
87
+ "苗条",
88
+ "苗头",
89
+ "苍蝇",
90
+ "芝麻",
91
+ "舒服",
92
+ "舒坦",
93
+ "舌头",
94
+ "自在",
95
+ "膏药",
96
+ "脾气",
97
+ "脑袋",
98
+ "脊梁",
99
+ "能耐",
100
+ "胳膊",
101
+ "胭脂",
102
+ "胡萝",
103
+ "胡琴",
104
+ "胡同",
105
+ "聪明",
106
+ "耽误",
107
+ "耽搁",
108
+ "耷拉",
109
+ "耳朵",
110
+ "老爷",
111
+ "老实",
112
+ "老婆",
113
+ "老头",
114
+ "老太",
115
+ "翻腾",
116
+ "罗嗦",
117
+ "罐头",
118
+ "编辑",
119
+ "结实",
120
+ "红火",
121
+ "累赘",
122
+ "糨糊",
123
+ "糊涂",
124
+ "精神",
125
+ "粮食",
126
+ "簸箕",
127
+ "篱笆",
128
+ "算计",
129
+ "算盘",
130
+ "答应",
131
+ "笤帚",
132
+ "笑语",
133
+ "笑话",
134
+ "窟窿",
135
+ "窝囊",
136
+ "窗户",
137
+ "稳当",
138
+ "稀罕",
139
+ "称呼",
140
+ "秧歌",
141
+ "秀气",
142
+ "秀才",
143
+ "福气",
144
+ "祖宗",
145
+ "砚台",
146
+ "码头",
147
+ "石榴",
148
+ "石头",
149
+ "石匠",
150
+ "知识",
151
+ "眼睛",
152
+ "眯缝",
153
+ "眨巴",
154
+ "眉毛",
155
+ "相声",
156
+ "盘算",
157
+ "白净",
158
+ "痢疾",
159
+ "痛快",
160
+ "疟疾",
161
+ "疙瘩",
162
+ "疏忽",
163
+ "畜生",
164
+ "生意",
165
+ "甘蔗",
166
+ "琵琶",
167
+ "琢磨",
168
+ "琉璃",
169
+ "玻璃",
170
+ "玫瑰",
171
+ "玄乎",
172
+ "狐狸",
173
+ "状元",
174
+ "特务",
175
+ "牲口",
176
+ "牙碜",
177
+ "牌楼",
178
+ "爽快",
179
+ "爱人",
180
+ "热闹",
181
+ "烧饼",
182
+ "烟筒",
183
+ "烂糊",
184
+ "点心",
185
+ "炊帚",
186
+ "灯笼",
187
+ "火候",
188
+ "漂亮",
189
+ "滑溜",
190
+ "溜达",
191
+ "温和",
192
+ "清楚",
193
+ "消息",
194
+ "浪头",
195
+ "活泼",
196
+ "比方",
197
+ "正经",
198
+ "欺负",
199
+ "模糊",
200
+ "槟榔",
201
+ "棺材",
202
+ "棒槌",
203
+ "棉花",
204
+ "核桃",
205
+ "栅栏",
206
+ "柴火",
207
+ "架势",
208
+ "枕头",
209
+ "枇杷",
210
+ "机灵",
211
+ "本事",
212
+ "木头",
213
+ "木匠",
214
+ "朋友",
215
+ "月饼",
216
+ "月亮",
217
+ "暖和",
218
+ "明白",
219
+ "时候",
220
+ "新鲜",
221
+ "故事",
222
+ "收拾",
223
+ "收成",
224
+ "提防",
225
+ "挖苦",
226
+ "挑剔",
227
+ "指甲",
228
+ "指头",
229
+ "拾掇",
230
+ "拳头",
231
+ "拨弄",
232
+ "招牌",
233
+ "招呼",
234
+ "抬举",
235
+ "护士",
236
+ "折腾",
237
+ "扫帚",
238
+ "打量",
239
+ "打算",
240
+ "打点",
241
+ "打扮",
242
+ "打听",
243
+ "打发",
244
+ "扎实",
245
+ "扁担",
246
+ "戒指",
247
+ "懒得",
248
+ "意识",
249
+ "意思",
250
+ "情形",
251
+ "悟性",
252
+ "怪物",
253
+ "思量",
254
+ "怎么",
255
+ "念头",
256
+ "念叨",
257
+ "快活",
258
+ "忙活",
259
+ "志气",
260
+ "心思",
261
+ "得罪",
262
+ "张罗",
263
+ "弟兄",
264
+ "开通",
265
+ "应酬",
266
+ "庄稼",
267
+ "干事",
268
+ "帮手",
269
+ "帐篷",
270
+ "希罕",
271
+ "师父",
272
+ "师傅",
273
+ "巴结",
274
+ "巴掌",
275
+ "差事",
276
+ "工夫",
277
+ "岁数",
278
+ "屁股",
279
+ "尾巴",
280
+ "少爷",
281
+ "小气",
282
+ "小伙",
283
+ "将就",
284
+ "对头",
285
+ "对付",
286
+ "寡妇",
287
+ "家伙",
288
+ "客气",
289
+ "实在",
290
+ "官司",
291
+ "学问",
292
+ "学生",
293
+ "字号",
294
+ "嫁妆",
295
+ "媳妇",
296
+ "媒人",
297
+ "婆家",
298
+ "娘家",
299
+ "委屈",
300
+ "姑娘",
301
+ "姐夫",
302
+ "妯娌",
303
+ "妥当",
304
+ "妖精",
305
+ "奴才",
306
+ "女婿",
307
+ "头发",
308
+ "太阳",
309
+ "大爷",
310
+ "大方",
311
+ "大意",
312
+ "大夫",
313
+ "多少",
314
+ "多么",
315
+ "外甥",
316
+ "壮实",
317
+ "地道",
318
+ "地方",
319
+ "在乎",
320
+ "困难",
321
+ "嘴巴",
322
+ "嘱咐",
323
+ "嘟囔",
324
+ "嘀咕",
325
+ "喜欢",
326
+ "喇嘛",
327
+ "喇叭",
328
+ "商量",
329
+ "唾沫",
330
+ "哑巴",
331
+ "哈欠",
332
+ "哆嗦",
333
+ "咳嗽",
334
+ "和尚",
335
+ "告诉",
336
+ "告示",
337
+ "含糊",
338
+ "吓唬",
339
+ "后头",
340
+ "名字",
341
+ "名堂",
342
+ "合同",
343
+ "吆喝",
344
+ "叫唤",
345
+ "口袋",
346
+ "厚道",
347
+ "厉害",
348
+ "千斤",
349
+ "包袱",
350
+ "包涵",
351
+ "匀称",
352
+ "勤快",
353
+ "动静",
354
+ "动弹",
355
+ "功夫",
356
+ "力气",
357
+ "前头",
358
+ "刺猬",
359
+ "刺激",
360
+ "别扭",
361
+ "利落",
362
+ "利索",
363
+ "利害",
364
+ "分析",
365
+ "出息",
366
+ "凑合",
367
+ "凉快",
368
+ "冷战",
369
+ "冤枉",
370
+ "冒失",
371
+ "养活",
372
+ "关系",
373
+ "先生",
374
+ "兄弟",
375
+ "便宜",
376
+ "使唤",
377
+ "佩服",
378
+ "作坊",
379
+ "体面",
380
+ "位置",
381
+ "似的",
382
+ "伙计",
383
+ "休息",
384
+ "什么",
385
+ "人家",
386
+ "亲戚",
387
+ "亲家",
388
+ "交情",
389
+ "云彩",
390
+ "事情",
391
+ "买卖",
392
+ "主意",
393
+ "丫头",
394
+ "丧气",
395
+ "两口",
396
+ "东西",
397
+ "东家",
398
+ "世故",
399
+ "不由",
400
+ "不在",
401
+ "下水",
402
+ "下巴",
403
+ "上头",
404
+ "上司",
405
+ "丈夫",
406
+ "丈人",
407
+ "一辈",
408
+ "那个",
409
+ "菩萨",
410
+ "父亲",
411
+ "母亲",
412
+ "咕噜",
413
+ "邋遢",
414
+ "费用",
415
+ "冤家",
416
+ "甜头",
417
+ "介绍",
418
+ "荒唐",
419
+ "大人",
420
+ "泥鳅",
421
+ "幸福",
422
+ "熟悉",
423
+ "计划",
424
+ "扑腾",
425
+ "蜡烛",
426
+ "姥爷",
427
+ "照顾",
428
+ "喉咙",
429
+ "吉他",
430
+ "弄堂",
431
+ "蚂蚱",
432
+ "凤凰",
433
+ "拖沓",
434
+ "寒碜",
435
+ "糟蹋",
436
+ "倒腾",
437
+ "报复",
438
+ "逻辑",
439
+ "盘缠",
440
+ "喽啰",
441
+ "牢骚",
442
+ "咖喱",
443
+ "扫把",
444
+ "惦记",
445
+ }
446
+ self.must_not_neural_tone_words = {
447
+ "男子",
448
+ "女子",
449
+ "分子",
450
+ "原子",
451
+ "量子",
452
+ "莲子",
453
+ "石子",
454
+ "瓜子",
455
+ "电子",
456
+ "人人",
457
+ "虎虎",
458
+ }
459
+ self.punc = ":,;。?!“”‘’':,;.?!"
460
+
461
+ # the meaning of jieba pos tag: https://blog.csdn.net/weixin_44174352/article/details/113731041
462
+ # e.g.
463
+ # word: "家里"
464
+ # pos: "s"
465
+ # finals: ['ia1', 'i3']
466
+ def _neural_sandhi(self, word: str, pos: str, finals: List[str]) -> List[str]:
467
+ # reduplication words for n. and v. e.g. 奶奶, 试试, 旺旺
468
+ for j, item in enumerate(word):
469
+ if (
470
+ j - 1 >= 0
471
+ and item == word[j - 1]
472
+ and pos[0] in {"n", "v", "a"}
473
+ and word not in self.must_not_neural_tone_words
474
+ ):
475
+ finals[j] = finals[j][:-1] + "5"
476
+ ge_idx = word.find("个")
477
+ if len(word) >= 1 and word[-1] in "吧呢啊呐噻嘛吖嗨呐哦哒额滴哩哟喽啰耶喔诶":
478
+ finals[-1] = finals[-1][:-1] + "5"
479
+ elif len(word) >= 1 and word[-1] in "的地得":
480
+ finals[-1] = finals[-1][:-1] + "5"
481
+ # e.g. 走了, 看着, 去过
482
+ # elif len(word) == 1 and word in "了着过" and pos in {"ul", "uz", "ug"}:
483
+ # finals[-1] = finals[-1][:-1] + "5"
484
+ elif (
485
+ len(word) > 1
486
+ and word[-1] in "们子"
487
+ and pos in {"r", "n"}
488
+ and word not in self.must_not_neural_tone_words
489
+ ):
490
+ finals[-1] = finals[-1][:-1] + "5"
491
+ # e.g. 桌上, 地下, 家里
492
+ elif len(word) > 1 and word[-1] in "上下里" and pos in {"s", "l", "f"}:
493
+ finals[-1] = finals[-1][:-1] + "5"
494
+ # e.g. 上来, 下去
495
+ elif len(word) > 1 and word[-1] in "来去" and word[-2] in "上下进出回过起开":
496
+ finals[-1] = finals[-1][:-1] + "5"
497
+ # 个做量词
498
+ elif (
499
+ ge_idx >= 1
500
+ and (word[ge_idx - 1].isnumeric() or word[ge_idx - 1] in "几有两半多各整每做是")
501
+ ) or word == "个":
502
+ finals[ge_idx] = finals[ge_idx][:-1] + "5"
503
+ else:
504
+ if (
505
+ word in self.must_neural_tone_words
506
+ or word[-2:] in self.must_neural_tone_words
507
+ ):
508
+ finals[-1] = finals[-1][:-1] + "5"
509
+
510
+ word_list = self._split_word(word)
511
+ finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]) :]]
512
+ for i, word in enumerate(word_list):
513
+ # conventional neural in Chinese
514
+ if (
515
+ word in self.must_neural_tone_words
516
+ or word[-2:] in self.must_neural_tone_words
517
+ ):
518
+ finals_list[i][-1] = finals_list[i][-1][:-1] + "5"
519
+ finals = sum(finals_list, [])
520
+ return finals
521
+
522
+ def _bu_sandhi(self, word: str, finals: List[str]) -> List[str]:
523
+ # e.g. 看不懂
524
+ if len(word) == 3 and word[1] == "不":
525
+ finals[1] = finals[1][:-1] + "5"
526
+ else:
527
+ for i, char in enumerate(word):
528
+ # "不" before tone4 should be bu2, e.g. 不怕
529
+ if char == "不" and i + 1 < len(word) and finals[i + 1][-1] == "4":
530
+ finals[i] = finals[i][:-1] + "2"
531
+ return finals
532
+
533
+ def _yi_sandhi(self, word: str, finals: List[str]) -> List[str]:
534
+ # "一" in number sequences, e.g. 一零零, 二一零
535
+ if word.find("一") != -1 and all(
536
+ [item.isnumeric() for item in word if item != "一"]
537
+ ):
538
+ return finals
539
+ # "一" between reduplication words should be yi5, e.g. 看一看
540
+ elif len(word) == 3 and word[1] == "一" and word[0] == word[-1]:
541
+ finals[1] = finals[1][:-1] + "5"
542
+ # when "一" is ordinal word, it should be yi1
543
+ elif word.startswith("第一"):
544
+ finals[1] = finals[1][:-1] + "1"
545
+ else:
546
+ for i, char in enumerate(word):
547
+ if char == "一" and i + 1 < len(word):
548
+ # "一" before tone4 should be yi2, e.g. 一段
549
+ if finals[i + 1][-1] == "4":
550
+ finals[i] = finals[i][:-1] + "2"
551
+ # "一" before non-tone4 should be yi4, e.g. 一天
552
+ else:
553
+ # "一" 后面如果是标点,还读一声
554
+ if word[i + 1] not in self.punc:
555
+ finals[i] = finals[i][:-1] + "4"
556
+ return finals
557
+
558
+ def _split_word(self, word: str) -> List[str]:
559
+ word_list = jieba.cut_for_search(word)
560
+ word_list = sorted(word_list, key=lambda i: len(i), reverse=False)
561
+ first_subword = word_list[0]
562
+ first_begin_idx = word.find(first_subword)
563
+ if first_begin_idx == 0:
564
+ second_subword = word[len(first_subword) :]
565
+ new_word_list = [first_subword, second_subword]
566
+ else:
567
+ second_subword = word[: -len(first_subword)]
568
+ new_word_list = [second_subword, first_subword]
569
+ return new_word_list
570
+
571
+ def _three_sandhi(self, word: str, finals: List[str]) -> List[str]:
572
+ if len(word) == 2 and self._all_tone_three(finals):
573
+ finals[0] = finals[0][:-1] + "2"
574
+ elif len(word) == 3:
575
+ word_list = self._split_word(word)
576
+ if self._all_tone_three(finals):
577
+ # disyllabic + monosyllabic, e.g. 蒙古/包
578
+ if len(word_list[0]) == 2:
579
+ finals[0] = finals[0][:-1] + "2"
580
+ finals[1] = finals[1][:-1] + "2"
581
+ # monosyllabic + disyllabic, e.g. 纸/老虎
582
+ elif len(word_list[0]) == 1:
583
+ finals[1] = finals[1][:-1] + "2"
584
+ else:
585
+ finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]) :]]
586
+ if len(finals_list) == 2:
587
+ for i, sub in enumerate(finals_list):
588
+ # e.g. 所有/人
589
+ if self._all_tone_three(sub) and len(sub) == 2:
590
+ finals_list[i][0] = finals_list[i][0][:-1] + "2"
591
+ # e.g. 好/喜欢
592
+ elif (
593
+ i == 1
594
+ and not self._all_tone_three(sub)
595
+ and finals_list[i][0][-1] == "3"
596
+ and finals_list[0][-1][-1] == "3"
597
+ ):
598
+ finals_list[0][-1] = finals_list[0][-1][:-1] + "2"
599
+ finals = sum(finals_list, [])
600
+ # split idiom into two words who's length is 2
601
+ elif len(word) == 4:
602
+ finals_list = [finals[:2], finals[2:]]
603
+ finals = []
604
+ for sub in finals_list:
605
+ if self._all_tone_three(sub):
606
+ sub[0] = sub[0][:-1] + "2"
607
+ finals += sub
608
+
609
+ return finals
610
+
611
+ def _all_tone_three(self, finals: List[str]) -> bool:
612
+ return all(x[-1] == "3" for x in finals)
613
+
614
+ # merge "不" and the word behind it
615
+ # if don't merge, "不" sometimes appears alone according to jieba, which may occur sandhi error
616
+ def _merge_bu(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
617
+ new_seg = []
618
+ last_word = ""
619
+ for word, pos in seg:
620
+ if last_word == "不":
621
+ word = last_word + word
622
+ if word != "不":
623
+ new_seg.append((word, pos))
624
+ last_word = word[:]
625
+ if last_word == "不":
626
+ new_seg.append((last_word, "d"))
627
+ last_word = ""
628
+ return new_seg
629
+
630
+ # function 1: merge "一" and reduplication words in it's left and right, e.g. "听","一","听" ->"听一听"
631
+ # function 2: merge single "一" and the word behind it
632
+ # if don't merge, "一" sometimes appears alone according to jieba, which may occur sandhi error
633
+ # e.g.
634
+ # input seg: [('听', 'v'), ('一', 'm'), ('听', 'v')]
635
+ # output seg: [['听一听', 'v']]
636
+ def _merge_yi(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
637
+ new_seg = []
638
+ # function 1
639
+ for i, (word, pos) in enumerate(seg):
640
+ if (
641
+ i - 1 >= 0
642
+ and word == "一"
643
+ and i + 1 < len(seg)
644
+ and seg[i - 1][0] == seg[i + 1][0]
645
+ and seg[i - 1][1] == "v"
646
+ ):
647
+ new_seg[i - 1][0] = new_seg[i - 1][0] + "一" + new_seg[i - 1][0]
648
+ else:
649
+ if (
650
+ i - 2 >= 0
651
+ and seg[i - 1][0] == "一"
652
+ and seg[i - 2][0] == word
653
+ and pos == "v"
654
+ ):
655
+ continue
656
+ else:
657
+ new_seg.append([word, pos])
658
+ seg = new_seg
659
+ new_seg = []
660
+ # function 2
661
+ for i, (word, pos) in enumerate(seg):
662
+ if new_seg and new_seg[-1][0] == "一":
663
+ new_seg[-1][0] = new_seg[-1][0] + word
664
+ else:
665
+ new_seg.append([word, pos])
666
+ return new_seg
667
+
668
+ # the first and the second words are all_tone_three
669
+ def _merge_continuous_three_tones(
670
+ self, seg: List[Tuple[str, str]]
671
+ ) -> List[Tuple[str, str]]:
672
+ new_seg = []
673
+ sub_finals_list = [
674
+ lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
675
+ for (word, pos) in seg
676
+ ]
677
+ assert len(sub_finals_list) == len(seg)
678
+ merge_last = [False] * len(seg)
679
+ for i, (word, pos) in enumerate(seg):
680
+ if (
681
+ i - 1 >= 0
682
+ and self._all_tone_three(sub_finals_list[i - 1])
683
+ and self._all_tone_three(sub_finals_list[i])
684
+ and not merge_last[i - 1]
685
+ ):
686
+ # if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
687
+ if (
688
+ not self._is_reduplication(seg[i - 1][0])
689
+ and len(seg[i - 1][0]) + len(seg[i][0]) <= 3
690
+ ):
691
+ new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
692
+ merge_last[i] = True
693
+ else:
694
+ new_seg.append([word, pos])
695
+ else:
696
+ new_seg.append([word, pos])
697
+
698
+ return new_seg
699
+
700
+ def _is_reduplication(self, word: str) -> bool:
701
+ return len(word) == 2 and word[0] == word[1]
702
+
703
+ # the last char of first word and the first char of second word is tone_three
704
+ def _merge_continuous_three_tones_2(
705
+ self, seg: List[Tuple[str, str]]
706
+ ) -> List[Tuple[str, str]]:
707
+ new_seg = []
708
+ sub_finals_list = [
709
+ lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
710
+ for (word, pos) in seg
711
+ ]
712
+ assert len(sub_finals_list) == len(seg)
713
+ merge_last = [False] * len(seg)
714
+ for i, (word, pos) in enumerate(seg):
715
+ if (
716
+ i - 1 >= 0
717
+ and sub_finals_list[i - 1][-1][-1] == "3"
718
+ and sub_finals_list[i][0][-1] == "3"
719
+ and not merge_last[i - 1]
720
+ ):
721
+ # if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
722
+ if (
723
+ not self._is_reduplication(seg[i - 1][0])
724
+ and len(seg[i - 1][0]) + len(seg[i][0]) <= 3
725
+ ):
726
+ new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
727
+ merge_last[i] = True
728
+ else:
729
+ new_seg.append([word, pos])
730
+ else:
731
+ new_seg.append([word, pos])
732
+ return new_seg
733
+
734
+ def _merge_er(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
735
+ new_seg = []
736
+ for i, (word, pos) in enumerate(seg):
737
+ if i - 1 >= 0 and word == "儿" and seg[i - 1][0] != "#":
738
+ new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
739
+ else:
740
+ new_seg.append([word, pos])
741
+ return new_seg
742
+
743
+ def _merge_reduplication(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
744
+ new_seg = []
745
+ for i, (word, pos) in enumerate(seg):
746
+ if new_seg and word == new_seg[-1][0]:
747
+ new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
748
+ else:
749
+ new_seg.append([word, pos])
750
+ return new_seg
751
+
752
+ def pre_merge_for_modify(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
753
+ seg = self._merge_bu(seg)
754
+ try:
755
+ seg = self._merge_yi(seg)
756
+ except:
757
+ print("_merge_yi failed")
758
+ seg = self._merge_reduplication(seg)
759
+ seg = self._merge_continuous_three_tones(seg)
760
+ seg = self._merge_continuous_three_tones_2(seg)
761
+ seg = self._merge_er(seg)
762
+ return seg
763
+
764
+ def modified_tone(self, word: str, pos: str, finals: List[str]) -> List[str]:
765
+ finals = self._bu_sandhi(word, finals)
766
+ finals = self._yi_sandhi(word, finals)
767
+ finals = self._neural_sandhi(word, pos, finals)
768
+ finals = self._three_sandhi(word, finals)
769
+ return finals