AzumaSeren100 commited on
Commit
554fcfd
1 Parent(s): d827b45

Upload 22 files

Browse files
text/__init__.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from text.symbols import *
2
+
3
+
4
+ _symbol_to_id = {s: i for i, s in enumerate(symbols)}
5
+
6
+ def cleaned_text_to_sequence(cleaned_text, tones, language):
7
+ '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
8
+ Args:
9
+ text: string to convert to a sequence
10
+ Returns:
11
+ List of integers corresponding to the symbols in the text
12
+ '''
13
+ phones = [_symbol_to_id[symbol] for symbol in cleaned_text]
14
+ tone_start = language_tone_start_map[language]
15
+ tones = [i + tone_start for i in tones]
16
+ lang_id = language_id_map[language]
17
+ lang_ids = [lang_id for i in phones]
18
+ return phones, tones, lang_ids
19
+
20
+ def get_bert(norm_text, word2ph, language):
21
+ from .chinese_bert import get_bert_feature as zh_bert
22
+ from .english_bert_mock import get_bert_feature as en_bert
23
+ lang_bert_func_map = {
24
+ 'ZH': zh_bert,
25
+ 'EN': en_bert
26
+ }
27
+ bert = lang_bert_func_map[language](norm_text, word2ph)
28
+ return bert
text/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (1.5 kB). View file
 
text/__pycache__/chinese.cpython-39.pyc ADDED
Binary file (4.52 kB). View file
 
text/__pycache__/chinese.cpython-39.pyc.baiduyun.downloading ADDED
File without changes
text/__pycache__/chinese_bert.cpython-39.pyc ADDED
Binary file (1.62 kB). View file
 
text/__pycache__/chinese_bert.cpython-39.pyc.baiduyun.downloading ADDED
File without changes
text/__pycache__/cleaner.cpython-39.pyc ADDED
Binary file (919 Bytes). View file
 
text/__pycache__/cleaner.cpython-39.pyc.baiduyun.downloading ADDED
File without changes
text/__pycache__/english_bert_mock.cpython-39.pyc ADDED
Binary file (310 Bytes). View file
 
text/__pycache__/symbols.cpython-39.pyc ADDED
Binary file (1.46 kB). View file
 
text/__pycache__/tone_sandhi.cpython-39.pyc ADDED
Binary file (13.5 kB). View file
 
text/chinese.py ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+
4
+ import cn2an
5
+ from pypinyin import lazy_pinyin, Style
6
+
7
+ from text import symbols
8
+ from text.symbols import punctuation
9
+ from text.tone_sandhi import ToneSandhi
10
+
11
+ current_file_path = os.path.dirname(__file__)
12
+ pinyin_to_symbol_map = {line.split("\t")[0]: line.strip().split("\t")[1] for line in
13
+ open(os.path.join(current_file_path, 'opencpop-strict.txt')).readlines()}
14
+
15
+ import jieba.posseg as psg
16
+
17
+
18
+ rep_map = {
19
+ ':': ',',
20
+ ';': ',',
21
+ ',': ',',
22
+ '。': '.',
23
+ '!': '!',
24
+ '?': '?',
25
+ '\n': '.',
26
+ "·": ",",
27
+ '、': ",",
28
+ '...': '…',
29
+ '$': '.',
30
+ '“': "'",
31
+ '”': "'",
32
+ '‘': "'",
33
+ '’': "'",
34
+ '(': "'",
35
+ ')': "'",
36
+ '(': "'",
37
+ ')': "'",
38
+ '《': "'",
39
+ '》': "'",
40
+ '【': "'",
41
+ '】': "'",
42
+ '[': "'",
43
+ ']': "'",
44
+ '—': "-",
45
+ '~': "-",
46
+ '~': "-",
47
+ '「': "'",
48
+ '」': "'",
49
+
50
+ }
51
+
52
+ tone_modifier = ToneSandhi()
53
+
54
+ def replace_punctuation(text):
55
+ text = text.replace("嗯", "恩").replace("呣","母")
56
+ pattern = re.compile('|'.join(re.escape(p) for p in rep_map.keys()))
57
+
58
+ replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
59
+
60
+ replaced_text = re.sub(r'[^\u4e00-\u9fa5'+"".join(punctuation)+r']+', '', replaced_text)
61
+
62
+ return replaced_text
63
+
64
+ def g2p(text):
65
+ pattern = r'(?<=[{0}])\s*'.format(''.join(punctuation))
66
+ sentences = [i for i in re.split(pattern, text) if i.strip()!='']
67
+ phones, tones, word2ph = _g2p(sentences)
68
+ assert sum(word2ph) == len(phones)
69
+ assert len(word2ph) == len(text) #Sometimes it will crash,you can add a try-catch.
70
+ phones = ['_'] + phones + ["_"]
71
+ tones = [0] + tones + [0]
72
+ word2ph = [1] + word2ph + [1]
73
+ return phones, tones, word2ph
74
+
75
+
76
+ def _get_initials_finals(word):
77
+ initials = []
78
+ finals = []
79
+ orig_initials = lazy_pinyin(
80
+ word, neutral_tone_with_five=True, style=Style.INITIALS)
81
+ orig_finals = lazy_pinyin(
82
+ word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
83
+ for c, v in zip(orig_initials, orig_finals):
84
+ initials.append(c)
85
+ finals.append(v)
86
+ return initials, finals
87
+
88
+
89
+ def _g2p(segments):
90
+ phones_list = []
91
+ tones_list = []
92
+ word2ph = []
93
+ for seg in segments:
94
+ pinyins = []
95
+ # Replace all English words in the sentence
96
+ seg = re.sub('[a-zA-Z]+', '', seg)
97
+ seg_cut = psg.lcut(seg)
98
+ initials = []
99
+ finals = []
100
+ seg_cut = tone_modifier.pre_merge_for_modify(seg_cut)
101
+ for word, pos in seg_cut:
102
+ if pos == 'eng':
103
+ continue
104
+ sub_initials, sub_finals = _get_initials_finals(word)
105
+ sub_finals = tone_modifier.modified_tone(word, pos,
106
+ sub_finals)
107
+ initials.append(sub_initials)
108
+ finals.append(sub_finals)
109
+
110
+ # assert len(sub_initials) == len(sub_finals) == len(word)
111
+ initials = sum(initials, [])
112
+ finals = sum(finals, [])
113
+ #
114
+ for c, v in zip(initials, finals):
115
+ raw_pinyin = c+v
116
+ # NOTE: post process for pypinyin outputs
117
+ # we discriminate i, ii and iii
118
+ if c == v:
119
+ assert c in punctuation
120
+ phone = [c]
121
+ tone = '0'
122
+ word2ph.append(1)
123
+ else:
124
+ v_without_tone = v[:-1]
125
+ tone = v[-1]
126
+
127
+ pinyin = c+v_without_tone
128
+ assert tone in '12345'
129
+
130
+ if c:
131
+ # 多音节
132
+ v_rep_map = {
133
+ "uei": 'ui',
134
+ 'iou': 'iu',
135
+ 'uen': 'un',
136
+ }
137
+ if v_without_tone in v_rep_map.keys():
138
+ pinyin = c+v_rep_map[v_without_tone]
139
+ else:
140
+ # 单音节
141
+ pinyin_rep_map = {
142
+ 'ing': 'ying',
143
+ 'i': 'yi',
144
+ 'in': 'yin',
145
+ 'u': 'wu',
146
+ }
147
+ if pinyin in pinyin_rep_map.keys():
148
+ pinyin = pinyin_rep_map[pinyin]
149
+ else:
150
+ single_rep_map = {
151
+ 'v': 'yu',
152
+ 'e': 'e',
153
+ 'i': 'y',
154
+ 'u': 'w',
155
+ }
156
+ if pinyin[0] in single_rep_map.keys():
157
+ pinyin = single_rep_map[pinyin[0]]+pinyin[1:]
158
+
159
+ assert pinyin in pinyin_to_symbol_map.keys(), (pinyin, seg, raw_pinyin)
160
+ phone = pinyin_to_symbol_map[pinyin].split(' ')
161
+ word2ph.append(len(phone))
162
+
163
+ phones_list += phone
164
+ tones_list += [int(tone)] * len(phone)
165
+ return phones_list, tones_list, word2ph
166
+
167
+
168
+
169
+ def text_normalize(text):
170
+ numbers = re.findall(r'\d+(?:\.?\d+)?', text)
171
+ for number in numbers:
172
+ text = text.replace(number, cn2an.an2cn(number), 1)
173
+ text = replace_punctuation(text)
174
+ return text
175
+
176
+ def get_bert_feature(text, word2ph):
177
+ from text import chinese_bert
178
+ return chinese_bert.get_bert_feature(text, word2ph)
179
+
180
+ if __name__ == '__main__':
181
+ from text.chinese_bert import get_bert_feature
182
+ text = "啊!但是《原神》是由,米哈\游自主, [研发]的一款全.新开放世界.冒险游戏"
183
+ text = text_normalize(text)
184
+ print(text)
185
+ phones, tones, word2ph = g2p(text)
186
+ bert = get_bert_feature(text, word2ph)
187
+
188
+ print(phones, tones, word2ph, bert.shape)
189
+
190
+
191
+ # # 示例用法
192
+ # text = "这是一个示例文本:,你好!这是一个测试...."
193
+ # print(g2p_paddle(text)) # 输出: 这是一个示例文本你好这是一个测试
text/chinese_bert.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import sys
3
+ from transformers import AutoTokenizer, AutoModelForMaskedLM
4
+
5
+ device = torch.device(
6
+ "cuda"
7
+ if torch.cuda.is_available()
8
+ else (
9
+ "mps"
10
+ if sys.platform == "darwin" and torch.backends.mps.is_available()
11
+ else "cpu"
12
+ )
13
+ )
14
+
15
+ tokenizer = AutoTokenizer.from_pretrained("./bert/chinese-roberta-wwm-ext-large")
16
+ model = AutoModelForMaskedLM.from_pretrained("./bert/chinese-roberta-wwm-ext-large").to(device)
17
+
18
+ def get_bert_feature(text, word2ph):
19
+ with torch.no_grad():
20
+ inputs = tokenizer(text, return_tensors='pt')
21
+ for i in inputs:
22
+ inputs[i] = inputs[i].to(device)
23
+ res = model(**inputs, output_hidden_states=True)
24
+ res = torch.cat(res['hidden_states'][-3:-2], -1)[0].cpu()
25
+
26
+ assert len(word2ph) == len(text)+2
27
+ word2phone = word2ph
28
+ phone_level_feature = []
29
+ for i in range(len(word2phone)):
30
+ repeat_feature = res[i].repeat(word2phone[i], 1)
31
+ phone_level_feature.append(repeat_feature)
32
+
33
+ phone_level_feature = torch.cat(phone_level_feature, dim=0)
34
+
35
+
36
+ return phone_level_feature.T
37
+
38
+ if __name__ == '__main__':
39
+ # feature = get_bert_feature('你好,我是说的道理。')
40
+ import torch
41
+
42
+ word_level_feature = torch.rand(38, 1024) # 12个词,每个词1024维特征
43
+ word2phone = [1, 2, 1, 2, 2, 1, 2, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 1, 1, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1]
44
+
45
+ # 计算总帧数
46
+ total_frames = sum(word2phone)
47
+ print(word_level_feature.shape)
48
+ print(word2phone)
49
+ phone_level_feature = []
50
+ for i in range(len(word2phone)):
51
+ print(word_level_feature[i].shape)
52
+
53
+ # 对每个词重复word2phone[i]次
54
+ repeat_feature = word_level_feature[i].repeat(word2phone[i], 1)
55
+ phone_level_feature.append(repeat_feature)
56
+
57
+ phone_level_feature = torch.cat(phone_level_feature, dim=0)
58
+ print(phone_level_feature.shape) # torch.Size([36, 1024])
59
+
text/cleaner.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from text import chinese, cleaned_text_to_sequence
2
+
3
+
4
+ language_module_map = {
5
+ 'ZH': chinese
6
+ }
7
+
8
+
9
+ def clean_text(text, language):
10
+ language_module = language_module_map[language]
11
+ norm_text = language_module.text_normalize(text)
12
+ phones, tones, word2ph = language_module.g2p(norm_text)
13
+ return norm_text, phones, tones, word2ph
14
+
15
+ def clean_text_bert(text, language):
16
+ language_module = language_module_map[language]
17
+ norm_text = language_module.text_normalize(text)
18
+ phones, tones, word2ph = language_module.g2p(norm_text)
19
+ bert = language_module.get_bert_feature(norm_text, word2ph)
20
+ return phones, tones, bert
21
+
22
+ def text_to_sequence(text, language):
23
+ norm_text, phones, tones, word2ph = clean_text(text, language)
24
+ return cleaned_text_to_sequence(phones, tones, language)
25
+
26
+ if __name__ == '__main__':
27
+ pass
text/cmudict.rep ADDED
The diff for this file is too large to render. See raw diff
 
text/cmudict_cache.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b9b21b20325471934ba92f2e4a5976989e7d920caa32e7a286eacb027d197949
3
+ size 6212655
text/english.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+ import os
3
+ import re
4
+ from g2p_en import G2p
5
+ from string import punctuation
6
+
7
+ from text import symbols
8
+
9
+ current_file_path = os.path.dirname(__file__)
10
+ CMU_DICT_PATH = os.path.join(current_file_path, 'cmudict.rep')
11
+ CACHE_PATH = os.path.join(current_file_path, 'cmudict_cache.pickle')
12
+ _g2p = G2p()
13
+
14
+ arpa = {'AH0', 'S', 'AH1', 'EY2', 'AE2', 'EH0', 'OW2', 'UH0', 'NG', 'B', 'G', 'AY0', 'M', 'AA0', 'F', 'AO0', 'ER2', 'UH1', 'IY1', 'AH2', 'DH', 'IY0', 'EY1', 'IH0', 'K', 'N', 'W', 'IY2', 'T', 'AA1', 'ER1', 'EH2', 'OY0', 'UH2', 'UW1', 'Z', 'AW2', 'AW1', 'V', 'UW2', 'AA2', 'ER', 'AW0', 'UW0', 'R', 'OW1', 'EH1', 'ZH', 'AE0', 'IH2', 'IH', 'Y', 'JH', 'P', 'AY1', 'EY0', 'OY2', 'TH', 'HH', 'D', 'ER0', 'CH', 'AO1', 'AE1', 'AO2', 'OY1', 'AY2', 'IH1', 'OW0', 'L', 'SH'}
15
+
16
+
17
+ def post_replace_ph(ph):
18
+ rep_map = {
19
+ ':': ',',
20
+ ';': ',',
21
+ ',': ',',
22
+ '。': '.',
23
+ '!': '!',
24
+ '?': '?',
25
+ '\n': '.',
26
+ "·": ",",
27
+ '、': ",",
28
+ '...': '…',
29
+ 'v': "V"
30
+ }
31
+ if ph in rep_map.keys():
32
+ ph = rep_map[ph]
33
+ if ph in symbols:
34
+ return ph
35
+ if ph not in symbols:
36
+ ph = 'UNK'
37
+ return ph
38
+
39
+ def read_dict():
40
+ g2p_dict = {}
41
+ start_line = 49
42
+ with open(CMU_DICT_PATH) as f:
43
+ line = f.readline()
44
+ line_index = 1
45
+ while line:
46
+ if line_index >= start_line:
47
+ line = line.strip()
48
+ word_split = line.split(' ')
49
+ word = word_split[0]
50
+
51
+ syllable_split = word_split[1].split(' - ')
52
+ g2p_dict[word] = []
53
+ for syllable in syllable_split:
54
+ phone_split = syllable.split(' ')
55
+ g2p_dict[word].append(phone_split)
56
+
57
+ line_index = line_index + 1
58
+ line = f.readline()
59
+
60
+ return g2p_dict
61
+
62
+
63
+ def cache_dict(g2p_dict, file_path):
64
+ with open(file_path, 'wb') as pickle_file:
65
+ pickle.dump(g2p_dict, pickle_file)
66
+
67
+
68
+ def get_dict():
69
+ if os.path.exists(CACHE_PATH):
70
+ with open(CACHE_PATH, 'rb') as pickle_file:
71
+ g2p_dict = pickle.load(pickle_file)
72
+ else:
73
+ g2p_dict = read_dict()
74
+ cache_dict(g2p_dict, CACHE_PATH)
75
+
76
+ return g2p_dict
77
+
78
+ eng_dict = get_dict()
79
+
80
+ def refine_ph(phn):
81
+ tone = 0
82
+ if re.search(r'\d$', phn):
83
+ tone = int(phn[-1]) + 1
84
+ phn = phn[:-1]
85
+ return phn.lower(), tone
86
+
87
+ def refine_syllables(syllables):
88
+ tones = []
89
+ phonemes = []
90
+ for phn_list in syllables:
91
+ for i in range(len(phn_list)):
92
+ phn = phn_list[i]
93
+ phn, tone = refine_ph(phn)
94
+ phonemes.append(phn)
95
+ tones.append(tone)
96
+ return phonemes, tones
97
+
98
+
99
+ def text_normalize(text):
100
+ # todo: eng text normalize
101
+ return text
102
+
103
+ def g2p(text):
104
+
105
+ phones = []
106
+ tones = []
107
+ words = re.split(r"([,;.\-\?\!\s+])", text)
108
+ for w in words:
109
+ if w.upper() in eng_dict:
110
+ phns, tns = refine_syllables(eng_dict[w.upper()])
111
+ phones += phns
112
+ tones += tns
113
+ else:
114
+ phone_list = list(filter(lambda p: p != " ", _g2p(w)))
115
+ for ph in phone_list:
116
+ if ph in arpa:
117
+ ph, tn = refine_ph(ph)
118
+ phones.append(ph)
119
+ tones.append(tn)
120
+ else:
121
+ phones.append(ph)
122
+ tones.append(0)
123
+ # todo: implement word2ph
124
+ word2ph = [1 for i in phones]
125
+
126
+ phones = [post_replace_ph(i) for i in phones]
127
+ return phones, tones, word2ph
128
+
129
+ if __name__ == "__main__":
130
+ # print(get_dict())
131
+ # print(eng_word_to_phoneme("hello"))
132
+ print(g2p("In this paper, we propose 1 DSPGAN, a GAN-based universal vocoder."))
133
+ # all_phones = set()
134
+ # for k, syllables in eng_dict.items():
135
+ # for group in syllables:
136
+ # for ph in group:
137
+ # all_phones.add(ph)
138
+ # print(all_phones)
text/english_bert_mock.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ import torch
2
+
3
+
4
+ def get_bert_feature(norm_text, word2ph):
5
+ return torch.zeros(1024, sum(word2ph))
text/japanese.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # modified from https://github.com/CjangCjengh/vits/blob/main/text/japanese.py
2
+ import re
3
+ import sys
4
+
5
+ import pyopenjtalk
6
+
7
+ from text import symbols
8
+
9
+ # Regular expression matching Japanese without punctuation marks:
10
+ _japanese_characters = re.compile(
11
+ r'[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
12
+
13
+ # Regular expression matching non-Japanese characters or punctuation marks:
14
+ _japanese_marks = re.compile(
15
+ r'[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
16
+
17
+ # List of (symbol, Japanese) pairs for marks:
18
+ _symbols_to_japanese = [(re.compile('%s' % x[0]), x[1]) for x in [
19
+ ('%', 'パーセント')
20
+ ]]
21
+
22
+
23
+ # List of (consonant, sokuon) pairs:
24
+ _real_sokuon = [(re.compile('%s' % x[0]), x[1]) for x in [
25
+ (r'Q([↑↓]*[kg])', r'k#\1'),
26
+ (r'Q([↑↓]*[tdjʧ])', r't#\1'),
27
+ (r'Q([↑↓]*[sʃ])', r's\1'),
28
+ (r'Q([↑↓]*[pb])', r'p#\1')
29
+ ]]
30
+
31
+ # List of (consonant, hatsuon) pairs:
32
+ _real_hatsuon = [(re.compile('%s' % x[0]), x[1]) for x in [
33
+ (r'N([↑↓]*[pbm])', r'm\1'),
34
+ (r'N([↑↓]*[ʧʥj])', r'n^\1'),
35
+ (r'N([↑↓]*[tdn])', r'n\1'),
36
+ (r'N([↑↓]*[kg])', r'ŋ\1')
37
+ ]]
38
+
39
+
40
+
41
+ def post_replace_ph(ph):
42
+ rep_map = {
43
+ ':': ',',
44
+ ';': ',',
45
+ ',': ',',
46
+ '。': '.',
47
+ '!': '!',
48
+ '?': '?',
49
+ '\n': '.',
50
+ "·": ",",
51
+ '、': ",",
52
+ '...': '…',
53
+ 'v': "V"
54
+ }
55
+ if ph in rep_map.keys():
56
+ ph = rep_map[ph]
57
+ if ph in symbols:
58
+ return ph
59
+ if ph not in symbols:
60
+ ph = 'UNK'
61
+ return ph
62
+
63
+ def symbols_to_japanese(text):
64
+ for regex, replacement in _symbols_to_japanese:
65
+ text = re.sub(regex, replacement, text)
66
+ return text
67
+
68
+
69
+ def preprocess_jap(text):
70
+ '''Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html'''
71
+ text = symbols_to_japanese(text)
72
+ sentences = re.split(_japanese_marks, text)
73
+ marks = re.findall(_japanese_marks, text)
74
+ text = []
75
+ for i, sentence in enumerate(sentences):
76
+ if re.match(_japanese_characters, sentence):
77
+ p = pyopenjtalk.g2p(sentence)
78
+ text += p.split(" ")
79
+
80
+ if i < len(marks):
81
+ text += [marks[i].replace(' ', '')]
82
+ return text
83
+
84
+ def text_normalize(text):
85
+ # todo: jap text normalize
86
+ return text
87
+
88
+ def g2p(norm_text):
89
+ phones = preprocess_jap(norm_text)
90
+ phones = [post_replace_ph(i) for i in phones]
91
+ # todo: implement tones and word2ph
92
+ tones = [0 for i in phones]
93
+ word2ph = [1 for i in phones]
94
+ return phones, tones, word2ph
95
+
96
+
97
+ if __name__ == '__main__':
98
+ for line in open("../../../Downloads/transcript_utf8.txt").readlines():
99
+ text = line.split(":")[1]
100
+ phones, tones, word2ph = g2p(text)
101
+ for p in phones:
102
+ if p == "z":
103
+ print(text, phones)
104
+ sys.exit(0)
text/opencpop-strict.txt ADDED
@@ -0,0 +1,429 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ a AA a
2
+ ai AA ai
3
+ an AA an
4
+ ang AA ang
5
+ ao AA ao
6
+ ba b a
7
+ bai b ai
8
+ ban b an
9
+ bang b ang
10
+ bao b ao
11
+ bei b ei
12
+ ben b en
13
+ beng b eng
14
+ bi b i
15
+ bian b ian
16
+ biao b iao
17
+ bie b ie
18
+ bin b in
19
+ bing b ing
20
+ bo b o
21
+ bu b u
22
+ ca c a
23
+ cai c ai
24
+ can c an
25
+ cang c ang
26
+ cao c ao
27
+ ce c e
28
+ cei c ei
29
+ cen c en
30
+ ceng c eng
31
+ cha ch a
32
+ chai ch ai
33
+ chan ch an
34
+ chang ch ang
35
+ chao ch ao
36
+ che ch e
37
+ chen ch en
38
+ cheng ch eng
39
+ chi ch ir
40
+ chong ch ong
41
+ chou ch ou
42
+ chu ch u
43
+ chua ch ua
44
+ chuai ch uai
45
+ chuan ch uan
46
+ chuang ch uang
47
+ chui ch ui
48
+ chun ch un
49
+ chuo ch uo
50
+ ci c i0
51
+ cong c ong
52
+ cou c ou
53
+ cu c u
54
+ cuan c uan
55
+ cui c ui
56
+ cun c un
57
+ cuo c uo
58
+ da d a
59
+ dai d ai
60
+ dan d an
61
+ dang d ang
62
+ dao d ao
63
+ de d e
64
+ dei d ei
65
+ den d en
66
+ deng d eng
67
+ di d i
68
+ dia d ia
69
+ dian d ian
70
+ diao d iao
71
+ die d ie
72
+ ding d ing
73
+ diu d iu
74
+ dong d ong
75
+ dou d ou
76
+ du d u
77
+ duan d uan
78
+ dui d ui
79
+ dun d un
80
+ duo d uo
81
+ e EE e
82
+ ei EE ei
83
+ en EE en
84
+ eng EE eng
85
+ er EE er
86
+ fa f a
87
+ fan f an
88
+ fang f ang
89
+ fei f ei
90
+ fen f en
91
+ feng f eng
92
+ fo f o
93
+ fou f ou
94
+ fu f u
95
+ ga g a
96
+ gai g ai
97
+ gan g an
98
+ gang g ang
99
+ gao g ao
100
+ ge g e
101
+ gei g ei
102
+ gen g en
103
+ geng g eng
104
+ gong g ong
105
+ gou g ou
106
+ gu g u
107
+ gua g ua
108
+ guai g uai
109
+ guan g uan
110
+ guang g uang
111
+ gui g ui
112
+ gun g un
113
+ guo g uo
114
+ ha h a
115
+ hai h ai
116
+ han h an
117
+ hang h ang
118
+ hao h ao
119
+ he h e
120
+ hei h ei
121
+ hen h en
122
+ heng h eng
123
+ hong h ong
124
+ hou h ou
125
+ hu h u
126
+ hua h ua
127
+ huai h uai
128
+ huan h uan
129
+ huang h uang
130
+ hui h ui
131
+ hun h un
132
+ huo h uo
133
+ ji j i
134
+ jia j ia
135
+ jian j ian
136
+ jiang j iang
137
+ jiao j iao
138
+ jie j ie
139
+ jin j in
140
+ jing j ing
141
+ jiong j iong
142
+ jiu j iu
143
+ ju j v
144
+ jv j v
145
+ juan j van
146
+ jvan j van
147
+ jue j ve
148
+ jve j ve
149
+ jun j vn
150
+ jvn j vn
151
+ ka k a
152
+ kai k ai
153
+ kan k an
154
+ kang k ang
155
+ kao k ao
156
+ ke k e
157
+ kei k ei
158
+ ken k en
159
+ keng k eng
160
+ kong k ong
161
+ kou k ou
162
+ ku k u
163
+ kua k ua
164
+ kuai k uai
165
+ kuan k uan
166
+ kuang k uang
167
+ kui k ui
168
+ kun k un
169
+ kuo k uo
170
+ la l a
171
+ lai l ai
172
+ lan l an
173
+ lang l ang
174
+ lao l ao
175
+ le l e
176
+ lei l ei
177
+ leng l eng
178
+ li l i
179
+ lia l ia
180
+ lian l ian
181
+ liang l iang
182
+ liao l iao
183
+ lie l ie
184
+ lin l in
185
+ ling l ing
186
+ liu l iu
187
+ lo l o
188
+ long l ong
189
+ lou l ou
190
+ lu l u
191
+ luan l uan
192
+ lun l un
193
+ luo l uo
194
+ lv l v
195
+ lve l ve
196
+ ma m a
197
+ mai m ai
198
+ man m an
199
+ mang m ang
200
+ mao m ao
201
+ me m e
202
+ mei m ei
203
+ men m en
204
+ meng m eng
205
+ mi m i
206
+ mian m ian
207
+ miao m iao
208
+ mie m ie
209
+ min m in
210
+ ming m ing
211
+ miu m iu
212
+ mo m o
213
+ mou m ou
214
+ mu m u
215
+ na n a
216
+ nai n ai
217
+ nan n an
218
+ nang n ang
219
+ nao n ao
220
+ ne n e
221
+ nei n ei
222
+ nen n en
223
+ neng n eng
224
+ ni n i
225
+ nian n ian
226
+ niang n iang
227
+ niao n iao
228
+ nie n ie
229
+ nin n in
230
+ ning n ing
231
+ niu n iu
232
+ nong n ong
233
+ nou n ou
234
+ nu n u
235
+ nuan n uan
236
+ nun n un
237
+ nuo n uo
238
+ nv n v
239
+ nve n ve
240
+ o OO o
241
+ ou OO ou
242
+ pa p a
243
+ pai p ai
244
+ pan p an
245
+ pang p ang
246
+ pao p ao
247
+ pei p ei
248
+ pen p en
249
+ peng p eng
250
+ pi p i
251
+ pian p ian
252
+ piao p iao
253
+ pie p ie
254
+ pin p in
255
+ ping p ing
256
+ po p o
257
+ pou p ou
258
+ pu p u
259
+ qi q i
260
+ qia q ia
261
+ qian q ian
262
+ qiang q iang
263
+ qiao q iao
264
+ qie q ie
265
+ qin q in
266
+ qing q ing
267
+ qiong q iong
268
+ qiu q iu
269
+ qu q v
270
+ qv q v
271
+ quan q van
272
+ qvan q van
273
+ que q ve
274
+ qve q ve
275
+ qun q vn
276
+ qvn q vn
277
+ ran r an
278
+ rang r ang
279
+ rao r ao
280
+ re r e
281
+ ren r en
282
+ reng r eng
283
+ ri r ir
284
+ rong r ong
285
+ rou r ou
286
+ ru r u
287
+ rua r ua
288
+ ruan r uan
289
+ rui r ui
290
+ run r un
291
+ ruo r uo
292
+ sa s a
293
+ sai s ai
294
+ san s an
295
+ sang s ang
296
+ sao s ao
297
+ se s e
298
+ sen s en
299
+ seng s eng
300
+ sha sh a
301
+ shai sh ai
302
+ shan sh an
303
+ shang sh ang
304
+ shao sh ao
305
+ she sh e
306
+ shei sh ei
307
+ shen sh en
308
+ sheng sh eng
309
+ shi sh ir
310
+ shou sh ou
311
+ shu sh u
312
+ shua sh ua
313
+ shuai sh uai
314
+ shuan sh uan
315
+ shuang sh uang
316
+ shui sh ui
317
+ shun sh un
318
+ shuo sh uo
319
+ si s i0
320
+ song s ong
321
+ sou s ou
322
+ su s u
323
+ suan s uan
324
+ sui s ui
325
+ sun s un
326
+ suo s uo
327
+ ta t a
328
+ tai t ai
329
+ tan t an
330
+ tang t ang
331
+ tao t ao
332
+ te t e
333
+ tei t ei
334
+ teng t eng
335
+ ti t i
336
+ tian t ian
337
+ tiao t iao
338
+ tie t ie
339
+ ting t ing
340
+ tong t ong
341
+ tou t ou
342
+ tu t u
343
+ tuan t uan
344
+ tui t ui
345
+ tun t un
346
+ tuo t uo
347
+ wa w a
348
+ wai w ai
349
+ wan w an
350
+ wang w ang
351
+ wei w ei
352
+ wen w en
353
+ weng w eng
354
+ wo w o
355
+ wu w u
356
+ xi x i
357
+ xia x ia
358
+ xian x ian
359
+ xiang x iang
360
+ xiao x iao
361
+ xie x ie
362
+ xin x in
363
+ xing x ing
364
+ xiong x iong
365
+ xiu x iu
366
+ xu x v
367
+ xv x v
368
+ xuan x van
369
+ xvan x van
370
+ xue x ve
371
+ xve x ve
372
+ xun x vn
373
+ xvn x vn
374
+ ya y a
375
+ yan y En
376
+ yang y ang
377
+ yao y ao
378
+ ye y E
379
+ yi y i
380
+ yin y in
381
+ ying y ing
382
+ yo y o
383
+ yong y ong
384
+ you y ou
385
+ yu y v
386
+ yv y v
387
+ yuan y van
388
+ yvan y van
389
+ yue y ve
390
+ yve y ve
391
+ yun y vn
392
+ yvn y vn
393
+ za z a
394
+ zai z ai
395
+ zan z an
396
+ zang z ang
397
+ zao z ao
398
+ ze z e
399
+ zei z ei
400
+ zen z en
401
+ zeng z eng
402
+ zha zh a
403
+ zhai zh ai
404
+ zhan zh an
405
+ zhang zh ang
406
+ zhao zh ao
407
+ zhe zh e
408
+ zhei zh ei
409
+ zhen zh en
410
+ zheng zh eng
411
+ zhi zh ir
412
+ zhong zh ong
413
+ zhou zh ou
414
+ zhu zh u
415
+ zhua zh ua
416
+ zhuai zh uai
417
+ zhuan zh uan
418
+ zhuang zh uang
419
+ zhui zh ui
420
+ zhun zh un
421
+ zhuo zh uo
422
+ zi z i0
423
+ zong z ong
424
+ zou z ou
425
+ zu z u
426
+ zuan z uan
427
+ zui z ui
428
+ zun z un
429
+ zuo z uo
text/symbols.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ punctuation = ['!', '?', '…', ",", ".", "'", '-']
2
+ pu_symbols = punctuation + ["SP", "UNK"]
3
+ pad = '_'
4
+
5
+ # chinese
6
+ zh_symbols = ['E', 'En', 'a', 'ai', 'an', 'ang', 'ao', 'b', 'c', 'ch', 'd', 'e', 'ei', 'en', 'eng', 'er', 'f', 'g', 'h',
7
+ 'i', 'i0', 'ia', 'ian', 'iang', 'iao', 'ie', 'in', 'ing', 'iong', 'ir', 'iu', 'j', 'k', 'l', 'm', 'n', 'o',
8
+ 'ong',
9
+ 'ou', 'p', 'q', 'r', 's', 'sh', 't', 'u', 'ua', 'uai', 'uan', 'uang', 'ui', 'un', 'uo', 'v', 'van', 've', 'vn',
10
+ 'w', 'x', 'y', 'z', 'zh',
11
+ "AA", "EE", "OO"]
12
+ num_zh_tones = 6
13
+
14
+ # japanese
15
+ ja_symbols = ['I', 'N', 'U', 'a', 'b', 'by', 'ch', 'cl', 'd', 'dy', 'e', 'f', 'g', 'gy', 'h', 'hy', 'i', 'j', 'k', 'ky',
16
+ 'm', 'my', 'n', 'ny', 'o', 'p', 'py', 'r', 'ry', 's', 'sh', 't', 'ts', 'u', 'V', 'w', 'y', 'z']
17
+ num_ja_tones = 1
18
+
19
+ # English
20
+ en_symbols = ['aa', 'ae', 'ah', 'ao', 'aw', 'ay', 'b', 'ch', 'd', 'dh', 'eh', 'er', 'ey', 'f', 'g', 'hh', 'ih', 'iy',
21
+ 'jh', 'k', 'l', 'm', 'n', 'ng', 'ow', 'oy', 'p', 'r', 's',
22
+ 'sh', 't', 'th', 'uh', 'uw', 'V', 'w', 'y', 'z', 'zh']
23
+ num_en_tones = 4
24
+
25
+ # combine all symbols
26
+ normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols))
27
+ symbols = [pad] + normal_symbols + pu_symbols
28
+ sil_phonemes_ids = [symbols.index(i) for i in pu_symbols]
29
+
30
+ # combine all tones
31
+ num_tones = num_zh_tones + num_ja_tones + num_en_tones
32
+
33
+ # language maps
34
+ language_id_map = {
35
+ 'ZH': 0,
36
+ "JA": 1,
37
+ "EN": 2
38
+ }
39
+ num_languages = len(language_id_map.keys())
40
+
41
+ language_tone_start_map = {
42
+ 'ZH': 0,
43
+ "JA": num_zh_tones,
44
+ "EN": num_zh_tones + num_ja_tones
45
+ }
46
+
47
+ if __name__ == '__main__':
48
+ a = set(zh_symbols)
49
+ b = set(en_symbols)
50
+ print(sorted(a&b))
51
+
text/tone_sandhi.py ADDED
@@ -0,0 +1,351 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from typing import List
15
+ from typing import Tuple
16
+
17
+ import jieba
18
+ from pypinyin import lazy_pinyin
19
+ from pypinyin import Style
20
+
21
+
22
+ class ToneSandhi():
23
+ def __init__(self):
24
+ self.must_neural_tone_words = {
25
+ '麻烦', '麻利', '鸳鸯', '高粱', '骨头', '骆驼', '马虎', '首饰', '馒头', '馄饨', '风筝',
26
+ '难为', '队伍', '阔气', '闺女', '门道', '锄头', '铺盖', '铃铛', '铁匠', '钥匙', '里脊',
27
+ '里头', '部分', '那么', '道士', '造化', '迷糊', '连累', '这么', '这个', '运气', '过去',
28
+ '软和', '转悠', '踏实', '跳蚤', '跟头', '趔趄', '财主', '豆腐', '讲究', '记性', '记号',
29
+ '认识', '规矩', '见识', '裁缝', '补丁', '衣裳', '衣服', '衙门', '街坊', '行李', '行当',
30
+ '蛤蟆', '蘑菇', '薄荷', '葫芦', '葡萄', '萝卜', '荸荠', '苗条', '苗头', '苍蝇', '芝麻',
31
+ '舒服', '舒坦', '舌头', '自在', '膏药', '脾气', '脑袋', '脊梁', '能耐', '胳膊', '胭脂',
32
+ '胡萝', '胡琴', '胡同', '聪明', '耽误', '耽搁', '耷拉', '耳朵', '老爷', '老实', '老婆',
33
+ '老头', '老太', '翻腾', '罗嗦', '罐头', '编辑', '结实', '红火', '累赘', '糨糊', '糊涂',
34
+ '精神', '粮食', '簸箕', '篱笆', '算计', '算盘', '答应', '笤帚', '笑语', '笑话', '窟窿',
35
+ '窝囊', '窗户', '稳当', '稀罕', '称呼', '秧歌', '秀气', '秀才', '福气', '祖宗', '砚台',
36
+ '码头', '石榴', '石头', '石匠', '知识', '眼睛', '眯缝', '眨巴', '眉毛', '相声', '盘算',
37
+ '白净', '痢疾', '痛快', '疟疾', '疙瘩', '疏忽', '畜生', '生意', '甘蔗', '琵琶', '琢磨',
38
+ '琉璃', '玻璃', '玫瑰', '玄乎', '狐狸', '状元', '特务', '牲口', '牙碜', '牌楼', '爽快',
39
+ '爱人', '热闹', '烧饼', '烟筒', '烂糊', '点心', '炊帚', '灯笼', '火候', '漂亮', '滑溜',
40
+ '溜达', '温和', '清楚', '消息', '浪头', '活泼', '比方', '正经', '欺负', '模糊', '槟榔',
41
+ '棺材', '棒槌', '棉花', '核桃', '栅栏', '柴火', '架势', '枕头', '枇杷', '机灵', '本事',
42
+ '木头', '木匠', '朋友', '月饼', '月亮', '暖和', '明白', '时候', '新鲜', '故事', '收拾',
43
+ '收成', '提防', '挖苦', '挑剔', '指甲', '指头', '拾掇', '拳头', '拨弄', '招牌', '招呼',
44
+ '抬举', '护士', '折腾', '扫帚', '打量', '打算', '打点', '打扮', '打听', '打发', '扎实',
45
+ '扁担', '戒指', '懒得', '意识', '意思', '情形', '悟性', '怪物', '思量', '怎么', '念头',
46
+ '念叨', '快活', '忙活', '志气', '心思', '得罪', '张罗', '弟兄', '开通', '应酬', '庄稼',
47
+ '干事', '帮手', '帐篷', '希罕', '师父', '师傅', '巴结', '巴掌', '差事', '工夫', '岁数',
48
+ '屁股', '尾巴', '少爷', '小气', '小伙', '将就', '对头', '对付', '寡妇', '家伙', '客气',
49
+ '实在', '官司', '学问', '学生', '字号', '嫁妆', '媳妇', '媒人', '婆家', '娘家', '委屈',
50
+ '姑娘', '姐夫', '妯娌', '妥当', '妖精', '奴才', '女婿', '头发', '太阳', '大爷', '大方',
51
+ '大意', '大夫', '多少', '多么', '外甥', '壮实', '地道', '地方', '在乎', '困难', '嘴巴',
52
+ '嘱咐', '嘟囔', '嘀咕', '喜欢', '喇嘛', '喇叭', '商量', '唾沫', '哑巴', '哈欠', '哆嗦',
53
+ '咳嗽', '和尚', '告诉', '告示', '含糊', '吓唬', '后头', '名字', '名堂', '合同', '吆喝',
54
+ '叫唤', '口袋', '厚道', '厉害', '千斤', '包袱', '包涵', '匀称', '勤快', '动静', '动弹',
55
+ '功夫', '力气', '前头', '刺猬', '刺激', '别扭', '利落', '利索', '利害', '分析', '出息',
56
+ '凑合', '凉快', '冷战', '冤枉', '冒失', '养活', '关系', '先生', '兄弟', '便宜', '使唤',
57
+ '佩服', '作坊', '体面', '位置', '似的', '伙计', '休息', '什么', '人家', '亲戚', '亲家',
58
+ '交情', '云彩', '事情', '买卖', '主意', '丫头', '丧气', '两口', '东西', '东家', '世故',
59
+ '不由', '不在', '下水', '下巴', '上头', '上司', '丈夫', '丈人', '一辈', '那个', '菩萨',
60
+ '父亲', '母亲', '咕噜', '邋遢', '费用', '冤家', '甜头', '介绍', '荒唐', '大人', '泥鳅',
61
+ '幸福', '熟悉', '计划', '扑腾', '蜡烛', '姥爷', '照顾', '喉咙', '吉他', '弄堂', '蚂蚱',
62
+ '凤凰', '拖沓', '寒碜', '糟蹋', '倒腾', '报复', '逻辑', '盘缠', '喽啰', '牢骚', '咖喱',
63
+ '扫把', '惦记'
64
+ }
65
+ self.must_not_neural_tone_words = {
66
+ "男子", "女子", "分子", "原子", "量子", "莲子", "石子", "瓜子", "电子", "人人", "虎虎"
67
+ }
68
+ self.punc = ":,;。?!“”‘’':,;.?!"
69
+
70
+ # the meaning of jieba pos tag: https://blog.csdn.net/weixin_44174352/article/details/113731041
71
+ # e.g.
72
+ # word: "家里"
73
+ # pos: "s"
74
+ # finals: ['ia1', 'i3']
75
+ def _neural_sandhi(self, word: str, pos: str,
76
+ finals: List[str]) -> List[str]:
77
+
78
+ # reduplication words for n. and v. e.g. 奶奶, 试试, 旺旺
79
+ for j, item in enumerate(word):
80
+ if j - 1 >= 0 and item == word[j - 1] and pos[0] in {
81
+ "n", "v", "a"
82
+ } and word not in self.must_not_neural_tone_words:
83
+ finals[j] = finals[j][:-1] + "5"
84
+ ge_idx = word.find("个")
85
+ if len(word) >= 1 and word[-1] in "吧呢啊呐噻嘛吖嗨呐哦哒额滴哩哟喽啰耶喔诶":
86
+ finals[-1] = finals[-1][:-1] + "5"
87
+ elif len(word) >= 1 and word[-1] in "的地得":
88
+ finals[-1] = finals[-1][:-1] + "5"
89
+ # e.g. 走了, 看着, 去过
90
+ # elif len(word) == 1 and word in "了着过" and pos in {"ul", "uz", "ug"}:
91
+ # finals[-1] = finals[-1][:-1] + "5"
92
+ elif len(word) > 1 and word[-1] in "们子" and pos in {
93
+ "r", "n"
94
+ } and word not in self.must_not_neural_tone_words:
95
+ finals[-1] = finals[-1][:-1] + "5"
96
+ # e.g. 桌上, 地下, 家里
97
+ elif len(word) > 1 and word[-1] in "上下里" and pos in {"s", "l", "f"}:
98
+ finals[-1] = finals[-1][:-1] + "5"
99
+ # e.g. 上来, 下去
100
+ elif len(word) > 1 and word[-1] in "来去" and word[-2] in "上下进出回过起开":
101
+ finals[-1] = finals[-1][:-1] + "5"
102
+ # 个做量词
103
+ elif (ge_idx >= 1 and
104
+ (word[ge_idx - 1].isnumeric() or
105
+ word[ge_idx - 1] in "几有两半多各整每做是")) or word == '个':
106
+ finals[ge_idx] = finals[ge_idx][:-1] + "5"
107
+ else:
108
+ if word in self.must_neural_tone_words or word[
109
+ -2:] in self.must_neural_tone_words:
110
+ finals[-1] = finals[-1][:-1] + "5"
111
+
112
+ word_list = self._split_word(word)
113
+ finals_list = [finals[:len(word_list[0])], finals[len(word_list[0]):]]
114
+ for i, word in enumerate(word_list):
115
+ # conventional neural in Chinese
116
+ if word in self.must_neural_tone_words or word[
117
+ -2:] in self.must_neural_tone_words:
118
+ finals_list[i][-1] = finals_list[i][-1][:-1] + "5"
119
+ finals = sum(finals_list, [])
120
+ return finals
121
+
122
+ def _bu_sandhi(self, word: str, finals: List[str]) -> List[str]:
123
+ # e.g. 看不懂
124
+ if len(word) == 3 and word[1] == "不":
125
+ finals[1] = finals[1][:-1] + "5"
126
+ else:
127
+ for i, char in enumerate(word):
128
+ # "不" before tone4 should be bu2, e.g. 不怕
129
+ if char == "不" and i + 1 < len(word) and finals[i +
130
+ 1][-1] == "4":
131
+ finals[i] = finals[i][:-1] + "2"
132
+ return finals
133
+
134
+ def _yi_sandhi(self, word: str, finals: List[str]) -> List[str]:
135
+ # "一" in number sequences, e.g. 一零零, 二一零
136
+ if word.find("一") != -1 and all(
137
+ [item.isnumeric() for item in word if item != "一"]):
138
+ return finals
139
+ # "一" between reduplication words shold be yi5, e.g. 看一看
140
+ elif len(word) == 3 and word[1] == "一" and word[0] == word[-1]:
141
+ finals[1] = finals[1][:-1] + "5"
142
+ # when "一" is ordinal word, it should be yi1
143
+ elif word.startswith("第一"):
144
+ finals[1] = finals[1][:-1] + "1"
145
+ else:
146
+ for i, char in enumerate(word):
147
+ if char == "一" and i + 1 < len(word):
148
+ # "一" before tone4 should be yi2, e.g. 一段
149
+ if finals[i + 1][-1] == "4":
150
+ finals[i] = finals[i][:-1] + "2"
151
+ # "一" before non-tone4 should be yi4, e.g. 一天
152
+ else:
153
+ # "一" 后面如果是标点,还读一声
154
+ if word[i + 1] not in self.punc:
155
+ finals[i] = finals[i][:-1] + "4"
156
+ return finals
157
+
158
+ def _split_word(self, word: str) -> List[str]:
159
+ word_list = jieba.cut_for_search(word)
160
+ word_list = sorted(word_list, key=lambda i: len(i), reverse=False)
161
+ first_subword = word_list[0]
162
+ first_begin_idx = word.find(first_subword)
163
+ if first_begin_idx == 0:
164
+ second_subword = word[len(first_subword):]
165
+ new_word_list = [first_subword, second_subword]
166
+ else:
167
+ second_subword = word[:-len(first_subword)]
168
+ new_word_list = [second_subword, first_subword]
169
+ return new_word_list
170
+
171
+ def _three_sandhi(self, word: str, finals: List[str]) -> List[str]:
172
+ if len(word) == 2 and self._all_tone_three(finals):
173
+ finals[0] = finals[0][:-1] + "2"
174
+ elif len(word) == 3:
175
+ word_list = self._split_word(word)
176
+ if self._all_tone_three(finals):
177
+ # disyllabic + monosyllabic, e.g. 蒙古/包
178
+ if len(word_list[0]) == 2:
179
+ finals[0] = finals[0][:-1] + "2"
180
+ finals[1] = finals[1][:-1] + "2"
181
+ # monosyllabic + disyllabic, e.g. 纸/老虎
182
+ elif len(word_list[0]) == 1:
183
+ finals[1] = finals[1][:-1] + "2"
184
+ else:
185
+ finals_list = [
186
+ finals[:len(word_list[0])], finals[len(word_list[0]):]
187
+ ]
188
+ if len(finals_list) == 2:
189
+ for i, sub in enumerate(finals_list):
190
+ # e.g. 所有/人
191
+ if self._all_tone_three(sub) and len(sub) == 2:
192
+ finals_list[i][0] = finals_list[i][0][:-1] + "2"
193
+ # e.g. 好/喜欢
194
+ elif i == 1 and not self._all_tone_three(sub) and finals_list[i][0][-1] == "3" and \
195
+ finals_list[0][-1][-1] == "3":
196
+
197
+ finals_list[0][-1] = finals_list[0][-1][:-1] + "2"
198
+ finals = sum(finals_list, [])
199
+ # split idiom into two words who's length is 2
200
+ elif len(word) == 4:
201
+ finals_list = [finals[:2], finals[2:]]
202
+ finals = []
203
+ for sub in finals_list:
204
+ if self._all_tone_three(sub):
205
+ sub[0] = sub[0][:-1] + "2"
206
+ finals += sub
207
+
208
+ return finals
209
+
210
+ def _all_tone_three(self, finals: List[str]) -> bool:
211
+ return all(x[-1] == "3" for x in finals)
212
+
213
+ # merge "不" and the word behind it
214
+ # if don't merge, "不" sometimes appears alone according to jieba, which may occur sandhi error
215
+ def _merge_bu(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
216
+ new_seg = []
217
+ last_word = ""
218
+ for word, pos in seg:
219
+ if last_word == "不":
220
+ word = last_word + word
221
+ if word != "不":
222
+ new_seg.append((word, pos))
223
+ last_word = word[:]
224
+ if last_word == "不":
225
+ new_seg.append((last_word, 'd'))
226
+ last_word = ""
227
+ return new_seg
228
+
229
+ # function 1: merge "一" and reduplication words in it's left and right, e.g. "听","一","听" ->"听一听"
230
+ # function 2: merge single "一" and the word behind it
231
+ # if don't merge, "一" sometimes appears alone according to jieba, which may occur sandhi error
232
+ # e.g.
233
+ # input seg: [('听', 'v'), ('一', 'm'), ('听', 'v')]
234
+ # output seg: [['听一听', 'v']]
235
+ def _merge_yi(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
236
+ new_seg = []
237
+ # function 1
238
+ for i, (word, pos) in enumerate(seg):
239
+ if i - 1 >= 0 and word == "一" and i + 1 < len(seg) and seg[i - 1][
240
+ 0] == seg[i + 1][0] and seg[i - 1][1] == "v":
241
+ new_seg[i - 1][0] = new_seg[i - 1][0] + "一" + new_seg[i - 1][0]
242
+ else:
243
+ if i - 2 >= 0 and seg[i - 1][0] == "一" and seg[i - 2][
244
+ 0] == word and pos == "v":
245
+ continue
246
+ else:
247
+ new_seg.append([word, pos])
248
+ seg = new_seg
249
+ new_seg = []
250
+ # function 2
251
+ for i, (word, pos) in enumerate(seg):
252
+ if new_seg and new_seg[-1][0] == "一":
253
+ new_seg[-1][0] = new_seg[-1][0] + word
254
+ else:
255
+ new_seg.append([word, pos])
256
+ return new_seg
257
+
258
+ # the first and the second words are all_tone_three
259
+ def _merge_continuous_three_tones(
260
+ self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
261
+ new_seg = []
262
+ sub_finals_list = [
263
+ lazy_pinyin(
264
+ word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
265
+ for (word, pos) in seg
266
+ ]
267
+ assert len(sub_finals_list) == len(seg)
268
+ merge_last = [False] * len(seg)
269
+ for i, (word, pos) in enumerate(seg):
270
+ if i - 1 >= 0 and self._all_tone_three(
271
+ sub_finals_list[i - 1]) and self._all_tone_three(
272
+ sub_finals_list[i]) and not merge_last[i - 1]:
273
+ # if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
274
+ if not self._is_reduplication(seg[i - 1][0]) and len(
275
+ seg[i - 1][0]) + len(seg[i][0]) <= 3:
276
+ new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
277
+ merge_last[i] = True
278
+ else:
279
+ new_seg.append([word, pos])
280
+ else:
281
+ new_seg.append([word, pos])
282
+
283
+ return new_seg
284
+
285
+ def _is_reduplication(self, word: str) -> bool:
286
+ return len(word) == 2 and word[0] == word[1]
287
+
288
+ # the last char of first word and the first char of second word is tone_three
289
+ def _merge_continuous_three_tones_2(
290
+ self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
291
+ new_seg = []
292
+ sub_finals_list = [
293
+ lazy_pinyin(
294
+ word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
295
+ for (word, pos) in seg
296
+ ]
297
+ assert len(sub_finals_list) == len(seg)
298
+ merge_last = [False] * len(seg)
299
+ for i, (word, pos) in enumerate(seg):
300
+ if i - 1 >= 0 and sub_finals_list[i - 1][-1][-1] == "3" and sub_finals_list[i][0][-1] == "3" and not \
301
+ merge_last[i - 1]:
302
+ # if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
303
+ if not self._is_reduplication(seg[i - 1][0]) and len(
304
+ seg[i - 1][0]) + len(seg[i][0]) <= 3:
305
+ new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
306
+ merge_last[i] = True
307
+ else:
308
+ new_seg.append([word, pos])
309
+ else:
310
+ new_seg.append([word, pos])
311
+ return new_seg
312
+
313
+ def _merge_er(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
314
+ new_seg = []
315
+ for i, (word, pos) in enumerate(seg):
316
+ if i - 1 >= 0 and word == "儿" and seg[i-1][0] != "#":
317
+ new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
318
+ else:
319
+ new_seg.append([word, pos])
320
+ return new_seg
321
+
322
+ def _merge_reduplication(
323
+ self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
324
+ new_seg = []
325
+ for i, (word, pos) in enumerate(seg):
326
+ if new_seg and word == new_seg[-1][0]:
327
+ new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
328
+ else:
329
+ new_seg.append([word, pos])
330
+ return new_seg
331
+
332
+ def pre_merge_for_modify(
333
+ self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
334
+ seg = self._merge_bu(seg)
335
+ try:
336
+ seg = self._merge_yi(seg)
337
+ except:
338
+ print("_merge_yi failed")
339
+ seg = self._merge_reduplication(seg)
340
+ seg = self._merge_continuous_three_tones(seg)
341
+ seg = self._merge_continuous_three_tones_2(seg)
342
+ seg = self._merge_er(seg)
343
+ return seg
344
+
345
+ def modified_tone(self, word: str, pos: str,
346
+ finals: List[str]) -> List[str]:
347
+ finals = self._bu_sandhi(word, finals)
348
+ finals = self._yi_sandhi(word, finals)
349
+ finals = self._neural_sandhi(word, pos, finals)
350
+ finals = self._three_sandhi(word, finals)
351
+ return finals