yerfor commited on
Commit
02c15bb
1 Parent(s): 08c2768

add data_gen

Browse files
Files changed (33) hide show
  1. data_gen/tts/__pycache__/base_binarizer.cpython-36.pyc +0 -0
  2. data_gen/tts/__pycache__/base_binarizer.cpython-37.pyc +0 -0
  3. data_gen/tts/__pycache__/base_binarizer.cpython-39.pyc +0 -0
  4. data_gen/tts/__pycache__/base_preprocess.cpython-36.pyc +0 -0
  5. data_gen/tts/__pycache__/base_preprocess.cpython-37.pyc +0 -0
  6. data_gen/tts/base_binarizer.py +324 -0
  7. data_gen/tts/base_preprocess.py +251 -0
  8. data_gen/tts/runs/align_and_binarize.py +12 -0
  9. data_gen/tts/runs/binarize.py +17 -0
  10. data_gen/tts/runs/preprocess.py +17 -0
  11. data_gen/tts/runs/train_mfa_align.py +46 -0
  12. data_gen/tts/txt_processors/__init__.py +1 -0
  13. data_gen/tts/txt_processors/__pycache__/__init__.cpython-36.pyc +0 -0
  14. data_gen/tts/txt_processors/__pycache__/__init__.cpython-37.pyc +0 -0
  15. data_gen/tts/txt_processors/__pycache__/base_text_processor.cpython-36.pyc +0 -0
  16. data_gen/tts/txt_processors/__pycache__/base_text_processor.cpython-37.pyc +0 -0
  17. data_gen/tts/txt_processors/__pycache__/en.cpython-36.pyc +0 -0
  18. data_gen/tts/txt_processors/__pycache__/en.cpython-37.pyc +0 -0
  19. data_gen/tts/txt_processors/__pycache__/syntactic_graph_buider.cpython-36.pyc +0 -0
  20. data_gen/tts/txt_processors/__pycache__/zh.cpython-36.pyc +0 -0
  21. data_gen/tts/txt_processors/__pycache__/zh.cpython-37.pyc +0 -0
  22. data_gen/tts/txt_processors/base_text_processor.py +50 -0
  23. data_gen/tts/txt_processors/en.py +78 -0
  24. data_gen/tts/txt_processors/zh.py +110 -0
  25. data_gen/tts/wav_processors/__init__.py +2 -0
  26. data_gen/tts/wav_processors/__pycache__/__init__.cpython-36.pyc +0 -0
  27. data_gen/tts/wav_processors/__pycache__/__init__.cpython-37.pyc +0 -0
  28. data_gen/tts/wav_processors/__pycache__/base_processor.cpython-36.pyc +0 -0
  29. data_gen/tts/wav_processors/__pycache__/base_processor.cpython-37.pyc +0 -0
  30. data_gen/tts/wav_processors/__pycache__/common_processors.cpython-36.pyc +0 -0
  31. data_gen/tts/wav_processors/__pycache__/common_processors.cpython-37.pyc +0 -0
  32. data_gen/tts/wav_processors/base_processor.py +25 -0
  33. data_gen/tts/wav_processors/common_processors.py +86 -0
data_gen/tts/__pycache__/base_binarizer.cpython-36.pyc ADDED
Binary file (11.3 kB). View file
 
data_gen/tts/__pycache__/base_binarizer.cpython-37.pyc ADDED
Binary file (11.2 kB). View file
 
data_gen/tts/__pycache__/base_binarizer.cpython-39.pyc ADDED
Binary file (11.1 kB). View file
 
data_gen/tts/__pycache__/base_preprocess.cpython-36.pyc ADDED
Binary file (10.8 kB). View file
 
data_gen/tts/__pycache__/base_preprocess.cpython-37.pyc ADDED
Binary file (10.8 kB). View file
 
data_gen/tts/base_binarizer.py ADDED
@@ -0,0 +1,324 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import random
4
+ from re import L
5
+ import traceback
6
+ from functools import partial
7
+
8
+ import numpy as np
9
+ from resemblyzer import VoiceEncoder
10
+ from tqdm import tqdm
11
+
12
+ import utils.commons.single_thread_env # NOQA
13
+ from utils.audio import librosa_wav2spec
14
+ from utils.audio.align import get_mel2ph, mel2token_to_dur
15
+ from utils.audio.cwt import get_lf0_cwt, get_cont_lf0
16
+ from utils.audio.pitch.utils import f0_to_coarse
17
+ from utils.audio.pitch_extractors import extract_pitch_simple
18
+ from utils.commons.hparams import hparams
19
+ from utils.commons.indexed_datasets import IndexedDatasetBuilder
20
+ from utils.commons.multiprocess_utils import multiprocess_run_tqdm
21
+ from utils.os_utils import remove_file, copy_file
22
+
23
+ np.seterr(divide='ignore', invalid='ignore')
24
+
25
+
26
+ class BinarizationError(Exception):
27
+ pass
28
+
29
+ sentence2graph_parser = None
30
+
31
+
32
+ class BaseBinarizer:
33
+ def __init__(self, processed_data_dir=None):
34
+ if processed_data_dir is None:
35
+ processed_data_dir = hparams['processed_data_dir']
36
+ self.processed_data_dir = processed_data_dir
37
+ self.binarization_args = hparams['binarization_args']
38
+ self.items = {}
39
+ self.item_names = []
40
+
41
+ global sentence2graph_parser
42
+ from modules.tts.syntaspeech.syntactic_graph_buider import Sentence2GraphParser
43
+
44
+ if hparams['ds_name'] == 'libritts':
45
+ # Unfortunately, we found when processing libritts with multi-processing will incur pytorch.multiprocessing
46
+ # so we use single thread with cuda graph builder
47
+ # it take about 20 hours in a PC with 24-cores-cpu and a RTX2080Ti to process the whole LibriTTS
48
+ # so run the binarization and take a break!
49
+ sentence2graph_parser = Sentence2GraphParser("en", use_gpu=True)
50
+ elif hparams['ds_name'] == 'ljspeech':
51
+ # use multi-processing, thus gpu is disabled
52
+ # it takes about 30 minutes for binarization
53
+ sentence2graph_parser = Sentence2GraphParser("en", use_gpu=False)
54
+ elif hparams['preprocess_args']['txt_processor'] == 'zh':
55
+ # use multi-processing, thus gpu is disabled
56
+ # it takes about 30 minutes for binarization
57
+ sentence2graph_parser = Sentence2GraphParser("zh", use_gpu=False)
58
+ else:
59
+ raise NotImplementedError
60
+
61
+ def load_meta_data(self):
62
+ processed_data_dir = self.processed_data_dir
63
+ items_list = json.load(open(f"{processed_data_dir}/metadata.json"))
64
+ for r in tqdm(items_list, desc='Loading meta data.'):
65
+ item_name = r['item_name']
66
+ self.items[item_name] = r
67
+ self.item_names.append(item_name)
68
+ if self.binarization_args['shuffle']:
69
+ random.seed(1234)
70
+ random.shuffle(self.item_names)
71
+
72
+ @property
73
+ def train_item_names(self):
74
+ range_ = self._convert_range(self.binarization_args['train_range'])
75
+ return self.item_names[range_[0]:range_[1]]
76
+
77
+ @property
78
+ def valid_item_names(self):
79
+ range_ = self._convert_range(self.binarization_args['valid_range'])
80
+ return self.item_names[range_[0]:range_[1]]
81
+
82
+ @property
83
+ def test_item_names(self):
84
+ range_ = self._convert_range(self.binarization_args['test_range'])
85
+ return self.item_names[range_[0]:range_[1]]
86
+
87
+ def _convert_range(self, range_):
88
+ if range_[1] == -1:
89
+ range_[1] = len(self.item_names)
90
+ return range_
91
+
92
+ def meta_data(self, prefix):
93
+ if prefix == 'valid':
94
+ item_names = self.valid_item_names
95
+ elif prefix == 'test':
96
+ item_names = self.test_item_names
97
+ else:
98
+ item_names = self.train_item_names
99
+ for item_name in item_names:
100
+ yield self.items[item_name]
101
+
102
+ def process(self):
103
+ self.load_meta_data()
104
+ os.makedirs(hparams['binary_data_dir'], exist_ok=True)
105
+ for fn in ['phone_set.json', 'word_set.json', 'spk_map.json']:
106
+ remove_file(f"{hparams['binary_data_dir']}/{fn}")
107
+ copy_file(f"{hparams['processed_data_dir']}/{fn}", f"{hparams['binary_data_dir']}/{fn}")
108
+ if hparams['ds_name'] in ['ljspeech', 'biaobei']:
109
+ self.process_data('valid')
110
+ self.process_data('test')
111
+ self.process_data('train')
112
+ elif hparams['ds_name'] in ['libritts']:
113
+ self.process_data_single_processing('valid')
114
+ self.process_data_single_processing('test')
115
+ self.process_data_single_processing('train')
116
+ else:
117
+ raise NotImplementedError
118
+
119
+ def process_data(self, prefix):
120
+ data_dir = hparams['binary_data_dir']
121
+ builder = IndexedDatasetBuilder(f'{data_dir}/{prefix}')
122
+ meta_data = list(self.meta_data(prefix))
123
+ process_item = partial(self.process_item, binarization_args=self.binarization_args)
124
+ ph_lengths = []
125
+ mel_lengths = []
126
+ total_sec = 0
127
+ items = []
128
+ args = [{'item': item} for item in meta_data]
129
+
130
+ for item_id, item in multiprocess_run_tqdm(process_item, args, desc='Processing data'):
131
+ if item is not None:
132
+ items.append(item)
133
+ if self.binarization_args['with_spk_embed']:
134
+ args = [{'wav': item['wav']} for item in items]
135
+ for item_id, spk_embed in multiprocess_run_tqdm(
136
+ self.get_spk_embed, args,
137
+ init_ctx_func=lambda wid: {'voice_encoder': VoiceEncoder().cuda()}, num_workers=4,
138
+ desc='Extracting spk embed'):
139
+ items[item_id]['spk_embed'] = spk_embed
140
+
141
+ for item in items:
142
+ if not self.binarization_args['with_wav'] and 'wav' in item:
143
+ del item['wav']
144
+ builder.add_item(item)
145
+ mel_lengths.append(item['len'])
146
+ assert item['len'] > 0, (item['item_name'], item['txt'], item['mel2ph'])
147
+ if 'ph_len' in item:
148
+ ph_lengths.append(item['ph_len'])
149
+ total_sec += item['sec']
150
+ builder.finalize()
151
+ np.save(f'{data_dir}/{prefix}_lengths.npy', mel_lengths)
152
+ if len(ph_lengths) > 0:
153
+ np.save(f'{data_dir}/{prefix}_ph_lengths.npy', ph_lengths)
154
+ print(f"| {prefix} total duration: {total_sec:.3f}s")
155
+
156
+
157
+ def process_data_single_processing(self, prefix):
158
+ data_dir = hparams['binary_data_dir']
159
+ builder = IndexedDatasetBuilder(f'{data_dir}/{prefix}')
160
+ meta_data = list(self.meta_data(prefix))
161
+ ph_lengths = []
162
+ mel_lengths = []
163
+ total_sec = 0
164
+ items = []
165
+ args = [{'item': item} for item in meta_data]
166
+
167
+ for raw_item in tqdm(meta_data):
168
+ item = self.process_item(raw_item, self.binarization_args)
169
+ if item is not None:
170
+ if item['dgl_graph'].num_nodes() != np.array(item['ph2word']).max():
171
+ print(f"Skip Item: {item['item_name']} word nodes number incorrect!")
172
+ continue
173
+
174
+ items.append(item)
175
+
176
+ if self.binarization_args['with_spk_embed']:
177
+ args = [{'wav': item['wav']} for item in items]
178
+ for item_id, spk_embed in multiprocess_run_tqdm(
179
+ self.get_spk_embed, args,
180
+ init_ctx_func=lambda wid: {'voice_encoder': VoiceEncoder().cuda()}, num_workers=4,
181
+ desc='Extracting spk embed'):
182
+ items[item_id]['spk_embed'] = spk_embed
183
+
184
+ for item in items:
185
+ if not self.binarization_args['with_wav'] and 'wav' in item:
186
+ del item['wav']
187
+ builder.add_item(item)
188
+ mel_lengths.append(item['len'])
189
+ assert item['len'] > 0, (item['item_name'], item['txt'], item['mel2ph'])
190
+ if 'ph_len' in item:
191
+ ph_lengths.append(item['ph_len'])
192
+ total_sec += item['sec']
193
+ builder.finalize()
194
+ np.save(f'{data_dir}/{prefix}_lengths.npy', mel_lengths)
195
+ if len(ph_lengths) > 0:
196
+ np.save(f'{data_dir}/{prefix}_ph_lengths.npy', ph_lengths)
197
+ print(f"| {prefix} total duration: {total_sec:.3f}s")
198
+
199
+ @classmethod
200
+ def process_item(cls, item, binarization_args):
201
+ try:
202
+ item['ph_len'] = len(item['ph_token'])
203
+ item_name = item['item_name']
204
+ wav_fn = item['wav_fn']
205
+ wav, mel = cls.process_audio(wav_fn, item, binarization_args)
206
+ except Exception as e:
207
+ print(f"| Skip item ({e}) for index error. item_name: {item_name}, wav_fn: {wav_fn}")
208
+ return None
209
+ try:
210
+ n_bos_frames, n_eos_frames = 0, 0
211
+ if binarization_args['with_align']:
212
+ tg_fn = f"{hparams['processed_data_dir']}/mfa_outputs/{item_name}.TextGrid"
213
+ item['tg_fn'] = tg_fn
214
+ cls.process_align(tg_fn, item)
215
+ if binarization_args['trim_eos_bos']:
216
+ n_bos_frames = item['dur'][0]
217
+ n_eos_frames = item['dur'][-1]
218
+ T = len(mel)
219
+ item['mel'] = mel[n_bos_frames:T - n_eos_frames]
220
+
221
+ item['mel2ph'] = item['mel2ph'][n_bos_frames:T - n_eos_frames]
222
+ item['mel2word'] = item['mel2word'][n_bos_frames:T - n_eos_frames]
223
+ item['dur'] = item['dur'][1:-1]
224
+ item['dur_word'] = item['dur_word'][1:-1]
225
+ item['len'] = item['mel'].shape[0]
226
+ item['wav'] = wav[n_bos_frames * hparams['hop_size']:len(wav) - n_eos_frames * hparams['hop_size']]
227
+ if binarization_args['with_f0']:
228
+ cls.process_pitch(item, n_bos_frames, n_eos_frames)
229
+ except BinarizationError as e:
230
+ print(f"| Skip item ({e}). item_name: {item_name}, wav_fn: {wav_fn}")
231
+ return None
232
+ except Exception as e:
233
+ traceback.print_exc()
234
+ print(f"| Skip item. item_name: {item_name}, wav_fn: {wav_fn}")
235
+ return None
236
+
237
+ if item['mel'].shape[0] < 128:
238
+ print(f"Skip Item: {item['item_name']} Mel-spectrogram is shorter than 128!")
239
+ return None
240
+ # fix one bad case of stanza
241
+ if item['txt'].endswith('yn .'):
242
+ item['txt'] = item['txt'][:-4]+'y .'
243
+ try:
244
+ language = sentence2graph_parser.language
245
+ if language == 'en':
246
+ dgl_graph, etypes = sentence2graph_parser.parse(item['txt'])
247
+ elif language == 'zh':
248
+ dgl_graph, etypes = sentence2graph_parser.parse(item['txt'], item['word'].split(" "), item['ph_gb_word'].split(" "))
249
+ else:
250
+ raise NotImplementedError
251
+ item['dgl_graph'] = dgl_graph
252
+ item['edge_types'] = etypes
253
+ except:
254
+ print(f"| Dependency Parsing Error! Skip item. item_name: {item_name}, wav_fn: {wav_fn}")
255
+ return None
256
+ return item
257
+
258
+ @classmethod
259
+ def process_audio(cls, wav_fn, res, binarization_args):
260
+ wav2spec_dict = librosa_wav2spec(
261
+ wav_fn,
262
+ fft_size=hparams['fft_size'],
263
+ hop_size=hparams['hop_size'],
264
+ win_length=hparams['win_size'],
265
+ num_mels=hparams['audio_num_mel_bins'],
266
+ fmin=hparams['fmin'],
267
+ fmax=hparams['fmax'],
268
+ sample_rate=hparams['audio_sample_rate'],
269
+ loud_norm=hparams['loud_norm'])
270
+ mel = wav2spec_dict['mel']
271
+ wav = wav2spec_dict['wav'].astype(np.float16)
272
+ if binarization_args['with_linear']:
273
+ res['linear'] = wav2spec_dict['linear']
274
+ res.update({'mel': mel, 'wav': wav, 'sec': len(wav) / hparams['audio_sample_rate'], 'len': mel.shape[0]})
275
+ return wav, mel
276
+
277
+ @staticmethod
278
+ def process_align(tg_fn, item):
279
+ ph = item['ph']
280
+ mel = item['mel']
281
+ ph_token = item['ph_token']
282
+ if tg_fn is not None and os.path.exists(tg_fn):
283
+ mel2ph, dur = get_mel2ph(tg_fn, ph, mel, hparams['hop_size'], hparams['audio_sample_rate'],
284
+ hparams['binarization_args']['min_sil_duration'])
285
+ else:
286
+ raise BinarizationError(f"Align not found")
287
+ if np.array(mel2ph).max() - 1 >= len(ph_token):
288
+ raise BinarizationError(
289
+ f"Align does not match: mel2ph.max() - 1: {np.array(mel2ph).max() - 1}, len(phone_encoded): {len(ph_token)}")
290
+ item['mel2ph'] = mel2ph
291
+ item['dur'] = dur
292
+
293
+ ph2word = item['ph2word']
294
+ mel2word = [ph2word[p - 1] for p in item['mel2ph']]
295
+ item['mel2word'] = mel2word # [T_mel]
296
+ dur_word = mel2token_to_dur(mel2word, len(item['word_token']))
297
+ item['dur_word'] = dur_word.tolist() # [T_word]
298
+
299
+ @staticmethod
300
+ def process_pitch(item, n_bos_frames, n_eos_frames):
301
+ wav, mel = item['wav'], item['mel']
302
+ f0 = extract_pitch_simple(item['wav'])
303
+ if sum(f0) == 0:
304
+ raise BinarizationError("Empty f0")
305
+ assert len(mel) == len(f0), (len(mel), len(f0))
306
+ pitch_coarse = f0_to_coarse(f0)
307
+ item['f0'] = f0
308
+ item['pitch'] = pitch_coarse
309
+ if hparams['binarization_args']['with_f0cwt']:
310
+ uv, cont_lf0_lpf = get_cont_lf0(f0)
311
+ logf0s_mean_org, logf0s_std_org = np.mean(cont_lf0_lpf), np.std(cont_lf0_lpf)
312
+ cont_lf0_lpf_norm = (cont_lf0_lpf - logf0s_mean_org) / logf0s_std_org
313
+ cwt_spec, scales = get_lf0_cwt(cont_lf0_lpf_norm)
314
+ item['cwt_spec'] = cwt_spec
315
+ item['cwt_mean'] = logf0s_mean_org
316
+ item['cwt_std'] = logf0s_std_org
317
+
318
+ @staticmethod
319
+ def get_spk_embed(wav, ctx):
320
+ return ctx['voice_encoder'].embed_utterance(wav.astype(float))
321
+
322
+ @property
323
+ def num_workers(self):
324
+ return int(os.getenv('N_PROC', hparams.get('N_PROC', os.cpu_count())))
data_gen/tts/base_preprocess.py ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import random
4
+ import re
5
+ import traceback
6
+ from collections import Counter
7
+ from functools import partial
8
+
9
+ import librosa
10
+ from tqdm import tqdm
11
+ from data_gen.tts.txt_processors.base_text_processor import get_txt_processor_cls
12
+ from data_gen.tts.wav_processors.base_processor import get_wav_processor_cls
13
+ from utils.commons.hparams import hparams
14
+ from utils.commons.multiprocess_utils import multiprocess_run_tqdm
15
+ from utils.os_utils import link_file, move_file, remove_file
16
+ from utils.text.text_encoder import is_sil_phoneme, build_token_encoder
17
+
18
+
19
+ class BasePreprocessor:
20
+ def __init__(self):
21
+ self.preprocess_args = hparams['preprocess_args']
22
+ txt_processor = self.preprocess_args['txt_processor']
23
+ self.txt_processor = get_txt_processor_cls(txt_processor)
24
+ self.raw_data_dir = hparams['raw_data_dir']
25
+ self.processed_dir = hparams['processed_data_dir']
26
+ self.spk_map_fn = f"{self.processed_dir}/spk_map.json"
27
+
28
+ def meta_data(self):
29
+ """
30
+
31
+ :return: {'item_name': Str, 'wav_fn': Str, 'txt': Str, 'spk_name': Str, 'txt_loader': None or Func}
32
+ """
33
+ raise NotImplementedError
34
+
35
+ def process(self):
36
+ processed_dir = self.processed_dir
37
+ wav_processed_tmp_dir = f'{processed_dir}/processed_tmp'
38
+ remove_file(wav_processed_tmp_dir)
39
+ os.makedirs(wav_processed_tmp_dir, exist_ok=True)
40
+ wav_processed_dir = f'{processed_dir}/{self.wav_processed_dirname}'
41
+ remove_file(wav_processed_dir)
42
+ os.makedirs(wav_processed_dir, exist_ok=True)
43
+
44
+ meta_data = list(tqdm(self.meta_data(), desc='Load meta data'))
45
+ item_names = [d['item_name'] for d in meta_data]
46
+ assert len(item_names) == len(set(item_names)), 'Key `item_name` should be Unique.'
47
+
48
+ # preprocess data
49
+ phone_list = []
50
+ word_list = []
51
+ spk_names = set()
52
+ process_item = partial(self.preprocess_first_pass,
53
+ txt_processor=self.txt_processor,
54
+ wav_processed_dir=wav_processed_dir,
55
+ wav_processed_tmp=wav_processed_tmp_dir,
56
+ preprocess_args=self.preprocess_args)
57
+ items = []
58
+ args = [{
59
+ 'item_name': item_raw['item_name'],
60
+ 'txt_raw': item_raw['txt'],
61
+ 'wav_fn': item_raw['wav_fn'],
62
+ 'txt_loader': item_raw.get('txt_loader'),
63
+ 'others': item_raw.get('others', None)
64
+ } for item_raw in meta_data]
65
+ for item_, (item_id, item) in zip(meta_data, multiprocess_run_tqdm(process_item, args, desc='Preprocess')):
66
+ if item is not None:
67
+ item_.update(item)
68
+ item = item_
69
+ if 'txt_loader' in item:
70
+ del item['txt_loader']
71
+ item['id'] = item_id
72
+ item['spk_name'] = item.get('spk_name', '<SINGLE_SPK>')
73
+ item['others'] = item.get('others', None)
74
+ phone_list += item['ph'].split(" ")
75
+ word_list += item['word'].split(" ")
76
+ spk_names.add(item['spk_name'])
77
+ items.append(item)
78
+
79
+ # add encoded tokens
80
+ ph_encoder, word_encoder = self._phone_encoder(phone_list), self._word_encoder(word_list)
81
+ spk_map = self.build_spk_map(spk_names)
82
+ args = [{
83
+ 'ph': item['ph'], 'word': item['word'], 'spk_name': item['spk_name'],
84
+ 'word_encoder': word_encoder, 'ph_encoder': ph_encoder, 'spk_map': spk_map
85
+ } for item in items]
86
+ for idx, item_new_kv in multiprocess_run_tqdm(self.preprocess_second_pass, args, desc='Add encoded tokens'):
87
+ items[idx].update(item_new_kv)
88
+
89
+ # build mfa data
90
+ if self.preprocess_args['use_mfa']:
91
+ mfa_dict = set()
92
+ mfa_input_dir = f'{processed_dir}/mfa_inputs'
93
+ remove_file(mfa_input_dir)
94
+ # group MFA inputs for better parallelism
95
+ mfa_groups = [i // self.preprocess_args['nsample_per_mfa_group'] for i in range(len(items))]
96
+ if self.preprocess_args['mfa_group_shuffle']:
97
+ random.seed(hparams['seed'])
98
+ random.shuffle(mfa_groups)
99
+ args = [{
100
+ 'item': item, 'mfa_input_dir': mfa_input_dir,
101
+ 'mfa_group': mfa_group, 'wav_processed_tmp': wav_processed_tmp_dir,
102
+ 'preprocess_args': self.preprocess_args
103
+ } for item, mfa_group in zip(items, mfa_groups)]
104
+ for i, (ph_gb_word_nosil, new_wav_align_fn) in multiprocess_run_tqdm(
105
+ self.build_mfa_inputs, args, desc='Build MFA data'):
106
+ items[i]['wav_align_fn'] = new_wav_align_fn
107
+ for w in ph_gb_word_nosil.split(" "):
108
+ mfa_dict.add(f"{w} {w.replace('_', ' ')}")
109
+ mfa_dict = sorted(mfa_dict)
110
+ with open(f'{processed_dir}/mfa_dict.txt', 'w') as f:
111
+ f.writelines([f'{l}\n' for l in mfa_dict])
112
+ with open(f"{processed_dir}/{self.meta_csv_filename}.json", 'w') as f:
113
+ f.write(re.sub(r'\n\s+([\d+\]])', r'\1', json.dumps(items, ensure_ascii=False, sort_keys=False, indent=1)))
114
+ remove_file(wav_processed_tmp_dir)
115
+
116
+ @classmethod
117
+ def preprocess_first_pass(cls, item_name, txt_raw, txt_processor,
118
+ wav_fn, wav_processed_dir, wav_processed_tmp,
119
+ preprocess_args, txt_loader=None, others=None):
120
+ try:
121
+ if txt_loader is not None:
122
+ txt_raw = txt_loader(txt_raw)
123
+ ph, txt, word, ph2word, ph_gb_word = cls.txt_to_ph(txt_processor, txt_raw, preprocess_args)
124
+ wav_fn, wav_align_fn = cls.process_wav(
125
+ item_name, wav_fn,
126
+ hparams['processed_data_dir'],
127
+ wav_processed_tmp, preprocess_args)
128
+
129
+ # wav for binarization
130
+ ext = os.path.splitext(wav_fn)[1]
131
+ os.makedirs(wav_processed_dir, exist_ok=True)
132
+ new_wav_fn = f"{wav_processed_dir}/{item_name}{ext}"
133
+ move_link_func = move_file if os.path.dirname(wav_fn) == wav_processed_tmp else link_file
134
+ move_link_func(wav_fn, new_wav_fn)
135
+ return {
136
+ 'txt': txt, 'txt_raw': txt_raw, 'ph': ph,
137
+ 'word': word, 'ph2word': ph2word, 'ph_gb_word': ph_gb_word,
138
+ 'wav_fn': new_wav_fn, 'wav_align_fn': wav_align_fn,
139
+ 'others': others
140
+ }
141
+ except:
142
+ traceback.print_exc()
143
+ print(f"| Error is caught. item_name: {item_name}.")
144
+ return None
145
+
146
+ @staticmethod
147
+ def txt_to_ph(txt_processor, txt_raw, preprocess_args):
148
+ txt_struct, txt = txt_processor.process(txt_raw, preprocess_args)
149
+ ph = [p for w in txt_struct for p in w[1]]
150
+ ph_gb_word = ["_".join(w[1]) for w in txt_struct]
151
+ words = [w[0] for w in txt_struct]
152
+ # word_id=0 is reserved for padding
153
+ ph2word = [w_id + 1 for w_id, w in enumerate(txt_struct) for _ in range(len(w[1]))]
154
+ return " ".join(ph), txt, " ".join(words), ph2word, " ".join(ph_gb_word)
155
+
156
+ @staticmethod
157
+ def process_wav(item_name, wav_fn, processed_dir, wav_processed_tmp, preprocess_args):
158
+ processors = [get_wav_processor_cls(v) for v in preprocess_args['wav_processors']]
159
+ processors = [k() for k in processors if k is not None]
160
+ if len(processors) >= 1:
161
+ sr_file = librosa.core.get_samplerate(wav_fn)
162
+ output_fn_for_align = None
163
+ ext = os.path.splitext(wav_fn)[1]
164
+ input_fn = f"{wav_processed_tmp}/{item_name}{ext}"
165
+ link_file(wav_fn, input_fn)
166
+ for p in processors:
167
+ outputs = p.process(input_fn, sr_file, wav_processed_tmp, processed_dir, item_name, preprocess_args)
168
+ if len(outputs) == 3:
169
+ input_fn, sr, output_fn_for_align = outputs
170
+ else:
171
+ input_fn, sr = outputs
172
+ return input_fn, output_fn_for_align
173
+ else:
174
+ return wav_fn, wav_fn
175
+
176
+ def _phone_encoder(self, ph_set):
177
+ ph_set_fn = f"{self.processed_dir}/phone_set.json"
178
+ if self.preprocess_args['reset_phone_dict'] or not os.path.exists(ph_set_fn):
179
+ ph_set = sorted(set(ph_set))
180
+ json.dump(ph_set, open(ph_set_fn, 'w'), ensure_ascii=False)
181
+ print("| Build phone set: ", ph_set)
182
+ else:
183
+ ph_set = json.load(open(ph_set_fn, 'r'))
184
+ print("| Load phone set: ", ph_set)
185
+ return build_token_encoder(ph_set_fn)
186
+
187
+ def _word_encoder(self, word_set):
188
+ word_set_fn = f"{self.processed_dir}/word_set.json"
189
+ if self.preprocess_args['reset_word_dict']:
190
+ word_set = Counter(word_set)
191
+ total_words = sum(word_set.values())
192
+ word_set = word_set.most_common(hparams['word_dict_size'])
193
+ num_unk_words = total_words - sum([x[1] for x in word_set])
194
+ word_set = ['<BOS>', '<EOS>'] + [x[0] for x in word_set]
195
+ word_set = sorted(set(word_set))
196
+ json.dump(word_set, open(word_set_fn, 'w'), ensure_ascii=False)
197
+ print(f"| Build word set. Size: {len(word_set)}, #total words: {total_words},"
198
+ f" #unk_words: {num_unk_words}, word_set[:10]:, {word_set[:10]}.")
199
+ else:
200
+ word_set = json.load(open(word_set_fn, 'r'))
201
+ print("| Load word set. Size: ", len(word_set), word_set[:10])
202
+ return build_token_encoder(word_set_fn)
203
+
204
+ @classmethod
205
+ def preprocess_second_pass(cls, word, ph, spk_name, word_encoder, ph_encoder, spk_map):
206
+ word_token = word_encoder.encode(word)
207
+ ph_token = ph_encoder.encode(ph)
208
+ spk_id = spk_map[spk_name]
209
+ return {'word_token': word_token, 'ph_token': ph_token, 'spk_id': spk_id}
210
+
211
+ def build_spk_map(self, spk_names):
212
+ spk_map = {x: i for i, x in enumerate(sorted(list(spk_names)))}
213
+ assert len(spk_map) == 0 or len(spk_map) <= hparams['num_spk'], len(spk_map)
214
+ print(f"| Number of spks: {len(spk_map)}, spk_map: {spk_map}")
215
+ json.dump(spk_map, open(self.spk_map_fn, 'w'), ensure_ascii=False)
216
+ return spk_map
217
+
218
+ @classmethod
219
+ def build_mfa_inputs(cls, item, mfa_input_dir, mfa_group, wav_processed_tmp, preprocess_args):
220
+ item_name = item['item_name']
221
+ wav_align_fn = item['wav_align_fn']
222
+ ph_gb_word = item['ph_gb_word']
223
+ ext = os.path.splitext(wav_align_fn)[1]
224
+ mfa_input_group_dir = f'{mfa_input_dir}/{mfa_group}'
225
+ os.makedirs(mfa_input_group_dir, exist_ok=True)
226
+ new_wav_align_fn = f"{mfa_input_group_dir}/{item_name}{ext}"
227
+ move_link_func = move_file if os.path.dirname(wav_align_fn) == wav_processed_tmp else link_file
228
+ move_link_func(wav_align_fn, new_wav_align_fn)
229
+ ph_gb_word_nosil = " ".join(["_".join([p for p in w.split("_") if not is_sil_phoneme(p)])
230
+ for w in ph_gb_word.split(" ") if not is_sil_phoneme(w)])
231
+ with open(f'{mfa_input_group_dir}/{item_name}.lab', 'w') as f_txt:
232
+ f_txt.write(ph_gb_word_nosil)
233
+ return ph_gb_word_nosil, new_wav_align_fn
234
+
235
+ def load_spk_map(self, base_dir):
236
+ spk_map_fn = f"{base_dir}/spk_map.json"
237
+ spk_map = json.load(open(spk_map_fn, 'r'))
238
+ return spk_map
239
+
240
+ def load_dict(self, base_dir):
241
+ ph_encoder = build_token_encoder(f'{base_dir}/phone_set.json')
242
+ word_encoder = build_token_encoder(f'{base_dir}/word_set.json')
243
+ return ph_encoder, word_encoder
244
+
245
+ @property
246
+ def meta_csv_filename(self):
247
+ return 'metadata'
248
+
249
+ @property
250
+ def wav_processed_dirname(self):
251
+ return 'wav_processed'
data_gen/tts/runs/align_and_binarize.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import utils.commons.single_thread_env # NOQA
2
+ from utils.commons.hparams import set_hparams, hparams
3
+ from data_gen.tts.runs.binarize import binarize
4
+ from data_gen.tts.runs.preprocess import preprocess
5
+ from data_gen.tts.runs.train_mfa_align import train_mfa_align
6
+
7
+ if __name__ == '__main__':
8
+ set_hparams()
9
+ preprocess()
10
+ if hparams['preprocess_args']['use_mfa']:
11
+ train_mfa_align()
12
+ binarize()
data_gen/tts/runs/binarize.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import utils.commons.single_thread_env # NOQA
2
+ from utils.commons.hparams import hparams, set_hparams
3
+ import importlib
4
+
5
+
6
+ def binarize():
7
+ binarizer_cls = hparams.get("binarizer_cls", 'data_gen.tts.base_binarizer.BaseBinarizer')
8
+ pkg = ".".join(binarizer_cls.split(".")[:-1])
9
+ cls_name = binarizer_cls.split(".")[-1]
10
+ binarizer_cls = getattr(importlib.import_module(pkg), cls_name)
11
+ print("| Binarizer: ", binarizer_cls)
12
+ binarizer_cls().process()
13
+
14
+
15
+ if __name__ == '__main__':
16
+ set_hparams()
17
+ binarize()
data_gen/tts/runs/preprocess.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import utils.commons.single_thread_env # NOQA
2
+ from utils.commons.hparams import hparams, set_hparams
3
+ import importlib
4
+
5
+
6
+ def preprocess():
7
+ assert hparams['preprocess_cls'] != ''
8
+
9
+ pkg = ".".join(hparams["preprocess_cls"].split(".")[:-1])
10
+ cls_name = hparams["preprocess_cls"].split(".")[-1]
11
+ process_cls = getattr(importlib.import_module(pkg), cls_name)
12
+ process_cls().process()
13
+
14
+
15
+ if __name__ == '__main__':
16
+ set_hparams()
17
+ preprocess()
data_gen/tts/runs/train_mfa_align.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import utils.commons.single_thread_env # NOQA
2
+ import glob
3
+ import subprocess
4
+ from textgrid import TextGrid
5
+ import os
6
+ from utils.commons.hparams import hparams, set_hparams
7
+
8
+
9
+ def train_mfa_align(mfa_outputs="mfa_outputs",
10
+ mfa_inputs="mfa_inputs",
11
+ model_name=None, pretrain_model_name=None,
12
+ mfa_cmd='train'):
13
+ CORPUS = hparams['processed_data_dir'].split("/")[-1]
14
+ NUM_JOB = int(os.getenv('N_PROC', os.cpu_count()))
15
+ env_vars = [f'CORPUS={CORPUS}', f'NUM_JOB={NUM_JOB}']
16
+ if mfa_outputs is not None:
17
+ env_vars.append(f'MFA_OUTPUTS={mfa_outputs}')
18
+ if mfa_inputs is not None:
19
+ env_vars.append(f'MFA_INPUTS={mfa_inputs}')
20
+ if model_name is not None:
21
+ env_vars.append(f'MODEL_NAME={model_name}')
22
+ if pretrain_model_name is not None:
23
+ env_vars.append(f'PRETRAIN_MODEL_NAME={pretrain_model_name}')
24
+ if mfa_cmd is not None:
25
+ env_vars.append(f'MFA_CMD={mfa_cmd}')
26
+ env_str = ' '.join(env_vars)
27
+ print(f"| Run MFA for {CORPUS}. Env vars: {env_str}")
28
+ subprocess.check_call(f'{env_str} bash mfa_usr/run_mfa_train_align.sh', shell=True)
29
+ mfa_offset = hparams['preprocess_args']['mfa_offset']
30
+ if mfa_offset > 0:
31
+ for tg_fn in glob.glob(f'{hparams["processed_data_dir"]}/{mfa_outputs}/*.TextGrid'):
32
+ tg = TextGrid.fromFile(tg_fn)
33
+ max_time = tg.maxTime
34
+ for tier in tg.tiers:
35
+ for interval in tier.intervals:
36
+ interval.maxTime = min(interval.maxTime + mfa_offset, max_time)
37
+ interval.minTime = min(interval.minTime + mfa_offset, max_time)
38
+ tier.intervals[0].minTime = 0
39
+ tier.maxTime = min(tier.maxTime + mfa_offset, max_time)
40
+ tg.write(tg_fn)
41
+ TextGrid.fromFile(tg_fn)
42
+
43
+
44
+ if __name__ == '__main__':
45
+ set_hparams(print_hparams=False)
46
+ train_mfa_align()
data_gen/tts/txt_processors/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from . import en, zh
data_gen/tts/txt_processors/__pycache__/__init__.cpython-36.pyc ADDED
Binary file (200 Bytes). View file
 
data_gen/tts/txt_processors/__pycache__/__init__.cpython-37.pyc ADDED
Binary file (206 Bytes). View file
 
data_gen/tts/txt_processors/__pycache__/base_text_processor.cpython-36.pyc ADDED
Binary file (1.82 kB). View file
 
data_gen/tts/txt_processors/__pycache__/base_text_processor.cpython-37.pyc ADDED
Binary file (1.82 kB). View file
 
data_gen/tts/txt_processors/__pycache__/en.cpython-36.pyc ADDED
Binary file (2.66 kB). View file
 
data_gen/tts/txt_processors/__pycache__/en.cpython-37.pyc ADDED
Binary file (2.66 kB). View file
 
data_gen/tts/txt_processors/__pycache__/syntactic_graph_buider.cpython-36.pyc ADDED
Binary file (8.23 kB). View file
 
data_gen/tts/txt_processors/__pycache__/zh.cpython-36.pyc ADDED
Binary file (3.96 kB). View file
 
data_gen/tts/txt_processors/__pycache__/zh.cpython-37.pyc ADDED
Binary file (3.96 kB). View file
 
data_gen/tts/txt_processors/base_text_processor.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from utils.text.text_encoder import is_sil_phoneme
2
+
3
+ REGISTERED_TEXT_PROCESSORS = {}
4
+
5
+
6
+ def register_txt_processors(name):
7
+ def _f(cls):
8
+ REGISTERED_TEXT_PROCESSORS[name] = cls
9
+ return cls
10
+
11
+ return _f
12
+
13
+
14
+ def get_txt_processor_cls(name):
15
+ return REGISTERED_TEXT_PROCESSORS.get(name, None)
16
+
17
+
18
+ class BaseTxtProcessor:
19
+ @staticmethod
20
+ def sp_phonemes():
21
+ return ['|']
22
+
23
+ @classmethod
24
+ def process(cls, txt, preprocess_args):
25
+ raise NotImplementedError
26
+
27
+ @classmethod
28
+ def postprocess(cls, txt_struct, preprocess_args):
29
+ # remove sil phoneme in head and tail
30
+ while len(txt_struct) > 0 and is_sil_phoneme(txt_struct[0][0]):
31
+ txt_struct = txt_struct[1:]
32
+ while len(txt_struct) > 0 and is_sil_phoneme(txt_struct[-1][0]):
33
+ txt_struct = txt_struct[:-1]
34
+ if preprocess_args['with_phsep']:
35
+ txt_struct = cls.add_bdr(txt_struct)
36
+ if preprocess_args['add_eos_bos']:
37
+ txt_struct = [["<BOS>", ["<BOS>"]]] + txt_struct + [["<EOS>", ["<EOS>"]]]
38
+ return txt_struct
39
+
40
+ @classmethod
41
+ def add_bdr(cls, txt_struct):
42
+ txt_struct_ = []
43
+ for i, ts in enumerate(txt_struct):
44
+ txt_struct_.append(ts)
45
+ if i != len(txt_struct) - 1 and \
46
+ not is_sil_phoneme(txt_struct[i][0]) and not is_sil_phoneme(txt_struct[i + 1][0]):
47
+ # txt_struct_.append(['|', ['|']])
48
+ # We disbale the sep token because it is imcompatible with syntactic graph.
49
+ pass
50
+ return txt_struct_
data_gen/tts/txt_processors/en.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import unicodedata
3
+
4
+ from g2p_en import G2p
5
+ from g2p_en.expand import normalize_numbers
6
+ from nltk import pos_tag
7
+ from nltk.tokenize import TweetTokenizer
8
+
9
+ from data_gen.tts.txt_processors.base_text_processor import BaseTxtProcessor, register_txt_processors
10
+ from utils.text.text_encoder import PUNCS, is_sil_phoneme
11
+
12
+
13
+ class EnG2p(G2p):
14
+ word_tokenize = TweetTokenizer().tokenize
15
+
16
+ def __call__(self, text):
17
+ # preprocessing
18
+ words = EnG2p.word_tokenize(text)
19
+ tokens = pos_tag(words) # tuples of (word, tag)
20
+
21
+ # steps
22
+ prons = []
23
+ for word, pos in tokens:
24
+ if re.search("[a-z]", word) is None:
25
+ pron = [word]
26
+
27
+ elif word in self.homograph2features: # Check homograph
28
+ pron1, pron2, pos1 = self.homograph2features[word]
29
+ if pos.startswith(pos1):
30
+ pron = pron1
31
+ else:
32
+ pron = pron2
33
+ elif word in self.cmu: # lookup CMU dict
34
+ pron = self.cmu[word][0]
35
+ else: # predict for oov
36
+ pron = self.predict(word)
37
+
38
+ prons.extend(pron)
39
+ prons.extend([" "])
40
+
41
+ return prons[:-1]
42
+
43
+
44
+ @register_txt_processors('en')
45
+ class TxtProcessor(BaseTxtProcessor):
46
+ g2p = EnG2p()
47
+
48
+ @staticmethod
49
+ def preprocess_text(text):
50
+ text = normalize_numbers(text)
51
+ text = ''.join(char for char in unicodedata.normalize('NFD', text)
52
+ if unicodedata.category(char) != 'Mn') # Strip accents
53
+ text = text.lower()
54
+ text = re.sub("[\'\"()]+", "", text)
55
+ text = re.sub("[-]+", " ", text)
56
+ text = re.sub(f"[^ a-z{PUNCS}]", "", text)
57
+ text = re.sub(f" ?([{PUNCS}]) ?", r"\1", text) # !! -> !
58
+ text = re.sub(f"([{PUNCS}])+", r"\1", text) # !! -> !
59
+ text = text.replace("i.e.", "that is")
60
+ text = text.replace("i.e.", "that is")
61
+ text = text.replace("etc.", "etc")
62
+ text = re.sub(f"([{PUNCS}])", r" \1 ", text)
63
+ text = re.sub(rf"\s+", r" ", text)
64
+ return text
65
+
66
+ @classmethod
67
+ def process(cls, txt, preprocess_args):
68
+ txt = cls.preprocess_text(txt).strip()
69
+ phs = cls.g2p(txt)
70
+ txt_struct = [[w, []] for w in txt.split(" ")]
71
+ i_word = 0
72
+ for p in phs:
73
+ if p == ' ':
74
+ i_word += 1
75
+ else:
76
+ txt_struct[i_word][1].append(p)
77
+ txt_struct = cls.postprocess(txt_struct, preprocess_args)
78
+ return txt_struct, txt
data_gen/tts/txt_processors/zh.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import jieba
3
+ from pypinyin import pinyin, Style
4
+ from utils.text.text_norm import NSWNormalizer
5
+ from data_gen.tts.txt_processors.base_text_processor import BaseTxtProcessor, register_txt_processors
6
+ from utils.text.text_encoder import PUNCS, is_sil_phoneme
7
+
8
+ ALL_SHENMU = ['zh', 'ch', 'sh', 'b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', 'j',
9
+ 'q', 'x', 'r', 'z', 'c', 's', 'y', 'w']
10
+
11
+
12
+ @register_txt_processors('zh')
13
+ class TxtProcessor(BaseTxtProcessor):
14
+ table = {ord(f): ord(t) for f, t in zip(
15
+ u':,。!?【】()%#@&1234567890',
16
+ u':,.!?[]()%#@&1234567890')}
17
+
18
+ @staticmethod
19
+ def sp_phonemes():
20
+ return ['|', '#']
21
+
22
+ @staticmethod
23
+ def preprocess_text(text):
24
+ text = text.translate(TxtProcessor.table)
25
+ text = NSWNormalizer(text).normalize(remove_punc=False).lower()
26
+ text = re.sub("[\'\"()]+", "", text)
27
+ text = re.sub("[-]+", " ", text)
28
+ text = re.sub(f"[^ A-Za-z\u4e00-\u9fff{PUNCS}]", "", text)
29
+ text = re.sub(f"([{PUNCS}])+", r"\1", text) # !! -> !
30
+ text = re.sub(f"([{PUNCS}])", r" \1 ", text)
31
+ text = re.sub(rf"\s+", r"", text)
32
+ text = re.sub(rf"[A-Za-z]+", r"$", text)
33
+ return text
34
+
35
+ @classmethod
36
+ def pinyin_with_en(cls, txt, style):
37
+ x = pinyin(txt, style)
38
+ x = [t[0] for t in x]
39
+ x_ = []
40
+ for t in x:
41
+ if '$' not in t:
42
+ x_.append(t)
43
+ else:
44
+ x_ += list(t)
45
+ x_ = [t if t != '$' else 'ENG' for t in x_]
46
+ return x_
47
+
48
+ @classmethod
49
+ def process(cls, txt, pre_align_args):
50
+ txt = cls.preprocess_text(txt)
51
+ # https://blog.csdn.net/zhoulei124/article/details/89055403
52
+ pre_align_args['use_tone'] = True
53
+ shengmu = cls.pinyin_with_en(txt, style=Style.INITIALS)
54
+ yunmu = cls.pinyin_with_en(txt, style=
55
+ Style.FINALS_TONE3 if pre_align_args['use_tone'] else Style.FINALS)
56
+ assert len(shengmu) == len(yunmu)
57
+ ph_list = []
58
+ for a, b in zip(shengmu, yunmu):
59
+ if a == b:
60
+ ph_list += [a]
61
+ else:
62
+ ph_list += [a + "%" + b]
63
+ seg_list = '#'.join(jieba.cut(txt))
64
+ assert len(ph_list) == len([s for s in seg_list if s != '#']), (ph_list, seg_list)
65
+
66
+ # 加入词边界'#'
67
+ ph_list_ = []
68
+ seg_idx = 0
69
+ for p in ph_list:
70
+ if seg_list[seg_idx] == '#':
71
+ ph_list_.append('#')
72
+ seg_idx += 1
73
+ elif len(ph_list_) > 0:
74
+ ph_list_.append("|")
75
+ seg_idx += 1
76
+ finished = False
77
+ if not finished:
78
+ ph_list_ += [x for x in p.split("%") if x != '']
79
+
80
+ ph_list = ph_list_
81
+
82
+ # 去除静音符号周围的词边界标记 [..., '#', ',', '#', ...]
83
+ sil_phonemes = list(PUNCS) + TxtProcessor.sp_phonemes()
84
+ ph_list_ = []
85
+ for i in range(0, len(ph_list), 1):
86
+ if ph_list[i] != '#' or (ph_list[i - 1] not in sil_phonemes and ph_list[i + 1] not in sil_phonemes):
87
+ ph_list_.append(ph_list[i])
88
+ ph_list = ph_list_
89
+ txt_struct = [[w, []] for w in txt]
90
+ i = 0
91
+ for ph in ph_list:
92
+ if ph == '|' or ph == '#':
93
+ i += 1
94
+ continue
95
+ elif ph in [',', '.']:
96
+ i += 1
97
+ txt_struct[i][1].append(ph)
98
+ i += 1
99
+ continue
100
+ txt_struct[i][1].append(ph)
101
+ # return ph_list, txt
102
+ txt_struct.insert(0, ['<BOS>', ['<BOS>']])
103
+ txt_struct.append(['<EOS>', ['<EOS>']])
104
+ return txt_struct, txt
105
+
106
+
107
+ if __name__ == '__main__':
108
+ t = 'simon演唱过后,simon还进行了simon精彩的文艺演出simon.'
109
+ phs, txt = TxtProcessor.process(t, {'use_tone': True})
110
+ print(phs, txt)
data_gen/tts/wav_processors/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from . import base_processor
2
+ from . import common_processors
data_gen/tts/wav_processors/__pycache__/__init__.cpython-36.pyc ADDED
Binary file (239 Bytes). View file
 
data_gen/tts/wav_processors/__pycache__/__init__.cpython-37.pyc ADDED
Binary file (245 Bytes). View file
 
data_gen/tts/wav_processors/__pycache__/base_processor.cpython-36.pyc ADDED
Binary file (1.26 kB). View file
 
data_gen/tts/wav_processors/__pycache__/base_processor.cpython-37.pyc ADDED
Binary file (1.26 kB). View file
 
data_gen/tts/wav_processors/__pycache__/common_processors.cpython-36.pyc ADDED
Binary file (3.77 kB). View file
 
data_gen/tts/wav_processors/__pycache__/common_processors.cpython-37.pyc ADDED
Binary file (3.77 kB). View file
 
data_gen/tts/wav_processors/base_processor.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ REGISTERED_WAV_PROCESSORS = {}
2
+
3
+
4
+ def register_wav_processors(name):
5
+ def _f(cls):
6
+ REGISTERED_WAV_PROCESSORS[name] = cls
7
+ return cls
8
+
9
+ return _f
10
+
11
+
12
+ def get_wav_processor_cls(name):
13
+ return REGISTERED_WAV_PROCESSORS.get(name, None)
14
+
15
+
16
+ class BaseWavProcessor:
17
+ @property
18
+ def name(self):
19
+ raise NotImplementedError
20
+
21
+ def output_fn(self, input_fn):
22
+ return f'{input_fn[:-4]}_{self.name}.wav'
23
+
24
+ def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
25
+ raise NotImplementedError
data_gen/tts/wav_processors/common_processors.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import subprocess
3
+ import librosa
4
+ import numpy as np
5
+ from data_gen.tts.wav_processors.base_processor import BaseWavProcessor, register_wav_processors
6
+ from utils.audio import trim_long_silences
7
+ from utils.audio.io import save_wav
8
+ from utils.audio.rnnoise import rnnoise
9
+ from utils.commons.hparams import hparams
10
+
11
+
12
+ @register_wav_processors(name='sox_to_wav')
13
+ class ConvertToWavProcessor(BaseWavProcessor):
14
+ @property
15
+ def name(self):
16
+ return 'ToWav'
17
+
18
+ def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
19
+ if input_fn[-4:] == '.wav':
20
+ return input_fn, sr
21
+ else:
22
+ output_fn = self.output_fn(input_fn)
23
+ subprocess.check_call(f'sox -v 0.95 "{input_fn}" -t wav "{output_fn}"', shell=True)
24
+ return output_fn, sr
25
+
26
+
27
+ @register_wav_processors(name='sox_resample')
28
+ class ResampleProcessor(BaseWavProcessor):
29
+ @property
30
+ def name(self):
31
+ return 'Resample'
32
+
33
+ def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
34
+ output_fn = self.output_fn(input_fn)
35
+ sr_file = librosa.core.get_samplerate(input_fn)
36
+ if sr != sr_file:
37
+ subprocess.check_call(f'sox -v 0.95 "{input_fn}" -r{sr} "{output_fn}"', shell=True)
38
+ y, _ = librosa.core.load(input_fn, sr=sr)
39
+ y, _ = librosa.effects.trim(y)
40
+ save_wav(y, output_fn, sr)
41
+ return output_fn, sr
42
+ else:
43
+ return input_fn, sr
44
+
45
+
46
+ @register_wav_processors(name='trim_sil')
47
+ class TrimSILProcessor(BaseWavProcessor):
48
+ @property
49
+ def name(self):
50
+ return 'TrimSIL'
51
+
52
+ def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
53
+ output_fn = self.output_fn(input_fn)
54
+ y, _ = librosa.core.load(input_fn, sr=sr)
55
+ y, _ = librosa.effects.trim(y)
56
+ save_wav(y, output_fn, sr)
57
+ return output_fn
58
+
59
+
60
+ @register_wav_processors(name='trim_all_sil')
61
+ class TrimAllSILProcessor(BaseWavProcessor):
62
+ @property
63
+ def name(self):
64
+ return 'TrimSIL'
65
+
66
+ def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
67
+ output_fn = self.output_fn(input_fn)
68
+ y, audio_mask, _ = trim_long_silences(
69
+ input_fn, vad_max_silence_length=preprocess_args.get('vad_max_silence_length', 12))
70
+ save_wav(y, output_fn, sr)
71
+ if preprocess_args['save_sil_mask']:
72
+ os.makedirs(f'{processed_dir}/sil_mask', exist_ok=True)
73
+ np.save(f'{processed_dir}/sil_mask/{item_name}.npy', audio_mask)
74
+ return output_fn, sr
75
+
76
+
77
+ @register_wav_processors(name='denoise')
78
+ class DenoiseProcessor(BaseWavProcessor):
79
+ @property
80
+ def name(self):
81
+ return 'Denoise'
82
+
83
+ def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
84
+ output_fn = self.output_fn(input_fn)
85
+ rnnoise(input_fn, output_fn, out_sample_rate=sr)
86
+ return output_fn, sr