keithhon commited on
Commit
13663d5
1 Parent(s): dc50595

Upload synthesizer/preprocess.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. synthesizer/preprocess.py +259 -0
synthesizer/preprocess.py ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from multiprocessing.pool import Pool
2
+ from synthesizer import audio
3
+ from functools import partial
4
+ from itertools import chain
5
+ from encoder import inference as encoder
6
+ from pathlib import Path
7
+ from utils import logmmse
8
+ from tqdm import tqdm
9
+ import numpy as np
10
+ import librosa
11
+
12
+
13
+ def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
14
+ skip_existing: bool, hparams, no_alignments: bool,
15
+ datasets_name: str, subfolders: str):
16
+ # Gather the input directories
17
+ dataset_root = datasets_root.joinpath(datasets_name)
18
+ input_dirs = [dataset_root.joinpath(subfolder.strip()) for subfolder in subfolders.split(",")]
19
+ print("\n ".join(map(str, ["Using data from:"] + input_dirs)))
20
+ assert all(input_dir.exists() for input_dir in input_dirs)
21
+
22
+ # Create the output directories for each output file type
23
+ out_dir.joinpath("mels").mkdir(exist_ok=True)
24
+ out_dir.joinpath("audio").mkdir(exist_ok=True)
25
+
26
+ # Create a metadata file
27
+ metadata_fpath = out_dir.joinpath("train.txt")
28
+ metadata_file = metadata_fpath.open("a" if skip_existing else "w", encoding="utf-8")
29
+
30
+ # Preprocess the dataset
31
+ speaker_dirs = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs))
32
+ func = partial(preprocess_speaker, out_dir=out_dir, skip_existing=skip_existing,
33
+ hparams=hparams, no_alignments=no_alignments)
34
+ job = Pool(n_processes).imap(func, speaker_dirs)
35
+ for speaker_metadata in tqdm(job, datasets_name, len(speaker_dirs), unit="speakers"):
36
+ for metadatum in speaker_metadata:
37
+ metadata_file.write("|".join(str(x) for x in metadatum) + "\n")
38
+ metadata_file.close()
39
+
40
+ # Verify the contents of the metadata file
41
+ with metadata_fpath.open("r", encoding="utf-8") as metadata_file:
42
+ metadata = [line.split("|") for line in metadata_file]
43
+ mel_frames = sum([int(m[4]) for m in metadata])
44
+ timesteps = sum([int(m[3]) for m in metadata])
45
+ sample_rate = hparams.sample_rate
46
+ hours = (timesteps / sample_rate) / 3600
47
+ print("The dataset consists of %d utterances, %d mel frames, %d audio timesteps (%.2f hours)." %
48
+ (len(metadata), mel_frames, timesteps, hours))
49
+ print("Max input length (text chars): %d" % max(len(m[5]) for m in metadata))
50
+ print("Max mel frames length: %d" % max(int(m[4]) for m in metadata))
51
+ print("Max audio timesteps length: %d" % max(int(m[3]) for m in metadata))
52
+
53
+
54
+ def preprocess_speaker(speaker_dir, out_dir: Path, skip_existing: bool, hparams, no_alignments: bool):
55
+ metadata = []
56
+ for book_dir in speaker_dir.glob("*"):
57
+ if no_alignments:
58
+ # Gather the utterance audios and texts
59
+ # LibriTTS uses .wav but we will include extensions for compatibility with other datasets
60
+ extensions = ["*.wav", "*.flac", "*.mp3"]
61
+ for extension in extensions:
62
+ wav_fpaths = book_dir.glob(extension)
63
+
64
+ for wav_fpath in wav_fpaths:
65
+ # Load the audio waveform
66
+ wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate)
67
+ if hparams.rescale:
68
+ wav = wav / np.abs(wav).max() * hparams.rescaling_max
69
+
70
+ # Get the corresponding text
71
+ # Check for .txt (for compatibility with other datasets)
72
+ text_fpath = wav_fpath.with_suffix(".txt")
73
+ if not text_fpath.exists():
74
+ # Check for .normalized.txt (LibriTTS)
75
+ text_fpath = wav_fpath.with_suffix(".normalized.txt")
76
+ assert text_fpath.exists()
77
+ with text_fpath.open("r") as text_file:
78
+ text = "".join([line for line in text_file])
79
+ text = text.replace("\"", "")
80
+ text = text.strip()
81
+
82
+ # Process the utterance
83
+ metadata.append(process_utterance(wav, text, out_dir, str(wav_fpath.with_suffix("").name),
84
+ skip_existing, hparams))
85
+ else:
86
+ # Process alignment file (LibriSpeech support)
87
+ # Gather the utterance audios and texts
88
+ try:
89
+ alignments_fpath = next(book_dir.glob("*.alignment.txt"))
90
+ with alignments_fpath.open("r") as alignments_file:
91
+ alignments = [line.rstrip().split(" ") for line in alignments_file]
92
+ except StopIteration:
93
+ # A few alignment files will be missing
94
+ continue
95
+
96
+ # Iterate over each entry in the alignments file
97
+ for wav_fname, words, end_times in alignments:
98
+ wav_fpath = book_dir.joinpath(wav_fname + ".flac")
99
+ assert wav_fpath.exists()
100
+ words = words.replace("\"", "").split(",")
101
+ end_times = list(map(float, end_times.replace("\"", "").split(",")))
102
+
103
+ # Process each sub-utterance
104
+ wavs, texts = split_on_silences(wav_fpath, words, end_times, hparams)
105
+ for i, (wav, text) in enumerate(zip(wavs, texts)):
106
+ sub_basename = "%s_%02d" % (wav_fname, i)
107
+ metadata.append(process_utterance(wav, text, out_dir, sub_basename,
108
+ skip_existing, hparams))
109
+
110
+ return [m for m in metadata if m is not None]
111
+
112
+
113
+ def split_on_silences(wav_fpath, words, end_times, hparams):
114
+ # Load the audio waveform
115
+ wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate)
116
+ if hparams.rescale:
117
+ wav = wav / np.abs(wav).max() * hparams.rescaling_max
118
+
119
+ words = np.array(words)
120
+ start_times = np.array([0.0] + end_times[:-1])
121
+ end_times = np.array(end_times)
122
+ assert len(words) == len(end_times) == len(start_times)
123
+ assert words[0] == "" and words[-1] == ""
124
+
125
+ # Find pauses that are too long
126
+ mask = (words == "") & (end_times - start_times >= hparams.silence_min_duration_split)
127
+ mask[0] = mask[-1] = True
128
+ breaks = np.where(mask)[0]
129
+
130
+ # Profile the noise from the silences and perform noise reduction on the waveform
131
+ silence_times = [[start_times[i], end_times[i]] for i in breaks]
132
+ silence_times = (np.array(silence_times) * hparams.sample_rate).astype(np.int)
133
+ noisy_wav = np.concatenate([wav[stime[0]:stime[1]] for stime in silence_times])
134
+ if len(noisy_wav) > hparams.sample_rate * 0.02:
135
+ profile = logmmse.profile_noise(noisy_wav, hparams.sample_rate)
136
+ wav = logmmse.denoise(wav, profile, eta=0)
137
+
138
+ # Re-attach segments that are too short
139
+ segments = list(zip(breaks[:-1], breaks[1:]))
140
+ segment_durations = [start_times[end] - end_times[start] for start, end in segments]
141
+ i = 0
142
+ while i < len(segments) and len(segments) > 1:
143
+ if segment_durations[i] < hparams.utterance_min_duration:
144
+ # See if the segment can be re-attached with the right or the left segment
145
+ left_duration = float("inf") if i == 0 else segment_durations[i - 1]
146
+ right_duration = float("inf") if i == len(segments) - 1 else segment_durations[i + 1]
147
+ joined_duration = segment_durations[i] + min(left_duration, right_duration)
148
+
149
+ # Do not re-attach if it causes the joined utterance to be too long
150
+ if joined_duration > hparams.hop_size * hparams.max_mel_frames / hparams.sample_rate:
151
+ i += 1
152
+ continue
153
+
154
+ # Re-attach the segment with the neighbour of shortest duration
155
+ j = i - 1 if left_duration <= right_duration else i
156
+ segments[j] = (segments[j][0], segments[j + 1][1])
157
+ segment_durations[j] = joined_duration
158
+ del segments[j + 1], segment_durations[j + 1]
159
+ else:
160
+ i += 1
161
+
162
+ # Split the utterance
163
+ segment_times = [[end_times[start], start_times[end]] for start, end in segments]
164
+ segment_times = (np.array(segment_times) * hparams.sample_rate).astype(np.int)
165
+ wavs = [wav[segment_time[0]:segment_time[1]] for segment_time in segment_times]
166
+ texts = [" ".join(words[start + 1:end]).replace(" ", " ") for start, end in segments]
167
+
168
+ # # DEBUG: play the audio segments (run with -n=1)
169
+ # import sounddevice as sd
170
+ # if len(wavs) > 1:
171
+ # print("This sentence was split in %d segments:" % len(wavs))
172
+ # else:
173
+ # print("There are no silences long enough for this sentence to be split:")
174
+ # for wav, text in zip(wavs, texts):
175
+ # # Pad the waveform with 1 second of silence because sounddevice tends to cut them early
176
+ # # when playing them. You shouldn't need to do that in your parsers.
177
+ # wav = np.concatenate((wav, [0] * 16000))
178
+ # print("\t%s" % text)
179
+ # sd.play(wav, 16000, blocking=True)
180
+ # print("")
181
+
182
+ return wavs, texts
183
+
184
+
185
+ def process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str,
186
+ skip_existing: bool, hparams):
187
+ ## FOR REFERENCE:
188
+ # For you not to lose your head if you ever wish to change things here or implement your own
189
+ # synthesizer.
190
+ # - Both the audios and the mel spectrograms are saved as numpy arrays
191
+ # - There is no processing done to the audios that will be saved to disk beyond volume
192
+ # normalization (in split_on_silences)
193
+ # - However, pre-emphasis is applied to the audios before computing the mel spectrogram. This
194
+ # is why we re-apply it on the audio on the side of the vocoder.
195
+ # - Librosa pads the waveform before computing the mel spectrogram. Here, the waveform is saved
196
+ # without extra padding. This means that you won't have an exact relation between the length
197
+ # of the wav and of the mel spectrogram. See the vocoder data loader.
198
+
199
+
200
+ # Skip existing utterances if needed
201
+ mel_fpath = out_dir.joinpath("mels", "mel-%s.npy" % basename)
202
+ wav_fpath = out_dir.joinpath("audio", "audio-%s.npy" % basename)
203
+ if skip_existing and mel_fpath.exists() and wav_fpath.exists():
204
+ return None
205
+
206
+ # Trim silence
207
+ if hparams.trim_silence:
208
+ wav = encoder.preprocess_wav(wav, normalize=False, trim_silence=True)
209
+
210
+ # Skip utterances that are too short
211
+ if len(wav) < hparams.utterance_min_duration * hparams.sample_rate:
212
+ return None
213
+
214
+ # Compute the mel spectrogram
215
+ mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
216
+ mel_frames = mel_spectrogram.shape[1]
217
+
218
+ # Skip utterances that are too long
219
+ if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
220
+ return None
221
+
222
+ # Write the spectrogram, embed and audio to disk
223
+ np.save(mel_fpath, mel_spectrogram.T, allow_pickle=False)
224
+ np.save(wav_fpath, wav, allow_pickle=False)
225
+
226
+ # Return a tuple describing this training example
227
+ return wav_fpath.name, mel_fpath.name, "embed-%s.npy" % basename, len(wav), mel_frames, text
228
+
229
+
230
+ def embed_utterance(fpaths, encoder_model_fpath):
231
+ if not encoder.is_loaded():
232
+ encoder.load_model(encoder_model_fpath)
233
+
234
+ # Compute the speaker embedding of the utterance
235
+ wav_fpath, embed_fpath = fpaths
236
+ wav = np.load(wav_fpath)
237
+ wav = encoder.preprocess_wav(wav)
238
+ embed = encoder.embed_utterance(wav)
239
+ np.save(embed_fpath, embed, allow_pickle=False)
240
+
241
+
242
+ def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_processes: int):
243
+ wav_dir = synthesizer_root.joinpath("audio")
244
+ metadata_fpath = synthesizer_root.joinpath("train.txt")
245
+ assert wav_dir.exists() and metadata_fpath.exists()
246
+ embed_dir = synthesizer_root.joinpath("embeds")
247
+ embed_dir.mkdir(exist_ok=True)
248
+
249
+ # Gather the input wave filepath and the target output embed filepath
250
+ with metadata_fpath.open("r") as metadata_file:
251
+ metadata = [line.split("|") for line in metadata_file]
252
+ fpaths = [(wav_dir.joinpath(m[0]), embed_dir.joinpath(m[2])) for m in metadata]
253
+
254
+ # TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here.
255
+ # Embed the utterances in separate threads
256
+ func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath)
257
+ job = Pool(n_processes).imap(func, fpaths)
258
+ list(tqdm(job, "Embedding", len(fpaths), unit="utterances"))
259
+