Spaces:
Runtime error
Runtime error
Upload synthesizer/preprocess.py with huggingface_hub
Browse files- synthesizer/preprocess.py +259 -0
synthesizer/preprocess.py
ADDED
@@ -0,0 +1,259 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from multiprocessing.pool import Pool
|
2 |
+
from synthesizer import audio
|
3 |
+
from functools import partial
|
4 |
+
from itertools import chain
|
5 |
+
from encoder import inference as encoder
|
6 |
+
from pathlib import Path
|
7 |
+
from utils import logmmse
|
8 |
+
from tqdm import tqdm
|
9 |
+
import numpy as np
|
10 |
+
import librosa
|
11 |
+
|
12 |
+
|
13 |
+
def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
|
14 |
+
skip_existing: bool, hparams, no_alignments: bool,
|
15 |
+
datasets_name: str, subfolders: str):
|
16 |
+
# Gather the input directories
|
17 |
+
dataset_root = datasets_root.joinpath(datasets_name)
|
18 |
+
input_dirs = [dataset_root.joinpath(subfolder.strip()) for subfolder in subfolders.split(",")]
|
19 |
+
print("\n ".join(map(str, ["Using data from:"] + input_dirs)))
|
20 |
+
assert all(input_dir.exists() for input_dir in input_dirs)
|
21 |
+
|
22 |
+
# Create the output directories for each output file type
|
23 |
+
out_dir.joinpath("mels").mkdir(exist_ok=True)
|
24 |
+
out_dir.joinpath("audio").mkdir(exist_ok=True)
|
25 |
+
|
26 |
+
# Create a metadata file
|
27 |
+
metadata_fpath = out_dir.joinpath("train.txt")
|
28 |
+
metadata_file = metadata_fpath.open("a" if skip_existing else "w", encoding="utf-8")
|
29 |
+
|
30 |
+
# Preprocess the dataset
|
31 |
+
speaker_dirs = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs))
|
32 |
+
func = partial(preprocess_speaker, out_dir=out_dir, skip_existing=skip_existing,
|
33 |
+
hparams=hparams, no_alignments=no_alignments)
|
34 |
+
job = Pool(n_processes).imap(func, speaker_dirs)
|
35 |
+
for speaker_metadata in tqdm(job, datasets_name, len(speaker_dirs), unit="speakers"):
|
36 |
+
for metadatum in speaker_metadata:
|
37 |
+
metadata_file.write("|".join(str(x) for x in metadatum) + "\n")
|
38 |
+
metadata_file.close()
|
39 |
+
|
40 |
+
# Verify the contents of the metadata file
|
41 |
+
with metadata_fpath.open("r", encoding="utf-8") as metadata_file:
|
42 |
+
metadata = [line.split("|") for line in metadata_file]
|
43 |
+
mel_frames = sum([int(m[4]) for m in metadata])
|
44 |
+
timesteps = sum([int(m[3]) for m in metadata])
|
45 |
+
sample_rate = hparams.sample_rate
|
46 |
+
hours = (timesteps / sample_rate) / 3600
|
47 |
+
print("The dataset consists of %d utterances, %d mel frames, %d audio timesteps (%.2f hours)." %
|
48 |
+
(len(metadata), mel_frames, timesteps, hours))
|
49 |
+
print("Max input length (text chars): %d" % max(len(m[5]) for m in metadata))
|
50 |
+
print("Max mel frames length: %d" % max(int(m[4]) for m in metadata))
|
51 |
+
print("Max audio timesteps length: %d" % max(int(m[3]) for m in metadata))
|
52 |
+
|
53 |
+
|
54 |
+
def preprocess_speaker(speaker_dir, out_dir: Path, skip_existing: bool, hparams, no_alignments: bool):
|
55 |
+
metadata = []
|
56 |
+
for book_dir in speaker_dir.glob("*"):
|
57 |
+
if no_alignments:
|
58 |
+
# Gather the utterance audios and texts
|
59 |
+
# LibriTTS uses .wav but we will include extensions for compatibility with other datasets
|
60 |
+
extensions = ["*.wav", "*.flac", "*.mp3"]
|
61 |
+
for extension in extensions:
|
62 |
+
wav_fpaths = book_dir.glob(extension)
|
63 |
+
|
64 |
+
for wav_fpath in wav_fpaths:
|
65 |
+
# Load the audio waveform
|
66 |
+
wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate)
|
67 |
+
if hparams.rescale:
|
68 |
+
wav = wav / np.abs(wav).max() * hparams.rescaling_max
|
69 |
+
|
70 |
+
# Get the corresponding text
|
71 |
+
# Check for .txt (for compatibility with other datasets)
|
72 |
+
text_fpath = wav_fpath.with_suffix(".txt")
|
73 |
+
if not text_fpath.exists():
|
74 |
+
# Check for .normalized.txt (LibriTTS)
|
75 |
+
text_fpath = wav_fpath.with_suffix(".normalized.txt")
|
76 |
+
assert text_fpath.exists()
|
77 |
+
with text_fpath.open("r") as text_file:
|
78 |
+
text = "".join([line for line in text_file])
|
79 |
+
text = text.replace("\"", "")
|
80 |
+
text = text.strip()
|
81 |
+
|
82 |
+
# Process the utterance
|
83 |
+
metadata.append(process_utterance(wav, text, out_dir, str(wav_fpath.with_suffix("").name),
|
84 |
+
skip_existing, hparams))
|
85 |
+
else:
|
86 |
+
# Process alignment file (LibriSpeech support)
|
87 |
+
# Gather the utterance audios and texts
|
88 |
+
try:
|
89 |
+
alignments_fpath = next(book_dir.glob("*.alignment.txt"))
|
90 |
+
with alignments_fpath.open("r") as alignments_file:
|
91 |
+
alignments = [line.rstrip().split(" ") for line in alignments_file]
|
92 |
+
except StopIteration:
|
93 |
+
# A few alignment files will be missing
|
94 |
+
continue
|
95 |
+
|
96 |
+
# Iterate over each entry in the alignments file
|
97 |
+
for wav_fname, words, end_times in alignments:
|
98 |
+
wav_fpath = book_dir.joinpath(wav_fname + ".flac")
|
99 |
+
assert wav_fpath.exists()
|
100 |
+
words = words.replace("\"", "").split(",")
|
101 |
+
end_times = list(map(float, end_times.replace("\"", "").split(",")))
|
102 |
+
|
103 |
+
# Process each sub-utterance
|
104 |
+
wavs, texts = split_on_silences(wav_fpath, words, end_times, hparams)
|
105 |
+
for i, (wav, text) in enumerate(zip(wavs, texts)):
|
106 |
+
sub_basename = "%s_%02d" % (wav_fname, i)
|
107 |
+
metadata.append(process_utterance(wav, text, out_dir, sub_basename,
|
108 |
+
skip_existing, hparams))
|
109 |
+
|
110 |
+
return [m for m in metadata if m is not None]
|
111 |
+
|
112 |
+
|
113 |
+
def split_on_silences(wav_fpath, words, end_times, hparams):
|
114 |
+
# Load the audio waveform
|
115 |
+
wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate)
|
116 |
+
if hparams.rescale:
|
117 |
+
wav = wav / np.abs(wav).max() * hparams.rescaling_max
|
118 |
+
|
119 |
+
words = np.array(words)
|
120 |
+
start_times = np.array([0.0] + end_times[:-1])
|
121 |
+
end_times = np.array(end_times)
|
122 |
+
assert len(words) == len(end_times) == len(start_times)
|
123 |
+
assert words[0] == "" and words[-1] == ""
|
124 |
+
|
125 |
+
# Find pauses that are too long
|
126 |
+
mask = (words == "") & (end_times - start_times >= hparams.silence_min_duration_split)
|
127 |
+
mask[0] = mask[-1] = True
|
128 |
+
breaks = np.where(mask)[0]
|
129 |
+
|
130 |
+
# Profile the noise from the silences and perform noise reduction on the waveform
|
131 |
+
silence_times = [[start_times[i], end_times[i]] for i in breaks]
|
132 |
+
silence_times = (np.array(silence_times) * hparams.sample_rate).astype(np.int)
|
133 |
+
noisy_wav = np.concatenate([wav[stime[0]:stime[1]] for stime in silence_times])
|
134 |
+
if len(noisy_wav) > hparams.sample_rate * 0.02:
|
135 |
+
profile = logmmse.profile_noise(noisy_wav, hparams.sample_rate)
|
136 |
+
wav = logmmse.denoise(wav, profile, eta=0)
|
137 |
+
|
138 |
+
# Re-attach segments that are too short
|
139 |
+
segments = list(zip(breaks[:-1], breaks[1:]))
|
140 |
+
segment_durations = [start_times[end] - end_times[start] for start, end in segments]
|
141 |
+
i = 0
|
142 |
+
while i < len(segments) and len(segments) > 1:
|
143 |
+
if segment_durations[i] < hparams.utterance_min_duration:
|
144 |
+
# See if the segment can be re-attached with the right or the left segment
|
145 |
+
left_duration = float("inf") if i == 0 else segment_durations[i - 1]
|
146 |
+
right_duration = float("inf") if i == len(segments) - 1 else segment_durations[i + 1]
|
147 |
+
joined_duration = segment_durations[i] + min(left_duration, right_duration)
|
148 |
+
|
149 |
+
# Do not re-attach if it causes the joined utterance to be too long
|
150 |
+
if joined_duration > hparams.hop_size * hparams.max_mel_frames / hparams.sample_rate:
|
151 |
+
i += 1
|
152 |
+
continue
|
153 |
+
|
154 |
+
# Re-attach the segment with the neighbour of shortest duration
|
155 |
+
j = i - 1 if left_duration <= right_duration else i
|
156 |
+
segments[j] = (segments[j][0], segments[j + 1][1])
|
157 |
+
segment_durations[j] = joined_duration
|
158 |
+
del segments[j + 1], segment_durations[j + 1]
|
159 |
+
else:
|
160 |
+
i += 1
|
161 |
+
|
162 |
+
# Split the utterance
|
163 |
+
segment_times = [[end_times[start], start_times[end]] for start, end in segments]
|
164 |
+
segment_times = (np.array(segment_times) * hparams.sample_rate).astype(np.int)
|
165 |
+
wavs = [wav[segment_time[0]:segment_time[1]] for segment_time in segment_times]
|
166 |
+
texts = [" ".join(words[start + 1:end]).replace(" ", " ") for start, end in segments]
|
167 |
+
|
168 |
+
# # DEBUG: play the audio segments (run with -n=1)
|
169 |
+
# import sounddevice as sd
|
170 |
+
# if len(wavs) > 1:
|
171 |
+
# print("This sentence was split in %d segments:" % len(wavs))
|
172 |
+
# else:
|
173 |
+
# print("There are no silences long enough for this sentence to be split:")
|
174 |
+
# for wav, text in zip(wavs, texts):
|
175 |
+
# # Pad the waveform with 1 second of silence because sounddevice tends to cut them early
|
176 |
+
# # when playing them. You shouldn't need to do that in your parsers.
|
177 |
+
# wav = np.concatenate((wav, [0] * 16000))
|
178 |
+
# print("\t%s" % text)
|
179 |
+
# sd.play(wav, 16000, blocking=True)
|
180 |
+
# print("")
|
181 |
+
|
182 |
+
return wavs, texts
|
183 |
+
|
184 |
+
|
185 |
+
def process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str,
|
186 |
+
skip_existing: bool, hparams):
|
187 |
+
## FOR REFERENCE:
|
188 |
+
# For you not to lose your head if you ever wish to change things here or implement your own
|
189 |
+
# synthesizer.
|
190 |
+
# - Both the audios and the mel spectrograms are saved as numpy arrays
|
191 |
+
# - There is no processing done to the audios that will be saved to disk beyond volume
|
192 |
+
# normalization (in split_on_silences)
|
193 |
+
# - However, pre-emphasis is applied to the audios before computing the mel spectrogram. This
|
194 |
+
# is why we re-apply it on the audio on the side of the vocoder.
|
195 |
+
# - Librosa pads the waveform before computing the mel spectrogram. Here, the waveform is saved
|
196 |
+
# without extra padding. This means that you won't have an exact relation between the length
|
197 |
+
# of the wav and of the mel spectrogram. See the vocoder data loader.
|
198 |
+
|
199 |
+
|
200 |
+
# Skip existing utterances if needed
|
201 |
+
mel_fpath = out_dir.joinpath("mels", "mel-%s.npy" % basename)
|
202 |
+
wav_fpath = out_dir.joinpath("audio", "audio-%s.npy" % basename)
|
203 |
+
if skip_existing and mel_fpath.exists() and wav_fpath.exists():
|
204 |
+
return None
|
205 |
+
|
206 |
+
# Trim silence
|
207 |
+
if hparams.trim_silence:
|
208 |
+
wav = encoder.preprocess_wav(wav, normalize=False, trim_silence=True)
|
209 |
+
|
210 |
+
# Skip utterances that are too short
|
211 |
+
if len(wav) < hparams.utterance_min_duration * hparams.sample_rate:
|
212 |
+
return None
|
213 |
+
|
214 |
+
# Compute the mel spectrogram
|
215 |
+
mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
|
216 |
+
mel_frames = mel_spectrogram.shape[1]
|
217 |
+
|
218 |
+
# Skip utterances that are too long
|
219 |
+
if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
|
220 |
+
return None
|
221 |
+
|
222 |
+
# Write the spectrogram, embed and audio to disk
|
223 |
+
np.save(mel_fpath, mel_spectrogram.T, allow_pickle=False)
|
224 |
+
np.save(wav_fpath, wav, allow_pickle=False)
|
225 |
+
|
226 |
+
# Return a tuple describing this training example
|
227 |
+
return wav_fpath.name, mel_fpath.name, "embed-%s.npy" % basename, len(wav), mel_frames, text
|
228 |
+
|
229 |
+
|
230 |
+
def embed_utterance(fpaths, encoder_model_fpath):
|
231 |
+
if not encoder.is_loaded():
|
232 |
+
encoder.load_model(encoder_model_fpath)
|
233 |
+
|
234 |
+
# Compute the speaker embedding of the utterance
|
235 |
+
wav_fpath, embed_fpath = fpaths
|
236 |
+
wav = np.load(wav_fpath)
|
237 |
+
wav = encoder.preprocess_wav(wav)
|
238 |
+
embed = encoder.embed_utterance(wav)
|
239 |
+
np.save(embed_fpath, embed, allow_pickle=False)
|
240 |
+
|
241 |
+
|
242 |
+
def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_processes: int):
|
243 |
+
wav_dir = synthesizer_root.joinpath("audio")
|
244 |
+
metadata_fpath = synthesizer_root.joinpath("train.txt")
|
245 |
+
assert wav_dir.exists() and metadata_fpath.exists()
|
246 |
+
embed_dir = synthesizer_root.joinpath("embeds")
|
247 |
+
embed_dir.mkdir(exist_ok=True)
|
248 |
+
|
249 |
+
# Gather the input wave filepath and the target output embed filepath
|
250 |
+
with metadata_fpath.open("r") as metadata_file:
|
251 |
+
metadata = [line.split("|") for line in metadata_file]
|
252 |
+
fpaths = [(wav_dir.joinpath(m[0]), embed_dir.joinpath(m[2])) for m in metadata]
|
253 |
+
|
254 |
+
# TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here.
|
255 |
+
# Embed the utterances in separate threads
|
256 |
+
func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath)
|
257 |
+
job = Pool(n_processes).imap(func, fpaths)
|
258 |
+
list(tqdm(job, "Embedding", len(fpaths), unit="utterances"))
|
259 |
+
|