Spaces:
Running
Running
Support generating long clips
Browse filesGenerate short clips at paragraph level and then combine them.
app.py
CHANGED
@@ -134,7 +134,7 @@ def text_to_phone_idx(text):
|
|
134 |
return tokens
|
135 |
|
136 |
|
137 |
-
def text_to_speech(text):
|
138 |
# prevent too long text
|
139 |
if len(text) > 500:
|
140 |
text = text[:500]
|
@@ -146,9 +146,6 @@ def text_to_speech(text):
|
|
146 |
}
|
147 |
|
148 |
# predict phoneme duration
|
149 |
-
duration_net = DurationNet(hps.data.vocab_size, 64, 4).to(device)
|
150 |
-
duration_net.load_state_dict(torch.load(duration_model_path, map_location=device))
|
151 |
-
duration_net = duration_net.eval()
|
152 |
phone_length = torch.from_numpy(batch["phone_length"].copy()).long().to(device)
|
153 |
phone_idx = torch.from_numpy(batch["phone_idx"].copy()).long().to(device)
|
154 |
with torch.inference_mode():
|
@@ -158,24 +155,7 @@ def text_to_speech(text):
|
|
158 |
)
|
159 |
phone_duration = torch.where(phone_idx == 0, 0, phone_duration)
|
160 |
|
161 |
-
|
162 |
-
hps.data.vocab_size,
|
163 |
-
hps.data.filter_length // 2 + 1,
|
164 |
-
hps.train.segment_size // hps.data.hop_length,
|
165 |
-
**vars(hps.model),
|
166 |
-
).to(device)
|
167 |
-
del generator.enc_q
|
168 |
-
ckpt = torch.load(lightspeed_model_path, map_location=device)
|
169 |
-
params = {}
|
170 |
-
for k, v in ckpt["net_g"].items():
|
171 |
-
k = k[7:] if k.startswith("module.") else k
|
172 |
-
params[k] = v
|
173 |
-
generator.load_state_dict(params, strict=False)
|
174 |
-
del ckpt, params
|
175 |
-
generator = generator.eval()
|
176 |
-
# mininum 1 frame for each phone
|
177 |
-
# phone_duration = torch.clamp_min(phone_duration, hps.data.hop_length * 1000 / hps.data.sampling_rate)
|
178 |
-
# phone_duration = torch.where(phone_idx == 0, 0, phone_duration)
|
179 |
end_time = torch.cumsum(phone_duration, dim=-1)
|
180 |
start_time = end_time - phone_duration
|
181 |
start_frame = start_time / 1000 * hps.data.sampling_rate / hps.data.hop_length
|
@@ -194,8 +174,40 @@ def text_to_speech(text):
|
|
194 |
return (wave * (2**15)).astype(np.int16)
|
195 |
|
196 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
197 |
def speak(text):
|
198 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
199 |
return hps.data.sampling_rate, y
|
200 |
|
201 |
|
|
|
134 |
return tokens
|
135 |
|
136 |
|
137 |
+
def text_to_speech(duration_net, generator, text):
|
138 |
# prevent too long text
|
139 |
if len(text) > 500:
|
140 |
text = text[:500]
|
|
|
146 |
}
|
147 |
|
148 |
# predict phoneme duration
|
|
|
|
|
|
|
149 |
phone_length = torch.from_numpy(batch["phone_length"].copy()).long().to(device)
|
150 |
phone_idx = torch.from_numpy(batch["phone_idx"].copy()).long().to(device)
|
151 |
with torch.inference_mode():
|
|
|
155 |
)
|
156 |
phone_duration = torch.where(phone_idx == 0, 0, phone_duration)
|
157 |
|
158 |
+
# generate waveform
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
159 |
end_time = torch.cumsum(phone_duration, dim=-1)
|
160 |
start_time = end_time - phone_duration
|
161 |
start_frame = start_time / 1000 * hps.data.sampling_rate / hps.data.hop_length
|
|
|
174 |
return (wave * (2**15)).astype(np.int16)
|
175 |
|
176 |
|
177 |
+
def load_models():
|
178 |
+
duration_net = DurationNet(hps.data.vocab_size, 64, 4).to(device)
|
179 |
+
duration_net.load_state_dict(torch.load(duration_model_path, map_location=device))
|
180 |
+
duration_net = duration_net.eval()
|
181 |
+
generator = SynthesizerTrn(
|
182 |
+
hps.data.vocab_size,
|
183 |
+
hps.data.filter_length // 2 + 1,
|
184 |
+
hps.train.segment_size // hps.data.hop_length,
|
185 |
+
**vars(hps.model),
|
186 |
+
).to(device)
|
187 |
+
del generator.enc_q
|
188 |
+
ckpt = torch.load(lightspeed_model_path, map_location=device)
|
189 |
+
params = {}
|
190 |
+
for k, v in ckpt["net_g"].items():
|
191 |
+
k = k[7:] if k.startswith("module.") else k
|
192 |
+
params[k] = v
|
193 |
+
generator.load_state_dict(params, strict=False)
|
194 |
+
del ckpt, params
|
195 |
+
generator = generator.eval()
|
196 |
+
return duration_net, generator
|
197 |
+
|
198 |
+
|
199 |
def speak(text):
|
200 |
+
duration_net, generator = load_models()
|
201 |
+
paragraphs = text.split("\n")
|
202 |
+
clips = [] # list of audio clips
|
203 |
+
# silence = np.zeros(hps.data.sampling_rate // 4)
|
204 |
+
for paragraph in paragraphs:
|
205 |
+
paragraph = paragraph.strip()
|
206 |
+
if paragraph == "":
|
207 |
+
continue
|
208 |
+
clips.append(text_to_speech(duration_net, generator, paragraph))
|
209 |
+
# clips.append(silence)
|
210 |
+
y = np.concatenate(clips)
|
211 |
return hps.data.sampling_rate, y
|
212 |
|
213 |
|