Spaces:
Runtime error
Runtime error
Synced repo using 'sync_with_huggingface' Github Action
Browse files
app.py
CHANGED
@@ -161,51 +161,6 @@ def predict(
|
|
161 |
)
|
162 |
return model.target_sample, audio
|
163 |
|
164 |
-
|
165 |
-
def predict_song_from_yt(
|
166 |
-
ytid_or_url,
|
167 |
-
start,
|
168 |
-
end,
|
169 |
-
speaker=speakers[0],
|
170 |
-
transpose: int = 0,
|
171 |
-
auto_predict_f0: bool = False,
|
172 |
-
cluster_infer_ratio: float = 0,
|
173 |
-
noise_scale: float = 0.4,
|
174 |
-
f0_method: str = "dio",
|
175 |
-
db_thresh: int = -40,
|
176 |
-
pad_seconds: float = 0.5,
|
177 |
-
chunk_seconds: float = 0.5,
|
178 |
-
absolute_thresh: bool = False,
|
179 |
-
):
|
180 |
-
end = min(start + duration_limit, end)
|
181 |
-
original_track_filepath = download_youtube_clip(
|
182 |
-
ytid_or_url,
|
183 |
-
start,
|
184 |
-
end,
|
185 |
-
"track.wav",
|
186 |
-
force=True,
|
187 |
-
url_base="" if ytid_or_url.startswith("http") else "https://www.youtube.com/watch?v=",
|
188 |
-
)
|
189 |
-
vox_wav, inst_wav = extract_vocal_demucs(demucs_model, original_track_filepath)
|
190 |
-
if transpose != 0:
|
191 |
-
inst_wav = librosa.effects.pitch_shift(inst_wav.T, sr=model.target_sample, n_steps=transpose).T
|
192 |
-
cloned_vox = model.infer_silence(
|
193 |
-
vox_wav.astype(np.float32),
|
194 |
-
speaker=speaker,
|
195 |
-
transpose=transpose,
|
196 |
-
auto_predict_f0=auto_predict_f0,
|
197 |
-
cluster_infer_ratio=cluster_infer_ratio,
|
198 |
-
noise_scale=noise_scale,
|
199 |
-
f0_method=f0_method,
|
200 |
-
db_thresh=db_thresh,
|
201 |
-
pad_seconds=pad_seconds,
|
202 |
-
chunk_seconds=chunk_seconds,
|
203 |
-
absolute_thresh=absolute_thresh,
|
204 |
-
)
|
205 |
-
full_song = inst_wav + np.expand_dims(cloned_vox, 1)
|
206 |
-
return (model.target_sample, full_song), (model.target_sample, cloned_vox)
|
207 |
-
|
208 |
-
|
209 |
SPACE_ID = "nateraw/voice-cloning"
|
210 |
description = f"""
|
211 |
# Attention - This Space may be slow in the shared UI if there is a long queue. To speed it up, you can duplicate and use it with a paid private T4 GPU.
|
@@ -264,38 +219,9 @@ interface_file = gr.Interface(
|
|
264 |
description=description,
|
265 |
article=article,
|
266 |
)
|
267 |
-
interface_yt = gr.Interface(
|
268 |
-
predict_song_from_yt,
|
269 |
-
inputs=[
|
270 |
-
gr.Textbox(
|
271 |
-
label="YouTube URL or ID", info="A YouTube URL (or ID) to a song on YouTube you want to clone from"
|
272 |
-
),
|
273 |
-
gr.Number(value=0, label="Start Time (seconds)"),
|
274 |
-
gr.Number(value=15, label="End Time (seconds)"),
|
275 |
-
gr.Dropdown(speakers, value=speakers[0], label="Target Speaker"),
|
276 |
-
gr.Slider(-12, 12, value=0, step=1, label="Transpose (Semitones)"),
|
277 |
-
gr.Checkbox(False, label="Auto Predict F0"),
|
278 |
-
gr.Slider(0.0, 1.0, value=default_cluster_infer_ratio, step=0.1, label="cluster infer ratio"),
|
279 |
-
gr.Slider(0.0, 1.0, value=0.4, step=0.1, label="noise scale"),
|
280 |
-
gr.Dropdown(
|
281 |
-
choices=["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"],
|
282 |
-
value=default_f0_method,
|
283 |
-
label="f0 method",
|
284 |
-
),
|
285 |
-
],
|
286 |
-
outputs=["audio", "audio"],
|
287 |
-
title="Voice Cloning",
|
288 |
-
description=description,
|
289 |
-
article=article,
|
290 |
-
examples=[
|
291 |
-
["COz9lDCFHjw", 75, 90, speakers[0], 0, False, default_cluster_infer_ratio, 0.4, default_f0_method],
|
292 |
-
["dQw4w9WgXcQ", 21, 35, speakers[0], 0, False, default_cluster_infer_ratio, 0.4, default_f0_method],
|
293 |
-
["Wvm5GuDfAas", 15, 30, speakers[0], 0, False, default_cluster_infer_ratio, 0.4, default_f0_method],
|
294 |
-
],
|
295 |
-
)
|
296 |
interface = gr.TabbedInterface(
|
297 |
-
[interface_mic, interface_file
|
298 |
-
["Clone From Mic", "Clone From File"
|
299 |
)
|
300 |
|
301 |
|
|
|
161 |
)
|
162 |
return model.target_sample, audio
|
163 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
164 |
SPACE_ID = "nateraw/voice-cloning"
|
165 |
description = f"""
|
166 |
# Attention - This Space may be slow in the shared UI if there is a long queue. To speed it up, you can duplicate and use it with a paid private T4 GPU.
|
|
|
219 |
description=description,
|
220 |
article=article,
|
221 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
222 |
interface = gr.TabbedInterface(
|
223 |
+
[interface_mic, interface_file],
|
224 |
+
["Clone From Mic", "Clone From File"],
|
225 |
)
|
226 |
|
227 |
|