nateraw commited on
Commit
ee65e1a
1 Parent(s): 3c6e8a5

Synced repo using 'sync_with_huggingface' Github Action

Browse files
Files changed (1) hide show
  1. app.py +2 -76
app.py CHANGED
@@ -161,51 +161,6 @@ def predict(
161
  )
162
  return model.target_sample, audio
163
 
164
-
165
- def predict_song_from_yt(
166
- ytid_or_url,
167
- start,
168
- end,
169
- speaker=speakers[0],
170
- transpose: int = 0,
171
- auto_predict_f0: bool = False,
172
- cluster_infer_ratio: float = 0,
173
- noise_scale: float = 0.4,
174
- f0_method: str = "dio",
175
- db_thresh: int = -40,
176
- pad_seconds: float = 0.5,
177
- chunk_seconds: float = 0.5,
178
- absolute_thresh: bool = False,
179
- ):
180
- end = min(start + duration_limit, end)
181
- original_track_filepath = download_youtube_clip(
182
- ytid_or_url,
183
- start,
184
- end,
185
- "track.wav",
186
- force=True,
187
- url_base="" if ytid_or_url.startswith("http") else "https://www.youtube.com/watch?v=",
188
- )
189
- vox_wav, inst_wav = extract_vocal_demucs(demucs_model, original_track_filepath)
190
- if transpose != 0:
191
- inst_wav = librosa.effects.pitch_shift(inst_wav.T, sr=model.target_sample, n_steps=transpose).T
192
- cloned_vox = model.infer_silence(
193
- vox_wav.astype(np.float32),
194
- speaker=speaker,
195
- transpose=transpose,
196
- auto_predict_f0=auto_predict_f0,
197
- cluster_infer_ratio=cluster_infer_ratio,
198
- noise_scale=noise_scale,
199
- f0_method=f0_method,
200
- db_thresh=db_thresh,
201
- pad_seconds=pad_seconds,
202
- chunk_seconds=chunk_seconds,
203
- absolute_thresh=absolute_thresh,
204
- )
205
- full_song = inst_wav + np.expand_dims(cloned_vox, 1)
206
- return (model.target_sample, full_song), (model.target_sample, cloned_vox)
207
-
208
-
209
  SPACE_ID = "nateraw/voice-cloning"
210
  description = f"""
211
  # Attention - This Space may be slow in the shared UI if there is a long queue. To speed it up, you can duplicate and use it with a paid private T4 GPU.
@@ -264,38 +219,9 @@ interface_file = gr.Interface(
264
  description=description,
265
  article=article,
266
  )
267
- interface_yt = gr.Interface(
268
- predict_song_from_yt,
269
- inputs=[
270
- gr.Textbox(
271
- label="YouTube URL or ID", info="A YouTube URL (or ID) to a song on YouTube you want to clone from"
272
- ),
273
- gr.Number(value=0, label="Start Time (seconds)"),
274
- gr.Number(value=15, label="End Time (seconds)"),
275
- gr.Dropdown(speakers, value=speakers[0], label="Target Speaker"),
276
- gr.Slider(-12, 12, value=0, step=1, label="Transpose (Semitones)"),
277
- gr.Checkbox(False, label="Auto Predict F0"),
278
- gr.Slider(0.0, 1.0, value=default_cluster_infer_ratio, step=0.1, label="cluster infer ratio"),
279
- gr.Slider(0.0, 1.0, value=0.4, step=0.1, label="noise scale"),
280
- gr.Dropdown(
281
- choices=["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"],
282
- value=default_f0_method,
283
- label="f0 method",
284
- ),
285
- ],
286
- outputs=["audio", "audio"],
287
- title="Voice Cloning",
288
- description=description,
289
- article=article,
290
- examples=[
291
- ["COz9lDCFHjw", 75, 90, speakers[0], 0, False, default_cluster_infer_ratio, 0.4, default_f0_method],
292
- ["dQw4w9WgXcQ", 21, 35, speakers[0], 0, False, default_cluster_infer_ratio, 0.4, default_f0_method],
293
- ["Wvm5GuDfAas", 15, 30, speakers[0], 0, False, default_cluster_infer_ratio, 0.4, default_f0_method],
294
- ],
295
- )
296
  interface = gr.TabbedInterface(
297
- [interface_mic, interface_file, interface_yt],
298
- ["Clone From Mic", "Clone From File", "Clone Song From YouTube"],
299
  )
300
 
301
 
 
161
  )
162
  return model.target_sample, audio
163
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
  SPACE_ID = "nateraw/voice-cloning"
165
  description = f"""
166
  # Attention - This Space may be slow in the shared UI if there is a long queue. To speed it up, you can duplicate and use it with a paid private T4 GPU.
 
219
  description=description,
220
  article=article,
221
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222
  interface = gr.TabbedInterface(
223
+ [interface_mic, interface_file],
224
+ ["Clone From Mic", "Clone From File"],
225
  )
226
 
227