nateraw commited on
Commit
ae3239e
1 Parent(s): f3cc5dc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +137 -13
app.py CHANGED
@@ -1,19 +1,24 @@
1
  import json
 
2
  from pathlib import Path
3
 
4
  import gradio as gr
5
  import librosa
6
  import numpy as np
7
  import torch
 
 
8
  from huggingface_hub import hf_hub_download, list_repo_files
 
9
  from so_vits_svc_fork.hparams import HParams
10
  from so_vits_svc_fork.inference.core import Svc
11
 
 
12
  ##########################################################
13
  # REPLACE THESE VALUES TO CHANGE THE MODEL REPO/CKPT NAME
14
  ##########################################################
15
- repo_id = "dog/arianagrande"
16
- ckpt_name = None # None will pick latest
17
  ##########################################################
18
 
19
  # Figure out the latest generator by taking highest value one.
@@ -34,6 +39,67 @@ hparams = HParams(**json.loads(Path(config_path).read_text()))
34
  speakers = list(hparams.spk.keys())
35
  device = "cuda" if torch.cuda.is_available() else "cpu"
36
  model = Svc(net_g_path=generator_path, config_path=config_path, device=device, cluster_model_path=None)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
 
39
  def predict(
@@ -66,18 +132,54 @@ def predict(
66
  return model.target_sample, audio
67
 
68
 
69
- description=f"""
70
- This app uses models trained with so-vits-svc-fork to clone your voice. Model currently being used is https://hf.co/{repo_id}.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
 
 
72
  To change the model being served, duplicate the space and update the `repo_id` in `app.py`.
73
  """.strip()
74
 
75
- article="""
76
  <p style='text-align: center'>
77
  <a href='https://github.com/voicepaw/so-vits-svc-fork' target='_blank'>Github Repo</a>
78
  </p>
79
  """.strip()
80
 
 
81
  interface_mic = gr.Interface(
82
  predict,
83
  inputs=[
@@ -85,9 +187,9 @@ interface_mic = gr.Interface(
85
  gr.Audio(type="filepath", source="microphone", label="Source Audio"),
86
  gr.Slider(-12, 12, value=0, step=1, label="Transpose (Semitones)"),
87
  gr.Checkbox(False, label="Auto Predict F0"),
88
- gr.Slider(0.0, 1.0, value=0.0, step=0.1, label='cluster infer ratio'),
89
  gr.Slider(0.0, 1.0, value=0.4, step=0.1, label="noise scale"),
90
- gr.Dropdown(choices=["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"], value='crepe', label="f0 method"),
91
  ],
92
  outputs="audio",
93
  title="Voice Cloning",
@@ -101,20 +203,42 @@ interface_file = gr.Interface(
101
  gr.Audio(type="filepath", source="upload", label="Source Audio"),
102
  gr.Slider(-12, 12, value=0, step=1, label="Transpose (Semitones)"),
103
  gr.Checkbox(False, label="Auto Predict F0"),
104
- gr.Slider(0.0, 1.0, value=0.0, step=0.1, label='cluster infer ratio'),
105
  gr.Slider(0.0, 1.0, value=0.4, step=0.1, label="noise scale"),
106
- gr.Dropdown(choices=["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"], value='crepe', label="f0 method"),
107
  ],
108
  outputs="audio",
109
  title="Voice Cloning",
110
  description=description,
111
  article=article,
112
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  interface = gr.TabbedInterface(
114
- [interface_mic, interface_file],
115
- ["Clone From Mic", "Clone From File"],
116
  )
117
 
118
 
119
- if __name__ == '__main__':
120
- interface.launch()
 
1
  import json
2
+ import subprocess
3
  from pathlib import Path
4
 
5
  import gradio as gr
6
  import librosa
7
  import numpy as np
8
  import torch
9
+ from demucs.apply import apply_model
10
+ from demucs.pretrained import DEFAULT_MODEL, get_model
11
  from huggingface_hub import hf_hub_download, list_repo_files
12
+
13
  from so_vits_svc_fork.hparams import HParams
14
  from so_vits_svc_fork.inference.core import Svc
15
 
16
+
17
  ##########################################################
18
  # REPLACE THESE VALUES TO CHANGE THE MODEL REPO/CKPT NAME
19
  ##########################################################
20
+ repo_id = "dog/kanye"
21
+ ckpt_name = None
22
  ##########################################################
23
 
24
  # Figure out the latest generator by taking highest value one.
 
39
  speakers = list(hparams.spk.keys())
40
  device = "cuda" if torch.cuda.is_available() else "cpu"
41
  model = Svc(net_g_path=generator_path, config_path=config_path, device=device, cluster_model_path=None)
42
+ demucs_model = get_model(DEFAULT_MODEL)
43
+
44
+
45
+ def extract_vocal_demucs(model, filename, sr=44100, device=None, shifts=1, split=True, overlap=0.25, jobs=0):
46
+ wav, sr = librosa.load(filename, mono=False, sr=sr)
47
+ wav = torch.tensor(wav)
48
+ ref = wav.mean(0)
49
+ wav = (wav - ref.mean()) / ref.std()
50
+ sources = apply_model(
51
+ model, wav[None], device=device, shifts=shifts, split=split, overlap=overlap, progress=True, num_workers=jobs
52
+ )[0]
53
+ sources = sources * ref.std() + ref.mean()
54
+ # We take just the vocals stem. I know the vocals for this model are at index -1
55
+ # If using different model, check model.sources.index('vocals')
56
+ vocal_wav = sources[-1]
57
+ # I did this because its the same normalization the so-vits model required
58
+ vocal_wav = vocal_wav / max(1.01 * vocal_wav.abs().max(), 1)
59
+ vocal_wav = vocal_wav.numpy()
60
+ vocal_wav = librosa.to_mono(vocal_wav)
61
+ vocal_wav = vocal_wav.T
62
+ instrumental_wav = sources[:-1].sum(0).numpy().T
63
+ return vocal_wav, instrumental_wav
64
+
65
+
66
+ def download_youtube_clip(
67
+ video_identifier,
68
+ start_time,
69
+ end_time,
70
+ output_filename,
71
+ num_attempts=5,
72
+ url_base="https://www.youtube.com/watch?v=",
73
+ quiet=False,
74
+ force=False,
75
+ ):
76
+ output_path = Path(output_filename)
77
+ if output_path.exists():
78
+ if not force:
79
+ return output_path
80
+ else:
81
+ output_path.unlink()
82
+
83
+ quiet = "--quiet --no-warnings" if quiet else ""
84
+ command = f"""
85
+ yt-dlp {quiet} -x --audio-format wav -f bestaudio -o "{output_filename}" --download-sections "*{start_time}-{end_time}" "{url_base}{video_identifier}" # noqa: E501
86
+ """.strip()
87
+
88
+ attempts = 0
89
+ while True:
90
+ try:
91
+ _ = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT)
92
+ except subprocess.CalledProcessError:
93
+ attempts += 1
94
+ if attempts == num_attempts:
95
+ return None
96
+ else:
97
+ break
98
+
99
+ if output_path.exists():
100
+ return output_path
101
+ else:
102
+ return None
103
 
104
 
105
  def predict(
 
132
  return model.target_sample, audio
133
 
134
 
135
+ def predict_song_from_yt(
136
+ ytid,
137
+ start,
138
+ end,
139
+ speaker=speakers[0],
140
+ transpose: int = 0,
141
+ auto_predict_f0: bool = False,
142
+ cluster_infer_ratio: float = 0,
143
+ noise_scale: float = 0.4,
144
+ f0_method: str = "dio",
145
+ db_thresh: int = -40,
146
+ pad_seconds: float = 0.5,
147
+ chunk_seconds: float = 0.5,
148
+ absolute_thresh: bool = False,
149
+ ):
150
+ original_track_filepath = download_youtube_clip(ytid, start, end, "track.wav", force=True)
151
+ vox_wav, inst_wav = extract_vocal_demucs(demucs_model, original_track_filepath, out_dir="./stems")
152
+ if transpose != 0:
153
+ inst_wav = librosa.effects.pitch_shift(inst_wav.T, sr=model.target_sample, n_steps=transpose).T
154
+ cloned_vox = model.infer_silence(
155
+ vox_wav.astype(np.float32),
156
+ speaker=speaker,
157
+ transpose=transpose,
158
+ auto_predict_f0=auto_predict_f0,
159
+ cluster_infer_ratio=cluster_infer_ratio,
160
+ noise_scale=noise_scale,
161
+ f0_method=f0_method,
162
+ db_thresh=db_thresh,
163
+ pad_seconds=pad_seconds,
164
+ chunk_seconds=chunk_seconds,
165
+ absolute_thresh=absolute_thresh,
166
+ )
167
+ full_song = inst_wav + np.expand_dims(cloned_vox, 1)
168
+ return (model.target_sample, full_song), (model.target_sample, cloned_vox)
169
+
170
 
171
+ description = f"""
172
+ This app uses models trained with so-vits-svc-fork to clone your voice. Model currently being used is https://hf.co/{repo_id}.
173
  To change the model being served, duplicate the space and update the `repo_id` in `app.py`.
174
  """.strip()
175
 
176
+ article = """
177
  <p style='text-align: center'>
178
  <a href='https://github.com/voicepaw/so-vits-svc-fork' target='_blank'>Github Repo</a>
179
  </p>
180
  """.strip()
181
 
182
+
183
  interface_mic = gr.Interface(
184
  predict,
185
  inputs=[
 
187
  gr.Audio(type="filepath", source="microphone", label="Source Audio"),
188
  gr.Slider(-12, 12, value=0, step=1, label="Transpose (Semitones)"),
189
  gr.Checkbox(False, label="Auto Predict F0"),
190
+ gr.Slider(0.0, 1.0, value=0.0, step=0.1, label="cluster infer ratio"),
191
  gr.Slider(0.0, 1.0, value=0.4, step=0.1, label="noise scale"),
192
+ gr.Dropdown(choices=["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"], value="dio", label="f0 method"),
193
  ],
194
  outputs="audio",
195
  title="Voice Cloning",
 
203
  gr.Audio(type="filepath", source="upload", label="Source Audio"),
204
  gr.Slider(-12, 12, value=0, step=1, label="Transpose (Semitones)"),
205
  gr.Checkbox(False, label="Auto Predict F0"),
206
+ gr.Slider(0.0, 1.0, value=0.0, step=0.1, label="cluster infer ratio"),
207
  gr.Slider(0.0, 1.0, value=0.4, step=0.1, label="noise scale"),
208
+ gr.Dropdown(choices=["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"], value="dio", label="f0 method"),
209
  ],
210
  outputs="audio",
211
  title="Voice Cloning",
212
  description=description,
213
  article=article,
214
  )
215
+ interface_yt = gr.Interface(
216
+ predict_song_from_yt,
217
+ inputs=[
218
+ "text",
219
+ gr.Number(value=0, label="Start Time (seconds)"),
220
+ gr.Number(value=15, label="End Time (seconds)"),
221
+ gr.Dropdown(speakers, value=speakers[0], label="Target Speaker"),
222
+ gr.Slider(-12, 12, value=0, step=1, label="Transpose (Semitones)"),
223
+ gr.Checkbox(False, label="Auto Predict F0"),
224
+ gr.Slider(0.0, 1.0, value=0.0, step=0.1, label="cluster infer ratio"),
225
+ gr.Slider(0.0, 1.0, value=0.4, step=0.1, label="noise scale"),
226
+ gr.Dropdown(choices=["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"], value="dio", label="f0 method"),
227
+ ],
228
+ outputs=["audio", "audio"],
229
+ title="Voice Cloning",
230
+ description=description,
231
+ article=article,
232
+ examples=[
233
+ ["COz9lDCFHjw", 75, 90, speakers[0], 0, False, 0.0, 0.4, "dio"],
234
+ ["Wvm5GuDfAas", 15, 30, speakers[0], 0, False, 0.0, 0.4, "crepe"],
235
+ ],
236
+ )
237
  interface = gr.TabbedInterface(
238
+ [interface_mic, interface_file, interface_yt],
239
+ ["Clone From Mic", "Clone From File", "Clone Song From YouTube"],
240
  )
241
 
242
 
243
+ if __name__ == "__main__":
244
+ interface.launch()