smjain commited on
Commit
e38a7f3
1 Parent(s): fc1c86d

Upload myinfer_latest.py

Browse files
Files changed (1) hide show
  1. myinfer_latest.py +393 -0
myinfer_latest.py ADDED
@@ -0,0 +1,393 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch, os, traceback, sys, warnings, shutil, numpy as np
2
+ import gradio as gr
3
+ import librosa
4
+ import asyncio
5
+ import rarfile
6
+ import edge_tts
7
+ import yt_dlp
8
+ import ffmpeg
9
+ import gdown
10
+ import subprocess
11
+ import wave
12
+ import soundfile as sf
13
+ from scipy.io import wavfile
14
+ from datetime import datetime
15
+ from urllib.parse import urlparse
16
+ from mega import Mega
17
+ from flask import Flask, request, jsonify, send_file
18
+ import base64
19
+ import tempfile
20
+ import os
21
+ import werkzeug
22
+ from pydub import AudioSegment
23
+ import uuid
24
+
25
+
26
+ app = Flask(__name__)
27
+
28
+ now_dir = os.getcwd()
29
+ tmp = os.path.join(now_dir, "TEMP")
30
+ shutil.rmtree(tmp, ignore_errors=True)
31
+ os.makedirs(tmp, exist_ok=True)
32
+ os.environ["TEMP"] = tmp
33
+ split_model="htdemucs"
34
+ from lib.infer_pack.models import (
35
+ SynthesizerTrnMs256NSFsid,
36
+ SynthesizerTrnMs256NSFsid_nono,
37
+ SynthesizerTrnMs768NSFsid,
38
+ SynthesizerTrnMs768NSFsid_nono,
39
+ )
40
+ from fairseq import checkpoint_utils
41
+ from vc_infer_pipeline import VC
42
+ from config import Config
43
+ config = Config()
44
+
45
+ tts_voice_list = asyncio.get_event_loop().run_until_complete(edge_tts.list_voices())
46
+ voices = [f"{v['ShortName']}-{v['Gender']}" for v in tts_voice_list]
47
+
48
+ hubert_model = None
49
+
50
+ f0method_mode = ["pm", "harvest", "crepe"]
51
+ f0method_info = "PM is fast, Harvest is good but extremely slow, and Crepe effect is good but requires GPU (Default: PM)"
52
+
53
+ if os.path.isfile("rmvpe.pt"):
54
+ f0method_mode.insert(2, "rmvpe")
55
+ f0method_info = "PM is fast, Harvest is good but extremely slow, Rvmpe is alternative to harvest (might be better), and Crepe effect is good but requires GPU (Default: PM)"
56
+
57
+ def load_hubert():
58
+ global hubert_model
59
+ models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
60
+ ["hubert_base.pt"],
61
+ suffix="",
62
+ )
63
+ hubert_model = models[0]
64
+ hubert_model = hubert_model.to(config.device)
65
+ if config.is_half:
66
+ hubert_model = hubert_model.half()
67
+ else:
68
+ hubert_model = hubert_model.float()
69
+ hubert_model.eval()
70
+
71
+ load_hubert()
72
+
73
+ weight_root = "weights"
74
+ index_root = "weights/index"
75
+ weights_model = []
76
+ weights_index = []
77
+ for _, _, model_files in os.walk(weight_root):
78
+ for file in model_files:
79
+ if file.endswith(".pth"):
80
+ weights_model.append(file)
81
+ for _, _, index_files in os.walk(index_root):
82
+ for file in index_files:
83
+ if file.endswith('.index') and "trained" not in file:
84
+ weights_index.append(os.path.join(index_root, file))
85
+
86
+ def check_models():
87
+ weights_model = []
88
+ weights_index = []
89
+ for _, _, model_files in os.walk(weight_root):
90
+ for file in model_files:
91
+ if file.endswith(".pth"):
92
+ weights_model.append(file)
93
+ for _, _, index_files in os.walk(index_root):
94
+ for file in index_files:
95
+ if file.endswith('.index') and "trained" not in file:
96
+ weights_index.append(os.path.join(index_root, file))
97
+ return (
98
+ gr.Dropdown.update(choices=sorted(weights_model), value=weights_model[0]),
99
+ gr.Dropdown.update(choices=sorted(weights_index))
100
+ )
101
+
102
+ def clean():
103
+ return (
104
+ gr.Dropdown.update(value=""),
105
+ gr.Slider.update(visible=False)
106
+ )
107
+
108
+
109
+
110
+ @app.route('/convert_voice', methods=['POST'])
111
+ def api_convert_voice():
112
+ spk_id = request.form['spk_id']
113
+ voice_transform = request.form['voice_transform']
114
+
115
+ # The file part
116
+ if 'file' not in request.files:
117
+ return jsonify({"error": "No file part"}), 400
118
+ file = request.files['file']
119
+ if file.filename == '':
120
+ return jsonify({"error": "No selected file"}), 400
121
+
122
+ # Save the file to a temporary path
123
+ unique_id = uuid.uuid4()
124
+
125
+ filename = werkzeug.utils.secure_filename(file.filename)
126
+ input_audio_path = os.path.join(tmp, f"{spk_id}_input_audio_{unique_id}.{filename.split('.')[-1]}")
127
+ file.save(input_audio_path)
128
+
129
+ #split audio
130
+ cut_vocal_and_inst(input_audio_path,spk_id)
131
+ print("audio splitting performed")
132
+ vocal_path = f"output/{split_model}/{spk_id}_input_audio/vocals.wav"
133
+ inst = f"output/{split_model}/{spk_id}_input_audio/no_vocals.wav"
134
+
135
+ output_path = convert_voice(spk_id, vocal_path, voice_transform)
136
+ output_path1= combine_vocal_and_inst(output_path,inst)
137
+ print(output_path1)
138
+
139
+
140
+ if os.path.exists(output_path1):
141
+ return send_file(output_path1, as_attachment=True)
142
+ else:
143
+ return jsonify({"error": "File not found."}), 404
144
+
145
+
146
+
147
+ def convert_voice(spk_id, input_audio_path, voice_transform):
148
+ get_vc(spk_id,0.5)
149
+ output_audio_path = vc_single(
150
+ sid=0,
151
+ input_audio_path=input_audio_path,
152
+ f0_up_key=voice_transform, # Assuming voice_transform corresponds to f0_up_key
153
+ f0_file=None ,
154
+ f0_method="rmvpe",
155
+ file_index=spk_id, # Assuming file_index_path corresponds to file_index
156
+ index_rate=0.75,
157
+ filter_radius=3,
158
+ resample_sr=0,
159
+ rms_mix_rate=0.25,
160
+ protect=0.33 # Adjusted from protect_rate to protect to match the function signature
161
+ )
162
+ print(output_audio_path)
163
+ return output_audio_path
164
+
165
+ def cut_vocal_and_inst(audio_path,spk_id):
166
+
167
+ vocal_path = "output/result/audio.wav"
168
+ os.makedirs("output/result", exist_ok=True)
169
+ #wavfile.write(vocal_path, audio_data[0], audio_data[1])
170
+ #logs.append("Starting the audio splitting process...")
171
+ #yield "\n".join(logs), None, None
172
+ print("before executing splitter")
173
+ command = f"demucs --two-stems=vocals -n {split_model} {audio_path} -o output"
174
+ #result = subprocess.Popen(command.split(), stdout=subprocess.PIPE, text=True)
175
+ result = subprocess.run(command.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
176
+ if result.returncode != 0:
177
+ print("Demucs process failed:", result.stderr)
178
+ else:
179
+ print("Demucs process completed successfully.")
180
+ print("after executing splitter")
181
+ #for line in result.stdout:
182
+ # logs.append(line)
183
+ # yield "\n".join(logs), None, None
184
+
185
+ print(result.stdout)
186
+ vocal = f"output/{split_model}/{spk_id}_input_audio/vocals.wav"
187
+ inst = f"output/{split_model}/{spk_id}_input_audio/no_vocals.wav"
188
+ #logs.append("Audio splitting complete.")
189
+
190
+
191
+ def combine_vocal_and_inst(vocal_path, inst_path):
192
+
193
+ vocal_volume=1
194
+ inst_volume=1
195
+ os.makedirs("output/result", exist_ok=True)
196
+ # Assuming vocal_path and inst_path are now directly passed as arguments
197
+ output_path = "output/result/combine.mp3"
198
+ #command = f'ffmpeg -y -i "{inst_path}" -i "{vocal_path}" -filter_complex [0:a]volume={inst_volume}[i];[1:a]volume={vocal_volume}[v];[i][v]amix=inputs=2:duration=longest[a] -map [a] -b:a 320k -c:a libmp3lame "{output_path}"'
199
+ #command=f'ffmpeg -y -i "{inst_path}" -i "{vocal_path}" -filter_complex "amix=inputs=2:duration=longest" -b:a 320k -c:a libmp3lame "{output_path}"'
200
+ # Load the audio files
201
+ vocal = AudioSegment.from_file(vocal_path)
202
+ instrumental = AudioSegment.from_file(inst_path)
203
+
204
+ # Overlay the vocal track on top of the instrumental track
205
+ combined = vocal.overlay(instrumental)
206
+
207
+ # Export the result
208
+ combined.export(output_path, format="mp3")
209
+
210
+ #result = subprocess.run(command.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
211
+ return output_path
212
+
213
+
214
+
215
+ def vc_single(
216
+ sid,
217
+ input_audio_path,
218
+ f0_up_key,
219
+ f0_file,
220
+ f0_method,
221
+ file_index,
222
+ index_rate,
223
+ filter_radius,
224
+ resample_sr,
225
+ rms_mix_rate,
226
+ protect
227
+ ): # spk_item, input_audio0, vc_transform0,f0_file,f0method0
228
+ global tgt_sr, net_g, vc, hubert_model, version, cpt
229
+
230
+ try:
231
+ logs = []
232
+ print(f"Converting...")
233
+
234
+ audio, sr = librosa.load(input_audio_path, sr=16000, mono=True)
235
+ print(f"found audio ")
236
+ f0_up_key = int(f0_up_key)
237
+ times = [0, 0, 0]
238
+ if hubert_model == None:
239
+ load_hubert()
240
+ print("loaded hubert")
241
+ if_f0 = 1
242
+ audio_opt = vc.pipeline(
243
+ hubert_model,
244
+ net_g,
245
+ 0,
246
+ audio,
247
+ input_audio_path,
248
+ times,
249
+ f0_up_key,
250
+ f0_method,
251
+ file_index,
252
+ # file_big_npy,
253
+ index_rate,
254
+ if_f0,
255
+ filter_radius,
256
+ tgt_sr,
257
+ resample_sr,
258
+ rms_mix_rate,
259
+ version,
260
+ protect,
261
+ f0_file=f0_file
262
+ )
263
+ if resample_sr >= 16000 and tgt_sr != resample_sr:
264
+ tgt_sr = resample_sr
265
+ index_info = (
266
+ "Using index:%s." % file_index
267
+ if os.path.exists(file_index)
268
+ else "Index not used."
269
+ )
270
+ print("writing to FS")
271
+ output_file_path = os.path.join("output", f"converted_audio_{sid}.wav") # Adjust path as needed
272
+
273
+ os.makedirs(os.path.dirname(output_file_path), exist_ok=True) # Create the output directory if it doesn't exist
274
+ print("create dir")
275
+ # Save the audio file using the target sampling rate
276
+ sf.write(output_file_path, audio_opt, tgt_sr)
277
+
278
+ print("wrote to FS")
279
+
280
+ # Return the path to the saved file along with any other information
281
+
282
+ return output_file_path
283
+
284
+
285
+ except:
286
+ info = traceback.format_exc()
287
+
288
+ return info, (None, None)
289
+
290
+ def get_vc(sid, to_return_protect0):
291
+ global n_spk, tgt_sr, net_g, vc, cpt, version, weights_index
292
+ if sid == "" or sid == []:
293
+ global hubert_model
294
+ if hubert_model is not None: # 考虑到轮询, 需要加个判断看是否 sid 是由有模型切换到无模型的
295
+ print("clean_empty_cache")
296
+ del net_g, n_spk, vc, hubert_model, tgt_sr # ,cpt
297
+ hubert_model = net_g = n_spk = vc = hubert_model = tgt_sr = None
298
+ if torch.cuda.is_available():
299
+ torch.cuda.empty_cache()
300
+ ###楼下不这么折腾清理不干净
301
+ if_f0 = cpt.get("f0", 1)
302
+ version = cpt.get("version", "v1")
303
+ if version == "v1":
304
+ if if_f0 == 1:
305
+ net_g = SynthesizerTrnMs256NSFsid(
306
+ *cpt["config"], is_half=config.is_half
307
+ )
308
+ else:
309
+ net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
310
+ elif version == "v2":
311
+ if if_f0 == 1:
312
+ net_g = SynthesizerTrnMs768NSFsid(
313
+ *cpt["config"], is_half=config.is_half
314
+ )
315
+ else:
316
+ net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
317
+ del net_g, cpt
318
+ if torch.cuda.is_available():
319
+ torch.cuda.empty_cache()
320
+ cpt = None
321
+ return (
322
+ gr.Slider.update(maximum=2333, visible=False),
323
+ gr.Slider.update(visible=True),
324
+ gr.Dropdown.update(choices=sorted(weights_index), value=""),
325
+ gr.Markdown.update(value="# <center> No model selected")
326
+ )
327
+ print(f"Loading {sid} model...")
328
+ selected_model = sid[:-4]
329
+ cpt = torch.load(os.path.join(weight_root, sid), map_location="cpu")
330
+ tgt_sr = cpt["config"][-1]
331
+ cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]
332
+ if_f0 = cpt.get("f0", 1)
333
+ if if_f0 == 0:
334
+ to_return_protect0 = {
335
+ "visible": False,
336
+ "value": 0.5,
337
+ "__type__": "update",
338
+ }
339
+ else:
340
+ to_return_protect0 = {
341
+ "visible": True,
342
+ "value": to_return_protect0,
343
+ "__type__": "update",
344
+ }
345
+ version = cpt.get("version", "v1")
346
+ if version == "v1":
347
+ if if_f0 == 1:
348
+ net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=config.is_half)
349
+ else:
350
+ net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
351
+ elif version == "v2":
352
+ if if_f0 == 1:
353
+ net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=config.is_half)
354
+ else:
355
+ net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
356
+ del net_g.enc_q
357
+ print(net_g.load_state_dict(cpt["weight"], strict=False))
358
+ net_g.eval().to(config.device)
359
+ if config.is_half:
360
+ net_g = net_g.half()
361
+ else:
362
+ net_g = net_g.float()
363
+ vc = VC(tgt_sr, config)
364
+ n_spk = cpt["config"][-3]
365
+ weights_index = []
366
+ for _, _, index_files in os.walk(index_root):
367
+ for file in index_files:
368
+ if file.endswith('.index') and "trained" not in file:
369
+ weights_index.append(os.path.join(index_root, file))
370
+ if weights_index == []:
371
+ selected_index = gr.Dropdown.update(value="")
372
+ else:
373
+ selected_index = gr.Dropdown.update(value=weights_index[0])
374
+ for index, model_index in enumerate(weights_index):
375
+ if selected_model in model_index:
376
+ selected_index = gr.Dropdown.update(value=weights_index[index])
377
+ break
378
+ return (
379
+ gr.Slider.update(maximum=n_spk, visible=True),
380
+ to_return_protect0,
381
+ selected_index,
382
+ gr.Markdown.update(
383
+ f'## <center> {selected_model}\n'+
384
+ f'### <center> RVC {version} Model'
385
+ )
386
+ )
387
+
388
+
389
+
390
+
391
+
392
+ if __name__ == '__main__':
393
+ app.run(debug=False, port=5000,host='0.0.0.0')