Advanced-RVC-Inference

Sleeping

App Files Files

xet

Community

smjain commited on Mar 1, 2024

Commit

e38a7f3

verified ·

1 Parent(s): fc1c86d

Upload myinfer_latest.py

Browse files

Files changed (1) hide show

myinfer_latest.py +393 -0

myinfer_latest.py ADDED Viewed

	@@ -0,0 +1,393 @@

+import torch, os, traceback, sys, warnings, shutil, numpy as np
+import gradio as gr
+import librosa
+import asyncio
+import rarfile
+import edge_tts
+import yt_dlp
+import ffmpeg
+import gdown
+import subprocess
+import wave
+import soundfile as sf
+from scipy.io import wavfile
+from datetime import datetime
+from urllib.parse import urlparse
+from mega import Mega
+from flask import Flask, request, jsonify, send_file
+import base64
+import tempfile
+import os
+import werkzeug
+from pydub import AudioSegment
+import uuid
+app = Flask(__name__)
+now_dir = os.getcwd()
+tmp = os.path.join(now_dir, "TEMP")
+shutil.rmtree(tmp, ignore_errors=True)
+os.makedirs(tmp, exist_ok=True)
+os.environ["TEMP"] = tmp
+split_model="htdemucs"
+from lib.infer_pack.models import (
+    SynthesizerTrnMs256NSFsid,
+    SynthesizerTrnMs256NSFsid_nono,
+    SynthesizerTrnMs768NSFsid,
+    SynthesizerTrnMs768NSFsid_nono,
+)
+from fairseq import checkpoint_utils
+from vc_infer_pipeline import VC
+from config import Config
+config = Config()
+tts_voice_list = asyncio.get_event_loop().run_until_complete(edge_tts.list_voices())
+voices = [f"{v['ShortName']}-{v['Gender']}" for v in tts_voice_list]
+hubert_model = None
+f0method_mode = ["pm", "harvest", "crepe"]
+f0method_info = "PM is fast, Harvest is good but extremely slow, and Crepe effect is good but requires GPU (Default: PM)"
+if os.path.isfile("rmvpe.pt"):
+    f0method_mode.insert(2, "rmvpe")
+    f0method_info = "PM is fast, Harvest is good but extremely slow, Rvmpe is alternative to harvest (might be better), and Crepe effect is good but requires GPU (Default: PM)"
+def load_hubert():
+    global hubert_model
+    models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
+        ["hubert_base.pt"],
+        suffix="",
+    )
+    hubert_model = models[0]
+    hubert_model = hubert_model.to(config.device)
+    if config.is_half:
+        hubert_model = hubert_model.half()
+    else:
+        hubert_model = hubert_model.float()
+    hubert_model.eval()
+load_hubert()
+weight_root = "weights"
+index_root = "weights/index"
+weights_model = []
+weights_index = []
+for _, _, model_files in os.walk(weight_root):
+    for file in model_files:
+        if file.endswith(".pth"):
+            weights_model.append(file)
+for _, _, index_files in os.walk(index_root):
+    for file in index_files:
+        if file.endswith('.index') and "trained" not in file:
+            weights_index.append(os.path.join(index_root, file))
+def check_models():
+    weights_model = []
+    weights_index = []
+    for _, _, model_files in os.walk(weight_root):
+        for file in model_files:
+            if file.endswith(".pth"):
+                weights_model.append(file)
+    for _, _, index_files in os.walk(index_root):
+        for file in index_files:
+            if file.endswith('.index') and "trained" not in file:
+                weights_index.append(os.path.join(index_root, file))
+    return (
+        gr.Dropdown.update(choices=sorted(weights_model), value=weights_model[0]),
+        gr.Dropdown.update(choices=sorted(weights_index))
+    )
+def clean():
+    return (
+        gr.Dropdown.update(value=""),
+        gr.Slider.update(visible=False)
+    )
+@app.route('/convert_voice', methods=['POST'])
+def api_convert_voice():
+    spk_id = request.form['spk_id']
+    voice_transform = request.form['voice_transform']
+    # The file part
+    if 'file' not in request.files:
+        return jsonify({"error": "No file part"}), 400
+    file = request.files['file']
+    if file.filename == '':
+        return jsonify({"error": "No selected file"}), 400
+    # Save the file to a temporary path
+	unique_id = uuid.uuid4()
+    filename = werkzeug.utils.secure_filename(file.filename)
+    input_audio_path = os.path.join(tmp, f"{spk_id}_input_audio_{unique_id}.{filename.split('.')[-1]}")
+    file.save(input_audio_path)
+    #split audio
+    cut_vocal_and_inst(input_audio_path,spk_id)
+    print("audio splitting performed")
+    vocal_path = f"output/{split_model}/{spk_id}_input_audio/vocals.wav"
+    inst = f"output/{split_model}/{spk_id}_input_audio/no_vocals.wav"
+    output_path = convert_voice(spk_id, vocal_path, voice_transform)
+    output_path1= combine_vocal_and_inst(output_path,inst)
+    print(output_path1)
+    if os.path.exists(output_path1):
+        return send_file(output_path1, as_attachment=True)
+    else:
+        return jsonify({"error": "File not found."}), 404
+def convert_voice(spk_id, input_audio_path, voice_transform):
+    get_vc(spk_id,0.5)
+    output_audio_path = vc_single(
+        sid=0,
+        input_audio_path=input_audio_path,
+        f0_up_key=voice_transform,  # Assuming voice_transform corresponds to f0_up_key
+        f0_file=None ,
+        f0_method="rmvpe",
+        file_index=spk_id,  # Assuming file_index_path corresponds to file_index
+        index_rate=0.75,
+        filter_radius=3,
+        resample_sr=0,
+        rms_mix_rate=0.25,
+        protect=0.33  # Adjusted from protect_rate to protect to match the function signature
+    )
+    print(output_audio_path)
+    return output_audio_path
+def cut_vocal_and_inst(audio_path,spk_id):
+    vocal_path = "output/result/audio.wav"
+    os.makedirs("output/result", exist_ok=True)
+    #wavfile.write(vocal_path, audio_data[0], audio_data[1])
+    #logs.append("Starting the audio splitting process...")
+    #yield "\n".join(logs), None, None
+    print("before executing splitter")
+    command = f"demucs --two-stems=vocals -n {split_model} {audio_path} -o output"
+    #result = subprocess.Popen(command.split(), stdout=subprocess.PIPE, text=True)
+    result = subprocess.run(command.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+    if result.returncode != 0:
+        print("Demucs process failed:", result.stderr)
+    else:
+        print("Demucs process completed successfully.")
+    print("after executing splitter")
+    #for line in result.stdout:
+    #    logs.append(line)
+    #    yield "\n".join(logs), None, None
+    print(result.stdout)
+    vocal = f"output/{split_model}/{spk_id}_input_audio/vocals.wav"
+    inst = f"output/{split_model}/{spk_id}_input_audio/no_vocals.wav"
+    #logs.append("Audio splitting complete.")
+def combine_vocal_and_inst(vocal_path, inst_path):
+    vocal_volume=1
+    inst_volume=1
+    os.makedirs("output/result", exist_ok=True)
+    # Assuming vocal_path and inst_path are now directly passed as arguments
+    output_path = "output/result/combine.mp3"
+    #command = f'ffmpeg -y -i "{inst_path}" -i "{vocal_path}" -filter_complex [0:a]volume={inst_volume}[i];[1:a]volume={vocal_volume}[v];[i][v]amix=inputs=2:duration=longest[a] -map [a] -b:a 320k -c:a libmp3lame "{output_path}"'
+    #command=f'ffmpeg -y -i "{inst_path}" -i "{vocal_path}" -filter_complex "amix=inputs=2:duration=longest" -b:a 320k -c:a libmp3lame "{output_path}"'
+    # Load the audio files
+    vocal = AudioSegment.from_file(vocal_path)
+    instrumental = AudioSegment.from_file(inst_path)
+# Overlay the vocal track on top of the instrumental track
+    combined = vocal.overlay(instrumental)
+# Export the result
+    combined.export(output_path, format="mp3")
+    #result = subprocess.run(command.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    return output_path
+def vc_single(
+    sid,
+    input_audio_path,
+    f0_up_key,
+    f0_file,
+    f0_method,
+    file_index,
+    index_rate,
+    filter_radius,
+    resample_sr,
+    rms_mix_rate,
+    protect
+):  # spk_item, input_audio0, vc_transform0,f0_file,f0method0
+    global tgt_sr, net_g, vc, hubert_model, version, cpt
+    try:
+        logs = []
+        print(f"Converting...")
+        audio, sr = librosa.load(input_audio_path, sr=16000, mono=True)
+        print(f"found audio ")
+        f0_up_key = int(f0_up_key)
+        times = [0, 0, 0]
+        if hubert_model == None:
+            load_hubert()
+        print("loaded hubert")
+        if_f0 = 1
+        audio_opt = vc.pipeline(
+            hubert_model,
+            net_g,
+            0,
+            audio,
+            input_audio_path,
+            times,
+            f0_up_key,
+            f0_method,
+            file_index,
+            # file_big_npy,
+            index_rate,
+            if_f0,
+            filter_radius,
+            tgt_sr,
+            resample_sr,
+            rms_mix_rate,
+            version,
+            protect,
+            f0_file=f0_file
+        )
+        if resample_sr >= 16000 and tgt_sr != resample_sr:
+            tgt_sr = resample_sr
+        index_info = (
+            "Using index:%s." % file_index
+            if os.path.exists(file_index)
+            else "Index not used."
+        )
+        print("writing to FS")
+        output_file_path = os.path.join("output", f"converted_audio_{sid}.wav")  # Adjust path as needed
+        os.makedirs(os.path.dirname(output_file_path), exist_ok=True)  # Create the output directory if it doesn't exist
+        print("create dir")
+        # Save the audio file using the target sampling rate
+        sf.write(output_file_path, audio_opt, tgt_sr)
+        print("wrote to FS")
+        # Return the path to the saved file along with any other information
+        return output_file_path
+    except:
+        info = traceback.format_exc()
+        return info, (None, None)
+def get_vc(sid, to_return_protect0):
+    global n_spk, tgt_sr, net_g, vc, cpt, version, weights_index
+    if sid == "" or sid == []:
+        global hubert_model
+        if hubert_model is not None:  # 考虑到轮询, 需要加个判断看是否 sid 是由有模型切换到无模型的
+            print("clean_empty_cache")
+            del net_g, n_spk, vc, hubert_model, tgt_sr  # ,cpt
+            hubert_model = net_g = n_spk = vc = hubert_model = tgt_sr = None
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+            ###楼下不这么折腾清理不干净
+            if_f0 = cpt.get("f0", 1)
+            version = cpt.get("version", "v1")
+            if version == "v1":
+                if if_f0 == 1:
+                    net_g = SynthesizerTrnMs256NSFsid(
+                        *cpt["config"], is_half=config.is_half
+                    )
+                else:
+                    net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
+            elif version == "v2":
+                if if_f0 == 1:
+                    net_g = SynthesizerTrnMs768NSFsid(
+                        *cpt["config"], is_half=config.is_half
+                    )
+                else:
+                    net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
+            del net_g, cpt
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+            cpt = None
+        return (
+            gr.Slider.update(maximum=2333, visible=False),
+            gr.Slider.update(visible=True),
+            gr.Dropdown.update(choices=sorted(weights_index), value=""),
+            gr.Markdown.update(value="# <center> No model selected")
+        )
+    print(f"Loading {sid} model...")
+    selected_model = sid[:-4]
+    cpt = torch.load(os.path.join(weight_root, sid), map_location="cpu")
+    tgt_sr = cpt["config"][-1]
+    cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]
+    if_f0 = cpt.get("f0", 1)
+    if if_f0 == 0:
+        to_return_protect0 = {
+            "visible": False,
+            "value": 0.5,
+            "__type__": "update",
+        }
+    else:
+        to_return_protect0 = {
+            "visible": True,
+            "value": to_return_protect0,
+            "__type__": "update",
+        }
+    version = cpt.get("version", "v1")
+    if version == "v1":
+        if if_f0 == 1:
+            net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=config.is_half)
+        else:
+            net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
+    elif version == "v2":
+        if if_f0 == 1:
+            net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=config.is_half)
+        else:
+            net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
+    del net_g.enc_q
+    print(net_g.load_state_dict(cpt["weight"], strict=False))
+    net_g.eval().to(config.device)
+    if config.is_half:
+        net_g = net_g.half()
+    else:
+        net_g = net_g.float()
+    vc = VC(tgt_sr, config)
+    n_spk = cpt["config"][-3]
+    weights_index = []
+    for _, _, index_files in os.walk(index_root):
+        for file in index_files:
+            if file.endswith('.index') and "trained" not in file:
+                weights_index.append(os.path.join(index_root, file))
+    if weights_index == []:
+        selected_index = gr.Dropdown.update(value="")
+    else:
+        selected_index = gr.Dropdown.update(value=weights_index[0])
+    for index, model_index in enumerate(weights_index):
+        if selected_model in model_index:
+            selected_index = gr.Dropdown.update(value=weights_index[index])
+            break
+    return (
+        gr.Slider.update(maximum=n_spk, visible=True),
+        to_return_protect0,
+        selected_index,
+        gr.Markdown.update(
+            f'## <center> {selected_model}\n'+
+            f'### <center> RVC {version} Model'
+        )
+    )
+if __name__ == '__main__':
+    app.run(debug=False, port=5000,host='0.0.0.0')