import os import sys import time import torch import logging import numpy as np import soundfile as sf import librosa now_dir = os.getcwd() sys.path.append(now_dir) from rvc.infer.pipeline import VC from scipy.io import wavfile from audio_upscaler import upscale import noisereduce as nr from rvc.lib.utils import load_audio from rvc.lib.tools.split_audio import process_audio, merge_audio from rvc.lib.infer_pack.models import ( SynthesizerTrnMs256NSFsid, SynthesizerTrnMs256NSFsid_nono, SynthesizerTrnMs768NSFsid, SynthesizerTrnMs768NSFsid_nono, ) from rvc.configs.config import Config from rvc.lib.utils import load_embedding logging.getLogger("httpx").setLevel(logging.WARNING) logging.getLogger("httpcore").setLevel(logging.WARNING) config = Config() hubert_model = None tgt_sr = None net_g = None vc = None cpt = None version = None n_spk = None def load_hubert(embedder_model, embedder_model_custom): global hubert_model models, _, _ = load_embedding(embedder_model, embedder_model_custom) hubert_model = models[0] hubert_model = hubert_model.to(config.device) if config.is_half: hubert_model = hubert_model.half() else: hubert_model = hubert_model.float() hubert_model.eval() def remove_audio_noise(input_audio_path, reduction_strength=0.7): try: rate, data = wavfile.read(input_audio_path) reduced_noise = nr.reduce_noise( y=data, sr=rate, prop_decrease=reduction_strength, ) return reduced_noise except Exception as error: print(f"Error cleaning audio: {error}") return None def convert_audio_format(input_path, output_path, output_format): try: if output_format != "WAV": print(f"Converting audio to {output_format} format...") audio, sample_rate = librosa.load(input_path, sr=None) common_sample_rates = [ 8000, 11025, 12000, 16000, 22050, 24000, 32000, 44100, 48000, ] target_sr = min(common_sample_rates, key=lambda x: abs(x - sample_rate)) audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=target_sr) sf.write(output_path, audio, target_sr, format=output_format.lower()) return output_path except Exception as error: print(f"Failed to convert audio to {output_format} format: {error}") def voice_conversion( sid=0, input_audio_path=None, f0_up_key=None, f0_file=None, f0_method=None, file_index=None, index_rate=None, resample_sr=0, rms_mix_rate=None, protect=None, hop_length=None, output_path=None, split_audio=False, f0autotune=False, filter_radius=None, embedder_model=None, embedder_model_custom=None, ): global tgt_sr, net_g, vc, hubert_model, version f0_up_key = int(f0_up_key) try: audio = load_audio(input_audio_path, 16000) audio_max = np.abs(audio).max() / 0.95 if audio_max > 1: audio /= audio_max if not hubert_model: load_hubert(embedder_model, embedder_model_custom) if_f0 = cpt.get("f0", 1) file_index = ( file_index.strip(" ") .strip('"') .strip("\n") .strip('"') .strip(" ") .replace("trained", "added") ) if tgt_sr != resample_sr >= 16000: tgt_sr = resample_sr if split_audio == "True": result, new_dir_path = process_audio(input_audio_path) if result == "Error": return "Error with Split Audio", None dir_path = ( new_dir_path.strip(" ").strip('"').strip("\n").strip('"').strip(" ") ) if dir_path != "": paths = [ os.path.join(root, name) for root, _, files in os.walk(dir_path, topdown=False) for name in files if name.endswith(".wav") and root == dir_path ] try: for path in paths: voice_conversion( sid, path, f0_up_key, None, f0_method, file_index, index_rate, resample_sr, rms_mix_rate, protect, hop_length, path, False, f0autotune, filter_radius, embedder_model, embedder_model_custom, ) except Exception as error: print(error) return f"Error {error}" print("Finished processing segmented audio, now merging audio...") merge_timestamps_file = os.path.join( os.path.dirname(new_dir_path), f"{os.path.basename(input_audio_path).split('.')[0]}_timestamps.txt", ) tgt_sr, audio_opt = merge_audio(merge_timestamps_file) os.remove(merge_timestamps_file) else: audio_opt = vc.pipeline( hubert_model, net_g, sid, audio, input_audio_path, f0_up_key, f0_method, file_index, index_rate, if_f0, filter_radius, tgt_sr, resample_sr, rms_mix_rate, version, protect, hop_length, f0autotune, f0_file=f0_file, ) if output_path is not None: sf.write(output_path, audio_opt, tgt_sr, format="WAV") return (tgt_sr, audio_opt) except Exception as error: print(error) def get_vc(weight_root, sid): global n_spk, tgt_sr, net_g, vc, cpt, version if sid == "" or sid == []: global hubert_model if hubert_model is not None: print("clean_empty_cache") del net_g, n_spk, vc, hubert_model, tgt_sr hubert_model = net_g = n_spk = vc = hubert_model = tgt_sr = None if torch.cuda.is_available(): torch.cuda.empty_cache() if_f0 = cpt.get("f0", 1) version = cpt.get("version", "v1") if version == "v1": if if_f0 == 1: net_g = SynthesizerTrnMs256NSFsid( *cpt["config"], is_half=config.is_half ) else: net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"]) elif version == "v2": if if_f0 == 1: net_g = SynthesizerTrnMs768NSFsid( *cpt["config"], is_half=config.is_half ) else: net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"]) del net_g, cpt if torch.cuda.is_available(): torch.cuda.empty_cache() cpt = None person = weight_root cpt = torch.load(person, map_location="cpu") tgt_sr = cpt["config"][-1] cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] if_f0 = cpt.get("f0", 1) version = cpt.get("version", "v1") if version == "v1": if if_f0 == 1: net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=config.is_half) else: net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"]) elif version == "v2": if if_f0 == 1: net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=config.is_half) else: net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"]) del net_g.enc_q print(net_g.load_state_dict(cpt["weight"], strict=False)) net_g.eval().to(config.device) if config.is_half: net_g = net_g.half() else: net_g = net_g.float() vc = VC(tgt_sr, config) n_spk = cpt["config"][-3] def infer_pipeline( f0up_key, filter_radius, index_rate, rms_mix_rate, protect, hop_length, f0method, audio_input_path, audio_output_path, model_path, index_path, split_audio, f0autotune, clean_audio, clean_strength, export_format, embedder_model, embedder_model_custom, upscale_audio, ): global tgt_sr, net_g, vc, cpt get_vc(model_path, 0) try: if upscale_audio == "True": upscale(audio_input_path, audio_input_path) start_time = time.time() voice_conversion( sid=0, input_audio_path=audio_input_path, f0_up_key=f0up_key, f0_file=None, f0_method=f0method, file_index=index_path, index_rate=float(index_rate), rms_mix_rate=float(rms_mix_rate), protect=float(protect), hop_length=hop_length, output_path=audio_output_path, split_audio=split_audio, f0autotune=f0autotune, filter_radius=filter_radius, embedder_model=embedder_model, embedder_model_custom=embedder_model_custom, ) if clean_audio == "True": cleaned_audio = remove_audio_noise(audio_output_path, clean_strength) if cleaned_audio is not None: sf.write(audio_output_path, cleaned_audio, tgt_sr, format="WAV") output_path_format = audio_output_path.replace( ".wav", f".{export_format.lower()}" ) audio_output_path = convert_audio_format( audio_output_path, output_path_format, export_format ) end_time = time.time() elapsed_time = end_time - start_time print( f"Conversion completed. Output file: '{audio_output_path}' in {elapsed_time:.2f} seconds." ) except Exception as error: print(f"Voice conversion failed: {error}")