Spaces:
Runtime error
Runtime error
import argparse | |
from ctypes import alignment | |
import os | |
import sys | |
sys.path.append('rtvc/') | |
from pathlib import Path | |
import spacy | |
import matplotlib.pyplot as plt | |
import librosa | |
import numpy as np | |
import soundfile as sf | |
import torch | |
import noisereduce as nr | |
from rtvc.encoder import inference as encoder | |
from rtvc.encoder.params_data import * | |
from rtvc.synthesizer.inference import Synthesizer_infer | |
from rtvc.utils.argutils import print_args | |
from rtvc.utils.default_models import ensure_default_models | |
from rtvc.vocoder import inference as vocoder | |
from rtvc.speed_changer.fixSpeed import * | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser( | |
formatter_class=argparse.ArgumentDefaultsHelpFormatter | |
) | |
parser.add_argument("--run_id", type=str, default="default", help= \ | |
"Name for this model. By default, training outputs will be stored to saved_models/<run_id>/. If a model state " | |
"from the same run ID was previously saved, the training will restart from there. Pass -f to overwrite saved " | |
"states and restart from scratch.") | |
parser.add_argument("-m", "--models_dir", type=Path, default="rtvc/saved_models", | |
help="Directory containing all saved models") | |
parser.add_argument("--weight", type=float, default=1, | |
help="weight of input audio for voice filter") | |
parser.add_argument("--griffin_lim", | |
action="store_true", | |
help="if True, use vocoder, else use griffin-lim") | |
parser.add_argument("--cpu", action="store_true", help=\ | |
"If True, processing is done on CPU, even when a GPU is available.") | |
parser.add_argument("--no_sound", action="store_true", help=\ | |
"If True, audio won't be played.") | |
parser.add_argument("--seed", type=int, default=None, help=\ | |
"Optional random number seed value to make toolbox deterministic.") | |
args = parser.parse_args() | |
arg_dict = vars(args) | |
print_args(args, parser) | |
# Hide GPUs from Pytorch to force CPU processing | |
if arg_dict.pop("cpu"): | |
os.environ["CUDA_VISIBLE_DEVICES"] = "-1" | |
print("Running a test of your configuration...\n") | |
if torch.cuda.is_available(): | |
device_id = torch.cuda.current_device() | |
gpu_properties = torch.cuda.get_device_properties(device_id) | |
## Print some environment information (for debugging purposes) | |
print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with " | |
"%.1fGb total memory.\n" % | |
(torch.cuda.device_count(), | |
device_id, | |
gpu_properties.name, | |
gpu_properties.major, | |
gpu_properties.minor, | |
gpu_properties.total_memory / 1e9)) | |
else: | |
print("Using CPU for inference.\n") | |
## Load the models one by one. | |
if not args.griffin_lim: | |
print("Preparing the encoder, the synthesizer and the vocoder...") | |
else: | |
print("Preparing the encoder and the synthesizer...") | |
ensure_default_models(args.run_id, Path("rtvc/saved_models")) | |
encoder.load_model(list(args.models_dir.glob(f"{args.run_id}/encoder.pt"))[0]) | |
synthesizer = Synthesizer_infer(list(args.models_dir.glob(f"{args.run_id}/synthesizer.pt"))[0]) | |
if not args.griffin_lim: | |
vocoder.load_model(list(args.models_dir.glob(f"{args.run_id}/vocoder.pt"))[0]) | |
nlp = spacy.load('en_core_web_sm') | |
weight = arg_dict["weight"] # 声音美颜的用户语音权重 | |
amp = 1 | |
directory = "input_audios" | |
pathlist = Path(directory).rglob('*.*') | |
for path in pathlist: | |
path = str(path) | |
print(path) | |
# enter the number of reference audios | |
# Computing the embedding | |
# First, we load the wav using the function that the speaker encoder provides. This is | |
# important: there is preprocessing that must be applied. | |
# The following two methods are equivalent: | |
# - Directly load from the filepath: | |
# preprocessed_wav = encoder.preprocess_wav(in_fpath) | |
# - If the wav is already loaded: | |
# get duration info from input audio | |
in_fpath = Path(path.replace("\"", "").replace("\'", "")) | |
fpath_without_ext = os.path.splitext(str(in_fpath))[0] | |
speaker_name = os.path.normpath(fpath_without_ext).split(os.sep)[-1] | |
is_wav_file, wav, wav_path = TransFormat(in_fpath, 'wav') | |
# 除了m4a格式无法工作而必须转换以外,无论原格式是否为wav,从稳定性的角度考虑也最好再转为wav(因为某些wav本身不带比特率属性,无法在此代码中工作,因此需要转换以赋予其该属性) | |
if not is_wav_file: | |
os.remove(wav_path) # remove intermediate wav files | |
preprocessed_wav = encoder.preprocess_wav(wav) | |
print("Loaded input audio file succesfully") | |
# Then we derive the embedding. There are many functions and parameters that the | |
# speaker encoder interfaces. These are mostly for in-depth research. You will typically | |
# only use this function (with its default parameters): | |
embed = encoder.embed_utterance(preprocessed_wav) | |
embed[embed < set_zero_thres]=0 # 噪声值置零 | |
if not os.path.exists("embeds"): | |
os.mkdir("embeds") | |
np.save(f"embeds/{speaker_name}.npy", embed) |