Spaces:
Runtime error
Runtime error
import os | |
import librosa | |
from scipy.io.wavfile import write | |
from mel_processing import spectrogram_torch | |
from text import text_to_sequence, _clean_text | |
from models import SynthesizerTrn | |
import utils | |
import commons | |
import sys | |
import re | |
import numpy as np | |
# import torch | |
# torch.set_num_threads(1) #设置torch线程为1,防止多任务推理时服务崩溃,但flask仍然会使用多线程 | |
from torch import no_grad, LongTensor, inference_mode, FloatTensor | |
import audonnx | |
import uuid | |
from io import BytesIO | |
class Voice: | |
def __init__(self, model, config, out_path=None): | |
self.out_path = out_path | |
if not os.path.exists(self.out_path): | |
try: | |
os.mkdir(self.out_path) | |
except: | |
pass | |
self.hps_ms = utils.get_hparams_from_file(config) | |
self.n_speakers = self.hps_ms.data.n_speakers if 'n_speakers' in self.hps_ms.data.keys() else 0 | |
self.n_symbols = len(self.hps_ms.symbols) if 'symbols' in self.hps_ms.keys() else 0 | |
self.speakers = self.hps_ms.speakers if 'speakers' in self.hps_ms.keys() else ['0'] | |
self.use_f0 = self.hps_ms.data.use_f0 if 'use_f0' in self.hps_ms.data.keys() else False | |
self.emotion_embedding = self.hps_ms.data.emotion_embedding if 'emotion_embedding' in self.hps_ms.data.keys() else False | |
self.net_g_ms = SynthesizerTrn( | |
self.n_symbols, | |
self.hps_ms.data.filter_length // 2 + 1, | |
self.hps_ms.train.segment_size // self.hps_ms.data.hop_length, | |
n_speakers=self.n_speakers, | |
emotion_embedding=self.emotion_embedding, | |
**self.hps_ms.model) | |
_ = self.net_g_ms.eval() | |
utils.load_checkpoint(model, self.net_g_ms) | |
def get_text(self, text, hps, cleaned=False): | |
if cleaned: | |
text_norm = text_to_sequence(text, hps.symbols, []) | |
else: | |
text_norm = text_to_sequence(text, hps.symbols, hps.data.text_cleaners) | |
if hps.data.add_blank: | |
text_norm = commons.intersperse(text_norm, 0) | |
text_norm = LongTensor(text_norm) | |
return text_norm | |
def get_label_value(self, text, label, default, warning_name='value'): | |
value = re.search(rf'\[{label}=(.+?)\]', text) | |
if value: | |
try: | |
text = re.sub(rf'\[{label}=(.+?)\]', '', text, 1) | |
value = float(value.group(1)) | |
except: | |
print(f'Invalid {warning_name}!') | |
sys.exit(1) | |
else: | |
value = default | |
return value, text | |
def ex_return(self, text, escape=False): | |
if escape: | |
return text.encode('unicode_escape').decode() | |
else: | |
return text | |
def return_speakers(self, escape=False): | |
return self.speakers | |
def get_label(self, text, label): | |
if f'[{label}]' in text: | |
return True, text.replace(f'[{label}]', '') | |
else: | |
return False, text | |
def generate(self, text=None, speaker_id=None, format=None, speed=1, audio_path=None, target_id=None, escape=False, | |
option=None, w2v2_folder=None): | |
if self.n_symbols != 0: | |
if not self.emotion_embedding: | |
length_scale, text = self.get_label_value(text, 'LENGTH', speed, 'length scale') | |
noise_scale, text = self.get_label_value(text, 'NOISE', 0.667, 'noise scale') | |
noise_scale_w, text = self.get_label_value(text, 'NOISEW', 0.8, 'deviation of noise') | |
cleaned, text = self.get_label(text, 'CLEANED') | |
stn_tst = self.get_text(text, self.hps_ms, cleaned=cleaned) | |
with no_grad(): | |
x_tst = stn_tst.unsqueeze(0) | |
x_tst_lengths = LongTensor([stn_tst.size(0)]) | |
sid = LongTensor([speaker_id]) | |
audio = self.net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, | |
noise_scale=noise_scale, | |
noise_scale_w=noise_scale_w, | |
length_scale=length_scale)[0][0, 0].data.cpu().float().numpy() | |
# else: | |
# w2v2_model = audonnx.load(os.path.dirname(w2v2_folder)) | |
# | |
# if option == 'clean': | |
# self.ex_print(_clean_text( | |
# text, self.hps_ms.data.text_cleaners), escape) | |
# | |
# length_scale, text = self.get_label_value( | |
# text, 'LENGTH', 1, 'length scale') | |
# noise_scale, text = self.get_label_value( | |
# text, 'NOISE', 0.667, 'noise scale') | |
# noise_scale_w, text = self.get_label_value( | |
# text, 'NOISEW', 0.8, 'deviation of noise') | |
# cleaned, text = self.get_label(text, 'CLEANED') | |
# | |
# stn_tst = self.get_text(text, self.hps_ms, cleaned=cleaned) | |
# | |
# emotion_reference = input('Path of an emotion reference: ') | |
# if emotion_reference.endswith('.npy'): | |
# emotion = np.load(emotion_reference) | |
# emotion = FloatTensor(emotion).unsqueeze(0) | |
# else: | |
# audio16000, sampling_rate = librosa.load( | |
# emotion_reference, sr=16000, mono=True) | |
# emotion = w2v2_model(audio16000, sampling_rate)[ | |
# 'hidden_states'] | |
# emotion_reference = re.sub( | |
# r'\..*$', '', emotion_reference) | |
# np.save(emotion_reference, emotion.squeeze(0)) | |
# emotion = FloatTensor(emotion) | |
# | |
# | |
# with no_grad(): | |
# x_tst = stn_tst.unsqueeze(0) | |
# x_tst_lengths = LongTensor([stn_tst.size(0)]) | |
# sid = LongTensor([speaker_id]) | |
# audio = self.net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, | |
# noise_scale_w=noise_scale_w, | |
# length_scale=length_scale, emotion_embedding=emotion)[0][ | |
# 0, 0].data.cpu().float().numpy() | |
# else: | |
# model = input('Path of a hubert-soft Model: ') | |
# from hubert_model import hubert_soft | |
# hubert = hubert_soft(model) | |
# if audio_path != '[VC]': | |
# if self.use_f0: | |
# audio, sampling_rate = librosa.load( | |
# audio_path, sr=self.hps_ms.data.sampling_rate, mono=True) | |
# audio16000 = librosa.resample( | |
# audio, orig_sr=sampling_rate, target_sr=16000) | |
# else: | |
# audio16000, sampling_rate = librosa.load( | |
# audio_path, sr=16000, mono=True) | |
# | |
# out_path = "H:/git/MoeGoe-Simple-API/upload/hubert.wav" | |
# length_scale, out_path = self.get_label_value( | |
# out_path, 'LENGTH', 1, 'length scale') | |
# noise_scale, out_path = self.get_label_value( | |
# out_path, 'NOISE', 0.1, 'noise scale') | |
# noise_scale_w, out_path = self.get_label_value( | |
# out_path, 'NOISEW', 0.1, 'deviation of noise') | |
# | |
# with inference_mode(): | |
# units = hubert.units(FloatTensor(audio16000).unsqueeze( | |
# 0).unsqueeze(0)).squeeze(0).numpy() | |
# if self.use_f0: | |
# f0_scale, out_path = self.get_label_value( | |
# out_path, 'F0', 1, 'f0 scale') | |
# f0 = librosa.pyin(audio, sr=sampling_rate, | |
# fmin=librosa.note_to_hz('C0'), | |
# fmax=librosa.note_to_hz('C7'), | |
# frame_length=1780)[0] | |
# target_length = len(units[:, 0]) | |
# f0 = np.nan_to_num(np.interp(np.arange(0, len(f0) * target_length, len(f0)) / target_length, | |
# np.arange(0, len(f0)), f0)) * f0_scale | |
# units[:, 0] = f0 / 10 | |
# | |
# stn_tst = FloatTensor(units) | |
# with no_grad(): | |
# x_tst = stn_tst.unsqueeze(0) | |
# x_tst_lengths = LongTensor([stn_tst.size(0)]) | |
# sid = LongTensor([target_id]) | |
# audio = self.net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, | |
# noise_scale_w=noise_scale_w, length_scale=length_scale)[0][ | |
# 0, 0].data.float().numpy() | |
with BytesIO() as f: | |
fname = str(uuid.uuid1()) | |
if format == 'ogg': | |
write(f, self.hps_ms.data.sampling_rate, audio) | |
with BytesIO() as o: | |
utils.wav2ogg(f, o) | |
return BytesIO(o.getvalue()), "audio/ogg", fname + ".ogg" | |
elif format == 'silk': | |
file_path = self.out_path + "/" + fname + ".wav" | |
write(file_path, 24000, audio) | |
silk_path = utils.convert_to_silk(file_path) | |
os.remove(file_path) | |
return silk_path, "audio/silk", fname + ".silk" | |
else: | |
write(f, self.hps_ms.data.sampling_rate, audio) | |
return BytesIO(f.getvalue()), "audio/wav", fname + ".wav" | |
def voice_conversion(self, audio_path, original_id, target_id): | |
audio = utils.load_audio_to_torch( | |
audio_path, self.hps_ms.data.sampling_rate) | |
y = audio.unsqueeze(0) | |
spec = spectrogram_torch(y, self.hps_ms.data.filter_length, | |
self.hps_ms.data.sampling_rate, self.hps_ms.data.hop_length, | |
self.hps_ms.data.win_length, | |
center=False) | |
spec_lengths = LongTensor([spec.size(-1)]) | |
sid_src = LongTensor([original_id]) | |
with no_grad(): | |
sid_tgt = LongTensor([target_id]) | |
audio = self.net_g_ms.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[ | |
0][0, 0].data.cpu().float().numpy() | |
with BytesIO() as f: | |
write(f, self.hps_ms.data.sampling_rate, audio) | |
return BytesIO(f.getvalue()) | |