import os import librosa from scipy.io.wavfile import write from mel_processing import spectrogram_torch from text import text_to_sequence, _clean_text from models import SynthesizerTrn import utils import commons import sys import re import numpy as np # import torch # torch.set_num_threads(1) #设置torch线程为1,防止多任务推理时服务崩溃,但flask仍然会使用多线程 from torch import no_grad, LongTensor, inference_mode, FloatTensor import audonnx import uuid from io import BytesIO class Voice: def __init__(self, model, config, out_path=None): self.out_path = out_path if not os.path.exists(self.out_path): try: os.mkdir(self.out_path) except: pass self.hps_ms = utils.get_hparams_from_file(config) self.n_speakers = self.hps_ms.data.n_speakers if 'n_speakers' in self.hps_ms.data.keys() else 0 self.n_symbols = len(self.hps_ms.symbols) if 'symbols' in self.hps_ms.keys() else 0 self.speakers = self.hps_ms.speakers if 'speakers' in self.hps_ms.keys() else ['0'] self.use_f0 = self.hps_ms.data.use_f0 if 'use_f0' in self.hps_ms.data.keys() else False self.emotion_embedding = self.hps_ms.data.emotion_embedding if 'emotion_embedding' in self.hps_ms.data.keys() else False self.net_g_ms = SynthesizerTrn( self.n_symbols, self.hps_ms.data.filter_length // 2 + 1, self.hps_ms.train.segment_size // self.hps_ms.data.hop_length, n_speakers=self.n_speakers, emotion_embedding=self.emotion_embedding, **self.hps_ms.model) _ = self.net_g_ms.eval() utils.load_checkpoint(model, self.net_g_ms) def get_text(self, text, hps, cleaned=False): if cleaned: text_norm = text_to_sequence(text, hps.symbols, []) else: text_norm = text_to_sequence(text, hps.symbols, hps.data.text_cleaners) if hps.data.add_blank: text_norm = commons.intersperse(text_norm, 0) text_norm = LongTensor(text_norm) return text_norm def get_label_value(self, text, label, default, warning_name='value'): value = re.search(rf'\[{label}=(.+?)\]', text) if value: try: text = re.sub(rf'\[{label}=(.+?)\]', '', text, 1) value = float(value.group(1)) except: print(f'Invalid {warning_name}!') sys.exit(1) else: value = default return value, text def ex_return(self, text, escape=False): if escape: return text.encode('unicode_escape').decode() else: return text def return_speakers(self, escape=False): return self.speakers def get_label(self, text, label): if f'[{label}]' in text: return True, text.replace(f'[{label}]', '') else: return False, text def generate(self, text=None, speaker_id=None, format=None, speed=1, audio_path=None, target_id=None, escape=False, option=None, w2v2_folder=None): if self.n_symbols != 0: if not self.emotion_embedding: length_scale, text = self.get_label_value(text, 'LENGTH', speed, 'length scale') noise_scale, text = self.get_label_value(text, 'NOISE', 0.667, 'noise scale') noise_scale_w, text = self.get_label_value(text, 'NOISEW', 0.8, 'deviation of noise') cleaned, text = self.get_label(text, 'CLEANED') stn_tst = self.get_text(text, self.hps_ms, cleaned=cleaned) with no_grad(): x_tst = stn_tst.unsqueeze(0) x_tst_lengths = LongTensor([stn_tst.size(0)]) sid = LongTensor([speaker_id]) audio = self.net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.cpu().float().numpy() # else: # w2v2_model = audonnx.load(os.path.dirname(w2v2_folder)) # # if option == 'clean': # self.ex_print(_clean_text( # text, self.hps_ms.data.text_cleaners), escape) # # length_scale, text = self.get_label_value( # text, 'LENGTH', 1, 'length scale') # noise_scale, text = self.get_label_value( # text, 'NOISE', 0.667, 'noise scale') # noise_scale_w, text = self.get_label_value( # text, 'NOISEW', 0.8, 'deviation of noise') # cleaned, text = self.get_label(text, 'CLEANED') # # stn_tst = self.get_text(text, self.hps_ms, cleaned=cleaned) # # emotion_reference = input('Path of an emotion reference: ') # if emotion_reference.endswith('.npy'): # emotion = np.load(emotion_reference) # emotion = FloatTensor(emotion).unsqueeze(0) # else: # audio16000, sampling_rate = librosa.load( # emotion_reference, sr=16000, mono=True) # emotion = w2v2_model(audio16000, sampling_rate)[ # 'hidden_states'] # emotion_reference = re.sub( # r'\..*$', '', emotion_reference) # np.save(emotion_reference, emotion.squeeze(0)) # emotion = FloatTensor(emotion) # # # with no_grad(): # x_tst = stn_tst.unsqueeze(0) # x_tst_lengths = LongTensor([stn_tst.size(0)]) # sid = LongTensor([speaker_id]) # audio = self.net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, # noise_scale_w=noise_scale_w, # length_scale=length_scale, emotion_embedding=emotion)[0][ # 0, 0].data.cpu().float().numpy() # else: # model = input('Path of a hubert-soft Model: ') # from hubert_model import hubert_soft # hubert = hubert_soft(model) # if audio_path != '[VC]': # if self.use_f0: # audio, sampling_rate = librosa.load( # audio_path, sr=self.hps_ms.data.sampling_rate, mono=True) # audio16000 = librosa.resample( # audio, orig_sr=sampling_rate, target_sr=16000) # else: # audio16000, sampling_rate = librosa.load( # audio_path, sr=16000, mono=True) # # out_path = "H:/git/MoeGoe-Simple-API/upload/hubert.wav" # length_scale, out_path = self.get_label_value( # out_path, 'LENGTH', 1, 'length scale') # noise_scale, out_path = self.get_label_value( # out_path, 'NOISE', 0.1, 'noise scale') # noise_scale_w, out_path = self.get_label_value( # out_path, 'NOISEW', 0.1, 'deviation of noise') # # with inference_mode(): # units = hubert.units(FloatTensor(audio16000).unsqueeze( # 0).unsqueeze(0)).squeeze(0).numpy() # if self.use_f0: # f0_scale, out_path = self.get_label_value( # out_path, 'F0', 1, 'f0 scale') # f0 = librosa.pyin(audio, sr=sampling_rate, # fmin=librosa.note_to_hz('C0'), # fmax=librosa.note_to_hz('C7'), # frame_length=1780)[0] # target_length = len(units[:, 0]) # f0 = np.nan_to_num(np.interp(np.arange(0, len(f0) * target_length, len(f0)) / target_length, # np.arange(0, len(f0)), f0)) * f0_scale # units[:, 0] = f0 / 10 # # stn_tst = FloatTensor(units) # with no_grad(): # x_tst = stn_tst.unsqueeze(0) # x_tst_lengths = LongTensor([stn_tst.size(0)]) # sid = LongTensor([target_id]) # audio = self.net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, # noise_scale_w=noise_scale_w, length_scale=length_scale)[0][ # 0, 0].data.float().numpy() with BytesIO() as f: fname = str(uuid.uuid1()) if format == 'ogg': write(f, self.hps_ms.data.sampling_rate, audio) with BytesIO() as o: utils.wav2ogg(f, o) return BytesIO(o.getvalue()), "audio/ogg", fname + ".ogg" elif format == 'silk': file_path = self.out_path + "/" + fname + ".wav" write(file_path, 24000, audio) silk_path = utils.convert_to_silk(file_path) os.remove(file_path) return silk_path, "audio/silk", fname + ".silk" else: write(f, self.hps_ms.data.sampling_rate, audio) return BytesIO(f.getvalue()), "audio/wav", fname + ".wav" def voice_conversion(self, audio_path, original_id, target_id): audio = utils.load_audio_to_torch( audio_path, self.hps_ms.data.sampling_rate) y = audio.unsqueeze(0) spec = spectrogram_torch(y, self.hps_ms.data.filter_length, self.hps_ms.data.sampling_rate, self.hps_ms.data.hop_length, self.hps_ms.data.win_length, center=False) spec_lengths = LongTensor([spec.size(-1)]) sid_src = LongTensor([original_id]) with no_grad(): sid_tgt = LongTensor([target_id]) audio = self.net_g_ms.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[ 0][0, 0].data.cpu().float().numpy() with BytesIO() as f: write(f, self.hps_ms.data.sampling_rate, audio) return BytesIO(f.getvalue())