trump / inference_nof0.py
lj1995's picture
Duplicate from innnky/trump
1d6aec7
raw
history blame contribute delete
No virus
1.81 kB
import torch,pdb
import numpy as np
import soundfile as sf
from models import SynthesizerTrnNoF0256
from scipy.io import wavfile
from fairseq import checkpoint_utils
import torch.nn.functional as F
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_path = "https://ibm.box.com/s/z1wgl1stco8ffooyatzdwsqn2psd9lrr"#checkpoint_best_legacy_500.pt
print("load model(s) from {}".format(model_path))
models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
[model_path],
suffix="",
)
model = models[0]
model = model.to(device)
model = model.half()
model.eval()
net_g = SynthesizerTrnNoF0256(513,40,192,192,768,2,6,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,4,2,2,2],512,[16,16,4,4,4],0)
weights=torch.load("trump.pt")
net_g.load_state_dict(weights,strict=True)
net_g.eval().to(device)
net_g.half()
wav_path="/bili-coeus/liujing04/vits_ch/测试素材/trump/wavs16k/云希_特朗普.wav"
wav, sr = sf.read(wav_path)
assert sr == 16000
feats = torch.from_numpy(wav).float()
if feats.dim() == 2: # double channels
feats = feats.mean(-1)
assert feats.dim() == 1, feats.dim()
feats = feats.view(1, -1)
padding_mask = torch.BoolTensor(feats.shape).fill_(False)
inputs = {
"source": feats.half().to(device),
"padding_mask": padding_mask.to(device),
"output_layer": 9, # layer 9
}
with torch.no_grad():
logits = model.extract_features(**inputs)
feats = model.final_proj(logits[0])
feats=F.interpolate(feats.permute(0,2,1),scale_factor=2).permute(0,2,1)
p_len = min(feats.shape[1],10000)#太大了爆显存
feats = feats[:,:p_len, :]
p_len = torch.LongTensor([p_len]).to(device)
with torch.no_grad():
audio = net_g.infer(feats, p_len)[0][0, 0].data.cpu().float().numpy()
wavfile.write("trump_co256nof0_63k_test.wav", 32000, audio)