ChatHaruhi / src /tts_vits /inference_main.py
BlairLeng's picture
pushees
5f735a0
raw
history blame
1.69 kB
import io
import logging
import time
from pathlib import Path
import librosa
import numpy as np
import soundfile
from inference import infer_tool
from inference import slicer
from inference.infer_tool import Svc
import uuid
logging.getLogger('numba').setLevel(logging.WARNING)
# chunks_dict = infer_tool.read_temp("inference/chunks_temp.json")
infer_tool.mkdir(["./results"])
model_path = "vits_models/Haruhi_54000.pth"
config_path = "configs/config.json"
svc_model = Svc(model_path, config_path)
def set_model_path(path):
global model_path
model_path = path
def infer_to(spk, tran, voice):
slice_db = -40
wav_format = 'wav'
# audio_file = io.BytesIO(voice)
audio_file = voice
chunks = slicer.cut(audio_file, db_thresh=slice_db)
# audio_file = io.BytesIO(voice)
audio_data, audio_sr = slicer.chunks2audio(audio_file, chunks)
audio = []
for (slice_tag, data) in audio_data:
print(f'#=====segment start, {round(len(data) / audio_sr, 3)}s======')
length = int(np.ceil(len(data) / audio_sr * svc_model.target_sample))
raw_path = io.BytesIO()
soundfile.write(raw_path, data, audio_sr, format="wav")
raw_path.seek(0)
if slice_tag:
print('jump empty segment')
_audio = np.zeros(length)
else:
out_audio, out_sr = svc_model.infer(spk, tran, raw_path)
_audio = out_audio.cpu().numpy()
audio.extend(list(_audio))
infer_tool.mkdir(["./vits_results"])
res_path = f'./vits_results/{tran}key_{spk}_{str(uuid.uuid4())}.{wav_format}'
soundfile.write(res_path, audio, svc_model.target_sample, format=wav_format)
return res_path