DreamVoice / dreamvoice /openvoice_utils.py
Higobeatz's picture
openvoice_plugin
1e95c1f
import os
import torch
import librosa
from tqdm import tqdm
from openvoice.mel_processing import spectrogram_torch
from whisper_timestamped.transcribe import get_audio_tensor, get_vad_segments
@torch.no_grad()
def se_extractor(audio_path, vc):
# vad
SAMPLE_RATE = 16000
audio_vad = get_audio_tensor(audio_path)
segments = get_vad_segments(
audio_vad,
output_sample=True,
min_speech_duration=0.1,
min_silence_duration=1,
method="silero",
)
segments = [(seg["start"], seg["end"]) for seg in segments]
segments = [(float(s) / SAMPLE_RATE, float(e) / SAMPLE_RATE) for s,e in segments]
if len(segments) == 0:
segments = [(0, len(audio_vad)/SAMPLE_RATE)]
print(segments)
# spk
hps = vc.hps
device = vc.device
model = vc.model
gs = []
audio, sr = librosa.load(audio_path, sr=hps.data.sampling_rate)
audio = torch.tensor(audio).float().to(device)
for s, e in segments:
y = audio[int(hps.data.sampling_rate*s):int(hps.data.sampling_rate*e)]
y = y.to(device)
y = y.unsqueeze(0)
y = spectrogram_torch(y, hps.data.filter_length,
hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
center=False).to(device)
g = model.ref_enc(y.transpose(1, 2)).unsqueeze(-1)
gs.append(g.detach())
gs = torch.stack(gs).mean(0)
return gs.cpu()