|
import os |
|
import torch |
|
import librosa |
|
from tqdm import tqdm |
|
from openvoice.mel_processing import spectrogram_torch |
|
from whisper_timestamped.transcribe import get_audio_tensor, get_vad_segments |
|
|
|
|
|
@torch.no_grad() |
|
def se_extractor(audio_path, vc): |
|
|
|
SAMPLE_RATE = 16000 |
|
audio_vad = get_audio_tensor(audio_path) |
|
segments = get_vad_segments( |
|
audio_vad, |
|
output_sample=True, |
|
min_speech_duration=0.1, |
|
min_silence_duration=1, |
|
method="silero", |
|
) |
|
segments = [(seg["start"], seg["end"]) for seg in segments] |
|
segments = [(float(s) / SAMPLE_RATE, float(e) / SAMPLE_RATE) for s,e in segments] |
|
|
|
if len(segments) == 0: |
|
segments = [(0, len(audio_vad)/SAMPLE_RATE)] |
|
print(segments) |
|
|
|
|
|
hps = vc.hps |
|
device = vc.device |
|
model = vc.model |
|
gs = [] |
|
|
|
audio, sr = librosa.load(audio_path, sr=hps.data.sampling_rate) |
|
audio = torch.tensor(audio).float().to(device) |
|
|
|
for s, e in segments: |
|
y = audio[int(hps.data.sampling_rate*s):int(hps.data.sampling_rate*e)] |
|
y = y.to(device) |
|
y = y.unsqueeze(0) |
|
y = spectrogram_torch(y, hps.data.filter_length, |
|
hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length, |
|
center=False).to(device) |
|
g = model.ref_enc(y.transpose(1, 2)).unsqueeze(-1) |
|
gs.append(g.detach()) |
|
|
|
gs = torch.stack(gs).mean(0) |
|
return gs.cpu() |