import os import torch import librosa from tqdm import tqdm from openvoice.mel_processing import spectrogram_torch from whisper_timestamped.transcribe import get_audio_tensor, get_vad_segments @torch.no_grad() def se_extractor(audio_path, vc): # vad SAMPLE_RATE = 16000 audio_vad = get_audio_tensor(audio_path) segments = get_vad_segments( audio_vad, output_sample=True, min_speech_duration=0.1, min_silence_duration=1, method="silero", ) segments = [(seg["start"], seg["end"]) for seg in segments] segments = [(float(s) / SAMPLE_RATE, float(e) / SAMPLE_RATE) for s,e in segments] if len(segments) == 0: segments = [(0, len(audio_vad)/SAMPLE_RATE)] print(segments) # spk hps = vc.hps device = vc.device model = vc.model gs = [] audio, sr = librosa.load(audio_path, sr=hps.data.sampling_rate) audio = torch.tensor(audio).float().to(device) for s, e in segments: y = audio[int(hps.data.sampling_rate*s):int(hps.data.sampling_rate*e)] y = y.to(device) y = y.unsqueeze(0) y = spectrogram_torch(y, hps.data.filter_length, hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length, center=False).to(device) g = model.ref_enc(y.transpose(1, 2)).unsqueeze(-1) gs.append(g.detach()) gs = torch.stack(gs).mean(0) return gs.cpu()