File size: 1,468 Bytes
1e95c1f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import os
import torch
import librosa
from tqdm import tqdm
from openvoice.mel_processing import spectrogram_torch
from whisper_timestamped.transcribe import get_audio_tensor, get_vad_segments


@torch.no_grad()
def se_extractor(audio_path, vc):
    # vad
    SAMPLE_RATE = 16000
    audio_vad = get_audio_tensor(audio_path)
    segments = get_vad_segments(
        audio_vad,
        output_sample=True,
        min_speech_duration=0.1,
        min_silence_duration=1,
        method="silero",
    )
    segments = [(seg["start"], seg["end"]) for seg in segments]
    segments = [(float(s) / SAMPLE_RATE, float(e) / SAMPLE_RATE) for s,e in segments]

    if len(segments) == 0:
        segments = [(0, len(audio_vad)/SAMPLE_RATE)]
        print(segments)

    # spk
    hps = vc.hps
    device = vc.device
    model = vc.model
    gs = []

    audio, sr = librosa.load(audio_path, sr=hps.data.sampling_rate)
    audio = torch.tensor(audio).float().to(device)

    for s, e in segments:
        y = audio[int(hps.data.sampling_rate*s):int(hps.data.sampling_rate*e)]
        y = y.to(device)
        y = y.unsqueeze(0)
        y = spectrogram_torch(y, hps.data.filter_length,
                              hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
                              center=False).to(device)
        g = model.ref_enc(y.transpose(1, 2)).unsqueeze(-1)
        gs.append(g.detach())

    gs = torch.stack(gs).mean(0)
    return gs.cpu()