Spaces:
Running
Running
import sys, os | |
sys.path.append(os.path.dirname(os.path.abspath(__file__))) | |
import torch | |
import librosa | |
import argparse | |
import numpy as np | |
from scipy.io.wavfile import write | |
from vad.utils import init_jit_model, get_speech_timestamps | |
def load_audio(file: str, sr: int = 16000): | |
x, sr = librosa.load(file, sr=sr) | |
return x | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--ref', type=str, required=True, | |
help="Path of ref audio.") | |
parser.add_argument('--svc', type=str, required=True, | |
help="Path of svc audio.") | |
parser.add_argument('--out', type=str, required=True, | |
help="Path of out audio.") | |
args = parser.parse_args() | |
print("svc in wave :", args.ref) | |
print("svc out wave :", args.svc) | |
print("svc post wave :", args.out) | |
model = init_jit_model(os.path.join('vad/assets', 'silero_vad.jit')) | |
model.eval() | |
ref_wave = load_audio(args.ref, sr=16000) | |
tmp_wave = torch.from_numpy(ref_wave).squeeze(0) | |
tag_wave = get_speech_timestamps( | |
tmp_wave, model, threshold=0.2, sampling_rate=16000) | |
ref_wave[:] = 0 | |
for tag in tag_wave: | |
ref_wave[tag["start"]:tag["end"]] = 1 | |
ref_wave = np.repeat(ref_wave, 2, -1) | |
svc_wave = load_audio(args.svc, sr=32000) | |
min_len = min(len(ref_wave), len(svc_wave)) | |
ref_wave = ref_wave[:min_len] | |
svc_wave = svc_wave[:min_len] | |
svc_wave[ref_wave == 0] = 0 | |
write(args.out, 32000, svc_wave) | |