myshell-ai
/

DreamVoice

speech-to-speech

Model card Files Files and versions Community

DreamVoice / dreamvoice /openvoice_utils.py

Higobeatz's picture

openvoice_plugin

1e95c1f 6 months ago

history blame contribute delete

1.47 kB

	import os
	import torch
	import librosa
	from tqdm import tqdm
	from openvoice.mel_processing import spectrogram_torch
	from whisper_timestamped.transcribe import get_audio_tensor, get_vad_segments


	@torch.no_grad()
	def se_extractor(audio_path, vc):
	# vad
	SAMPLE_RATE = 16000
	audio_vad = get_audio_tensor(audio_path)
	segments = get_vad_segments(
	audio_vad,
	output_sample=True,
	min_speech_duration=0.1,
	min_silence_duration=1,
	method="silero",
	)
	segments = [(seg["start"], seg["end"]) for seg in segments]
	segments = [(float(s) / SAMPLE_RATE, float(e) / SAMPLE_RATE) for s,e in segments]

	if len(segments) == 0:
	segments = [(0, len(audio_vad)/SAMPLE_RATE)]
	print(segments)

	# spk
	hps = vc.hps
	device = vc.device
	model = vc.model
	gs = []

	audio, sr = librosa.load(audio_path, sr=hps.data.sampling_rate)
	audio = torch.tensor(audio).float().to(device)

	for s, e in segments:
	y = audio[int(hps.data.sampling_rates):int(hps.data.sampling_ratee)]
	y = y.to(device)
	y = y.unsqueeze(0)
	y = spectrogram_torch(y, hps.data.filter_length,
	hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
	center=False).to(device)
	g = model.ref_enc(y.transpose(1, 2)).unsqueeze(-1)
	gs.append(g.detach())

	gs = torch.stack(gs).mean(0)
	return gs.cpu()