Spaces:

yl12053
/

so-vits-4.1-Rice-Shower

Running

so-vits-4.1-Rice-Shower / vencoder /WhisperPPGLarge.py

bfd2c22 11 months ago

No virus

1.18 kB

	from vencoder.encoder import SpeechEncoder
	import torch

	from vencoder.whisper.model import Whisper, ModelDimensions
	from vencoder.whisper.audio import pad_or_trim, log_mel_spectrogram


	class WhisperPPGLarge(SpeechEncoder):
	def __init__(self,vec_path = "pretrain/large-v2.pt",device=None):
	if device is None:
	self.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	else:
	self.dev = torch.device(device)
	checkpoint = torch.load(vec_path, map_location=device)
	dims = ModelDimensions(**checkpoint["dims"])
	model = Whisper(dims)
	model.load_state_dict(checkpoint["model_state_dict"])
	self.hidden_dim = dims
	self.model = model.to(self.dev)

	def encoder(self, wav):
	audio = wav
	audln = audio.shape[0]
	ppgln = audln // 320
	audio = pad_or_trim(audio)
	mel = log_mel_spectrogram(audio).to(self.dev)
	with torch.no_grad():
	ppg = self.model.encoder(mel.unsqueeze(0)).squeeze().data.cpu().float().numpy()
	ppg = torch.FloatTensor(ppg[:ppgln,]).to(self.dev)
	return ppg[None,:,:].transpose(1, 2)