import torch import torchaudio import numpy as np from decoder_base import AcousticModel class InferencePipeline(): def __init__(self): # download hubert content encoder self.hubert = torch.hub.load("bshall/hubert:main", "hubert_soft", trust_repo=True)#.cuda() # initialize decoder with checkpoint ckpts_path = 'model-best.pt' self.model = AcousticModel() cp = torch.load(ckpts_path, map_location=torch.device('cpu')) self.model.load_state_dict(cp['acoustic-model']) # download vocoder self.hifigan = torch.hub.load("bshall/hifigan:main", "hifigan_hubert_soft", trust_repo=True, map_location=torch.device('cpu')) # load target speaker embedding self.trg_spk_emb = np.load('merkel.npy') self.trg_spk_emb = torch.from_numpy(self.trg_spk_emb) self.trg_spk_emb = self.trg_spk_emb.unsqueeze(0)#.cuda() def voice_conversion(self, audio_data): # Extract the file path from the tuple audio_path = audio_data[0] if isinstance(audio_data, tuple) else audio_data print(f"Loading audio from: {audio_path}") # load source audio source, sr = torchaudio.load(audio_path) #"test.wav") source = torchaudio.functional.resample(source, sr, 16000) source = source.unsqueeze(0)#.cuda() # run inference self.model.eval() with torch.inference_mode(): # Extract speech units units = self.hubert.units(source) # Generate target spectrogram mel = self.model.generate(units, self.trg_spk_emb).transpose(1, 2) # Generate audio waveform target = self.hifigan(mel) # Assuming `target` is a tensor with the audio waveform # Convert it to numpy array and save it as an output audio file output_audio_path = "output.wav" torchaudio.save("output.wav", target.squeeze(0), 16000) return output_audio_path