| | import torch |
| | import torchaudio |
| | import numpy as np |
| | from decoder_base import AcousticModel |
| |
|
| | class InferencePipeline(): |
| | def __init__(self): |
| | |
| | self.hubert = torch.hub.load("bshall/hubert:main", "hubert_soft", trust_repo=True) |
| |
|
| | |
| | ckpts_path = 'model-best.pt' |
| | self.model = AcousticModel() |
| | cp = torch.load(ckpts_path, map_location=torch.device('cpu')) |
| | self.model.load_state_dict(cp['acoustic-model']) |
| |
|
| | |
| | self.hifigan = torch.hub.load("bshall/hifigan:main", "hifigan_hubert_soft", trust_repo=True, map_location=torch.device('cpu')) |
| |
|
| | |
| | |
| | |
| | |
| |
|
| | |
| | self.trg_spk_emb = np.load('content/vctk/spk_emb/p226/p226_322_mic1.npy') |
| | self.trg_spk_emb = torch.from_numpy(self.trg_spk_emb) |
| | self.trg_spk_emb = self.trg_spk_emb.unsqueeze(0) |
| |
|
| | def voice_conversion(self, audio_file_path): |
| | |
| | self.model.eval() |
| | with torch.inference_mode(): |
| | |
| | units = self.hubert.units(audio_file_path) |
| | |
| | mel = self.model.generate(units, self.trg_spk_emb).transpose(1, 2) |
| | |
| | target = self.hifigan(mel) |
| |
|
| | |
| | |
| | output_audio_path = "output.wav" |
| | torchaudio.save(output_audio_path, target.cpu(), sample_rate=16000) |
| |
|
| | return output_audio_path |
| |
|
| | |