# Copyright (c) 2023 Amphion. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch from utils.util import pad_mels_to_tensors, pad_f0_to_tensors def vocoder_inference(cfg, model, mels, f0s=None, device=None, fast_inference=False): """Inference the vocoder Args: mels: A tensor of mel-specs with the shape (batch_size, num_mels, frames) Returns: audios: A tensor of audios with the shape (batch_size, seq_len) """ model.eval() with torch.no_grad(): mels = mels.to(device) if f0s != None: f0s = f0s.to(device) if f0s == None and not cfg.preprocess.extract_amplitude_phase: output = model.forward(mels) elif cfg.preprocess.extract_amplitude_phase: ( _, _, _, _, output, ) = model.forward(mels) else: output = model.forward(mels, f0s) return output.squeeze(1).detach().cpu() def synthesis_audios(cfg, model, mels, f0s=None, batch_size=None, fast_inference=False): """Inference the vocoder Args: mels: A list of mel-specs Returns: audios: A list of audios """ # Get the device device = next(model.parameters()).device audios = [] # Pad the given list into tensors mel_batches, mel_frames = pad_mels_to_tensors(mels, batch_size) if f0s != None: f0_batches = pad_f0_to_tensors(f0s, batch_size) if f0s == None: for mel_batch, mel_frame in zip(mel_batches, mel_frames): for i in range(mel_batch.shape[0]): mel = mel_batch[i] frame = mel_frame[i] audio = vocoder_inference( cfg, model, mel.unsqueeze(0), device=device, fast_inference=fast_inference, ).squeeze(0) # # Apply fade_out to make the sound more natural # fade_out = torch.linspace( # 1, 0, steps=20 * model.cfg.preprocess.hop_size # ).cpu() # calculate the audio length audio_length = frame * model.cfg.preprocess.hop_size audio = audio[:audio_length] # audio[-20 * model.cfg.preprocess.hop_size :] *= fade_out audios.append(audio) else: for mel_batch, f0_batch, mel_frame in zip(mel_batches, f0_batches, mel_frames): for i in range(mel_batch.shape[0]): mel = mel_batch[i] f0 = f0_batch[i] frame = mel_frame[i] audio = vocoder_inference( cfg, model, mel.unsqueeze(0), f0s=f0.unsqueeze(0), device=device, fast_inference=fast_inference, ).squeeze(0) # # Apply fade_out to make the sound more natural # fade_out = torch.linspace( # 1, 0, steps=20 * model.cfg.preprocess.hop_size # ).cpu() # calculate the audio length audio_length = frame * model.cfg.preprocess.hop_length audio = audio[:audio_length] # audio[-20 * model.cfg.preprocess.hop_size :] *= fade_out audios.append(audio) return audios