Spaces:
Running
Running
| # ================================================================================================== | |
| # DEEPFAKE AUDIO - vocoder/inference.py (Neural Waveform Synthesizer) | |
| # ================================================================================================== | |
| # | |
| # π DESCRIPTION | |
| # This module provides the high-level API for vocoder inference. It encapsulates | |
| # the WaveRNN model, handles hardware acceleration (CUDA), and provides the | |
| # entry point for transforming Mel-Spectrograms into audible speech waveforms. | |
| # | |
| # π€ AUTHORS | |
| # - Amey Thakur (https://github.com/Amey-Thakur) | |
| # - Mega Satish (https://github.com/msatmod) | |
| # | |
| # π€π» CREDITS | |
| # Original Real-Time Voice Cloning methodology by CorentinJ | |
| # Repository: https://github.com/CorentinJ/Real-Time-Voice-Cloning | |
| # | |
| # π PROJECT LINKS | |
| # Repository: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO | |
| # Video Demo: https://youtu.be/i3wnBcbHDbs | |
| # Research: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO/blob/main/DEEPFAKE-AUDIO.ipynb | |
| # | |
| # π LICENSE | |
| # Released under the MIT License | |
| # Release Date: 2021-02-06 | |
| # ================================================================================================== | |
| from vocoder.models.fatchord_version import WaveRNN | |
| from vocoder import hparams as hp | |
| import torch | |
| _model = None # Global singleton for the active WaveRNN instance | |
| def load_model(weights_fpath, verbose=True): | |
| """Neural Wake-up: Initializes the WaveRNN architecture and loads pre-trained weights.""" | |
| global _model, _device | |
| if verbose: | |
| print("Building Wave-RNN Architecture...") | |
| _model = WaveRNN( | |
| rnn_dims=hp.voc_rnn_dims, | |
| fc_dims=hp.voc_fc_dims, | |
| bits=hp.bits, | |
| pad=hp.voc_pad, | |
| upsample_factors=hp.voc_upsample_factors, | |
| feat_dims=hp.num_mels, | |
| compute_dims=hp.voc_compute_dims, | |
| res_out_dims=hp.voc_res_out_dims, | |
| res_blocks=hp.voc_res_blocks, | |
| hop_length=hp.hop_length, | |
| sample_rate=hp.sample_rate, | |
| mode=hp.voc_mode | |
| ) | |
| # Hardware Optimization: Prefer CUDA for high-performance generation | |
| if torch.cuda.is_available(): | |
| _model = _model.cuda() | |
| _device = torch.device('cuda') | |
| else: | |
| _device = torch.device('cpu') | |
| if verbose: | |
| print("Loading model weights from: %s" % weights_fpath) | |
| checkpoint = torch.load(weights_fpath, map_location=_device, weights_only=False) | |
| _model.load_state_dict(checkpoint['model_state']) | |
| _model.eval() | |
| def is_loaded(): | |
| """Status Check: Verifies if the vocoder is initialized in memory.""" | |
| return _model is not None | |
| def infer_waveform(mel, normalize=True, batched=True, target=8000, overlap=800, progress_callback=None): | |
| """ | |
| Waveform Synthesis Phase: | |
| Transforms a Mel-Spectrogram into a time-domain waveform using neural vocoding. | |
| :param mel: Mel-Spectrogram input (numpy array) | |
| :param normalize: Whether to scale the input spectrogram | |
| :param batched: Use parallel generation for speed | |
| :param target: Chunk size for batched synthesis | |
| :param overlap: Samples used for smooth blending between chunks | |
| :return: Synthesized waveform (numpy array) | |
| """ | |
| if _model is None: | |
| raise Exception("Operational Error: Wave-RNN must be loaded before inference.") | |
| if normalize: | |
| mel = mel / hp.mel_max_abs_value | |
| mel = torch.from_numpy(mel[None, ...]) | |
| wav = _model.generate(mel, batched, target, overlap, hp.mu_law, progress_callback) | |
| return wav | |