import torch, torchaudio import gradio as gr import time from hifigan.generator import HifiganGenerator from acoustic import AcousticModel from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present hubert = torch.hub.load("bshall/hubert:main", "hubert_soft").cpu() acoustic = AcousticModel(False, True) checkpoint = torch.load("models/acoustic-model-best.pt", map_location=torch.device('cpu')) consume_prefix_in_state_dict_if_present(checkpoint["acoustic-model"], "module.") acoustic.load_state_dict(checkpoint["acoustic-model"]) acoustic.eval() #hifigan = torch.hub.load("bshall/hifigan:main", "hifigan_hubert_soft").cpu()#.cuda() hifigan = HifiganGenerator() checkpoint = torch.load("models/hifigan-model-best.pt", map_location=torch.device('cpu')) consume_prefix_in_state_dict_if_present(checkpoint["generator"]["model"], "module.") hifigan.load_state_dict(checkpoint["generator"]["model"]) hifigan.eval() def run_conversion(audio_in): sr, source = audio_in source = torch.Tensor(source) if source.dim() == 1: source = source.unsqueeze(1) source = source.T #resample to 16khz source = torchaudio.functional.resample(source, sr, 16000) #convert to mono source = torch.mean(source, dim=0).unsqueeze(0) source = source.unsqueeze(0) with torch.inference_mode(): time_start = time.perf_counter() # Extract speech units units = hubert.units(source) # Generate target spectrogram mel = acoustic.generate(units).transpose(1, 2) # Generate audio waveform target = hifigan(mel) result = target.squeeze().cpu().multiply(32767).to(torch.int16).numpy() time_end = time.perf_counter() time_elapsed = time_end - time_start print(f"Conversion finished in {time_elapsed} Seconds") return (16000, result) with gr.Blocks() as demo: with gr.Column(): gr.Markdown( """ # Soft-VC | Widowmaker This is a [Soft-VC model](https://github.com/bshall/soft-vc) trained on Widowmaker from Overwatch, allowing the conversion of any voice to Widowmaker's voice. While lower quality (16kHz), it captures the character fairly well, imo. For a multi-speaker model, check out my [sovits-overwatch2](https://huggingface.co/spaces/cjayic/sovits-overwatch2) space! The acoustic model has been trained for around 100k iterations, the HiFiGAN-Model for around 150k iterations. Quality could likely be improved by training the HiFiGAN further. """), with gr.Column(): with gr.Tab("Upload Audio File"): with gr.Column(): input_audio = gr.Audio( label="Audio to be converted", ).style( container=False, ) btn_upload = gr.Button("Widowify", variant="primary").style(full_width=True) with gr.Tab("Record Audio"): with gr.Column(): input_audio_record = gr.Audio( label="Audio to be converted", source="microphone" ).style( container=False, ) btn_rec = gr.Button("Widowify", variant="primary").style(full_width=True) with gr.Row(): output_audio = gr.Audio( label="Converted Audio", elem_id="output_audio", interactive=False ).style(height="auto") btn_upload.click(run_conversion, [input_audio], output_audio) btn_rec.click(run_conversion, [input_audio_record], output_audio) gr.Examples( ["examples/jermacraft.wav","examples/Mercy_0000000B0F5.wav","examples/weartie.wav","examples/gman_02.wav"], inputs=[input_audio], outputs=[output_audio], fn=run_conversion, cache_examples=True, run_on_click=True ) demo.queue() demo.launch()