import torch import torchaudio import gradio as gr import requests # Download and load the HuBERT content encoder hubert = torch.hub.load("bshall/hubert:main", "hubert_soft", trust_repo=True).cuda() # Assuming similar steps for downloading and loading the acoustic model acoustic_model = torch.hub.load("bshall/acoustic-model:main", "hubert_soft", trust_repo=True).cuda() # Load the HiFiGAN vocoder (if used in the notebook) vocoder = torch.hub.load("bshall/hifigan:main", "hifigan", trust_repo=True).cuda() def voice_conversion(input_audio): # Load input audio waveform, sample_rate = torchaudio.load(input_audio) # Process the audio using the models with torch.no_grad(): units = hubert(waveform.cuda()) mel_spec = acoustic_model.generate(units) audio_out = vocoder(mel_spec) # Save the output audio output_path = "output.wav" torchaudio.save(output_path, audio_out.cpu(), sample_rate) return output_path # Define Gradio interface iface = gr.Interface( fn=voice_conversion, inputs=gr.inputs.Audio(source="upload", type="filepath"), outputs=gr.outputs.Audio(type="file"), title="Voice Conversion Demo", description="Upload an audio file to convert its voice using HuBERT and other models." ) # Launch the interface iface.launch()