import torch 
import torchaudio
import gradio as gr
import requests

# Download and load the HuBERT content encoder
hubert = torch.hub.load("bshall/hubert:main", "hubert_soft", trust_repo=True).cuda()

# Assuming similar steps for downloading and loading the acoustic model
acoustic_model = torch.hub.load("bshall/acoustic-model:main", "hubert_soft", trust_repo=True).cuda()

# Load the HiFiGAN vocoder (if used in the notebook)
vocoder = torch.hub.load("bshall/hifigan:main", "hifigan", trust_repo=True).cuda()

def voice_conversion(input_audio):
    # Load input audio
    waveform, sample_rate = torchaudio.load(input_audio)
    
    # Process the audio using the models
    with torch.no_grad():
        units = hubert(waveform.cuda())
        mel_spec = acoustic_model.generate(units)
        audio_out = vocoder(mel_spec)
    
    # Save the output audio
    output_path = "output.wav"
    torchaudio.save(output_path, audio_out.cpu(), sample_rate)
    
    return output_path

# Define Gradio interface
iface = gr.Interface(
    fn=voice_conversion,
    inputs=gr.inputs.Audio(source="upload", type="filepath"),
    outputs=gr.outputs.Audio(type="file"),
    title="Voice Conversion Demo",
    description="Upload an audio file to convert its voice using HuBERT and other models."
)

# Launch the interface
iface.launch()