import spaces
import gradio as gr
import os
from pyannote.audio import Pipeline

# instantiate the pipeline
try:
    pipeline = Pipeline.from_pretrained(
        "pyannote/speaker-diarization-3.1",
        use_auth_token=os.environ["api"]
    )
except Exception as e:
    print(f"Error initializing pipeline: {e}")
    pipeline = None


def save_audio(audio):
    if pipeline is None:
        return "Error: Pipeline not initialized"

    # Read the uploaded audio file as bytes
    with open(audio, "rb") as f:
        audio_data = f.read()

    # Save the uploaded audio file to a temporary location
    with open("temp.wav", "wb") as f:
        f.write(audio_data)

    return "temp.wav"

@spaces.GPU
def diarize_audio(temp_file, num_speakers, min_speakers, max_speakers):
    if pipeline is None:
        return "Error: Pipeline not initialized"

    try:
        params = {}
        if num_speakers > 0:
            params["num_speakers"] = num_speakers
        if min_speakers > 0:
            params["min_speakers"] = min_speakers
        if max_speakers > 0:
            params["max_speakers"] = max_speakers

        diarization = pipeline(temp_file, **params)
    except Exception as e:
        return f"Error processing audio: {e}"

    # Remove the temporary file
    os.remove(temp_file)

    # Return the diarization output
    return str(diarization)

with gr.Blocks() as demo:
    audio_input = gr.Audio(type="filepath", label="Upload Audio")  
    num_speakers_input = gr.Number(label="Number of Speakers", value=0)
    min_speakers_input = gr.Number(label="Minimum Number of Speakers", value=0)
    max_speakers_input = gr.Number(label="Maximum Number of Speakers", value=0)
    process_button = gr.Button("Process")
    diarization_output = gr.Textbox(label="Diarization Output")

    process_button.click(
        fn=lambda audio, num_speakers, min_speakers, max_speakers: 
            diarize_audio(save_audio(audio), num_speakers, min_speakers, max_speakers),
        inputs=[audio_input, num_speakers_input, min_speakers_input, max_speakers_input],
        outputs=diarization_output
    )

demo.launch()