from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import datasets
import soundfile
import librosa
import gradio as gr
import torch

# Global variables to hold model, processor, and pipeline after first load
model = None
processor = None
asr_pipeline = None

def load_model():
    global model, processor, asr_pipeline
    from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
    import torch

    # Set up device and data type for torch based on GPU availability
    device = "cuda" if torch.cuda.is_available() else "cpu"
    torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

    model_id = "distil-whisper/distil-large-v3"

    if model is None:
        model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True)
        model.to(device)

    if processor is None:
        processor = AutoProcessor.from_pretrained(model_id)

    if asr_pipeline is None:
        asr_pipeline = pipeline(
            "automatic-speech-recognition",
            model=model,
            feature_extractor=processor.feature_extractor,
            tokenizer=processor.tokenizer,
            device=device,
            torch_dtype=torch_dtype
        )

def transcribe_speech(file_info):
    # Ensure model and processor are loaded
    load_model()

    filepath = file_info['path']
    input_features = processor(filepath, return_tensors="pt").input_features

    # Transcribe the audio
    result = asr_pipeline(input_features)
    return result['text']

# Building the Gradio app
with gr.Blocks() as demo:
    with gr.Tab("Transcribe Audio"):
        with gr.Row():
            audio_input = gr.Audio(label="Upload audio file or record")
        with gr.Row():
            audio_output = gr.Textbox(label="Transcription")
        demo.add_callback(transcribe_speech, inputs=[audio_input], outputs=[audio_output])

# Launch the app
demo.launch(share=True)