import gradio as gr
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import torch

# Set up device and data type for torch based on GPU availability
device = "cuda" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

print(f"Using device: {device}, torch_dtype: {torch_dtype}")


# Correct the model_id if using from the Hugging Face Model Hub
model_id = "distil-whisper/distil-large-v3"
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True)
processor = AutoProcessor.from_pretrained(model_id)

model.to(device)

print(f"Model and processor loaded successfully: {model_id}")


def transcribe_speech(file_info):
    filepath = file_info['path']
    sample = processor(filepath, return_tensors="pt")
    input_features = sample.input_features.to(device)

    # Check audio length to decide on chunking
    audio_length_seconds = sample.input_values.shape[1] / processor.feature_extractor.sampling_rate
    if audio_length_seconds > 30:
        chunk_length_s = 15
        batch_size = 2
    else:
        chunk_length_s = None
        batch_size = 1

    # Use the model and processor directly in the pipeline function
    pipe = pipeline(
        "automatic-speech-recognition",
        model=model,
        feature_extractor=processor.feature_extractor,
        tokenizer=processor.tokenizer,
        device=device,
        max_new_tokens=128,
        chunk_length_s=chunk_length_s,
        batch_size=batch_size,
        torch_dtype=torch_dtype
    )

    result = pipe(input_features)
    return result["text"]


with gr.Blocks() as demo:
    with gr.Tab("Transcribe Audio"):
        with gr.Row():
            audio_input = gr.Audio(label="Upload audio file or record")
        with gr.Row():
            audio_output = gr.Textbox(label="Transcription")
        demo.add_callback(transcribe_speech, inputs=[audio_input], outputs=[audio_output])

demo.launch(share=True)