import transformers
import gradio as gr
import librosa
import torch
import numpy as np
import spaces
from typing import Tuple  

@spaces.GPU(duration=120)
def transcribe_and_respond(audio_input: Tuple[np.ndarray, int]) -> str:
    try:
        pipe = transformers.pipeline(
    model='sarvamai/shuka_v1',
    trust_remote_code=True,
    device=0,
    torch_dtype=torch.bfloat16
)
        # Unpack the audio input
        audio, sr = audio_input
        
        # Ensure audio is float32
        if audio.dtype != np.float32:
            audio = audio.astype(np.float32)
        
        # Resample if necessary
        if sr != 16000:
            audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
        
        # Define conversation turns
        turns = [
            {'role': 'system', 'content': 'Respond naturally and informatively.'},
            {'role': 'user', 'content': ''}
        ]

        # Run the pipeline with the audio and conversation turns
        output = pipe({'audio': audio, 'turns': turns, 'sampling_rate': 16000}, max_new_tokens=512)

        # Return the model's response
        return output

    except Exception as e:
        return f"Error processing audio: {str(e)}"

iface = gr.Interface(
    fn=transcribe_and_respond,
    inputs=gr.Audio(sources="microphone", type="numpy"),  
    outputs="text", 
    title="Live Transcription and Response",
    description="Speak into your microphone, and the model will respond naturally and informatively.",
    live=True  # Enable live processing
)

if __name__ == "__main__":
    iface.launch()