import transformers import gradio as gr import librosa import torch import numpy as np import spaces from typing import Tuple @spaces.GPU(duration=120) def transcribe_and_respond(audio_input: Tuple[np.ndarray, int]) -> str: try: pipe = transformers.pipeline( model='sarvamai/shuka_v1', trust_remote_code=True, device=0, torch_dtype=torch.bfloat16 ) # Unpack the audio input audio, sr = audio_input # Ensure audio is float32 if audio.dtype != np.float32: audio = audio.astype(np.float32) # Resample if necessary if sr != 16000: audio = librosa.resample(audio, orig_sr=sr, target_sr=16000) # Define conversation turns turns = [ {'role': 'system', 'content': 'Respond naturally and informatively.'}, {'role': 'user', 'content': ''} ] # Run the pipeline with the audio and conversation turns output = pipe({'audio': audio, 'turns': turns, 'sampling_rate': 16000}, max_new_tokens=512) # Return the model's response return output except Exception as e: return f"Error processing audio: {str(e)}" iface = gr.Interface( fn=transcribe_and_respond, inputs=gr.Audio(sources="microphone", type="numpy"), outputs="text", title="Live Transcription and Response", description="Speak into your microphone, and the model will respond naturally and informatively.", live=True # Enable live processing ) if __name__ == "__main__": iface.launch()