import speech_recognition as sr from gtts import gTTS from pydub import AudioSegment from IPython.display import Audio import torch from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline import soundfile as sf # Setup device and dtype device = "cuda:0" if torch.cuda.is_available() else "cpu" import os from groq import Groq # Initialize the Groq client with the API key client = Groq( api_key="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", ) #@@## device = "cuda:0" if torch.cuda.is_available() else "cpu" torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 # Load model and processor model_id = "openai/whisper-medium" model = AutoModelForSpeechSeq2Seq.from_pretrained( model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True ) model.to(device) processor = AutoProcessor.from_pretrained(model_id) from transformers import pipeline from gtts import gTTS import gradio as gr import torch # Load ASR pipeline asr_pipe =pipeline( "automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, torch_dtype=torch_dtype, device=device, ) # Initialize Groq client client = Groq( api_key="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" ) # Text-to-Speech function def text_to_speech(text): try: # Convert text to speech using gTTS tts = gTTS(text, lang='hi') tts.save("response.mp3") return "response.mp3" # Return the MP3 file path for playback in Gradio except Exception as e: print(f"Text-to-speech error: {e}") return None # Function to process audio, get model response, and return TTS output def process_audio(audio): # Convert audio to text print("Converting audio to text...") result = asr_pipe(audio, generate_kwargs={"language": "urdu"}) # Check if audio-to-text conversion was successful if "text" in result and result["text"].strip(): user_ques = result["text"] print("Audio-to-text conversion successful. User Question:", user_ques) # Prepare messages for model input messages = [ { "role": "system", "content": "You are a helpful assistant named SSk BOT that stands for (sehar bot) who mostly answers in Roman Urdu. Be professional. No emojis; just Urdu written in English letters, and if you receive a prompt in Urdu font, answer only in English (Roman Urdu).", }, { "role": "user", "content": user_ques, } ] # Get response from Groq model print("Getting response from the model...") response = client.chat.completions.create( messages=messages, model="gemma2-9b-it", ) # Extract model's response model_response = response['choices'][0]['message']['content'] print("Model:", model_response) # Convert model's response to speech audio_path = text_to_speech(model_response) return model_response, audio_path else: print("Audio-to-text conversion failed or produced no text.") return "Audio-to-text conversion failed or no text was detected.", None # Gradio interface interface = gr.Interface( fn=process_audio, inputs=gr.Audio(type="filepath"), outputs=[gr.Textbox(label="Model Response"), gr.Audio(label="Response Audio")], title="Real-time ASR to Language Model Response", description="Upload an audio file in Urdu, get a text response from the model, and hear the response in English." ) # Launch the Gradio Interface interface.launch()