|
|
|
|
|
|
|
|
|
|
|
|
|
import speech_recognition as sr |
|
from gtts import gTTS |
|
from pydub import AudioSegment |
|
from IPython.display import Audio |
|
|
|
import torch |
|
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline |
|
import soundfile as sf |
|
|
|
|
|
device = "cuda:0" if torch.cuda.is_available() else "cpu" |
|
|
|
|
|
|
|
|
|
|
|
import os |
|
from groq import Groq |
|
|
|
|
|
client = Groq( |
|
api_key="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", |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
device = "cuda:0" if torch.cuda.is_available() else "cpu" |
|
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 |
|
|
|
|
|
model_id = "openai/whisper-medium" |
|
model = AutoModelForSpeechSeq2Seq.from_pretrained( |
|
model_id, |
|
torch_dtype=torch_dtype, |
|
low_cpu_mem_usage=True, |
|
use_safetensors=True |
|
) |
|
model.to(device) |
|
processor = AutoProcessor.from_pretrained(model_id) |
|
|
|
from transformers import pipeline |
|
from gtts import gTTS |
|
import gradio as gr |
|
import torch |
|
|
|
|
|
asr_pipe =pipeline( |
|
"automatic-speech-recognition", |
|
model=model, |
|
tokenizer=processor.tokenizer, |
|
feature_extractor=processor.feature_extractor, |
|
torch_dtype=torch_dtype, |
|
device=device, |
|
) |
|
|
|
|
|
client = Groq( |
|
api_key="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" |
|
) |
|
|
|
|
|
def text_to_speech(text): |
|
try: |
|
|
|
tts = gTTS(text, lang='hi') |
|
tts.save("response.mp3") |
|
return "response.mp3" |
|
except Exception as e: |
|
print(f"Text-to-speech error: {e}") |
|
return None |
|
|
|
|
|
def process_audio(audio): |
|
|
|
print("Converting audio to text...") |
|
result = asr_pipe(audio, generate_kwargs={"language": "urdu"}) |
|
|
|
|
|
if "text" in result and result["text"].strip(): |
|
user_ques = result["text"] |
|
print("Audio-to-text conversion successful. User Question:", user_ques) |
|
|
|
|
|
messages = [ |
|
{ |
|
"role": "system", |
|
"content": "You are a helpful assistant named SSk BOT that stands for (sehar bot) who mostly answers in Roman Urdu. Be professional. No emojis; just Urdu written in English letters, and if you receive a prompt in Urdu font, answer only in English (Roman Urdu).", |
|
}, |
|
{ |
|
"role": "user", |
|
"content": user_ques, |
|
} |
|
] |
|
|
|
|
|
print("Getting response from the model...") |
|
response = client.chat.completions.create( |
|
messages=messages, |
|
model="gemma2-9b-it", |
|
) |
|
|
|
|
|
model_response = response['choices'][0]['message']['content'] |
|
print("Model:", model_response) |
|
|
|
|
|
audio_path = text_to_speech(model_response) |
|
return model_response, audio_path |
|
|
|
else: |
|
print("Audio-to-text conversion failed or produced no text.") |
|
return "Audio-to-text conversion failed or no text was detected.", None |
|
|
|
|
|
interface = gr.Interface( |
|
fn=process_audio, |
|
inputs=gr.Audio(type="filepath"), |
|
outputs=[gr.Textbox(label="Model Response"), gr.Audio(label="Response Audio")], |
|
title="Real-time ASR to Language Model Response", |
|
description="Upload an audio file in Urdu, get a text response from the model, and hear the response in English." |
|
) |
|
|
|
|
|
interface.launch() |
|
|
|
|
|
|
|
|