AIBOT / app.py
akjedidtz's picture
Create app.py
3651420 verified
import speech_recognition as sr
from gtts import gTTS
from pydub import AudioSegment
from IPython.display import Audio
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import soundfile as sf
# Setup device and dtype
device = "cuda:0" if torch.cuda.is_available() else "cpu"
import os
from groq import Groq
# Initialize the Groq client with the API key
client = Groq(
api_key="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",
)
#@@##
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
# Load model and processor
model_id = "openai/whisper-medium"
model = AutoModelForSpeechSeq2Seq.from_pretrained(
model_id,
torch_dtype=torch_dtype,
low_cpu_mem_usage=True,
use_safetensors=True
)
model.to(device)
processor = AutoProcessor.from_pretrained(model_id)
from transformers import pipeline
from gtts import gTTS
import gradio as gr
import torch
# Load ASR pipeline
asr_pipe =pipeline(
"automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
torch_dtype=torch_dtype,
device=device,
)
# Initialize Groq client
client = Groq(
api_key="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"
)
# Text-to-Speech function
def text_to_speech(text):
try:
# Convert text to speech using gTTS
tts = gTTS(text, lang='hi')
tts.save("response.mp3")
return "response.mp3" # Return the MP3 file path for playback in Gradio
except Exception as e:
print(f"Text-to-speech error: {e}")
return None
# Function to process audio, get model response, and return TTS output
def process_audio(audio):
# Convert audio to text
print("Converting audio to text...")
result = asr_pipe(audio, generate_kwargs={"language": "urdu"})
# Check if audio-to-text conversion was successful
if "text" in result and result["text"].strip():
user_ques = result["text"]
print("Audio-to-text conversion successful. User Question:", user_ques)
# Prepare messages for model input
messages = [
{
"role": "system",
"content": "You are a helpful assistant named SSk BOT that stands for (sehar bot) who mostly answers in Roman Urdu. Be professional. No emojis; just Urdu written in English letters, and if you receive a prompt in Urdu font, answer only in English (Roman Urdu).",
},
{
"role": "user",
"content": user_ques,
}
]
# Get response from Groq model
print("Getting response from the model...")
response = client.chat.completions.create(
messages=messages,
model="gemma2-9b-it",
)
# Extract model's response
model_response = response['choices'][0]['message']['content']
print("Model:", model_response)
# Convert model's response to speech
audio_path = text_to_speech(model_response)
return model_response, audio_path
else:
print("Audio-to-text conversion failed or produced no text.")
return "Audio-to-text conversion failed or no text was detected.", None
# Gradio interface
interface = gr.Interface(
fn=process_audio,
inputs=gr.Audio(type="filepath"),
outputs=[gr.Textbox(label="Model Response"), gr.Audio(label="Response Audio")],
title="Real-time ASR to Language Model Response",
description="Upload an audio file in Urdu, get a text response from the model, and hear the response in English."
)
# Launch the Gradio Interface
interface.launch()