|
import gradio as gr |
|
import whisper |
|
from gtts import gTTS |
|
from groq import Groq |
|
import os |
|
import numpy as np |
|
import soundfile as sf |
|
import logging |
|
|
|
|
|
GROQ_API_KEY = "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" |
|
|
|
|
|
logging.basicConfig(level=logging.DEBUG) |
|
|
|
|
|
try: |
|
whisper_model = whisper.load_model("base") |
|
logging.info("Whisper model loaded successfully.") |
|
except Exception as e: |
|
raise RuntimeError(f"Error loading Whisper model: {e}") |
|
|
|
|
|
try: |
|
client = Groq( |
|
api_key=GROQ_API_KEY |
|
) |
|
logging.info("Groq client initialized successfully.") |
|
except Exception as e: |
|
raise RuntimeError(f"Error initializing Groq client: {e}") |
|
|
|
|
|
def transcribe_audio(audio): |
|
try: |
|
|
|
logging.debug(f"Loading audio file: {audio}") |
|
audio_data, sample_rate = sf.read(audio, dtype='float32') |
|
logging.debug(f"Audio loaded with sample rate: {sample_rate}, data shape: {audio_data.shape}") |
|
|
|
|
|
if sample_rate != 16000: |
|
logging.debug(f"Resampling audio from {sample_rate} to 16000 Hz") |
|
|
|
num_samples = int(len(audio_data) * (16000 / sample_rate)) |
|
audio_data_resampled = np.interp(np.linspace(0, len(audio_data), num_samples), |
|
np.arange(len(audio_data)), |
|
audio_data) |
|
audio_data = audio_data_resampled.astype(np.float32) |
|
sample_rate = 16000 |
|
|
|
|
|
result = whisper_model.transcribe(audio_data) |
|
logging.debug(f"Transcription result: {result['text']}") |
|
return result['text'] |
|
except Exception as e: |
|
logging.error(f"Error during transcription: {e}") |
|
return f"Error during transcription: {e}" |
|
|
|
|
|
def get_response(text): |
|
try: |
|
logging.debug(f"Sending request to Groq API with text: {text}") |
|
chat_completion = client.chat.completions.create( |
|
messages=[ |
|
{ |
|
"role": "user", |
|
"content": text, |
|
} |
|
], |
|
model="llama3-8b-8192", |
|
) |
|
|
|
|
|
response_text = chat_completion.choices[0].message.content |
|
logging.debug(f"Received response from Groq API: {response_text}") |
|
return response_text |
|
except Exception as e: |
|
logging.error(f"Error during model response generation: {e}") |
|
return f"Error during model response generation: {e}" |
|
|
|
|
|
def text_to_speech(text): |
|
try: |
|
tts = gTTS(text) |
|
tts.save("response.mp3") |
|
logging.debug("Text-to-speech conversion completed successfully.") |
|
return "response.mp3" |
|
except Exception as e: |
|
logging.error(f"Error during text-to-speech conversion: {e}") |
|
return f"Error during text-to-speech conversion: {e}" |
|
|
|
|
|
def chatbot(audio): |
|
try: |
|
|
|
user_input = transcribe_audio(audio) |
|
|
|
|
|
if "Error" in user_input: |
|
return user_input, None |
|
|
|
logging.debug(f"Transcribed text: {user_input}") |
|
|
|
|
|
response_text = get_response(user_input) |
|
|
|
|
|
if "Error" in response_text: |
|
return response_text, None |
|
|
|
logging.debug(f"Response text: {response_text}") |
|
|
|
|
|
response_audio = text_to_speech(response_text) |
|
|
|
|
|
if "Error" in response_audio: |
|
return response_audio, None |
|
|
|
|
|
return response_text, response_audio |
|
|
|
except Exception as e: |
|
logging.error(f"Unexpected error occurred: {e}") |
|
return f"Unexpected error occurred: {e}", None |
|
|
|
|
|
iface = gr.Interface( |
|
fn=chatbot, |
|
inputs=gr.Audio(type="filepath"), |
|
outputs=[gr.Textbox(label="Response Text"), gr.Audio(label="Response Audio")], |
|
live=True, |
|
title="Voice-to-Voice Chatbot", |
|
description="Speak to the bot, and it will respond with voice.", |
|
) |
|
|
|
try: |
|
iface.launch() |
|
except Exception as e: |
|
logging.error(f"Error launching Gradio interface: {e}") |
|
|