speech-chatbot / app.py
habibrehman1996's picture
Update app.py
31979a7 verified
import gradio as gr
from huggingface_hub import InferenceClient
import speech_recognition as sr
from gtts import gTTS
import tempfile
import os
import sys
# Check for Hugging Face API token
HF_TOKEN = os.getenv("HF_TOKEN")
if not HF_TOKEN:
print("Error: HF_TOKEN environment variable not set. Please set it with your Hugging Face API token.")
sys.exit(1)
# Initialize Hugging Face client
try:
client = InferenceClient(model="mistralai/Mixtral-8x7B-Instruct-v0.1", token=HF_TOKEN)
print("Successfully initialized InferenceClient with Mixtral-8x7B-Instruct-v0.1")
except Exception as e:
print(f"Failed to initialize InferenceClient: {e}")
sys.exit(1)
# Speech-to-Text Function
def speech_to_text(audio_path):
if not os.path.exists(audio_path):
return "Error: Audio file not found."
recognizer = sr.Recognizer()
with sr.AudioFile(audio_path) as source:
audio_data = recognizer.record(source)
try:
text = recognizer.recognize_google(audio_data)
print(f"Speech-to-Text Output: {text}")
return text
except sr.UnknownValueError:
return "Could not understand the audio."
except sr.RequestError as e:
return f"Speech recognition service error: {e}"
# Text-to-Speech Function
def text_to_speech(text):
try:
print(f"Text-to-Speech Input: {text}")
tts = gTTS(text)
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
temp_file.close()
tts.save(temp_file.name)
print(f"Audio file generated at: {temp_file.name}")
return temp_file.name
except Exception as e:
print(f"Text-to-Speech Error: {e}")
return None
# Chatbot Response Function
def respond(message, history, system_message, max_tokens, temperature, top_p):
messages = [{"role": "system", "content": system_message}]
# Build message history
for user_msg, bot_msg in history:
if user_msg:
messages.append({"role": "user", "content": user_msg})
if bot_msg:
messages.append({"role": "assistant", "content": bot_msg})
messages.append({"role": "user", "content": message})
response = ""
try:
for message in client.chat_completion(
messages=messages,
max_tokens=max_tokens,
stream=True,
temperature=temperature,
top_p=top_p,
):
# Check for valid token content to avoid TypeError
token_content = message.choices[0].delta.content
if token_content is not None:
response += token_content
if not response:
return "Error: Empty response from the model."
print(f"Chatbot Response: {response}")
return response
except Exception as e:
print(f"Chatbot Error: {e}")
return f"Error generating response from the chatbot: {e}"
# Voice-to-Voice Functionality
def voice_to_voice(audio, history, system_message, max_tokens, temperature, top_p):
# Convert user voice to text
user_message = speech_to_text(audio)
if user_message.startswith("Error") or user_message.startswith("Could not understand"):
return user_message, history, None
# Get chatbot response
response_text = respond(user_message, history, system_message, max_tokens, temperature, top_p)
if response_text.startswith("Error"):
return response_text, history, None
# Update chat history (truncate to last 5 messages)
history.append((user_message, response_text))
history = history[-5:]
# Convert chatbot response to audio
audio_file = text_to_speech(response_text)
if not audio_file:
return "Failed to generate audio response.", history, None
return response_text, history, audio_file
# Gradio Interface
def main_interface():
with gr.Blocks() as demo:
gr.Markdown("# Voice-to-Voice Chatbot")
system_message = gr.Textbox(
value="You are a friendly and helpful chatbot.",
label="System Message",
lines=2
)
max_tokens = gr.Slider(
minimum=1, maximum=2048, value=512, step=1, label="Max New Tokens"
)
temperature = gr.Slider(
minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"
)
top_p = gr.Slider(
minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (Nucleus Sampling)"
)
audio_input = gr.Audio(
sources=["microphone"], type="filepath", label="Speak Your Question"
)
response_output = gr.Textbox(label="Chatbot Response")
audio_output = gr.Audio(label="Response Audio", type="filepath")
history_state = gr.State([])
gr.Button("Submit").click(
fn=voice_to_voice,
inputs=[audio_input, history_state, system_message, max_tokens, temperature, top_p],
outputs=[response_output, history_state, audio_output],
)
return demo
if __name__ == "__main__":
demo = main_interface()
demo.launch()