Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from huggingface_hub import InferenceClient | |
| import speech_recognition as sr | |
| from gtts import gTTS | |
| import tempfile | |
| import os | |
| import sys | |
| # Check for Hugging Face API token | |
| HF_TOKEN = os.getenv("HF_TOKEN") | |
| if not HF_TOKEN: | |
| print("Error: HF_TOKEN environment variable not set. Please set it with your Hugging Face API token.") | |
| sys.exit(1) | |
| # Initialize Hugging Face client | |
| try: | |
| client = InferenceClient(model="mistralai/Mixtral-8x7B-Instruct-v0.1", token=HF_TOKEN) | |
| print("Successfully initialized InferenceClient with Mixtral-8x7B-Instruct-v0.1") | |
| except Exception as e: | |
| print(f"Failed to initialize InferenceClient: {e}") | |
| sys.exit(1) | |
| # Speech-to-Text Function | |
| def speech_to_text(audio_path): | |
| if not os.path.exists(audio_path): | |
| return "Error: Audio file not found." | |
| recognizer = sr.Recognizer() | |
| with sr.AudioFile(audio_path) as source: | |
| audio_data = recognizer.record(source) | |
| try: | |
| text = recognizer.recognize_google(audio_data) | |
| print(f"Speech-to-Text Output: {text}") | |
| return text | |
| except sr.UnknownValueError: | |
| return "Could not understand the audio." | |
| except sr.RequestError as e: | |
| return f"Speech recognition service error: {e}" | |
| # Text-to-Speech Function | |
| def text_to_speech(text): | |
| try: | |
| print(f"Text-to-Speech Input: {text}") | |
| tts = gTTS(text) | |
| temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") | |
| temp_file.close() | |
| tts.save(temp_file.name) | |
| print(f"Audio file generated at: {temp_file.name}") | |
| return temp_file.name | |
| except Exception as e: | |
| print(f"Text-to-Speech Error: {e}") | |
| return None | |
| # Chatbot Response Function | |
| def respond(message, history, system_message, max_tokens, temperature, top_p): | |
| messages = [{"role": "system", "content": system_message}] | |
| # Build message history | |
| for user_msg, bot_msg in history: | |
| if user_msg: | |
| messages.append({"role": "user", "content": user_msg}) | |
| if bot_msg: | |
| messages.append({"role": "assistant", "content": bot_msg}) | |
| messages.append({"role": "user", "content": message}) | |
| response = "" | |
| try: | |
| for message in client.chat_completion( | |
| messages=messages, | |
| max_tokens=max_tokens, | |
| stream=True, | |
| temperature=temperature, | |
| top_p=top_p, | |
| ): | |
| # Check for valid token content to avoid TypeError | |
| token_content = message.choices[0].delta.content | |
| if token_content is not None: | |
| response += token_content | |
| if not response: | |
| return "Error: Empty response from the model." | |
| print(f"Chatbot Response: {response}") | |
| return response | |
| except Exception as e: | |
| print(f"Chatbot Error: {e}") | |
| return f"Error generating response from the chatbot: {e}" | |
| # Voice-to-Voice Functionality | |
| def voice_to_voice(audio, history, system_message, max_tokens, temperature, top_p): | |
| # Convert user voice to text | |
| user_message = speech_to_text(audio) | |
| if user_message.startswith("Error") or user_message.startswith("Could not understand"): | |
| return user_message, history, None | |
| # Get chatbot response | |
| response_text = respond(user_message, history, system_message, max_tokens, temperature, top_p) | |
| if response_text.startswith("Error"): | |
| return response_text, history, None | |
| # Update chat history (truncate to last 5 messages) | |
| history.append((user_message, response_text)) | |
| history = history[-5:] | |
| # Convert chatbot response to audio | |
| audio_file = text_to_speech(response_text) | |
| if not audio_file: | |
| return "Failed to generate audio response.", history, None | |
| return response_text, history, audio_file | |
| # Gradio Interface | |
| def main_interface(): | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# Voice-to-Voice Chatbot") | |
| system_message = gr.Textbox( | |
| value="You are a friendly and helpful chatbot.", | |
| label="System Message", | |
| lines=2 | |
| ) | |
| max_tokens = gr.Slider( | |
| minimum=1, maximum=2048, value=512, step=1, label="Max New Tokens" | |
| ) | |
| temperature = gr.Slider( | |
| minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature" | |
| ) | |
| top_p = gr.Slider( | |
| minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (Nucleus Sampling)" | |
| ) | |
| audio_input = gr.Audio( | |
| sources=["microphone"], type="filepath", label="Speak Your Question" | |
| ) | |
| response_output = gr.Textbox(label="Chatbot Response") | |
| audio_output = gr.Audio(label="Response Audio", type="filepath") | |
| history_state = gr.State([]) | |
| gr.Button("Submit").click( | |
| fn=voice_to_voice, | |
| inputs=[audio_input, history_state, system_message, max_tokens, temperature, top_p], | |
| outputs=[response_output, history_state, audio_output], | |
| ) | |
| return demo | |
| if __name__ == "__main__": | |
| demo = main_interface() | |
| demo.launch() |