Spaces:

develops20
/

VoiceSupportAgent

Running

App Files Files Community

develops20 commited on Jun 9, 2025

Commit

e4674b9

verified ·

1 Parent(s): 9a4cedc

Update app.py

Browse files

Files changed (1) hide show

app.py +314 -70

app.py CHANGED Viewed

@@ -1,84 +1,328 @@
 import gradio as gr
-from transformers import pipeline
-from gtts import gTTS
 import os
-import numpy as np
-# Initialize Whisper for speech-to-text
-whisper = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")
-# Hardcoded knowledge base for Q&A
-knowledge_base = {
-    "what cars are available": "We have Toyota Camry, Honda Civic, and Ford Mustang.",
-    "price of camry": "The Toyota Camry starts at $25,000.",
-    "price of tesla": "The Tesla starts at $60,000."
-}
-def transcribe(audio):
-    print(f"Transcribing audio: {type(audio)}")
-    try:
-        # Check if audio is a tuple (numpy array, sample rate)
-        if isinstance(audio, tuple):
-            audio_data, _ = audio  # Extract numpy array, ignore sample rate
         else:
-            audio_data = audio
-        result = whisper(audio_data)["text"]
-        print(f"Transcription result: {result}")
-        return result
-    except Exception as e:
-        print(f"Error in transcribe: {str(e)}")
-        import traceback
-        traceback.print_exc()
-        raise
-def text_to_speech(text):
-    print(f"Generating speech for text: {text}")
-    try:
-        tts = gTTS(text, lang="en")
-        output_path = "/tmp/response.mp3"
-        tts.save(output_path)
-        print(f"Speech saved to {output_path}")
-        return output_path
-    except Exception as e:
-        print(f"Error in text_to_speech: {str(e)}")
-        import traceback
-        traceback.print_exc()
-        raise
-def answer_question(text):
-    print(f"Answering question: {text}")
     try:
-        for key in knowledge_base:
-            if key in text.lower():
-                print(f"Found match for key: {key}")
-                return knowledge_base[key]
-        print("No match found in knowledge base")
-        return "Sorry, I can help with car availability and prices. Try again!"
     except Exception as e:
-        print(f"Error in answer_question: {str(e)}")
-        import traceback
-        traceback.print_exc()
-        raise
-def process_audio(audio):
-    print(f"Processing audio: {type(audio)}")
     try:
-        text = transcribe(audio)
-        response = answer_question(text)
-        audio_response = text_to_speech(response)
-        print(f"Process complete. Response: {response}, Audio: {audio_response}")
-        return response, audio_response
     except Exception as e:
-        print(f"Error in process_audio: {str(e)}")
-        import traceback
-        traceback.print_exc()
-        raise
-# Gradio interface
-with gr.Blocks() as demo:
-    gr.Markdown("# AI Support Agent: Car Dealership")
-    audio_input = gr.Audio(label="Speak to the Agent")
-    text_output = gr.Textbox(label="Agent Response")
-    audio_output = gr.Audio(label="Listen to Response")
-    btn = gr.Button("Submit")
-    btn.click(fn=process_audio, inputs=audio_input, outputs=[text_output, audio_output])

 import gradio as gr
+import speech_recognition as sr
+import requests
+import json
 import os
+from datetime import datetime, timedelta
+import tempfile
+import io
+import base64
+from typing import Optional, Dict, Any
+import asyncio
+import aiohttp
+# Configuration
+ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY")
+GOOGLE_CALENDAR_CREDENTIALS = os.getenv("GOOGLE_CALENDAR_CREDENTIALS")
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+# ElevenLabs configuration
+ELEVENLABS_VOICE_ID = "21m00Tcm4TlvDq8ikWAM"  # Default voice, can be changed
+ELEVENLABS_API_URL = "https://api.elevenlabs.io/v1"
+class VoiceAgent:
+    def __init__(self):
+        self.recognizer = sr.Recognizer()
+        self.microphone = sr.Microphone()
+    async def speech_to_text(self, audio_file) -> str:
+        """Convert speech to text using speech_recognition"""
+        try:
+            with sr.AudioFile(audio_file) as source:
+                audio = self.recognizer.record(source)
+            text = self.recognizer.recognize_google(audio)
+            return text
+        except Exception as e:
+            return f"Error in speech recognition: {str(e)}"
+    async def text_to_speech(self, text: str) -> bytes:
+        """Convert text to speech using ElevenLabs"""
+        if not ELEVENLABS_API_KEY:
+            raise ValueError("ElevenLabs API key not found")
+        url = f"{ELEVENLABS_API_URL}/text-to-speech/{ELEVENLABS_VOICE_ID}"
+        headers = {
+            "Accept": "audio/mpeg",
+            "Content-Type": "application/json",
+            "xi-api-key": ELEVENLABS_API_KEY
+        }
+        data = {
+            "text": text,
+            "model_id": "eleven_monolingual_v1",
+            "voice_settings": {
+                "stability": 0.5,
+                "similarity_boost": 0.5
+            }
+        }
+        async with aiohttp.ClientSession() as session:
+            async with session.post(url, json=data, headers=headers) as response:
+                if response.status == 200:
+                    return await response.read()
+                else:
+                    raise Exception(f"ElevenLabs API error: {response.status}")
+    async def process_with_mcp(self, user_input: str) -> Dict[str, Any]:
+        """Process user input using MCP (Model Context Protocol)"""
+        # Detect intent
+        intent = self.detect_intent(user_input)
+        if intent == "calendar":
+            return await self.handle_calendar_request(user_input)
         else:
+            return await self.handle_general_question(user_input)
+    def detect_intent(self, text: str) -> str:
+        """Simple intent detection"""
+        calendar_keywords = ["schedule", "appointment", "meeting", "calendar", "book", "reserve"]
+        if any(keyword in text.lower() for keyword in calendar_keywords):
+            return "calendar"
+        return "general"
+    async def handle_calendar_request(self, text: str) -> Dict[str, Any]:
+        """Handle calendar appointment creation"""
+        try:
+            # Extract appointment details using simple parsing
+            # In a real implementation, you'd use NLP or LLM for better extraction
+            appointment_data = self.extract_appointment_details(text)
+            # Create calendar event (simplified - would use Google Calendar API)
+            event_summary = f"Appointment: {appointment_data.get('title', 'New Meeting')}"
+            event_time = appointment_data.get('time', 'TBD')
+            response_text = f"I've scheduled your {event_summary} for {event_time}. Please note: This is a demo - in production, this would create an actual Google Calendar event."
+            return {
+                "type": "calendar",
+                "response": response_text,
+                "success": True,
+                "event_data": appointment_data
+            }
+        except Exception as e:
+            return {
+                "type": "calendar",
+                "response": f"I encountered an error while scheduling your appointment: {str(e)}",
+                "success": False
+            }
+    def extract_appointment_details(self, text: str) -> Dict[str, str]:
+        """Extract appointment details from text (simplified)"""
+        # This is a basic implementation - in production, use NLP/LLM
+        details = {
+            "title": "Meeting",
+            "time": "Next available slot",
+            "duration": "30 minutes"
+        }
+        # Simple keyword extraction
+        if "doctor" in text.lower():
+            details["title"] = "Doctor Appointment"
+        elif "meeting" in text.lower():
+            details["title"] = "Meeting"
+        elif "call" in text.lower():
+            details["title"] = "Phone Call"
+        # Extract time mentions (basic)
+        words = text.lower().split()
+        for i, word in enumerate(words):
+            if word in ["tomorrow", "today", "monday", "tuesday", "wednesday", "thursday", "friday"]:
+                details["time"] = word.capitalize()
+                break
+            elif "at" in words and i < len(words) - 1:
+                if any(char.isdigit() for char in words[i + 1]):
+                    details["time"] = f"at {words[i + 1]}"
+                    break
+        return details
+    async def handle_general_question(self, text: str) -> Dict[str, Any]:
+        """Handle general questions"""
+        # Simple responses - in production, integrate with LLM
+        responses = {
+            "hello": "Hello! I'm your voice assistant. I can help you schedule appointments or answer questions.",
+            "how are you": "I'm doing well, thank you! How can I help you today?",
+            "weather": "I'm a demo assistant focused on calendar management. For weather, I'd need to integrate with a weather API.",
+            "time": f"The current time is {datetime.now().strftime('%I:%M %p')}",
+            "default": "I understand you're asking about something. As a demo assistant, I can help you schedule appointments or provide basic information. What would you like to do?"
+        }
+        text_lower = text.lower()
+        response_text = responses.get("default")
+        for key, response in responses.items():
+            if key in text_lower:
+                response_text = response
+                break
+        return {
+            "type": "general",
+            "response": response_text,
+            "success": True
+        }
+# Initialize the agent
+agent = VoiceAgent()
+async def process_voice_input(audio_file):
+    """Process voice input and return voice response"""
+    if audio_file is None:
+        return None, "Please record some audio first."
     try:
+        # Convert speech to text
+        text = await agent.speech_to_text(audio_file)
+        if text.startswith("Error"):
+            return None, text
+        # Process with MCP
+        result = await agent.process_with_mcp(text)
+        response_text = result["response"]
+        # Convert response to speech
+        if ELEVENLABS_API_KEY:
+            try:
+                audio_bytes = await agent.text_to_speech(response_text)
+                # Save to temporary file
+                with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
+                    tmp_file.write(audio_bytes)
+                    return tmp_file.name, f"You said: '{text}'\n\nResponse: {response_text}"
+            except Exception as e:
+                return None, f"Text-to-speech error: {str(e)}\n\nYou said: '{text}'\nResponse: {response_text}"
+        else:
+            return None, f"You said: '{text}'\n\nResponse: {response_text}\n\n(Note: Set ELEVENLABS_API_KEY for voice output)"
     except Exception as e:
+        return None, f"Error processing audio: {str(e)}"
+def process_text_input(text_input):
+    """Process text input directly"""
+    if not text_input.strip():
+        return "Please enter some text."
     try:
+        # Process with MCP
+        result = asyncio.run(agent.process_with_mcp(text_input))
+        return result["response"]
     except Exception as e:
+        return f"Error processing text: {str(e)}"
+# Create Gradio interface
+with gr.Blocks(title="Voice Agent - Gradio MCP Hackathon", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("""
+    # 🎤 Voice Agent with MCP
+    **Hackathon Project**: Gradio Agents & MCP Hackathon
+    This lightweight voice agent can:
+    - 🗣️ Process voice input and respond with voice
+    - 📅 Schedule calendar appointments
+    - ❓ Answer general questions
+    - 🔧 Uses MCP (Model Context Protocol) for processing
+    ## Setup Instructions:
+    1. Set `ELEVENLABS_API_KEY` environment variable for voice synthesis
+    2. Set `GOOGLE_CALENDAR_CREDENTIALS` for calendar integration (optional)
+    3. Try voice input or type your questions below!
+    """)
+    with gr.Tab("🎤 Voice Mode"):
+        with gr.Row():
+            with gr.Column():
+                audio_input = gr.Audio(
+                    sources=["microphone"],
+                    type="filepath",
+                    label="Record your voice"
+                )
+                voice_button = gr.Button("Process Voice Input", variant="primary")
+            with gr.Column():
+                audio_output = gr.Audio(label="AI Response (Voice)")
+                text_output = gr.Textbox(
+                    label="Conversation Log",
+                    lines=6,
+                    interactive=False
+                )
+        voice_button.click(
+            fn=process_voice_input,
+            inputs=[audio_input],
+            outputs=[audio_output, text_output]
+        )
+    with gr.Tab("💬 Text Mode"):
+        with gr.Row():
+            with gr.Column():
+                text_input = gr.Textbox(
+                    label="Type your message",
+                    placeholder="Ask me anything or request to schedule an appointment...",
+                    lines=3
+                )
+                text_button = gr.Button("Send Message", variant="primary")
+            with gr.Column():
+                text_response = gr.Textbox(
+                    label="AI Response",
+                    lines=6,
+                    interactive=False
+                )
+        text_button.click(
+            fn=process_text_input,
+            inputs=[text_input],
+            outputs=[text_response]
+        )
+        # Quick action buttons
+        gr.Markdown("### Quick Actions:")
+        with gr.Row():
+            quick_hello = gr.Button("👋 Say Hello")
+            quick_time = gr.Button("🕐 What time is it?")
+            quick_appointment = gr.Button("📅 Schedule appointment tomorrow at 2pm")
+        quick_hello.click(
+            fn=lambda: process_text_input("hello"),
+            outputs=[text_response]
+        )
+        quick_time.click(
+            fn=lambda: process_text_input("what time is it"),
+            outputs=[text_response]
+        )
+        quick_appointment.click(
+            fn=lambda: process_text_input("schedule an appointment tomorrow at 2pm"),
+            outputs=[text_response]
+        )
+    with gr.Tab("ℹ️ About"):
+        gr.Markdown("""
+        ## About This Project
+        This is a hackathon submission for the **Gradio Agents & MCP Hackathon**.
+        ### Features:
+        - **Voice Input/Output**: Uses speech recognition and ElevenLabs TTS
+        - **MCP Integration**: Implements Model Context Protocol for intelligent processing
+        - **Calendar Management**: Can schedule appointments (demo mode)
+        - **Lightweight**: Optimized for Hugging Face Spaces
+        ### Technologies Used:
+        - **Gradio**: For the web interface
+        - **ElevenLabs**: For text-to-speech synthesis
+        - **MCP**: For intelligent request processing
+        - **Speech Recognition**: For voice-to-text conversion
+        ### Environment Variables:
+        - `ELEVENLABS_API_KEY`: Your ElevenLabs API key
+        - `GOOGLE_CALENDAR_CREDENTIALS`: Google Calendar API credentials (optional)
+        ### Example Interactions:
+        - "Hello, how are you?"
+        - "What time is it?"
+        - "Schedule a doctor appointment for tomorrow at 3pm"
+        - "Book a meeting with John next Monday"
+        """)
+if __name__ == "__main__":
+    demo.launch()