develops20 commited on
Commit
e4674b9
Β·
verified Β·
1 Parent(s): 9a4cedc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +314 -70
app.py CHANGED
@@ -1,84 +1,328 @@
1
  import gradio as gr
2
- from transformers import pipeline
3
- from gtts import gTTS
 
4
  import os
5
- import numpy as np
 
 
 
 
 
 
6
 
7
- # Initialize Whisper for speech-to-text
8
- whisper = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")
 
 
9
 
10
- # Hardcoded knowledge base for Q&A
11
- knowledge_base = {
12
- "what cars are available": "We have Toyota Camry, Honda Civic, and Ford Mustang.",
13
- "price of camry": "The Toyota Camry starts at $25,000.",
14
- "price of tesla": "The Tesla starts at $60,000."
15
- }
16
 
17
- def transcribe(audio):
18
- print(f"Transcribing audio: {type(audio)}")
19
- try:
20
- # Check if audio is a tuple (numpy array, sample rate)
21
- if isinstance(audio, tuple):
22
- audio_data, _ = audio # Extract numpy array, ignore sample rate
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  else:
24
- audio_data = audio
25
- result = whisper(audio_data)["text"]
26
- print(f"Transcription result: {result}")
27
- return result
28
- except Exception as e:
29
- print(f"Error in transcribe: {str(e)}")
30
- import traceback
31
- traceback.print_exc()
32
- raise
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
- def text_to_speech(text):
35
- print(f"Generating speech for text: {text}")
36
- try:
37
- tts = gTTS(text, lang="en")
38
- output_path = "/tmp/response.mp3"
39
- tts.save(output_path)
40
- print(f"Speech saved to {output_path}")
41
- return output_path
42
- except Exception as e:
43
- print(f"Error in text_to_speech: {str(e)}")
44
- import traceback
45
- traceback.print_exc()
46
- raise
47
 
48
- def answer_question(text):
49
- print(f"Answering question: {text}")
 
 
 
50
  try:
51
- for key in knowledge_base:
52
- if key in text.lower():
53
- print(f"Found match for key: {key}")
54
- return knowledge_base[key]
55
- print("No match found in knowledge base")
56
- return "Sorry, I can help with car availability and prices. Try again!"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  except Exception as e:
58
- print(f"Error in answer_question: {str(e)}")
59
- import traceback
60
- traceback.print_exc()
61
- raise
62
 
63
- def process_audio(audio):
64
- print(f"Processing audio: {type(audio)}")
 
 
 
65
  try:
66
- text = transcribe(audio)
67
- response = answer_question(text)
68
- audio_response = text_to_speech(response)
69
- print(f"Process complete. Response: {response}, Audio: {audio_response}")
70
- return response, audio_response
71
  except Exception as e:
72
- print(f"Error in process_audio: {str(e)}")
73
- import traceback
74
- traceback.print_exc()
75
- raise
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
- # Gradio interface
78
- with gr.Blocks() as demo:
79
- gr.Markdown("# AI Support Agent: Car Dealership")
80
- audio_input = gr.Audio(label="Speak to the Agent")
81
- text_output = gr.Textbox(label="Agent Response")
82
- audio_output = gr.Audio(label="Listen to Response")
83
- btn = gr.Button("Submit")
84
- btn.click(fn=process_audio, inputs=audio_input, outputs=[text_output, audio_output])
 
1
  import gradio as gr
2
+ import speech_recognition as sr
3
+ import requests
4
+ import json
5
  import os
6
+ from datetime import datetime, timedelta
7
+ import tempfile
8
+ import io
9
+ import base64
10
+ from typing import Optional, Dict, Any
11
+ import asyncio
12
+ import aiohttp
13
 
14
+ # Configuration
15
+ ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY")
16
+ GOOGLE_CALENDAR_CREDENTIALS = os.getenv("GOOGLE_CALENDAR_CREDENTIALS")
17
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
18
 
19
+ # ElevenLabs configuration
20
+ ELEVENLABS_VOICE_ID = "21m00Tcm4TlvDq8ikWAM" # Default voice, can be changed
21
+ ELEVENLABS_API_URL = "https://api.elevenlabs.io/v1"
 
 
 
22
 
23
+ class VoiceAgent:
24
+ def __init__(self):
25
+ self.recognizer = sr.Recognizer()
26
+ self.microphone = sr.Microphone()
27
+
28
+ async def speech_to_text(self, audio_file) -> str:
29
+ """Convert speech to text using speech_recognition"""
30
+ try:
31
+ with sr.AudioFile(audio_file) as source:
32
+ audio = self.recognizer.record(source)
33
+ text = self.recognizer.recognize_google(audio)
34
+ return text
35
+ except Exception as e:
36
+ return f"Error in speech recognition: {str(e)}"
37
+
38
+ async def text_to_speech(self, text: str) -> bytes:
39
+ """Convert text to speech using ElevenLabs"""
40
+ if not ELEVENLABS_API_KEY:
41
+ raise ValueError("ElevenLabs API key not found")
42
+
43
+ url = f"{ELEVENLABS_API_URL}/text-to-speech/{ELEVENLABS_VOICE_ID}"
44
+ headers = {
45
+ "Accept": "audio/mpeg",
46
+ "Content-Type": "application/json",
47
+ "xi-api-key": ELEVENLABS_API_KEY
48
+ }
49
+
50
+ data = {
51
+ "text": text,
52
+ "model_id": "eleven_monolingual_v1",
53
+ "voice_settings": {
54
+ "stability": 0.5,
55
+ "similarity_boost": 0.5
56
+ }
57
+ }
58
+
59
+ async with aiohttp.ClientSession() as session:
60
+ async with session.post(url, json=data, headers=headers) as response:
61
+ if response.status == 200:
62
+ return await response.read()
63
+ else:
64
+ raise Exception(f"ElevenLabs API error: {response.status}")
65
+
66
+ async def process_with_mcp(self, user_input: str) -> Dict[str, Any]:
67
+ """Process user input using MCP (Model Context Protocol)"""
68
+ # Detect intent
69
+ intent = self.detect_intent(user_input)
70
+
71
+ if intent == "calendar":
72
+ return await self.handle_calendar_request(user_input)
73
  else:
74
+ return await self.handle_general_question(user_input)
75
+
76
+ def detect_intent(self, text: str) -> str:
77
+ """Simple intent detection"""
78
+ calendar_keywords = ["schedule", "appointment", "meeting", "calendar", "book", "reserve"]
79
+ if any(keyword in text.lower() for keyword in calendar_keywords):
80
+ return "calendar"
81
+ return "general"
82
+
83
+ async def handle_calendar_request(self, text: str) -> Dict[str, Any]:
84
+ """Handle calendar appointment creation"""
85
+ try:
86
+ # Extract appointment details using simple parsing
87
+ # In a real implementation, you'd use NLP or LLM for better extraction
88
+ appointment_data = self.extract_appointment_details(text)
89
+
90
+ # Create calendar event (simplified - would use Google Calendar API)
91
+ event_summary = f"Appointment: {appointment_data.get('title', 'New Meeting')}"
92
+ event_time = appointment_data.get('time', 'TBD')
93
+
94
+ response_text = f"I've scheduled your {event_summary} for {event_time}. Please note: This is a demo - in production, this would create an actual Google Calendar event."
95
+
96
+ return {
97
+ "type": "calendar",
98
+ "response": response_text,
99
+ "success": True,
100
+ "event_data": appointment_data
101
+ }
102
+ except Exception as e:
103
+ return {
104
+ "type": "calendar",
105
+ "response": f"I encountered an error while scheduling your appointment: {str(e)}",
106
+ "success": False
107
+ }
108
+
109
+ def extract_appointment_details(self, text: str) -> Dict[str, str]:
110
+ """Extract appointment details from text (simplified)"""
111
+ # This is a basic implementation - in production, use NLP/LLM
112
+ details = {
113
+ "title": "Meeting",
114
+ "time": "Next available slot",
115
+ "duration": "30 minutes"
116
+ }
117
+
118
+ # Simple keyword extraction
119
+ if "doctor" in text.lower():
120
+ details["title"] = "Doctor Appointment"
121
+ elif "meeting" in text.lower():
122
+ details["title"] = "Meeting"
123
+ elif "call" in text.lower():
124
+ details["title"] = "Phone Call"
125
+
126
+ # Extract time mentions (basic)
127
+ words = text.lower().split()
128
+ for i, word in enumerate(words):
129
+ if word in ["tomorrow", "today", "monday", "tuesday", "wednesday", "thursday", "friday"]:
130
+ details["time"] = word.capitalize()
131
+ break
132
+ elif "at" in words and i < len(words) - 1:
133
+ if any(char.isdigit() for char in words[i + 1]):
134
+ details["time"] = f"at {words[i + 1]}"
135
+ break
136
+
137
+ return details
138
+
139
+ async def handle_general_question(self, text: str) -> Dict[str, Any]:
140
+ """Handle general questions"""
141
+ # Simple responses - in production, integrate with LLM
142
+ responses = {
143
+ "hello": "Hello! I'm your voice assistant. I can help you schedule appointments or answer questions.",
144
+ "how are you": "I'm doing well, thank you! How can I help you today?",
145
+ "weather": "I'm a demo assistant focused on calendar management. For weather, I'd need to integrate with a weather API.",
146
+ "time": f"The current time is {datetime.now().strftime('%I:%M %p')}",
147
+ "default": "I understand you're asking about something. As a demo assistant, I can help you schedule appointments or provide basic information. What would you like to do?"
148
+ }
149
+
150
+ text_lower = text.lower()
151
+ response_text = responses.get("default")
152
+
153
+ for key, response in responses.items():
154
+ if key in text_lower:
155
+ response_text = response
156
+ break
157
+
158
+ return {
159
+ "type": "general",
160
+ "response": response_text,
161
+ "success": True
162
+ }
163
 
164
+ # Initialize the agent
165
+ agent = VoiceAgent()
 
 
 
 
 
 
 
 
 
 
 
166
 
167
+ async def process_voice_input(audio_file):
168
+ """Process voice input and return voice response"""
169
+ if audio_file is None:
170
+ return None, "Please record some audio first."
171
+
172
  try:
173
+ # Convert speech to text
174
+ text = await agent.speech_to_text(audio_file)
175
+ if text.startswith("Error"):
176
+ return None, text
177
+
178
+ # Process with MCP
179
+ result = await agent.process_with_mcp(text)
180
+ response_text = result["response"]
181
+
182
+ # Convert response to speech
183
+ if ELEVENLABS_API_KEY:
184
+ try:
185
+ audio_bytes = await agent.text_to_speech(response_text)
186
+ # Save to temporary file
187
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
188
+ tmp_file.write(audio_bytes)
189
+ return tmp_file.name, f"You said: '{text}'\n\nResponse: {response_text}"
190
+ except Exception as e:
191
+ return None, f"Text-to-speech error: {str(e)}\n\nYou said: '{text}'\nResponse: {response_text}"
192
+ else:
193
+ return None, f"You said: '{text}'\n\nResponse: {response_text}\n\n(Note: Set ELEVENLABS_API_KEY for voice output)"
194
+
195
  except Exception as e:
196
+ return None, f"Error processing audio: {str(e)}"
 
 
 
197
 
198
+ def process_text_input(text_input):
199
+ """Process text input directly"""
200
+ if not text_input.strip():
201
+ return "Please enter some text."
202
+
203
  try:
204
+ # Process with MCP
205
+ result = asyncio.run(agent.process_with_mcp(text_input))
206
+ return result["response"]
 
 
207
  except Exception as e:
208
+ return f"Error processing text: {str(e)}"
209
+
210
+ # Create Gradio interface
211
+ with gr.Blocks(title="Voice Agent - Gradio MCP Hackathon", theme=gr.themes.Soft()) as demo:
212
+ gr.Markdown("""
213
+ # 🎀 Voice Agent with MCP
214
+
215
+ **Hackathon Project**: Gradio Agents & MCP Hackathon
216
+
217
+ This lightweight voice agent can:
218
+ - πŸ—£οΈ Process voice input and respond with voice
219
+ - πŸ“… Schedule calendar appointments
220
+ - ❓ Answer general questions
221
+ - πŸ”§ Uses MCP (Model Context Protocol) for processing
222
+
223
+ ## Setup Instructions:
224
+ 1. Set `ELEVENLABS_API_KEY` environment variable for voice synthesis
225
+ 2. Set `GOOGLE_CALENDAR_CREDENTIALS` for calendar integration (optional)
226
+ 3. Try voice input or type your questions below!
227
+ """)
228
+
229
+ with gr.Tab("🎀 Voice Mode"):
230
+ with gr.Row():
231
+ with gr.Column():
232
+ audio_input = gr.Audio(
233
+ sources=["microphone"],
234
+ type="filepath",
235
+ label="Record your voice"
236
+ )
237
+ voice_button = gr.Button("Process Voice Input", variant="primary")
238
+
239
+ with gr.Column():
240
+ audio_output = gr.Audio(label="AI Response (Voice)")
241
+ text_output = gr.Textbox(
242
+ label="Conversation Log",
243
+ lines=6,
244
+ interactive=False
245
+ )
246
+
247
+ voice_button.click(
248
+ fn=process_voice_input,
249
+ inputs=[audio_input],
250
+ outputs=[audio_output, text_output]
251
+ )
252
+
253
+ with gr.Tab("πŸ’¬ Text Mode"):
254
+ with gr.Row():
255
+ with gr.Column():
256
+ text_input = gr.Textbox(
257
+ label="Type your message",
258
+ placeholder="Ask me anything or request to schedule an appointment...",
259
+ lines=3
260
+ )
261
+ text_button = gr.Button("Send Message", variant="primary")
262
+
263
+ with gr.Column():
264
+ text_response = gr.Textbox(
265
+ label="AI Response",
266
+ lines=6,
267
+ interactive=False
268
+ )
269
+
270
+ text_button.click(
271
+ fn=process_text_input,
272
+ inputs=[text_input],
273
+ outputs=[text_response]
274
+ )
275
+
276
+ # Quick action buttons
277
+ gr.Markdown("### Quick Actions:")
278
+ with gr.Row():
279
+ quick_hello = gr.Button("πŸ‘‹ Say Hello")
280
+ quick_time = gr.Button("πŸ• What time is it?")
281
+ quick_appointment = gr.Button("πŸ“… Schedule appointment tomorrow at 2pm")
282
+
283
+ quick_hello.click(
284
+ fn=lambda: process_text_input("hello"),
285
+ outputs=[text_response]
286
+ )
287
+
288
+ quick_time.click(
289
+ fn=lambda: process_text_input("what time is it"),
290
+ outputs=[text_response]
291
+ )
292
+
293
+ quick_appointment.click(
294
+ fn=lambda: process_text_input("schedule an appointment tomorrow at 2pm"),
295
+ outputs=[text_response]
296
+ )
297
+
298
+ with gr.Tab("ℹ️ About"):
299
+ gr.Markdown("""
300
+ ## About This Project
301
+
302
+ This is a hackathon submission for the **Gradio Agents & MCP Hackathon**.
303
+
304
+ ### Features:
305
+ - **Voice Input/Output**: Uses speech recognition and ElevenLabs TTS
306
+ - **MCP Integration**: Implements Model Context Protocol for intelligent processing
307
+ - **Calendar Management**: Can schedule appointments (demo mode)
308
+ - **Lightweight**: Optimized for Hugging Face Spaces
309
+
310
+ ### Technologies Used:
311
+ - **Gradio**: For the web interface
312
+ - **ElevenLabs**: For text-to-speech synthesis
313
+ - **MCP**: For intelligent request processing
314
+ - **Speech Recognition**: For voice-to-text conversion
315
+
316
+ ### Environment Variables:
317
+ - `ELEVENLABS_API_KEY`: Your ElevenLabs API key
318
+ - `GOOGLE_CALENDAR_CREDENTIALS`: Google Calendar API credentials (optional)
319
+
320
+ ### Example Interactions:
321
+ - "Hello, how are you?"
322
+ - "What time is it?"
323
+ - "Schedule a doctor appointment for tomorrow at 3pm"
324
+ - "Book a meeting with John next Monday"
325
+ """)
326
 
327
+ if __name__ == "__main__":
328
+ demo.launch()