|
|
""" |
|
|
Voice Agent Gradio Application |
|
|
Web interface for the Voice Agent with microphone support |
|
|
""" |
|
|
|
|
|
import gradio as gr |
|
|
import asyncio |
|
|
import logging |
|
|
import os |
|
|
from .voice_agent import VoiceAgent |
|
|
|
|
|
|
|
|
class VoiceApp: |
|
|
"""Gradio web application for Voice Agent.""" |
|
|
|
|
|
def __init__(self): |
|
|
self.agent = VoiceAgent() |
|
|
self.conversation_history = [] |
|
|
|
|
|
|
|
|
logging.basicConfig(level=logging.INFO) |
|
|
|
|
|
|
|
|
self.interface = self._create_interface() |
|
|
|
|
|
def _create_interface(self): |
|
|
"""Create the Gradio interface.""" |
|
|
|
|
|
with gr.Blocks( |
|
|
title="π€ Voice Agent - Secure AI Suite", |
|
|
theme=gr.themes.Soft( |
|
|
primary_hue="orange", |
|
|
secondary_hue="gray", |
|
|
neutral_hue="slate" |
|
|
), |
|
|
css=""" |
|
|
.container { max-width: 1200px; margin: auto; } |
|
|
.chatbot { height: 500px; } |
|
|
.status-card { background: linear-gradient(90deg, #fa709a 0%, #fee140 100%); color: white; } |
|
|
.tool-card { border: 2px solid #e2e8f0; border-radius: 8px; padding: 12px; margin: 8px 0; } |
|
|
.audio-controls { text-align: center; padding: 20px; background: #f8fafc; border-radius: 8px; } |
|
|
""" |
|
|
) as app: |
|
|
|
|
|
|
|
|
gr.HTML(""" |
|
|
<div style='text-align: center; padding: 20px; background: linear-gradient(90deg, #fa709a 0%, #fee140 100%); color: white; border-radius: 10px;'> |
|
|
<h1 style='margin: 0; font-size: 2.5em;'>π€ Voice Agent</h1> |
|
|
<p style='margin: 10px 0; font-size: 1.2em;'>Speech-to-AI & Text-to-Speech with Multi-modal Processing</p> |
|
|
<p style='margin: 0; opacity: 0.8;'>π Secure AI Agents Suite</p> |
|
|
</div> |
|
|
""") |
|
|
|
|
|
with gr.Row(): |
|
|
|
|
|
with gr.Column(scale=2): |
|
|
gr.HTML("<h3>ποΈ Voice Interaction</h3>") |
|
|
|
|
|
|
|
|
with gr.Column(): |
|
|
gr.HTML("<div class='audio-controls'>") |
|
|
gr.HTML("<h4>ποΈ Record Your Voice</h4>") |
|
|
audio_input = gr.Audio( |
|
|
label="Click to record or upload audio file", |
|
|
type="filepath", |
|
|
format="mp3", |
|
|
elem_classes=["audio-input"] |
|
|
) |
|
|
|
|
|
gr.HTML("<h4>π£οΈ AI Response (Audio)</h4>") |
|
|
audio_output = gr.Audio( |
|
|
label="AI response will appear here", |
|
|
type="numpy", |
|
|
elem_classes=["audio-output"] |
|
|
) |
|
|
gr.HTML("</div>") |
|
|
|
|
|
gr.HTML("<h3>π¬ Text Chat with Voice Features</h3>") |
|
|
|
|
|
chatbot = gr.Chatbot( |
|
|
label="Voice Assistant Chat", |
|
|
height=300, |
|
|
elem_classes=["chatbot"], |
|
|
avatar_images=(None, "π€") |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
msg_input = gr.Textbox( |
|
|
placeholder="Type or use voice input. Try: 'Transcribe this audio' or 'Say hello in a female voice'...", |
|
|
lines=2, |
|
|
max_lines=4, |
|
|
label="Your Message" |
|
|
) |
|
|
with gr.Column(scale=0): |
|
|
send_btn = gr.Button("Send", variant="primary") |
|
|
clear_btn = gr.Button("Clear", variant="secondary") |
|
|
|
|
|
|
|
|
with gr.Column(scale=1): |
|
|
gr.HTML("<h3>π οΈ Voice Services</h3>") |
|
|
|
|
|
tools_info = gr.HTML(""" |
|
|
<div class="tool-card"> |
|
|
<h4>ποΈ Speech-to-Text</h4> |
|
|
<p>β’ Whisper transcription<br>β’ Multi-language support<br>β’ High accuracy</p> |
|
|
</div> |
|
|
<div class="tool-card"> |
|
|
<h4>π£οΈ Text-to-Speech</h4> |
|
|
<p>β’ ElevenLabs synthesis<br>β’ Natural voices<br>β’ Emotional expression</p> |
|
|
</div> |
|
|
<div class="tool-card"> |
|
|
<h4>π¬ Voice Conversation</h4> |
|
|
<p>β’ Full-duplex chat<br>β’ Real-time processing<br>β’ Context awareness</p> |
|
|
</div> |
|
|
<div class="tool-card"> |
|
|
<h4>π Multilingual</h4> |
|
|
<p>β’ 5+ languages<br>β’ Auto-detection<br>β’ Cultural adaptation</p> |
|
|
</div> |
|
|
""") |
|
|
|
|
|
gr.HTML("<h3>ποΈ Voice Settings</h3>") |
|
|
with gr.Row(): |
|
|
voice_select = gr.Dropdown( |
|
|
choices=["Adam (Male)", "Rachel (Female)", "Cloyd (Deep)", "Custom"], |
|
|
value="Adam (Male)", |
|
|
label="Voice Selection" |
|
|
) |
|
|
speed_slider = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Speech Speed") |
|
|
|
|
|
gr.HTML("<h3>π System Status</h3>") |
|
|
status_display = gr.HTML() |
|
|
|
|
|
|
|
|
def user(user_message, history): |
|
|
"""Handle user input.""" |
|
|
if not user_message.strip(): |
|
|
return history, "" |
|
|
|
|
|
|
|
|
history.append((user_message, None)) |
|
|
return history, "" |
|
|
|
|
|
async def bot_response(history, user_message): |
|
|
"""Generate bot response.""" |
|
|
if not user_message.strip(): |
|
|
return history |
|
|
|
|
|
|
|
|
response = await self.agent.handle_user_input(user_message) |
|
|
|
|
|
|
|
|
history[-1] = (user_message, response) |
|
|
return history |
|
|
|
|
|
async def process_audio(audio_file): |
|
|
"""Process uploaded or recorded audio.""" |
|
|
if not audio_file: |
|
|
return None, "No audio file provided" |
|
|
|
|
|
try: |
|
|
|
|
|
response = await self.agent.handle_user_input("process this audio file") |
|
|
return audio_file, response |
|
|
except Exception as e: |
|
|
return audio_file, f"Error processing audio: {str(e)}" |
|
|
|
|
|
async def text_to_speech(text, voice_style, speed): |
|
|
"""Convert text to speech.""" |
|
|
if not text.strip(): |
|
|
return None, "No text provided" |
|
|
|
|
|
try: |
|
|
|
|
|
voice_prompt = f"speak: {text} with {voice_style} voice at {speed}x speed" |
|
|
response = await self.agent.handle_user_input(voice_prompt) |
|
|
|
|
|
|
|
|
audio_path = f"temp_audio_{hash(text)}.mp3" |
|
|
|
|
|
return audio_path, response |
|
|
except Exception as e: |
|
|
return None, f"Error generating speech: {str(e)}" |
|
|
|
|
|
def clear_conversation(): |
|
|
"""Clear conversation history.""" |
|
|
return [] |
|
|
|
|
|
def update_status(): |
|
|
"""Update status display.""" |
|
|
status = self.agent.get_status() |
|
|
voice_settings = self.agent.config.get("voice_settings", {}) |
|
|
return f""" |
|
|
<div class="status-card" style="padding: 15px; border-radius: 8px;"> |
|
|
<h4>β
Voice System Status</h4> |
|
|
<p><strong>Agent:</strong> {status['name']}</p> |
|
|
<p><strong>Status:</strong> {status['status']}</p> |
|
|
<p><strong>Whisper:</strong> {voice_settings.get('whisper_model', 'whisper-1')}</p> |
|
|
<p><strong>ElevenLabs:</strong> Active</p> |
|
|
<p><strong>Languages:</strong> 5+ supported</p> |
|
|
<p><strong>Security:</strong> {'π‘οΈ Enabled' if status['security_enabled'] else 'β Disabled'}</p> |
|
|
</div> |
|
|
""" |
|
|
|
|
|
|
|
|
send_btn.click( |
|
|
user, |
|
|
inputs=[msg_input, chatbot], |
|
|
outputs=[chatbot, msg_input] |
|
|
).then( |
|
|
bot_response, |
|
|
inputs=[chatbot, msg_input], |
|
|
outputs=[chatbot] |
|
|
) |
|
|
|
|
|
msg_input.submit( |
|
|
user, |
|
|
inputs=[msg_input, chatbot], |
|
|
outputs=[chatbot, msg_input] |
|
|
).then( |
|
|
bot_response, |
|
|
inputs=[chatbot, msg_input], |
|
|
outputs=[chatbot] |
|
|
) |
|
|
|
|
|
|
|
|
audio_input.change( |
|
|
process_audio, |
|
|
inputs=[audio_input], |
|
|
outputs=[audio_output, chatbot] |
|
|
) |
|
|
|
|
|
|
|
|
def generate_speech(text, voice, speed): |
|
|
return text_to_speech(text, voice, speed) |
|
|
|
|
|
clear_btn.click(clear_conversation, outputs=chatbot) |
|
|
|
|
|
|
|
|
app.load(update_status, outputs=status_display) |
|
|
|
|
|
return app |
|
|
|
|
|
def launch(self, **kwargs): |
|
|
"""Launch the Gradio application.""" |
|
|
self.interface.launch( |
|
|
server_name="0.0.0.0", |
|
|
server_port=7863, |
|
|
share=False, |
|
|
show_error=True, |
|
|
quiet=False, |
|
|
**kwargs |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
EXAMPLE_QUERIES = [ |
|
|
"Transcribe this audio file", |
|
|
"Say 'Hello, welcome to our voice AI' in a female voice", |
|
|
"Start a voice conversation", |
|
|
"Analyze the sentiment of this audio", |
|
|
"Search for meeting recordings about project updates", |
|
|
"Enable multilingual voice mode" |
|
|
] |
|
|
|
|
|
|
|
|
def main(): |
|
|
"""Main function to run the Voice Agent app.""" |
|
|
print("π€ Starting Voice Agent...") |
|
|
print("ποΈ Initializing Whisper (Speech-to-Text)...") |
|
|
print("π£οΈ Loading ElevenLabs (Text-to-Speech)...") |
|
|
print("π§ Connecting AI models (GPT-4o, Gemini)...") |
|
|
print("π Setting up multilingual support...") |
|
|
|
|
|
app = VoiceApp() |
|
|
|
|
|
print("\n" + "="*60) |
|
|
print("π€ VOICE AGENT - SPEECH PROCESSING SUITE") |
|
|
print("="*60) |
|
|
print("\nπ‘ Example voice requests you can try:") |
|
|
for i, query in enumerate(EXAMPLE_QUERIES, 1): |
|
|
print(f" {i}. {query}") |
|
|
print("\nποΈ Features:") |
|
|
print(" β’ Record your voice or upload audio files") |
|
|
print(" β’ Convert text to natural-sounding speech") |
|
|
print(" β’ Full voice conversations with AI") |
|
|
print(" β’ Multi-language support (English, Spanish, Nepali, etc.)") |
|
|
print("\nπ Starting Gradio server...") |
|
|
print("π Open your browser to: http://localhost:7863") |
|
|
print("\n" + "="*60) |
|
|
|
|
|
app.launch() |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |