rajkumarrawal's picture
Initial commit
2ec0d39
"""
Voice Agent Gradio Application
Web interface for the Voice Agent with microphone support
"""
import gradio as gr
import asyncio
import logging
import os
from .voice_agent import VoiceAgent
class VoiceApp:
"""Gradio web application for Voice Agent."""
def __init__(self):
self.agent = VoiceAgent()
self.conversation_history = []
# Set up logging
logging.basicConfig(level=logging.INFO)
# Create the interface
self.interface = self._create_interface()
def _create_interface(self):
"""Create the Gradio interface."""
with gr.Blocks(
title="🎀 Voice Agent - Secure AI Suite",
theme=gr.themes.Soft(
primary_hue="orange",
secondary_hue="gray",
neutral_hue="slate"
),
css="""
.container { max-width: 1200px; margin: auto; }
.chatbot { height: 500px; }
.status-card { background: linear-gradient(90deg, #fa709a 0%, #fee140 100%); color: white; }
.tool-card { border: 2px solid #e2e8f0; border-radius: 8px; padding: 12px; margin: 8px 0; }
.audio-controls { text-align: center; padding: 20px; background: #f8fafc; border-radius: 8px; }
"""
) as app:
# Header
gr.HTML("""
<div style='text-align: center; padding: 20px; background: linear-gradient(90deg, #fa709a 0%, #fee140 100%); color: white; border-radius: 10px;'>
<h1 style='margin: 0; font-size: 2.5em;'>🎀 Voice Agent</h1>
<p style='margin: 10px 0; font-size: 1.2em;'>Speech-to-AI & Text-to-Speech with Multi-modal Processing</p>
<p style='margin: 0; opacity: 0.8;'>πŸ” Secure AI Agents Suite</p>
</div>
""")
with gr.Row():
# Left column - Voice interface
with gr.Column(scale=2):
gr.HTML("<h3>πŸŽ™οΈ Voice Interaction</h3>")
# Audio input/output section
with gr.Column():
gr.HTML("<div class='audio-controls'>")
gr.HTML("<h4>πŸŽ™οΈ Record Your Voice</h4>")
audio_input = gr.Audio(
label="Click to record or upload audio file",
type="filepath",
format="mp3",
elem_classes=["audio-input"]
)
gr.HTML("<h4>πŸ—£οΈ AI Response (Audio)</h4>")
audio_output = gr.Audio(
label="AI response will appear here",
type="numpy",
elem_classes=["audio-output"]
)
gr.HTML("</div>")
gr.HTML("<h3>πŸ’¬ Text Chat with Voice Features</h3>")
chatbot = gr.Chatbot(
label="Voice Assistant Chat",
height=300,
elem_classes=["chatbot"],
avatar_images=(None, "🎀")
)
with gr.Row():
msg_input = gr.Textbox(
placeholder="Type or use voice input. Try: 'Transcribe this audio' or 'Say hello in a female voice'...",
lines=2,
max_lines=4,
label="Your Message"
)
with gr.Column(scale=0):
send_btn = gr.Button("Send", variant="primary")
clear_btn = gr.Button("Clear", variant="secondary")
# Right column - Voice Tools and Settings
with gr.Column(scale=1):
gr.HTML("<h3>πŸ› οΈ Voice Services</h3>")
tools_info = gr.HTML("""
<div class="tool-card">
<h4>πŸŽ™οΈ Speech-to-Text</h4>
<p>β€’ Whisper transcription<br>β€’ Multi-language support<br>β€’ High accuracy</p>
</div>
<div class="tool-card">
<h4>πŸ—£οΈ Text-to-Speech</h4>
<p>β€’ ElevenLabs synthesis<br>β€’ Natural voices<br>β€’ Emotional expression</p>
</div>
<div class="tool-card">
<h4>πŸ’¬ Voice Conversation</h4>
<p>β€’ Full-duplex chat<br>β€’ Real-time processing<br>β€’ Context awareness</p>
</div>
<div class="tool-card">
<h4>🌍 Multilingual</h4>
<p>β€’ 5+ languages<br>β€’ Auto-detection<br>β€’ Cultural adaptation</p>
</div>
""")
gr.HTML("<h3>πŸŽ›οΈ Voice Settings</h3>")
with gr.Row():
voice_select = gr.Dropdown(
choices=["Adam (Male)", "Rachel (Female)", "Cloyd (Deep)", "Custom"],
value="Adam (Male)",
label="Voice Selection"
)
speed_slider = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Speech Speed")
gr.HTML("<h3>πŸ“Š System Status</h3>")
status_display = gr.HTML()
# Event handlers
def user(user_message, history):
"""Handle user input."""
if not user_message.strip():
return history, ""
# Add user message to history
history.append((user_message, None))
return history, ""
async def bot_response(history, user_message):
"""Generate bot response."""
if not user_message.strip():
return history
# Get response from agent
response = await self.agent.handle_user_input(user_message)
# Add bot response to history
history[-1] = (user_message, response)
return history
async def process_audio(audio_file):
"""Process uploaded or recorded audio."""
if not audio_file:
return None, "No audio file provided"
try:
# Process audio with voice agent
response = await self.agent.handle_user_input("process this audio file")
return audio_file, response
except Exception as e:
return audio_file, f"Error processing audio: {str(e)}"
async def text_to_speech(text, voice_style, speed):
"""Convert text to speech."""
if not text.strip():
return None, "No text provided"
try:
# Process with voice synthesis
voice_prompt = f"speak: {text} with {voice_style} voice at {speed}x speed"
response = await self.agent.handle_user_input(voice_prompt)
# Generate mock audio file path
audio_path = f"temp_audio_{hash(text)}.mp3"
return audio_path, response
except Exception as e:
return None, f"Error generating speech: {str(e)}"
def clear_conversation():
"""Clear conversation history."""
return []
def update_status():
"""Update status display."""
status = self.agent.get_status()
voice_settings = self.agent.config.get("voice_settings", {})
return f"""
<div class="status-card" style="padding: 15px; border-radius: 8px;">
<h4>βœ… Voice System Status</h4>
<p><strong>Agent:</strong> {status['name']}</p>
<p><strong>Status:</strong> {status['status']}</p>
<p><strong>Whisper:</strong> {voice_settings.get('whisper_model', 'whisper-1')}</p>
<p><strong>ElevenLabs:</strong> Active</p>
<p><strong>Languages:</strong> 5+ supported</p>
<p><strong>Security:</strong> {'πŸ›‘οΈ Enabled' if status['security_enabled'] else '❌ Disabled'}</p>
</div>
"""
# Connect events
send_btn.click(
user,
inputs=[msg_input, chatbot],
outputs=[chatbot, msg_input]
).then(
bot_response,
inputs=[chatbot, msg_input],
outputs=[chatbot]
)
msg_input.submit(
user,
inputs=[msg_input, chatbot],
outputs=[chatbot, msg_input]
).then(
bot_response,
inputs=[chatbot, msg_input],
outputs=[chatbot]
)
# Audio processing
audio_input.change(
process_audio,
inputs=[audio_input],
outputs=[audio_output, chatbot]
)
# Text-to-speech generation
def generate_speech(text, voice, speed):
return text_to_speech(text, voice, speed)
clear_btn.click(clear_conversation, outputs=chatbot)
# Initial status update
app.load(update_status, outputs=status_display)
return app
def launch(self, **kwargs):
"""Launch the Gradio application."""
self.interface.launch(
server_name="0.0.0.0",
server_port=7863,
share=False,
show_error=True,
quiet=False,
**kwargs
)
# Example usage and quick commands
EXAMPLE_QUERIES = [
"Transcribe this audio file",
"Say 'Hello, welcome to our voice AI' in a female voice",
"Start a voice conversation",
"Analyze the sentiment of this audio",
"Search for meeting recordings about project updates",
"Enable multilingual voice mode"
]
def main():
"""Main function to run the Voice Agent app."""
print("🎀 Starting Voice Agent...")
print("πŸŽ™οΈ Initializing Whisper (Speech-to-Text)...")
print("πŸ—£οΈ Loading ElevenLabs (Text-to-Speech)...")
print("🧠 Connecting AI models (GPT-4o, Gemini)...")
print("🌍 Setting up multilingual support...")
app = VoiceApp()
print("\n" + "="*60)
print("🎀 VOICE AGENT - SPEECH PROCESSING SUITE")
print("="*60)
print("\nπŸ’‘ Example voice requests you can try:")
for i, query in enumerate(EXAMPLE_QUERIES, 1):
print(f" {i}. {query}")
print("\nπŸŽ™οΈ Features:")
print(" β€’ Record your voice or upload audio files")
print(" β€’ Convert text to natural-sounding speech")
print(" β€’ Full voice conversations with AI")
print(" β€’ Multi-language support (English, Spanish, Nepali, etc.)")
print("\n🌐 Starting Gradio server...")
print("πŸ”— Open your browser to: http://localhost:7863")
print("\n" + "="*60)
app.launch()
if __name__ == "__main__":
main()