Spaces:

rajkumarrawal
/

Secure-AI-Agents-Suite

Sleeping

App Files Files Community

Secure-AI-Agents-Suite / voice /voice_app.py

rajkumarrawal

Initial commit

2ec0d39 14 days ago

raw

history blame contribute delete

11.9 kB

	"""
	Voice Agent Gradio Application
	Web interface for the Voice Agent with microphone support
	"""

	import gradio as gr
	import asyncio
	import logging
	import os
	from .voice_agent import VoiceAgent


	class VoiceApp:
	"""Gradio web application for Voice Agent."""

	def __init__(self):
	self.agent = VoiceAgent()
	self.conversation_history = []

	# Set up logging
	logging.basicConfig(level=logging.INFO)

	# Create the interface
	self.interface = self._create_interface()

	def _create_interface(self):
	"""Create the Gradio interface."""

	with gr.Blocks(
	title="🎤 Voice Agent - Secure AI Suite",
	theme=gr.themes.Soft(
	primary_hue="orange",
	secondary_hue="gray",
	neutral_hue="slate"
	),
	css="""
	.container { max-width: 1200px; margin: auto; }
	.chatbot { height: 500px; }
	.status-card { background: linear-gradient(90deg, #fa709a 0%, #fee140 100%); color: white; }
	.tool-card { border: 2px solid #e2e8f0; border-radius: 8px; padding: 12px; margin: 8px 0; }
	.audio-controls { text-align: center; padding: 20px; background: #f8fafc; border-radius: 8px; }
	"""
	) as app:

	# Header
	gr.HTML("""
	<div style='text-align: center; padding: 20px; background: linear-gradient(90deg, #fa709a 0%, #fee140 100%); color: white; border-radius: 10px;'>
	<h1 style='margin: 0; font-size: 2.5em;'>🎤 Voice Agent</h1>
	<p style='margin: 10px 0; font-size: 1.2em;'>Speech-to-AI & Text-to-Speech with Multi-modal Processing</p>
	<p style='margin: 0; opacity: 0.8;'>🔐 Secure AI Agents Suite</p>
	</div>
	""")

	with gr.Row():
	# Left column - Voice interface
	with gr.Column(scale=2):
	gr.HTML("<h3>🎙️ Voice Interaction</h3>")

	# Audio input/output section
	with gr.Column():
	gr.HTML("<div class='audio-controls'>")
	gr.HTML("<h4>🎙️ Record Your Voice</h4>")
	audio_input = gr.Audio(
	label="Click to record or upload audio file",
	type="filepath",
	format="mp3",
	elem_classes=["audio-input"]
	)

	gr.HTML("<h4>🗣️ AI Response (Audio)</h4>")
	audio_output = gr.Audio(
	label="AI response will appear here",
	type="numpy",
	elem_classes=["audio-output"]
	)
	gr.HTML("</div>")

	gr.HTML("<h3>💬 Text Chat with Voice Features</h3>")

	chatbot = gr.Chatbot(
	label="Voice Assistant Chat",
	height=300,
	elem_classes=["chatbot"],
	avatar_images=(None, "🎤")
	)

	with gr.Row():
	msg_input = gr.Textbox(
	placeholder="Type or use voice input. Try: 'Transcribe this audio' or 'Say hello in a female voice'...",
	lines=2,
	max_lines=4,
	label="Your Message"
	)
	with gr.Column(scale=0):
	send_btn = gr.Button("Send", variant="primary")
	clear_btn = gr.Button("Clear", variant="secondary")

	# Right column - Voice Tools and Settings
	with gr.Column(scale=1):
	gr.HTML("<h3>🛠️ Voice Services</h3>")

	tools_info = gr.HTML("""
	<div class="tool-card">
	<h4>🎙️ Speech-to-Text</h4>
	<p>• Whisper transcription<br>• Multi-language support<br>• High accuracy</p>
	</div>
	<div class="tool-card">
	<h4>🗣️ Text-to-Speech</h4>
	<p>• ElevenLabs synthesis<br>• Natural voices<br>• Emotional expression</p>
	</div>
	<div class="tool-card">
	<h4>💬 Voice Conversation</h4>
	<p>• Full-duplex chat<br>• Real-time processing<br>• Context awareness</p>
	</div>
	<div class="tool-card">
	<h4>🌍 Multilingual</h4>
	<p>• 5+ languages<br>• Auto-detection<br>• Cultural adaptation</p>
	</div>
	""")

	gr.HTML("<h3>🎛️ Voice Settings</h3>")
	with gr.Row():
	voice_select = gr.Dropdown(
	choices=["Adam (Male)", "Rachel (Female)", "Cloyd (Deep)", "Custom"],
	value="Adam (Male)",
	label="Voice Selection"
	)
	speed_slider = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Speech Speed")

	gr.HTML("<h3>📊 System Status</h3>")
	status_display = gr.HTML()

	# Event handlers
	def user(user_message, history):
	"""Handle user input."""
	if not user_message.strip():
	return history, ""

	# Add user message to history
	history.append((user_message, None))
	return history, ""

	async def bot_response(history, user_message):
	"""Generate bot response."""
	if not user_message.strip():
	return history

	# Get response from agent
	response = await self.agent.handle_user_input(user_message)

	# Add bot response to history
	history[-1] = (user_message, response)
	return history

	async def process_audio(audio_file):
	"""Process uploaded or recorded audio."""
	if not audio_file:
	return None, "No audio file provided"

	try:
	# Process audio with voice agent
	response = await self.agent.handle_user_input("process this audio file")
	return audio_file, response
	except Exception as e:
	return audio_file, f"Error processing audio: {str(e)}"

	async def text_to_speech(text, voice_style, speed):
	"""Convert text to speech."""
	if not text.strip():
	return None, "No text provided"

	try:
	# Process with voice synthesis
	voice_prompt = f"speak: {text} with {voice_style} voice at {speed}x speed"
	response = await self.agent.handle_user_input(voice_prompt)

	# Generate mock audio file path
	audio_path = f"temp_audio_{hash(text)}.mp3"

	return audio_path, response
	except Exception as e:
	return None, f"Error generating speech: {str(e)}"

	def clear_conversation():
	"""Clear conversation history."""
	return []

	def update_status():
	"""Update status display."""
	status = self.agent.get_status()
	voice_settings = self.agent.config.get("voice_settings", {})
	return f"""
	<div class="status-card" style="padding: 15px; border-radius: 8px;">
	<h4>✅ Voice System Status</h4>
	<p><strong>Agent:</strong> {status['name']}</p>
	<p><strong>Status:</strong> {status['status']}</p>
	<p><strong>Whisper:</strong> {voice_settings.get('whisper_model', 'whisper-1')}</p>
	<p><strong>ElevenLabs:</strong> Active</p>
	<p><strong>Languages:</strong> 5+ supported</p>
	<p><strong>Security:</strong> {'🛡️ Enabled' if status['security_enabled'] else '❌ Disabled'}</p>
	</div>
	"""

	# Connect events
	send_btn.click(
	user,
	inputs=[msg_input, chatbot],
	outputs=[chatbot, msg_input]
	).then(
	bot_response,
	inputs=[chatbot, msg_input],
	outputs=[chatbot]
	)

	msg_input.submit(
	user,
	inputs=[msg_input, chatbot],
	outputs=[chatbot, msg_input]
	).then(
	bot_response,
	inputs=[chatbot, msg_input],
	outputs=[chatbot]
	)

	# Audio processing
	audio_input.change(
	process_audio,
	inputs=[audio_input],
	outputs=[audio_output, chatbot]
	)

	# Text-to-speech generation
	def generate_speech(text, voice, speed):
	return text_to_speech(text, voice, speed)

	clear_btn.click(clear_conversation, outputs=chatbot)

	# Initial status update
	app.load(update_status, outputs=status_display)

	return app

	def launch(self, **kwargs):
	"""Launch the Gradio application."""
	self.interface.launch(
	server_name="0.0.0.0",
	server_port=7863,
	share=False,
	show_error=True,
	quiet=False,
	**kwargs
	)


	# Example usage and quick commands
	EXAMPLE_QUERIES = [
	"Transcribe this audio file",
	"Say 'Hello, welcome to our voice AI' in a female voice",
	"Start a voice conversation",
	"Analyze the sentiment of this audio",
	"Search for meeting recordings about project updates",
	"Enable multilingual voice mode"
	]


	def main():
	"""Main function to run the Voice Agent app."""
	print("🎤 Starting Voice Agent...")
	print("🎙️ Initializing Whisper (Speech-to-Text)...")
	print("🗣️ Loading ElevenLabs (Text-to-Speech)...")
	print("🧠 Connecting AI models (GPT-4o, Gemini)...")
	print("🌍 Setting up multilingual support...")

	app = VoiceApp()

	print("\n" + "="*60)
	print("🎤 VOICE AGENT - SPEECH PROCESSING SUITE")
	print("="*60)
	print("\n💡 Example voice requests you can try:")
	for i, query in enumerate(EXAMPLE_QUERIES, 1):
	print(f" {i}. {query}")
	print("\n🎙️ Features:")
	print(" • Record your voice or upload audio files")
	print(" • Convert text to natural-sounding speech")
	print(" • Full voice conversations with AI")
	print(" • Multi-language support (English, Spanish, Nepali, etc.)")
	print("\n🌐 Starting Gradio server...")
	print("🔗 Open your browser to: http://localhost:7863")
	print("\n" + "="*60)

	app.launch()


	if __name__ == "__main__":
	main()