|
|
""" |
|
|
Voice Worker for Modal Deployment |
|
|
Handles voice processing tasks on Modal infrastructure |
|
|
""" |
|
|
|
|
|
import asyncio |
|
|
import json |
|
|
import logging |
|
|
import base64 |
|
|
from typing import Dict, List, Any, Optional |
|
|
from datetime import datetime |
|
|
|
|
|
|
|
|
import modal |
|
|
|
|
|
|
|
|
logging.basicConfig(level=logging.INFO) |
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
app = modal.App("voice-worker") |
|
|
|
|
|
|
|
|
class VoiceWorker: |
|
|
"""Voice processing worker for Modal deployment.""" |
|
|
|
|
|
def __init__(self): |
|
|
self.config = { |
|
|
"whisper_model": "whisper-1", |
|
|
"voice_id": "pNInz6obpgDQGcFmaJgB", |
|
|
"language": "en", |
|
|
"response_format": "json" |
|
|
} |
|
|
|
|
|
async def process_whisper_transcription(self, audio_data: str, language: str = "auto") -> Dict[str, Any]: |
|
|
"""Process audio with Whisper for transcription.""" |
|
|
try: |
|
|
|
|
|
|
|
|
|
|
|
await asyncio.sleep(0.1) |
|
|
|
|
|
mock_transcription = { |
|
|
"text": "Hello, this is a test of the voice transcription system.", |
|
|
"language": language, |
|
|
"duration": 4.2, |
|
|
"confidence": 0.97, |
|
|
"words": [ |
|
|
{"word": "Hello", "start": 0.0, "end": 0.5, "confidence": 0.99}, |
|
|
{"word": "this", "start": 0.6, "end": 0.8, "confidence": 0.95}, |
|
|
{"word": "is", "start": 0.9, "end": 1.1, "confidence": 0.98}, |
|
|
{"word": "a", "start": 1.2, "end": 1.3, "confidence": 0.94}, |
|
|
{"word": "test", "start": 1.4, "end": 1.8, "confidence": 0.99}, |
|
|
{"word": "of", "start": 1.9, "end": 2.1, "confidence": 0.96}, |
|
|
{"word": "the", "start": 2.2, "end": 2.4, "confidence": 0.98}, |
|
|
{"word": "voice", "start": 2.5, "end": 2.9, "confidence": 0.97}, |
|
|
{"word": "transcription", "start": 3.0, "end": 3.8, "confidence": 0.99}, |
|
|
{"word": "system", "start": 3.9, "end": 4.2, "confidence": 0.98} |
|
|
] |
|
|
} |
|
|
|
|
|
logger.info(f"Whisper transcription completed: {len(mock_transcription['text'])} characters") |
|
|
return mock_transcription |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Whisper transcription error: {e}") |
|
|
return {"error": str(e), "text": None} |
|
|
|
|
|
async def process_elevenlabs_synthesis(self, text: str, voice_id: str, stability: float = 0.5) -> Dict[str, Any]: |
|
|
"""Process text with ElevenLabs for voice synthesis.""" |
|
|
try: |
|
|
|
|
|
|
|
|
|
|
|
await asyncio.sleep(0.2) |
|
|
|
|
|
|
|
|
audio_duration = len(text) * 0.1 |
|
|
audio_size = len(text) * 0.5 |
|
|
|
|
|
mock_audio_data = base64.b64encode(b"mock_audio_data").decode() |
|
|
|
|
|
voice_names = { |
|
|
"pNInz6obpgDQGcFmaJgB": "Adam (Male, Professional)", |
|
|
"21m00Tcm4TlvDq8ikWAM": "Rachel (Female, Warm)", |
|
|
"29vD33N1CtxCmqQRPOHJ": "Cloyd (Male, Deep)" |
|
|
} |
|
|
|
|
|
mock_synthesis = { |
|
|
"audio_data": mock_audio_data, |
|
|
"duration": audio_duration, |
|
|
"voice_name": voice_names.get(voice_id, "Custom Voice"), |
|
|
"voice_id": voice_id, |
|
|
"model_id": "eleven_monolingual_v1", |
|
|
"settings": { |
|
|
"stability": stability, |
|
|
"similarity_boost": 0.5, |
|
|
"style": 0.0, |
|
|
"use_speaker_boost": True |
|
|
}, |
|
|
"file_size_kb": audio_size, |
|
|
"format": "mp3", |
|
|
"sample_rate": 44100 |
|
|
} |
|
|
|
|
|
logger.info(f"ElevenLabs synthesis completed: {audio_duration:.1f}s audio") |
|
|
return mock_synthesis |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"ElevenLabs synthesis error: {e}") |
|
|
return {"error": str(e), "audio_data": None} |
|
|
|
|
|
async def process_gpt4o_conversation(self, user_input: str, context: List[Dict] = None) -> Dict[str, Any]: |
|
|
"""Process conversation with GPT-4o.""" |
|
|
try: |
|
|
|
|
|
|
|
|
|
|
|
await asyncio.sleep(0.15) |
|
|
|
|
|
|
|
|
if any(word in user_input.lower() for word in ["hello", "hi", "hey"]): |
|
|
response = "Hello! I'm your voice AI assistant. How can I help you today? I can transcribe audio, generate speech, or have a conversation with you." |
|
|
elif any(word in user_input.lower() for word in ["transcribe", "speech to text"]): |
|
|
response = "I can transcribe your audio using Whisper AI. Please upload your audio file or record directly, and I'll convert it to text with high accuracy." |
|
|
elif any(word in user_input.lower() for word in ["speak", "say", "voice"]): |
|
|
response = "I can generate natural-sounding speech using ElevenLabs. What would you like me to say? I have multiple voice options available." |
|
|
elif any(word in user_input.lower() for word in ["translate", "language"]): |
|
|
response = "I support multiple languages including English, Spanish, French, and Nepali. I can automatically detect the language and provide appropriate responses." |
|
|
else: |
|
|
response = f"I understand you're asking about: '{user_input}'. As your voice AI, I can help with transcription, speech synthesis, multilingual processing, and intelligent conversations. What specific voice task would you like me to help with?" |
|
|
|
|
|
mock_conversation = { |
|
|
"response": response, |
|
|
"model": "gpt-4o", |
|
|
"tokens_used": len(user_input.split()) + len(response.split()), |
|
|
"confidence": 0.95, |
|
|
"processing_time": 0.15, |
|
|
"context_aware": True, |
|
|
"timestamp": datetime.utcnow().isoformat() |
|
|
} |
|
|
|
|
|
logger.info(f"GPT-4o conversation processed: {len(response)} character response") |
|
|
return mock_conversation |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"GPT-4o conversation error: {e}") |
|
|
return {"error": str(e), "response": None} |
|
|
|
|
|
async def process_multilingual_detection(self, audio_data: str) -> Dict[str, Any]: |
|
|
"""Detect language and process multilingual audio.""" |
|
|
try: |
|
|
|
|
|
|
|
|
|
|
|
await asyncio.sleep(0.1) |
|
|
|
|
|
|
|
|
mock_detection = { |
|
|
"detected_language": "en", |
|
|
"language_name": "English", |
|
|
"confidence": 0.94, |
|
|
"alternative_languages": [ |
|
|
{"language": "es", "confidence": 0.12}, |
|
|
{"language": "fr", "confidence": 0.08}, |
|
|
{"language": "ne", "confidence": 0.05} |
|
|
], |
|
|
"auto_switch": True, |
|
|
"cultural_context": "Western business communication", |
|
|
"phonetic_features": { |
|
|
"accent": "neutral", |
|
|
"clarity": "high", |
|
|
"speech_rate": "normal" |
|
|
} |
|
|
} |
|
|
|
|
|
logger.info(f"Language detection completed: {mock_detection['language_name']}") |
|
|
return mock_detection |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Language detection error: {e}") |
|
|
return {"error": str(e), "detected_language": None} |
|
|
|
|
|
|
|
|
|
|
|
@app.function() |
|
|
async def whisper_transcribe(audio_data: str, language: str = "auto") -> str: |
|
|
"""Modal endpoint for Whisper transcription.""" |
|
|
worker = VoiceWorker() |
|
|
result = await worker.process_whisper_transcription(audio_data, language) |
|
|
return json.dumps(result) |
|
|
|
|
|
|
|
|
@app.function() |
|
|
async def elevenlabs_synthesize(text: str, voice_id: str = "pNInz6obpgDQGcFmaJgB", stability: float = 0.5) -> str: |
|
|
"""Modal endpoint for ElevenLabs voice synthesis.""" |
|
|
worker = VoiceWorker() |
|
|
result = await worker.process_elevenlabs_synthesis(text, voice_id, stability) |
|
|
return json.dumps(result) |
|
|
|
|
|
|
|
|
@app.function() |
|
|
async def gpt4o_converse(user_input: str, context: str = "[]") -> str: |
|
|
"""Modal endpoint for GPT-4o conversation.""" |
|
|
worker = VoiceWorker() |
|
|
context_list = json.loads(context) if context != "[]" else None |
|
|
result = await worker.process_gpt4o_conversation(user_input, context_list) |
|
|
return json.dumps(result) |
|
|
|
|
|
|
|
|
@app.function() |
|
|
async def detect_language(audio_data: str) -> str: |
|
|
"""Modal endpoint for language detection.""" |
|
|
worker = VoiceWorker() |
|
|
result = await worker.process_multilingual_detection(audio_data) |
|
|
return json.dumps(result) |
|
|
|
|
|
|
|
|
@app.function() |
|
|
async def voice_pipeline(audio_data: str, operation: str = "full", language: str = "auto") -> str: |
|
|
"""Modal endpoint for complete voice processing pipeline.""" |
|
|
worker = VoiceWorker() |
|
|
|
|
|
try: |
|
|
if operation == "transcribe": |
|
|
result = await worker.process_whisper_transcription(audio_data, language) |
|
|
elif operation == "synthesize": |
|
|
|
|
|
text = "Hello, this is a test of the voice synthesis system." |
|
|
result = await worker.process_elevenlabs_synthesis(text) |
|
|
elif operation == "detect": |
|
|
result = await worker.process_multilingual_detection(audio_data) |
|
|
elif operation == "full": |
|
|
|
|
|
detection = await worker.process_multilingual_detection(audio_data) |
|
|
transcription = await worker.process_whisper_transcription(audio_data, detection.get("detected_language", "en")) |
|
|
conversation = await worker.process_gpt4o_conversation(transcription.get("text", "")) |
|
|
|
|
|
result = { |
|
|
"pipeline": "complete", |
|
|
"language_detection": detection, |
|
|
"transcription": transcription, |
|
|
"conversation": conversation, |
|
|
"timestamp": datetime.utcnow().isoformat() |
|
|
} |
|
|
else: |
|
|
result = {"error": f"Unknown operation: {operation}"} |
|
|
|
|
|
return json.dumps(result) |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Voice pipeline error: {e}") |
|
|
return json.dumps({"error": str(e), "operation": operation}) |
|
|
|
|
|
|
|
|
@app.function() |
|
|
async def health_check() -> str: |
|
|
"""Modal endpoint for health check.""" |
|
|
health_status = { |
|
|
"status": "healthy", |
|
|
"timestamp": datetime.utcnow().isoformat(), |
|
|
"services": { |
|
|
"whisper": "available", |
|
|
"elevenlabs": "available", |
|
|
"gpt4o": "available", |
|
|
"language_detection": "available" |
|
|
}, |
|
|
"version": "1.0.0", |
|
|
"uptime": "100%" |
|
|
} |
|
|
return json.dumps(health_status) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
async def test_voice_worker(): |
|
|
worker = VoiceWorker() |
|
|
|
|
|
print("🎤 Testing Voice Worker...") |
|
|
|
|
|
|
|
|
print("\n1. Testing Whisper Transcription:") |
|
|
audio_data = base64.b64encode(b"mock_audio_data").decode() |
|
|
result = await worker.process_whisper_transcription(audio_data) |
|
|
print(f" Result: {result.get('text', 'No text')}") |
|
|
|
|
|
|
|
|
print("\n2. Testing ElevenLabs Synthesis:") |
|
|
result = await worker.process_elevenlabs_synthesis("Hello, this is a test") |
|
|
print(f" Voice: {result.get('voice_name', 'Unknown')}") |
|
|
print(f" Duration: {result.get('duration', 0):.1f}s") |
|
|
|
|
|
|
|
|
print("\n3. Testing GPT-4o Conversation:") |
|
|
result = await worker.process_gpt4o_conversation("Hello, how can you help me?") |
|
|
print(f" Response: {result.get('response', 'No response')[:100]}...") |
|
|
|
|
|
|
|
|
print("\n4. Testing Language Detection:") |
|
|
result = await worker.process_multilingual_detection(audio_data) |
|
|
print(f" Language: {result.get('language_name', 'Unknown')} ({result.get('confidence', 0):.1%})") |
|
|
|
|
|
print("\n✅ Voice Worker tests completed!") |
|
|
|
|
|
|
|
|
asyncio.run(test_voice_worker()) |