Spaces:

jayashree
/

TatTwamAI

Sleeping

File size: 6,340 Bytes

20d720d

"""
Multilingual Voice Processing Tools
STT and TTS with language support
"""

import whisper
import numpy as np
from gtts import gTTS
import edge_tts
import io
import asyncio
from typing import Tuple, Optional
from crewai.tools import BaseTool
import speech_recognition as sr

class MultilingualVoiceProcessor:
    """Handles multilingual STT and TTS"""
    
    def __init__(self):
        # Load Whisper model for multilingual STT
        self.whisper_model = whisper.load_model("base")
        
        # Language voice mappings for Edge TTS
        self.voice_map = {
            "en": "en-US-AriaNeural",
            "es": "es-ES-ElviraNeural",
            "fr": "fr-FR-DeniseNeural",
            "de": "de-DE-KatjaNeural",
            "it": "it-IT-ElsaNeural",
            "pt": "pt-BR-FranciscaNeural",
            "hi": "hi-IN-SwaraNeural",
            "zh": "zh-CN-XiaoxiaoNeural",
            "ja": "ja-JP-NanamiNeural",
            "ko": "ko-KR-SunHiNeural",
            "ar": "ar-SA-ZariyahNeural",
            "ru": "ru-RU-SvetlanaNeural"
        }
    
    async def transcribe(
        self, 
        audio_data: np.ndarray, 
        language: Optional[str] = None
    ) -> Tuple[str, str]:
        """Transcribe audio to text with language detection"""
        try:
            # Process audio
            if isinstance(audio_data, tuple):
                sample_rate, audio = audio_data
            else:
                audio = audio_data
                sample_rate = 16000
            
            # Normalize audio
            if audio.dtype != np.float32:
                audio = audio.astype(np.float32) / 32768.0
            
            # Transcribe with Whisper
            if language and language != "auto":
                result = self.whisper_model.transcribe(
                    audio, 
                    language=language
                )
            else:
                # Auto-detect language
                result = self.whisper_model.transcribe(audio)
            
            text = result["text"]
            detected_language = result["language"]
            
            return text, detected_language
            
        except Exception as e:
            print(f"Transcription error: {e}")
            return "Could not transcribe audio", "en"
    
    async def synthesize(
        self, 
        text: str, 
        language: str = "en",
        voice_type: str = "normal"
    ) -> bytes:
        """Convert text to speech with voice modulation"""
        try:
            voice = self.voice_map.get(language, "en-US-AriaNeural")
            
            # Apply voice settings for meditation tone
            if voice_type == "meditation":
                rate = "-15%"  # Slower
                pitch = "-50Hz"  # Lower pitch
            else:
                rate = "+0%"
                pitch = "+0Hz"
            
            # Generate speech
            communicate = edge_tts.Communicate(
                text,
                voice,
                rate=rate,
                pitch=pitch
            )
            
            audio_data = b""
            async for chunk in communicate.stream():
                if chunk["type"] == "audio":
                    audio_data += chunk["data"]
            
            return audio_data
            
        except Exception as e:
            print(f"TTS error: {e}")
            # Fallback to gTTS
            try:
                tts = gTTS(text=text, lang=language[:2])
                fp = io.BytesIO()
                tts.write_to_fp(fp)
                return fp.getvalue()
            except:
                return None

class TranscribeTool(BaseTool):
    name: str = "transcribe_audio"
    description: str = "Transcribe audio input to text with language detection"
    
    def _run(self, audio_data: np.ndarray, language: str = None) -> dict:
        processor = MultilingualVoiceProcessor()
        text, detected_lang = asyncio.run(
            processor.transcribe(audio_data, language)
        )
        return {
            "text": text,
            "language": detected_lang
        }

class DetectEmotionTool(BaseTool):
    name: str = "detect_emotion"
    description: str = "Detect emotional state from text using Mistral"
    
    def _run(self, text: str) -> dict:
        # Use Mistral for emotion detection
        from models.mistral_model import MistralModel
        model = MistralModel()
        
        prompt = f"""
        Analyze the emotional state in this text: "{text}"
        
        Identify:
        1. Primary emotion (joy, sadness, anger, fear, anxiety, confusion, etc.)
        2. Emotional intensity (low, medium, high)
        3. Underlying feelings
        4. Key concerns
        
        Format as JSON with keys: primary_emotion, intensity, feelings, concerns
        """
        
        response = model.generate(prompt)
        
        # Parse response (simplified)
        return {
            "primary_emotion": "detected_emotion",
            "intensity": "medium",
            "feelings": ["feeling1", "feeling2"],
            "concerns": ["concern1", "concern2"]
        }

class GenerateQuestionsTool(BaseTool):
    name: str = "generate_reflective_questions"
    description: str = "Generate empathetic reflective questions"
    
    def _run(self, context: dict) -> list:
        emotion = context.get("primary_emotion", "neutral")
        
        questions_map = {
            "anxiety": [
                "What specific thoughts are creating this anxiety?",
                "What would feeling calm look like in this situation?",
                "What has helped you manage anxiety before?"
            ],
            "sadness": [
                "What would comfort mean to you right now?",
                "What are you grieving or missing?",
                "How can you be gentle with yourself today?"
            ],
            "confusion": [
                "What would clarity feel like?",
                "What's the main question you're grappling with?",
                "What does your intuition tell you?"
            ]
        }
        
        return questions_map.get(emotion, [
            "How are you feeling in this moment?",
            "What would support look like for you?",
            "What's most important to explore right now?"
        ])