File size: 6,340 Bytes
20d720d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
"""
Multilingual Voice Processing Tools
STT and TTS with language support
"""

import whisper
import numpy as np
from gtts import gTTS
import edge_tts
import io
import asyncio
from typing import Tuple, Optional
from crewai.tools import BaseTool
import speech_recognition as sr

class MultilingualVoiceProcessor:
    """Handles multilingual STT and TTS"""
    
    def __init__(self):
        # Load Whisper model for multilingual STT
        self.whisper_model = whisper.load_model("base")
        
        # Language voice mappings for Edge TTS
        self.voice_map = {
            "en": "en-US-AriaNeural",
            "es": "es-ES-ElviraNeural",
            "fr": "fr-FR-DeniseNeural",
            "de": "de-DE-KatjaNeural",
            "it": "it-IT-ElsaNeural",
            "pt": "pt-BR-FranciscaNeural",
            "hi": "hi-IN-SwaraNeural",
            "zh": "zh-CN-XiaoxiaoNeural",
            "ja": "ja-JP-NanamiNeural",
            "ko": "ko-KR-SunHiNeural",
            "ar": "ar-SA-ZariyahNeural",
            "ru": "ru-RU-SvetlanaNeural"
        }
    
    async def transcribe(
        self, 
        audio_data: np.ndarray, 
        language: Optional[str] = None
    ) -> Tuple[str, str]:
        """Transcribe audio to text with language detection"""
        try:
            # Process audio
            if isinstance(audio_data, tuple):
                sample_rate, audio = audio_data
            else:
                audio = audio_data
                sample_rate = 16000
            
            # Normalize audio
            if audio.dtype != np.float32:
                audio = audio.astype(np.float32) / 32768.0
            
            # Transcribe with Whisper
            if language and language != "auto":
                result = self.whisper_model.transcribe(
                    audio, 
                    language=language
                )
            else:
                # Auto-detect language
                result = self.whisper_model.transcribe(audio)
            
            text = result["text"]
            detected_language = result["language"]
            
            return text, detected_language
            
        except Exception as e:
            print(f"Transcription error: {e}")
            return "Could not transcribe audio", "en"
    
    async def synthesize(
        self, 
        text: str, 
        language: str = "en",
        voice_type: str = "normal"
    ) -> bytes:
        """Convert text to speech with voice modulation"""
        try:
            voice = self.voice_map.get(language, "en-US-AriaNeural")
            
            # Apply voice settings for meditation tone
            if voice_type == "meditation":
                rate = "-15%"  # Slower
                pitch = "-50Hz"  # Lower pitch
            else:
                rate = "+0%"
                pitch = "+0Hz"
            
            # Generate speech
            communicate = edge_tts.Communicate(
                text,
                voice,
                rate=rate,
                pitch=pitch
            )
            
            audio_data = b""
            async for chunk in communicate.stream():
                if chunk["type"] == "audio":
                    audio_data += chunk["data"]
            
            return audio_data
            
        except Exception as e:
            print(f"TTS error: {e}")
            # Fallback to gTTS
            try:
                tts = gTTS(text=text, lang=language[:2])
                fp = io.BytesIO()
                tts.write_to_fp(fp)
                return fp.getvalue()
            except:
                return None

class TranscribeTool(BaseTool):
    name: str = "transcribe_audio"
    description: str = "Transcribe audio input to text with language detection"
    
    def _run(self, audio_data: np.ndarray, language: str = None) -> dict:
        processor = MultilingualVoiceProcessor()
        text, detected_lang = asyncio.run(
            processor.transcribe(audio_data, language)
        )
        return {
            "text": text,
            "language": detected_lang
        }

class DetectEmotionTool(BaseTool):
    name: str = "detect_emotion"
    description: str = "Detect emotional state from text using Mistral"
    
    def _run(self, text: str) -> dict:
        # Use Mistral for emotion detection
        from models.mistral_model import MistralModel
        model = MistralModel()
        
        prompt = f"""
        Analyze the emotional state in this text: "{text}"
        
        Identify:
        1. Primary emotion (joy, sadness, anger, fear, anxiety, confusion, etc.)
        2. Emotional intensity (low, medium, high)
        3. Underlying feelings
        4. Key concerns
        
        Format as JSON with keys: primary_emotion, intensity, feelings, concerns
        """
        
        response = model.generate(prompt)
        
        # Parse response (simplified)
        return {
            "primary_emotion": "detected_emotion",
            "intensity": "medium",
            "feelings": ["feeling1", "feeling2"],
            "concerns": ["concern1", "concern2"]
        }

class GenerateQuestionsTool(BaseTool):
    name: str = "generate_reflective_questions"
    description: str = "Generate empathetic reflective questions"
    
    def _run(self, context: dict) -> list:
        emotion = context.get("primary_emotion", "neutral")
        
        questions_map = {
            "anxiety": [
                "What specific thoughts are creating this anxiety?",
                "What would feeling calm look like in this situation?",
                "What has helped you manage anxiety before?"
            ],
            "sadness": [
                "What would comfort mean to you right now?",
                "What are you grieving or missing?",
                "How can you be gentle with yourself today?"
            ],
            "confusion": [
                "What would clarity feel like?",
                "What's the main question you're grappling with?",
                "What does your intuition tell you?"
            ]
        }
        
        return questions_map.get(emotion, [
            "How are you feeling in this moment?",
            "What would support look like for you?",
            "What's most important to explore right now?"
        ])