File size: 6,340 Bytes
20d720d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 |
"""
Multilingual Voice Processing Tools
STT and TTS with language support
"""
import whisper
import numpy as np
from gtts import gTTS
import edge_tts
import io
import asyncio
from typing import Tuple, Optional
from crewai.tools import BaseTool
import speech_recognition as sr
class MultilingualVoiceProcessor:
"""Handles multilingual STT and TTS"""
def __init__(self):
# Load Whisper model for multilingual STT
self.whisper_model = whisper.load_model("base")
# Language voice mappings for Edge TTS
self.voice_map = {
"en": "en-US-AriaNeural",
"es": "es-ES-ElviraNeural",
"fr": "fr-FR-DeniseNeural",
"de": "de-DE-KatjaNeural",
"it": "it-IT-ElsaNeural",
"pt": "pt-BR-FranciscaNeural",
"hi": "hi-IN-SwaraNeural",
"zh": "zh-CN-XiaoxiaoNeural",
"ja": "ja-JP-NanamiNeural",
"ko": "ko-KR-SunHiNeural",
"ar": "ar-SA-ZariyahNeural",
"ru": "ru-RU-SvetlanaNeural"
}
async def transcribe(
self,
audio_data: np.ndarray,
language: Optional[str] = None
) -> Tuple[str, str]:
"""Transcribe audio to text with language detection"""
try:
# Process audio
if isinstance(audio_data, tuple):
sample_rate, audio = audio_data
else:
audio = audio_data
sample_rate = 16000
# Normalize audio
if audio.dtype != np.float32:
audio = audio.astype(np.float32) / 32768.0
# Transcribe with Whisper
if language and language != "auto":
result = self.whisper_model.transcribe(
audio,
language=language
)
else:
# Auto-detect language
result = self.whisper_model.transcribe(audio)
text = result["text"]
detected_language = result["language"]
return text, detected_language
except Exception as e:
print(f"Transcription error: {e}")
return "Could not transcribe audio", "en"
async def synthesize(
self,
text: str,
language: str = "en",
voice_type: str = "normal"
) -> bytes:
"""Convert text to speech with voice modulation"""
try:
voice = self.voice_map.get(language, "en-US-AriaNeural")
# Apply voice settings for meditation tone
if voice_type == "meditation":
rate = "-15%" # Slower
pitch = "-50Hz" # Lower pitch
else:
rate = "+0%"
pitch = "+0Hz"
# Generate speech
communicate = edge_tts.Communicate(
text,
voice,
rate=rate,
pitch=pitch
)
audio_data = b""
async for chunk in communicate.stream():
if chunk["type"] == "audio":
audio_data += chunk["data"]
return audio_data
except Exception as e:
print(f"TTS error: {e}")
# Fallback to gTTS
try:
tts = gTTS(text=text, lang=language[:2])
fp = io.BytesIO()
tts.write_to_fp(fp)
return fp.getvalue()
except:
return None
class TranscribeTool(BaseTool):
name: str = "transcribe_audio"
description: str = "Transcribe audio input to text with language detection"
def _run(self, audio_data: np.ndarray, language: str = None) -> dict:
processor = MultilingualVoiceProcessor()
text, detected_lang = asyncio.run(
processor.transcribe(audio_data, language)
)
return {
"text": text,
"language": detected_lang
}
class DetectEmotionTool(BaseTool):
name: str = "detect_emotion"
description: str = "Detect emotional state from text using Mistral"
def _run(self, text: str) -> dict:
# Use Mistral for emotion detection
from models.mistral_model import MistralModel
model = MistralModel()
prompt = f"""
Analyze the emotional state in this text: "{text}"
Identify:
1. Primary emotion (joy, sadness, anger, fear, anxiety, confusion, etc.)
2. Emotional intensity (low, medium, high)
3. Underlying feelings
4. Key concerns
Format as JSON with keys: primary_emotion, intensity, feelings, concerns
"""
response = model.generate(prompt)
# Parse response (simplified)
return {
"primary_emotion": "detected_emotion",
"intensity": "medium",
"feelings": ["feeling1", "feeling2"],
"concerns": ["concern1", "concern2"]
}
class GenerateQuestionsTool(BaseTool):
name: str = "generate_reflective_questions"
description: str = "Generate empathetic reflective questions"
def _run(self, context: dict) -> list:
emotion = context.get("primary_emotion", "neutral")
questions_map = {
"anxiety": [
"What specific thoughts are creating this anxiety?",
"What would feeling calm look like in this situation?",
"What has helped you manage anxiety before?"
],
"sadness": [
"What would comfort mean to you right now?",
"What are you grieving or missing?",
"How can you be gentle with yourself today?"
],
"confusion": [
"What would clarity feel like?",
"What's the main question you're grappling with?",
"What does your intuition tell you?"
]
}
return questions_map.get(emotion, [
"How are you feeling in this moment?",
"What would support look like for you?",
"What's most important to explore right now?"
]) |