Spaces:

datbkpro
/

voicebot

Sleeping

App Files Files Community

datbkpro commited on Oct 22

Commit

4b51187

verified ·

1 Parent(s): a1302e7

Update services/streaming_voice_service.py

Browse files

Files changed (1) hide show

services/streaming_voice_service.py +44 -18

services/streaming_voice_service.py CHANGED Viewed

@@ -181,11 +181,10 @@
 import io
 import numpy as np
 import soundfile as sf
-import threading
-import time
 import traceback
 from groq import Groq
-from typing import Optional, Callable
 from config.settings import settings
 from core.rag_system import EnhancedRAGSystem
 from core.tts_service import EnhancedTTSService
@@ -197,15 +196,11 @@ class StreamingVoiceService:
         self.rag_system = rag_system
         self.tts_service = tts_service
-        # Streaming state
-        self.is_listening = False
-        self.callback_handler = None
         # Conversation context
         self.conversation_history = []
         self.current_transcription = ""
-    def process_streaming_audio(self, audio_data: tuple) -> dict:
         """Xử lý audio streaming từ Gradio microphone component"""
         if not audio_data:
             return {
@@ -220,6 +215,14 @@ class StreamingVoiceService:
             print(f"🎯 Nhận audio: {len(audio_array)} samples, SR: {sample_rate}")
             # Chuyển đổi thành văn bản
             transcription = self._transcribe_audio(audio_array, sample_rate)
@@ -232,6 +235,9 @@ class StreamingVoiceService:
             print(f"📝 Đã chuyển đổi: {transcription}")
             # Tạo phản hồi AI
             response = self._generate_ai_response(transcription)
@@ -246,9 +252,10 @@ class StreamingVoiceService:
         except Exception as e:
             print(f"❌ Lỗi xử lý streaming audio: {e}")
             return {
                 'transcription': f"❌ Lỗi: {str(e)}",
-                'response': "Xin lỗi, có lỗi xảy ra",
                 'tts_audio': None
             }
@@ -259,14 +266,23 @@ class StreamingVoiceService:
             if audio_data.ndim > 1:
                 audio_data = np.mean(audio_data, axis=1)  # Chuyển sang mono
-            # Normalize
-            if np.max(np.abs(audio_data)) > 0:
-                audio_data = audio_data / np.max(np.abs(audio_data))
             buffer = io.BytesIO()
             sf.write(buffer, audio_data, sample_rate, format='wav', subtype='PCM_16')
             buffer.seek(0)
             transcription = self.client.audio.transcriptions.create(
                 model=settings.WHISPER_MODEL,
                 file=("speech.wav", buffer.read(), "audio/wav"),
@@ -276,11 +292,14 @@ class StreamingVoiceService:
             # Xử lý response
             if hasattr(transcription, 'text'):
-                return transcription.text.strip()
             elif isinstance(transcription, str):
-                return transcription.strip()
             else:
-                return str(transcription).strip()
         except Exception as e:
             print(f"❌ Lỗi transcription: {e}")
@@ -323,14 +342,19 @@ Thông tin tham khảo:
             return response
         except Exception as e:
-            return f"Xin lỗi, tôi gặp lỗi: {str(e)}"
     def _text_to_speech(self, text: str) -> Optional[str]:
         """Chuyển văn bản thành giọng nói"""
         try:
             tts_bytes = self.tts_service.text_to_speech(text, 'vi')
             if tts_bytes:
-                return self.tts_service.save_audio_to_file(tts_bytes)
         except Exception as e:
             print(f"❌ Lỗi TTS: {e}")
         return None
@@ -338,11 +362,13 @@ Thông tin tham khảo:
     def clear_conversation(self):
         """Xóa lịch sử hội thoại"""
         self.conversation_history = []
         print("🗑️ Đã xóa lịch sử hội thoại")
     def get_conversation_state(self) -> dict:
         """Lấy trạng thái hội thoại"""
         return {
             'history_length': len(self.conversation_history),
-            'current_transcription': self.current_transcription
         }

 import io
 import numpy as np
 import soundfile as sf
+import time  # THÊM IMPORT NÀY
 import traceback
 from groq import Groq
+from typing import Optional, Dict, Any
 from config.settings import settings
 from core.rag_system import EnhancedRAGSystem
 from core.tts_service import EnhancedTTSService
         self.rag_system = rag_system
         self.tts_service = tts_service
         # Conversation context
         self.conversation_history = []
         self.current_transcription = ""
+    def process_streaming_audio(self, audio_data: tuple) -> Dict[str, Any]:
         """Xử lý audio streaming từ Gradio microphone component"""
         if not audio_data:
             return {
             print(f"🎯 Nhận audio: {len(audio_array)} samples, SR: {sample_rate}")
+            # Kiểm tra audio có dữ liệu không
+            if len(audio_array) == 0 or np.max(np.abs(audio_array)) < 0.01:
+                return {
+                    'transcription': "❌ Âm thanh quá yếu",
+                    'response': "Xin vui lòng nói to hơn và rõ hơn",
+                    'tts_audio': None
+                }
             # Chuyển đổi thành văn bản
             transcription = self._transcribe_audio(audio_array, sample_rate)
             print(f"📝 Đã chuyển đổi: {transcription}")
+            # Cập nhật transcription hiện tại
+            self.current_transcription = transcription
             # Tạo phản hồi AI
             response = self._generate_ai_response(transcription)
         except Exception as e:
             print(f"❌ Lỗi xử lý streaming audio: {e}")
+            print(f"Chi tiết lỗi: {traceback.format_exc()}")
             return {
                 'transcription': f"❌ Lỗi: {str(e)}",
+                'response': "Xin lỗi, có lỗi xảy ra trong quá trình xử lý",
                 'tts_audio': None
             }
             if audio_data.ndim > 1:
                 audio_data = np.mean(audio_data, axis=1)  # Chuyển sang mono
+            # Normalize âm lượng
+            audio_max = np.max(np.abs(audio_data))
+            if audio_max > 0:
+                audio_data = audio_data / audio_max
+            # Giới hạn độ dài audio (tránh quá dài)
+            max_duration = 10  # giây
+            max_samples = sample_rate * max_duration
+            if len(audio_data) > max_samples:
+                audio_data = audio_data[:max_samples]
+                print(f"⚠️ Cắt audio xuống còn {max_duration} giây")
             buffer = io.BytesIO()
             sf.write(buffer, audio_data, sample_rate, format='wav', subtype='PCM_16')
             buffer.seek(0)
+            # Gọi API Whisper
             transcription = self.client.audio.transcriptions.create(
                 model=settings.WHISPER_MODEL,
                 file=("speech.wav", buffer.read(), "audio/wav"),
             # Xử lý response
             if hasattr(transcription, 'text'):
+                result = transcription.text.strip()
             elif isinstance(transcription, str):
+                result = transcription.strip()
             else:
+                result = str(transcription).strip()
+            print(f"✅ Transcription thành công: {result}")
+            return result
         except Exception as e:
             print(f"❌ Lỗi transcription: {e}")
             return response
         except Exception as e:
+            return f"Xin lỗi, tôi gặp lỗi khi tạo phản hồi: {str(e)}"
     def _text_to_speech(self, text: str) -> Optional[str]:
         """Chuyển văn bản thành giọng nói"""
         try:
+            if not text or text.startswith("❌") or text.startswith("Xin lỗi"):
+                return None
             tts_bytes = self.tts_service.text_to_speech(text, 'vi')
             if tts_bytes:
+                audio_path = self.tts_service.save_audio_to_file(tts_bytes)
+                print(f"✅ Đã tạo TTS: {audio_path}")
+                return audio_path
         except Exception as e:
             print(f"❌ Lỗi TTS: {e}")
         return None
     def clear_conversation(self):
         """Xóa lịch sử hội thoại"""
         self.conversation_history = []
+        self.current_transcription = ""
         print("🗑️ Đã xóa lịch sử hội thoại")
     def get_conversation_state(self) -> dict:
         """Lấy trạng thái hội thoại"""
         return {
             'history_length': len(self.conversation_history),
+            'current_transcription': self.current_transcription,
+            'last_update': time.strftime("%H:%M:%S")
         }