Spaces:
Paused
Paused
MacBook pro
commited on
Commit
·
fc4f80f
1
Parent(s):
cbbb792
feat(voice): add voice processor skeleton and integrate timing into audio metrics
Browse files- app.py +10 -2
- voice_processor.py +90 -0
app.py
CHANGED
|
@@ -6,6 +6,7 @@ import traceback
|
|
| 6 |
import time
|
| 7 |
from metrics import metrics as _metrics_singleton, Metrics
|
| 8 |
from config import config
|
|
|
|
| 9 |
|
| 10 |
app = FastAPI(title="Mirage Phase 1+2 Scaffold")
|
| 11 |
|
|
@@ -49,11 +50,18 @@ async def _echo_websocket(websocket: WebSocket, kind: str):
|
|
| 49 |
interval = None
|
| 50 |
if last_ts is not None:
|
| 51 |
interval = now - last_ts
|
| 52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
last_ts = now
|
| 54 |
elif kind == "video":
|
| 55 |
metrics.record_video_frame(size_bytes=size)
|
| 56 |
-
# Echo straight back
|
| 57 |
await websocket.send_bytes(data)
|
| 58 |
except WebSocketDisconnect:
|
| 59 |
# Silent disconnect
|
|
|
|
| 6 |
import time
|
| 7 |
from metrics import metrics as _metrics_singleton, Metrics
|
| 8 |
from config import config
|
| 9 |
+
from voice_processor import voice_processor
|
| 10 |
|
| 11 |
app = FastAPI(title="Mirage Phase 1+2 Scaffold")
|
| 12 |
|
|
|
|
| 50 |
interval = None
|
| 51 |
if last_ts is not None:
|
| 52 |
interval = now - last_ts
|
| 53 |
+
|
| 54 |
+
infer_ms = None
|
| 55 |
+
if config.voice_enable:
|
| 56 |
+
# Run through voice processor (pass-through currently)
|
| 57 |
+
processed_view, infer_ms = voice_processor.process_pcm_int16(data, sample_rate=16000)
|
| 58 |
+
# Use processed bytes for echo (still original length)
|
| 59 |
+
data = processed_view.tobytes()
|
| 60 |
+
metrics.record_audio_chunk(size_bytes=size, loop_interval_ms=interval, infer_time_ms=infer_ms)
|
| 61 |
last_ts = now
|
| 62 |
elif kind == "video":
|
| 63 |
metrics.record_video_frame(size_bytes=size)
|
| 64 |
+
# Echo straight back (audio maybe processed)
|
| 65 |
await websocket.send_bytes(data)
|
| 66 |
except WebSocketDisconnect:
|
| 67 |
# Silent disconnect
|
voice_processor.py
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Voice Processor Skeleton.
|
| 2 |
+
|
| 3 |
+
Phase: B3
|
| 4 |
+
|
| 5 |
+
Provides a minimal singleton VoiceProcessor with a lazy load() and a
|
| 6 |
+
process_pcm_int16 method. For now it only measures timing and returns
|
| 7 |
+
pass-through audio.
|
| 8 |
+
|
| 9 |
+
Future expansion hooks:
|
| 10 |
+
- VAD / segmentation
|
| 11 |
+
- Feature extraction (MFCCs, log-mel)
|
| 12 |
+
- Model inference (ASR, voice conversion, TTS, etc.)
|
| 13 |
+
- Streaming state management
|
| 14 |
+
|
| 15 |
+
The design keeps the API intentionally small so upstream code can remain
|
| 16 |
+
stable while internals evolve.
|
| 17 |
+
"""
|
| 18 |
+
from __future__ import annotations
|
| 19 |
+
|
| 20 |
+
import threading
|
| 21 |
+
import time
|
| 22 |
+
from dataclasses import dataclass
|
| 23 |
+
from typing import Optional
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
@dataclass
|
| 27 |
+
class VoiceResult:
|
| 28 |
+
"""Container for voice processing output.
|
| 29 |
+
|
| 30 |
+
For now, just echoes the PCM input.
|
| 31 |
+
"""
|
| 32 |
+
pcm: memoryview # zero-copy view of processed PCM int16 data
|
| 33 |
+
sample_rate: int
|
| 34 |
+
# Future: add tokens, text, features, etc.
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
class VoiceProcessor:
|
| 38 |
+
_instance: Optional["VoiceProcessor"] = None
|
| 39 |
+
_instance_lock = threading.Lock()
|
| 40 |
+
|
| 41 |
+
def __init__(self) -> None:
|
| 42 |
+
self._loaded = False
|
| 43 |
+
self._load_lock = threading.Lock()
|
| 44 |
+
# Placeholder for model / pipeline objects
|
| 45 |
+
self._models_ready = False
|
| 46 |
+
|
| 47 |
+
# ------------- Singleton Access -------------
|
| 48 |
+
@classmethod
|
| 49 |
+
def get(cls) -> "VoiceProcessor":
|
| 50 |
+
if cls._instance is None:
|
| 51 |
+
with cls._instance_lock:
|
| 52 |
+
if cls._instance is None: # double-checked
|
| 53 |
+
cls._instance = cls()
|
| 54 |
+
return cls._instance
|
| 55 |
+
|
| 56 |
+
# ------------- Lifecycle -------------
|
| 57 |
+
def load(self) -> None:
|
| 58 |
+
"""Lazy load models / resources.
|
| 59 |
+
|
| 60 |
+
Keep it extremely fast right now. Simulate a trivial setup only
|
| 61 |
+
on first call.
|
| 62 |
+
"""
|
| 63 |
+
if self._loaded:
|
| 64 |
+
return
|
| 65 |
+
with self._load_lock:
|
| 66 |
+
if self._loaded:
|
| 67 |
+
return
|
| 68 |
+
# Simulate minimal setup work (no sleep to keep fast)
|
| 69 |
+
self._models_ready = True
|
| 70 |
+
self._loaded = True
|
| 71 |
+
|
| 72 |
+
# ------------- Processing -------------
|
| 73 |
+
def process_pcm_int16(self, pcm: bytes | bytearray | memoryview, sample_rate: int) -> tuple[memoryview, float]:
|
| 74 |
+
"""Process an int16 PCM chunk.
|
| 75 |
+
|
| 76 |
+
Returns a tuple of (processed_pcm_memoryview, elapsed_ms).
|
| 77 |
+
Currently pass-through.
|
| 78 |
+
"""
|
| 79 |
+
if not self._loaded:
|
| 80 |
+
self.load()
|
| 81 |
+
start = time.time() * 1000.0
|
| 82 |
+
# Pass-through: we could copy but we prefer zero-copy memoryview
|
| 83 |
+
mv = memoryview(pcm)
|
| 84 |
+
# Placeholder for future signal chain
|
| 85 |
+
end = time.time() * 1000.0
|
| 86 |
+
return mv, end - start
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
# Export singleton accessor
|
| 90 |
+
voice_processor = VoiceProcessor.get()
|