MacBook pro commited on
Commit
fc4f80f
·
1 Parent(s): cbbb792

feat(voice): add voice processor skeleton and integrate timing into audio metrics

Browse files
Files changed (2) hide show
  1. app.py +10 -2
  2. voice_processor.py +90 -0
app.py CHANGED
@@ -6,6 +6,7 @@ import traceback
6
  import time
7
  from metrics import metrics as _metrics_singleton, Metrics
8
  from config import config
 
9
 
10
  app = FastAPI(title="Mirage Phase 1+2 Scaffold")
11
 
@@ -49,11 +50,18 @@ async def _echo_websocket(websocket: WebSocket, kind: str):
49
  interval = None
50
  if last_ts is not None:
51
  interval = now - last_ts
52
- metrics.record_audio_chunk(size_bytes=size, loop_interval_ms=interval)
 
 
 
 
 
 
 
53
  last_ts = now
54
  elif kind == "video":
55
  metrics.record_video_frame(size_bytes=size)
56
- # Echo straight back
57
  await websocket.send_bytes(data)
58
  except WebSocketDisconnect:
59
  # Silent disconnect
 
6
  import time
7
  from metrics import metrics as _metrics_singleton, Metrics
8
  from config import config
9
+ from voice_processor import voice_processor
10
 
11
  app = FastAPI(title="Mirage Phase 1+2 Scaffold")
12
 
 
50
  interval = None
51
  if last_ts is not None:
52
  interval = now - last_ts
53
+
54
+ infer_ms = None
55
+ if config.voice_enable:
56
+ # Run through voice processor (pass-through currently)
57
+ processed_view, infer_ms = voice_processor.process_pcm_int16(data, sample_rate=16000)
58
+ # Use processed bytes for echo (still original length)
59
+ data = processed_view.tobytes()
60
+ metrics.record_audio_chunk(size_bytes=size, loop_interval_ms=interval, infer_time_ms=infer_ms)
61
  last_ts = now
62
  elif kind == "video":
63
  metrics.record_video_frame(size_bytes=size)
64
+ # Echo straight back (audio maybe processed)
65
  await websocket.send_bytes(data)
66
  except WebSocketDisconnect:
67
  # Silent disconnect
voice_processor.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Voice Processor Skeleton.
2
+
3
+ Phase: B3
4
+
5
+ Provides a minimal singleton VoiceProcessor with a lazy load() and a
6
+ process_pcm_int16 method. For now it only measures timing and returns
7
+ pass-through audio.
8
+
9
+ Future expansion hooks:
10
+ - VAD / segmentation
11
+ - Feature extraction (MFCCs, log-mel)
12
+ - Model inference (ASR, voice conversion, TTS, etc.)
13
+ - Streaming state management
14
+
15
+ The design keeps the API intentionally small so upstream code can remain
16
+ stable while internals evolve.
17
+ """
18
+ from __future__ import annotations
19
+
20
+ import threading
21
+ import time
22
+ from dataclasses import dataclass
23
+ from typing import Optional
24
+
25
+
26
+ @dataclass
27
+ class VoiceResult:
28
+ """Container for voice processing output.
29
+
30
+ For now, just echoes the PCM input.
31
+ """
32
+ pcm: memoryview # zero-copy view of processed PCM int16 data
33
+ sample_rate: int
34
+ # Future: add tokens, text, features, etc.
35
+
36
+
37
+ class VoiceProcessor:
38
+ _instance: Optional["VoiceProcessor"] = None
39
+ _instance_lock = threading.Lock()
40
+
41
+ def __init__(self) -> None:
42
+ self._loaded = False
43
+ self._load_lock = threading.Lock()
44
+ # Placeholder for model / pipeline objects
45
+ self._models_ready = False
46
+
47
+ # ------------- Singleton Access -------------
48
+ @classmethod
49
+ def get(cls) -> "VoiceProcessor":
50
+ if cls._instance is None:
51
+ with cls._instance_lock:
52
+ if cls._instance is None: # double-checked
53
+ cls._instance = cls()
54
+ return cls._instance
55
+
56
+ # ------------- Lifecycle -------------
57
+ def load(self) -> None:
58
+ """Lazy load models / resources.
59
+
60
+ Keep it extremely fast right now. Simulate a trivial setup only
61
+ on first call.
62
+ """
63
+ if self._loaded:
64
+ return
65
+ with self._load_lock:
66
+ if self._loaded:
67
+ return
68
+ # Simulate minimal setup work (no sleep to keep fast)
69
+ self._models_ready = True
70
+ self._loaded = True
71
+
72
+ # ------------- Processing -------------
73
+ def process_pcm_int16(self, pcm: bytes | bytearray | memoryview, sample_rate: int) -> tuple[memoryview, float]:
74
+ """Process an int16 PCM chunk.
75
+
76
+ Returns a tuple of (processed_pcm_memoryview, elapsed_ms).
77
+ Currently pass-through.
78
+ """
79
+ if not self._loaded:
80
+ self.load()
81
+ start = time.time() * 1000.0
82
+ # Pass-through: we could copy but we prefer zero-copy memoryview
83
+ mv = memoryview(pcm)
84
+ # Placeholder for future signal chain
85
+ end = time.time() * 1000.0
86
+ return mv, end - start
87
+
88
+
89
+ # Export singleton accessor
90
+ voice_processor = VoiceProcessor.get()