Spaces:

SalexAI
/

api

Sleeping

App Files Files Community

SalexAI commited on Feb 12

Commit

9c5a6e7

verified ·

1 Parent(s): 277bec4

Update app/main.py

Browse files

Files changed (1) hide show

app/main.py +205 -313

app/main.py CHANGED Viewed

@@ -1,335 +1,227 @@
-# app/main.py
-import os
-import json
 import asyncio
-from typing import Any, Dict, Optional, List
-from fastapi import FastAPI, WebSocket, WebSocketDisconnect
-from fastapi.responses import JSONResponse
 from dotenv import load_dotenv
-import websockets
-load_dotenv()
-app = FastAPI(title="Gemini Live Native-Audio WS Proxy", version="2.1.0")
-GEMINI_LIVE_WS_URL = (
-    "wss://generativelanguage.googleapis.com/ws/"
-    "google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent"
 )
-API_KEY = os.getenv("GEMINI_API_KEY", "").strip()
-# IMPORTANT: pick a REAL default model here (must support Live + native audio)
-# Put your known-working native audio model id below:
-FALLBACK_NATIVE_AUDIO_MODEL = "models/gemini-2.5-flash-native-audio-preview-12-2025"
-DEFAULT_MODEL = os.getenv("GEMINI_MODEL", FALLBACK_NATIVE_AUDIO_MODEL)
-DEFAULT_SYSTEM = os.getenv(
-    "GEMINI_SYSTEM_INSTRUCTION",
-    "You are an AI voice assitent named arrow, Please attempt to understand all commands as best you can, Be helpful and use inbulit functions instead of guessing."
 )
-DEFAULT_TEMPERATURE = float(os.getenv("GEMINI_TEMPERATURE", "0.7"))
-DEFAULT_MAX_TOKENS = int(os.getenv("GEMINI_MAX_OUTPUT_TOKENS", "1024"))
-DEFAULT_VOICE = os.getenv("GEMINI_VOICE_NAME", "Kore")
-DEFAULT_INPUT_RATE = int(os.getenv("GEMINI_INPUT_AUDIO_RATE", "16000"))
-DEFAULT_OUTPUT_RATE = int(os.getenv("GEMINI_OUTPUT_AUDIO_RATE", "24000"))
-DEBUG_GEMINI_RAW = os.getenv("DEBUG_GEMINI_RAW", "0").strip() == "1"
-def _clean_str(x: Any) -> str:
-    if not isinstance(x, str):
-        return ""
-    return x.strip()
-def _is_bad_model(s: str) -> bool:
-    s2 = (s or "").strip().lower()
-    return (not s2) or (s2 in {"undefined", "null", "none"})
-def _safe_model(model: Any) -> str:
-    m = _clean_str(model)
-    if _is_bad_model(m):
-        m = _clean_str(DEFAULT_MODEL)
-    if _is_bad_model(m):
-        m = FALLBACK_NATIVE_AUDIO_MODEL
-    return m
 @app.get("/health")
 async def health():
-    model = _safe_model(DEFAULT_MODEL)
-    ok = bool(API_KEY)
-    return JSONResponse(
-        {
-            "ok": ok,
-            "has_api_key": ok,
-            "model": model,
-            "voice": DEFAULT_VOICE,
-            "input_rate": DEFAULT_INPUT_RATE,
-            "output_rate": DEFAULT_OUTPUT_RATE,
-            "debug_raw": DEBUG_GEMINI_RAW,
-        }
-    )
-def _extract_text_parts(content: Dict[str, Any]) -> str:
-    parts = content.get("parts") or []
-    out: List[str] = []
-    for p in parts:
-        if isinstance(p, dict) and isinstance(p.get("text"), str):
-            out.append(p["text"])
-    return "".join(out)
-def _extract_inline_audio_parts(content: Dict[str, Any]) -> List[Dict[str, str]]:
-    parts = content.get("parts") or []
-    out: List[Dict[str, str]] = []
-    for p in parts:
-        if not isinstance(p, dict):
-            continue
-        inline = p.get("inlineData")
-        if isinstance(inline, dict):
-            data = inline.get("data")
-            mime = inline.get("mimeType")
-            if isinstance(data, str) and isinstance(mime, str):
-                out.append({"mime": mime, "data": data})
-    return out
-async def _gemini_ws_connect(setup_payload: Dict[str, Any]):
-    headers = {"x-goog-api-key": API_KEY}
-    ws = await websockets.connect(
-        GEMINI_LIVE_WS_URL,
-        extra_headers=headers,
-        max_size=32 * 1024 * 1024,
-        ping_interval=20,
-        ping_timeout=20,
-    )
-    await ws.send(json.dumps(setup_payload))
-    while True:
-        raw = await ws.recv()
-        msg = json.loads(raw)
-        if "setupComplete" in msg:
-            return ws
-        if "error" in msg:
-            raise RuntimeError(f"Gemini setup error: {msg['error']}")
-@app.websocket("/ws")
-async def ws_proxy(client_ws: WebSocket):
-    await client_ws.accept()
-    if not API_KEY:
-        await client_ws.send_text(json.dumps({"type": "error", "message": "Missing GEMINI_API_KEY on server."}))
-        await client_ws.close(code=1011)
-        return
-    # Defaults per connection
-    cfg = {
-        "model": _safe_model(DEFAULT_MODEL),
-        "system_instruction": _clean_str(DEFAULT_SYSTEM) or "You are helpful.",
-        "temperature": DEFAULT_TEMPERATURE,
-        "max_output_tokens": DEFAULT_MAX_TOKENS,
-        "voice": _clean_str(DEFAULT_VOICE) or "Kore",
-        "input_rate": DEFAULT_INPUT_RATE,
     }
-    # Wait briefly for optional configure (FIRST message)
-    pending_first: Optional[Dict[str, Any]] = None
-    try:
-        raw = await asyncio.wait_for(client_ws.receive_text(), timeout=1.2)
-        first = json.loads(raw)
-        if isinstance(first, dict) and first.get("type") == "configure":
-            cfg["model"] = _safe_model(first.get("model"))
-            si = _clean_str(first.get("system_instruction"))
-            if si:
-                cfg["system_instruction"] = si
-            try:
-                if first.get("temperature") is not None:
-                    cfg["temperature"] = float(first["temperature"])
-            except Exception:
-                pass
-            try:
-                if first.get("max_output_tokens") is not None:
-                    cfg["max_output_tokens"] = int(first["max_output_tokens"])
-            except Exception:
-                pass
-            v = _clean_str(first.get("voice"))
-            if v:
-                cfg["voice"] = v
-            try:
-                if first.get("input_rate") is not None:
-                    cfg["input_rate"] = int(first["input_rate"])
-            except Exception:
-                pass
-            await client_ws.send_text(json.dumps({"type": "configured"}))
-        else:
-            pending_first = first if isinstance(first, dict) else None
-    except asyncio.TimeoutError:
-        pass
-    except Exception:
-        pass
-    # FINAL guard (this prevents “undefined” ever reaching Gemini)
-    cfg["model"] = _safe_model(cfg["model"])
-    # Build native-audio session setup
-    setup_payload = {
-        "setup": {
-            "model": cfg["model"],
-            "generationConfig": {
-                "temperature": cfg["temperature"],
-                "maxOutputTokens": cfg["max_output_tokens"],
-                "responseModalities": ["AUDIO"],
-                "speechConfig": {
-                    "voiceConfig": {
-                        "prebuiltVoiceConfig": {
-                            "voiceName": cfg["voice"],
-                        }
-                    }
-                },
-            },
-            "inputAudioTranscription": {},
-            "outputAudioTranscription": {},
-            "systemInstruction": {
-                "role": "system",
-                "parts": [{"text": cfg["system_instruction"]}],
-            },
-        }
-    }
-    stop_event = asyncio.Event()
-    gemini_ws = None
-    try:
-        gemini_ws = await _gemini_ws_connect(setup_payload)
-        await client_ws.send_text(json.dumps({"type": "ready", "model": cfg["model"]}))
-    except Exception as e:
-        await client_ws.send_text(json.dumps({"type": "error", "message": f"Gemini setup failed: {e}"}))
-        await client_ws.close(code=1011)
-        return
-    async def forward_client_to_gemini():
-        nonlocal pending_first
-        try:
-            while not stop_event.is_set():
-                if pending_first is not None:
-                    data = pending_first
-                    pending_first = None
-                else:
-                    raw = await client_ws.receive_text()
-                    data = json.loads(raw)
-                t = data.get("type")
-                if t == "close":
-                    stop_event.set()
-                    return
-                if t == "audio":
-                    b64 = data.get("data")
-                    rate = data.get("rate", cfg["input_rate"])
-                    if not isinstance(b64, str) or not b64:
-                        continue
-                    try:
-                        rate_i = int(rate)
-                    except Exception:
-                        rate_i = cfg["input_rate"]
-                    payload = {
-                        "realtimeInput": {
-                            "audio": {
-                                "data": b64,
-                                "mimeType": f"audio/pcm;rate={rate_i}",
-                            }
-                        }
-                    }
-                    await gemini_ws.send(json.dumps(payload))
-                    continue
-                if t == "audio_end":
-                    await gemini_ws.send(json.dumps({"realtimeInput": {"audioStreamEnd": True}}))
-                    continue
-                if t == "text":
-                    text = data.get("text", "")
-                    if isinstance(text, str) and text.strip():
-                        payload = {
-                            "clientContent": {
-                                "turns": [{"role": "user", "parts": [{"text": text.strip()}]}],
-                                "turnComplete": True,
-                            }
-                        }
-                        await gemini_ws.send(json.dumps(payload))
-                    continue
-                await client_ws.send_text(json.dumps({"type": "error", "message": f"Unknown message type: {t}"}))
-        except WebSocketDisconnect:
-            stop_event.set()
-        except Exception as e:
-            stop_event.set()
-            try:
-                await client_ws.send_text(json.dumps({"type": "error", "message": str(e)}))
-            except Exception:
-                pass
-    async def forward_gemini_to_client():
-        try:
-            while not stop_event.is_set():
-                raw = await gemini_ws.recv()
-                msg = json.loads(raw)
-                if DEBUG_GEMINI_RAW:
-                    await client_ws.send_text(json.dumps({"type": "gemini_raw", "message": msg}))
-                server_content = msg.get("serverContent")
-                if isinstance(server_content, dict):
-                    model_turn = server_content.get("modelTurn")
-                    if isinstance(model_turn, dict):
-                        txt = _extract_text_parts(model_turn)
-                        if txt:
-                            await client_ws.send_text(json.dumps({"type": "text_delta", "text": txt}))
-                        audios = _extract_inline_audio_parts(model_turn)
-                        for a in audios:
-                            await client_ws.send_text(
-                                json.dumps({"type": "audio_delta", "mime": a["mime"], "data": a["data"]})
-                            )
-                    out_tx = server_content.get("outputTranscription")
-                    if isinstance(out_tx, dict) and isinstance(out_tx.get("text"), str):
-                        await client_ws.send_text(
-                            json.dumps({"type": "output_transcript_delta", "text": out_tx["text"]})
-                        )
-                    if server_content.get("generationComplete") is True:
-                        await client_ws.send_text(json.dumps({"type": "turn_complete"}))
-        except Exception as e:
-            stop_event.set()
-            try:
-                await client_ws.send_text(json.dumps({"type": "error", "message": f"Gemini link error: {e}"}))
-            except Exception:
-                pass
-    try:
-        await asyncio.gather(forward_client_to_gemini(), forward_gemini_to_client())
-    finally:
-        stop_event.set()
-        try:
-            if gemini_ws is not None:
-                await gemini_ws.close()
-        except Exception:
-            pass
-        try:
-            await client_ws.close()
-        except Exception:
-            pass

 import asyncio
+import base64
+import json
+import os
+from typing import AsyncGenerator, Literal
+import numpy as np
 from dotenv import load_dotenv
+from fastapi import FastAPI
+from fastapi.responses import StreamingResponse
+from fastrtc import (
+    AdditionalOutputs,
+    AsyncStreamHandler,
+    Stream,
+    wait_for_item,
 )
+from google import genai
+from google.genai.types import (
+    LiveConnectConfig,
+    PrebuiltVoiceConfig,
+    SpeechConfig,
+    VoiceConfig,
 )
+load_dotenv()
+# ---------------------------
+# Config (env vars)
+# ---------------------------
+# Put this in your HF Space "Secrets":
+#   GEMINI_API_KEY = "..."
+GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "")
+# Gemini realtime model (this is the one FastRTC uses in their Gemini demo Space)
+# You can change this later to another Live-capable model.
+GEMINI_LIVE_MODEL = os.getenv("GEMINI_LIVE_MODEL", "gemini-2.0-flash-exp")
+# Voice name (FastRTC Gemini demo uses "Puck" by default)
+DEFAULT_VOICE = os.getenv("GEMINI_VOICE", "Puck")
+# Sample rates
+OUTPUT_SAMPLE_RATE = int(os.getenv("OUTPUT_SAMPLE_RATE", "24000"))
+INPUT_SAMPLE_RATE = int(os.getenv("INPUT_SAMPLE_RATE", "16000"))  # matches the demo Space
+def _encode_pcm16_mono_to_b64(data: np.ndarray) -> str:
+    """
+    Encodes int16 mono PCM to base64 for any custom debug endpoints.
+    """
+    if data.dtype != np.int16:
+        data = data.astype(np.int16)
+    return base64.b64encode(data.tobytes()).decode("utf-8")
+class GeminiLiveAudioHandler(AsyncStreamHandler):
+    """
+    FastRTC AsyncStreamHandler that connects to Gemini Live and streams AUDIO back.
+    This is adapted from the official FastRTC Gemini demo Space code. :contentReference[oaicite:5]{index=5}
+    """
+    def __init__(
+        self,
+        expected_layout: Literal["mono"] = "mono",
+        output_sample_rate: int = OUTPUT_SAMPLE_RATE,
+    ) -> None:
+        super().__init__(
+            expected_layout=expected_layout,
+            output_sample_rate=output_sample_rate,
+            input_sample_rate=INPUT_SAMPLE_RATE,
+        )
+        self.input_queue: asyncio.Queue[bytes] = asyncio.Queue()
+        self.output_queue: asyncio.Queue[tuple[int, np.ndarray] | AdditionalOutputs] = asyncio.Queue()
+        self.quit = asyncio.Event()
+    def copy(self) -> "GeminiLiveAudioHandler":
+        # FastRTC uses .copy() to clone per-connection handlers
+        return GeminiLiveAudioHandler(
+            expected_layout="mono",
+            output_sample_rate=self.output_sample_rate,
+        )
+    async def start_up(self) -> None:
+        """
+        Connect to Gemini Live, then continuously:
+          - read user audio from self.stream()
+          - receive model audio chunks and push them to output_queue
+        """
+        # Optional: allow per-connection overrides via "additional_inputs"
+        # We wait for args to be set (FastRTC API docs show wait_for_args usage). :contentReference[oaicite:6]{index=6}
+        await self.wait_for_args()
+        # latest_args includes metadata at [0]; any custom inputs start at [1]
+        # We'll accept: voice_name (str) as the single custom arg, fallback to DEFAULT_VOICE.
+        voice_name = DEFAULT_VOICE
+        try:
+            if len(self.latest_args) >= 2 and isinstance(self.latest_args[1], str) and self.latest_args[1].strip():
+                voice_name = self.latest_args[1].strip()
+        except Exception:
+            pass
+        api_key = GEMINI_API_KEY
+        if not api_key:
+            # Fail early with a helpful message in the client.
+            await self.output_queue.put(
+                AdditionalOutputs({"type": "error", "message": "Missing GEMINI_API_KEY env var on the server."})
+            )
+            return
+        client = genai.Client(
+            api_key=api_key,
+            http_options={"api_version": "v1alpha"},  # matches FastRTC Gemini demo Space :contentReference[oaicite:7]{index=7}
+        )
+        config = LiveConnectConfig(
+            response_modalities=["AUDIO"],  # AUDIO-only mode :contentReference[oaicite:8]{index=8}
+            speech_config=SpeechConfig(
+                voice_config=VoiceConfig(
+                    prebuilt_voice_config=PrebuiltVoiceConfig(voice_name=voice_name)
+                )
+            ),
+        )
+        async with client.aio.live.connect(model=GEMINI_LIVE_MODEL, config=config) as session:
+            # session.start_stream takes an async generator of bytes
+            async for audio in session.start_stream(stream=self._stream_pcm(), mime_type="audio/pcm"):
+                if audio.data:
+                    # Gemini returns pcm16 bytes; convert to int16 array
+                    arr = np.frombuffer(audio.data, dtype=np.int16)
+                    # FastRTC expects (sample_rate, np.ndarray) shaped like (1, n) or (n,) depending on handler usage.
+                    self.output_queue.put_nowait((self.output_sample_rate, arr.reshape(1, -1)))
+    async def _stream_pcm(self) -> AsyncGenerator[bytes, None]:
+        """
+        Provides PCM bytes to Gemini Live continuously.
+        """
+        while not self.quit.is_set():
+            try:
+                chunk = await asyncio.wait_for(self.input_queue.get(), timeout=0.1)
+                yield chunk
+            except (asyncio.TimeoutError, TimeoutError):
+                pass
+    async def receive(self, frame: tuple[int, np.ndarray]) -> None:
+        """
+        Called by FastRTC as audio frames arrive from the client.
+        """
+        _, audio = frame
+        # Expect mono, int16-ish. Convert safely.
+        audio = np.asarray(audio)
+        if audio.ndim == 2:
+            audio = audio.squeeze()
+        if audio.dtype != np.int16:
+            audio = audio.astype(np.int16)
+        # Push raw PCM16 bytes to Gemini stream
+        self.input_queue.put_nowait(audio.tobytes())
+    async def emit(self) -> tuple[int, np.ndarray] | AdditionalOutputs | None:
+        """
+        Called by FastRTC to get the next outbound chunk (audio or structured outputs).
+        """
+        return await wait_for_item(self.output_queue)
+    async def shutdown(self) -> None:
+        self.quit.set()
+# ---------------------------
+# FastRTC Stream + FastAPI
+# ---------------------------
+# We expose one additional input: voice name
+# Clients can set it via Stream.set_input(...) patterns described in the FastRTC API docs. :contentReference[oaicite:9]{index=9}
+stream = Stream(
+    handler=GeminiLiveAudioHandler(),
+    modality="audio",
+    mode="send-receive",
+    additional_inputs=[
+        # Keep it simple: one string
+        # (FastRTC examples often use Gradio components here; in API mode we’ll set via set_input)
+        # We still define it so handler.wait_for_args() has something to wait on.
+        "voice_name"
+    ],
+)
+app = FastAPI()
+# Mount FastRTC endpoints onto FastAPI (this is the core feature). :contentReference[oaicite:10]{index=10}
+stream.mount(app)
+# ---------------------------
+# Optional: server-side outputs stream (SSE)
+# Works well for Scratch/JS clients that want text/meta without WebRTC.
+# FastRTC docs show using stream.output_stream(webrtc_id). :contentReference[oaicite:11]{index=11}
+# The talk-to-openai Space uses the same approach. :contentReference[oaicite:12]{index=12}
+# ---------------------------
+@app.get("/outputs")
+async def outputs(webrtc_id: str):
+    async def event_stream():
+        async for out in stream.output_stream(webrtc_id):
+            # out is an AdditionalOutputs instance
+            # Serialize it as SSE "output" events
+            payload = json.dumps(out.args[0] if out.args else None)
+            yield f"event: output\ndata: {payload}\n\n"
+    return StreamingResponse(event_stream(), media_type="text/event-stream")
 @app.get("/health")
 async def health():
+    return {
+        "ok": True,
+        "provider": "gemini_live_audio",
+        "model": GEMINI_LIVE_MODEL,
+        "output_sample_rate": OUTPUT_SAMPLE_RATE,
+        "input_sample_rate": INPUT_SAMPLE_RATE,
     }
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", "7860")))