Spaces:

MataStrategy
/

ground-zero

Running

jefffffff9 commited on 8 days ago

Commit

064d08b

1 Parent(s): cc82bd8

Per-turn telemetry + RAG few-shot on phrasebook miss

Telemetry (src/engine/turn_logger.py)
Thread-safe JSONL append logger writing one self-contained row per turn
to data/field_turns.jsonl (overridable via FIELD_TURNS_PATH). Captures
ts, tab, langs, transcript/user_text, phrasebook hit details, llm model
+ ms, reply, tts ms, total ms, error. Foundation for field-test review,
phrasebook hit-rate measurement, and Stage-4 training-data curation.
Never raises — telemetry must not break the user-facing pipeline.

RAG few-shot (src/llm/phrasebook.py + src/llm/minimal_client.py)
- phrasebook.top_k(text, lang, k=3): nearest-K phrasebook entries
regardless of threshold, sorted by descending fuzzy score.
- MinimalClient.chat() now accepts extra_examples; _build_system_prompt
appends them as a second 'Additional reference phrases relevant to the
current user input' section after the 30-pair anchor list.

app_minimal.py
- Both run_pipeline (voice) and run_text_pipeline (text) now share a
single _resolve_reply() helper: phrasebook short-circuit first, on
miss inject top-3 nearest entries into the LLM call, log the turn.
- Stage timings (transcribe_ms, llm_ms, tts_ms, total_ms) captured per
turn and emitted via TurnLogger.

Files changed (4) hide show

app_minimal.py +134 -42
src/engine/turn_logger.py +80 -0
src/llm/minimal_client.py +36 -4
src/llm/phrasebook.py +32 -0

app_minimal.py CHANGED Viewed

@@ -36,9 +36,10 @@ except ImportError:
 # Local imports — the four modules the baseline-rebuild plan authorizes.
 # Everything else in src/ is intentionally unused here.
 from src.data.bam_normalize import normalize as bam_normalize
 from src.engine.whisper_base import WhisperBackbone
 from src.llm.minimal_client import MinimalClient
-from src.llm.phrasebook import lookup as phrasebook_lookup
 from src.tts.mms_tts import MMSTTSEngine
 logging.basicConfig(
@@ -71,9 +72,10 @@ LANG_TO_WHISPER_HINT = {
 # ── Service singletons (lazy-loaded) ────────────────────────────────────────
-_backbone: Optional[WhisperBackbone] = None
-_llm:      Optional[MinimalClient]   = None
-_tts:      Optional[MMSTTSEngine]    = None
 def _resolve_device() -> str:
@@ -180,6 +182,8 @@ def run_pipeline(
     Returns (transcript, reply_text, reply_audio). Graceful degradation: any
     stage failure yields a readable string and None audio instead of raising.
     """
     if audio is None:
         return "", "(no audio received)", None
@@ -187,44 +191,75 @@ def run_pipeline(
     if audio_np.size == 0:
         return "", "(empty audio)", None
     try:
         transcript = transcribe(audio_np, sample_rate, input_lang)
     except Exception as exc:  # pragma: no cover — field-safety
         logger.exception("Transcription failed")
         return "", f"(STT error: {exc})", None
     if not transcript:
         return "", "(no speech detected)", None
-    # ── Phrasebook short-circuit ──────────────────────────────────────────
-    # Canonical greetings/courtesies hit the curated gold phrasebook directly,
-    # skipping the LLM entirely. Only fires for bam/ful targets.
-    hit = phrasebook_lookup(transcript, output_lang)
-    if hit:
-        logger.info(
-            "Phrasebook hit (%s, score=%.2f): %r → %r [cat=%s]",
-            hit["match"], hit["score"], transcript, hit["target"], hit["category"],
         )
-        reply_text = hit["target"]
-    else:
-        try:
-            # Dialect-anchored plain-string reply (see MinimalClient).
-            reply_text = get_llm().chat(transcript, target_lang=output_lang)
-        except Exception as exc:  # pragma: no cover
-            logger.exception("LLM call failed")
-            return transcript, f"(LLM error: {exc})", None
-    reply_text = reply_text or "(empty reply)"
     try:
         wav, sr = get_tts().synthesize(
             reply_text, language=output_lang, device=_resolve_device()
         )
     except Exception as exc:
         logger.exception("TTS failed")
-        return transcript, reply_text, None
-    return transcript, reply_text, (sr, wav)
 def run_text_pipeline(
@@ -241,36 +276,93 @@ def run_text_pipeline(
     reads it as-is and replies in `output_lang`. Skips Whisper entirely; this
     is the fast dev-loop path.
     """
     text = (text or "").strip()
     if not text:
         return "(no text entered)", None
-    # ── Phrasebook short-circuit (see voice path above) ──────────────────
-    hit = phrasebook_lookup(text, output_lang)
-    if hit:
-        logger.info(
-            "Phrasebook hit (%s, score=%.2f): %r → %r [cat=%s]",
-            hit["match"], hit["score"], text, hit["target"], hit["category"],
         )
-        reply_text = hit["target"]
-    else:
-        try:
-            reply_text = get_llm().chat(text, target_lang=output_lang)
-        except Exception as exc:  # pragma: no cover
-            logger.exception("LLM call failed")
-            return f"(LLM error: {exc})", None
-    reply_text = reply_text or "(empty reply)"
     try:
         wav, sr = get_tts().synthesize(
             reply_text, language=output_lang, device=_resolve_device()
         )
     except Exception as exc:
         logger.exception("TTS failed")
-        return reply_text, None
-    return reply_text, (sr, wav)
 # ── Gradio UI ────────────────────────────────────────────────────────────────

 # Local imports — the four modules the baseline-rebuild plan authorizes.
 # Everything else in src/ is intentionally unused here.
 from src.data.bam_normalize import normalize as bam_normalize
+from src.engine.turn_logger import TurnLogger
 from src.engine.whisper_base import WhisperBackbone
 from src.llm.minimal_client import MinimalClient
+from src.llm.phrasebook import lookup as phrasebook_lookup, top_k as phrasebook_top_k
 from src.tts.mms_tts import MMSTTSEngine
 logging.basicConfig(
 # ── Service singletons (lazy-loaded) ────────────────────────────────────────
+_backbone:    Optional[WhisperBackbone] = None
+_llm:         Optional[MinimalClient]   = None
+_tts:         Optional[MMSTTSEngine]    = None
+_turn_logger: TurnLogger                = TurnLogger()
 def _resolve_device() -> str:
     Returns (transcript, reply_text, reply_audio). Graceful degradation: any
     stage failure yields a readable string and None audio instead of raising.
     """
+    import time
+    t0 = time.perf_counter()
     if audio is None:
         return "", "(no audio received)", None
     if audio_np.size == 0:
         return "", "(empty audio)", None
+    # ── 1. Transcribe ─────────────────────────────────────────────────────
+    t_stt = time.perf_counter()
     try:
         transcript = transcribe(audio_np, sample_rate, input_lang)
     except Exception as exc:  # pragma: no cover — field-safety
         logger.exception("Transcription failed")
+        _turn_logger.log(
+            tab="voice", input_lang=input_lang, output_lang=output_lang,
+            user_text=None, transcript=None, transcribe_ms=None,
+            phrasebook=None, llm_model=None, llm_ms=None,
+            reply_text=None, tts_ms=None,
+            total_ms=int((time.perf_counter() - t0) * 1000),
+            error=f"stt: {exc}",
+        )
         return "", f"(STT error: {exc})", None
+    transcribe_ms = int((time.perf_counter() - t_stt) * 1000)
     if not transcript:
+        _turn_logger.log(
+            tab="voice", input_lang=input_lang, output_lang=output_lang,
+            user_text=None, transcript="", transcribe_ms=transcribe_ms,
+            phrasebook=None, llm_model=None, llm_ms=None,
+            reply_text=None, tts_ms=None,
+            total_ms=int((time.perf_counter() - t0) * 1000),
+            error="no_speech",
+        )
         return "", "(no speech detected)", None
+    # ── 2. Phrasebook → LLM (with RAG few-shot on miss) → reply ──────────
+    reply_text, hit, llm_ms = _resolve_reply(transcript, output_lang)
+    if reply_text is None:
+        _turn_logger.log(
+            tab="voice", input_lang=input_lang, output_lang=output_lang,
+            user_text=transcript, transcript=transcript,
+            transcribe_ms=transcribe_ms,
+            phrasebook=hit, llm_model=LLM_MODEL_ID, llm_ms=llm_ms,
+            reply_text=None, tts_ms=None,
+            total_ms=int((time.perf_counter() - t0) * 1000),
+            error="llm_failed",
         )
+        return transcript, "(LLM error)", None
+    # ── 3. TTS ────────────────────────────────────────────────────────────
+    t_tts = time.perf_counter()
+    tts_ms: Optional[int] = None
+    audio_out: Optional[Tuple[int, np.ndarray]] = None
+    tts_error: Optional[str] = None
     try:
         wav, sr = get_tts().synthesize(
             reply_text, language=output_lang, device=_resolve_device()
         )
+        audio_out = (sr, wav)
+        tts_ms = int((time.perf_counter() - t_tts) * 1000)
     except Exception as exc:
         logger.exception("TTS failed")
+        tts_error = f"tts: {exc}"
+    _turn_logger.log(
+        tab="voice", input_lang=input_lang, output_lang=output_lang,
+        user_text=transcript, transcript=transcript,
+        transcribe_ms=transcribe_ms,
+        phrasebook=hit,
+        llm_model=None if hit else LLM_MODEL_ID,
+        llm_ms=llm_ms,
+        reply_text=reply_text, tts_ms=tts_ms,
+        total_ms=int((time.perf_counter() - t0) * 1000),
+        error=tts_error,
+    )
+    return transcript, reply_text, audio_out
 def run_text_pipeline(
     reads it as-is and replies in `output_lang`. Skips Whisper entirely; this
     is the fast dev-loop path.
     """
+    import time
+    t0 = time.perf_counter()
     text = (text or "").strip()
     if not text:
         return "(no text entered)", None
+    reply_text, hit, llm_ms = _resolve_reply(text, output_lang)
+    if reply_text is None:
+        _turn_logger.log(
+            tab="text", input_lang=None, output_lang=output_lang,
+            user_text=text, transcript=None, transcribe_ms=None,
+            phrasebook=hit, llm_model=LLM_MODEL_ID, llm_ms=llm_ms,
+            reply_text=None, tts_ms=None,
+            total_ms=int((time.perf_counter() - t0) * 1000),
+            error="llm_failed",
         )
+        return "(LLM error)", None
+    t_tts = time.perf_counter()
+    tts_ms: Optional[int] = None
+    audio_out: Optional[Tuple[int, np.ndarray]] = None
+    tts_error: Optional[str] = None
     try:
         wav, sr = get_tts().synthesize(
             reply_text, language=output_lang, device=_resolve_device()
         )
+        audio_out = (sr, wav)
+        tts_ms = int((time.perf_counter() - t_tts) * 1000)
     except Exception as exc:
         logger.exception("TTS failed")
+        tts_error = f"tts: {exc}"
+    _turn_logger.log(
+        tab="text", input_lang=None, output_lang=output_lang,
+        user_text=text, transcript=None, transcribe_ms=None,
+        phrasebook=hit,
+        llm_model=None if hit else LLM_MODEL_ID,
+        llm_ms=llm_ms,
+        reply_text=reply_text, tts_ms=tts_ms,
+        total_ms=int((time.perf_counter() - t0) * 1000),
+        error=tts_error,
+    )
+    return reply_text, audio_out
+def _resolve_reply(
+    user_text: str,
+    output_lang: str,
+) -> Tuple[Optional[str], Optional[dict], Optional[int]]:
+    """Shared phrasebook → LLM resolver for both voice and text tabs.
+    Returns (reply_text, phrasebook_hit_or_None, llm_ms_or_None).
+    `reply_text` is None only if the LLM itself failed; in every other case
+    the caller is given a usable string (possibly an "(empty reply)" sentinel).
+    On phrasebook miss for bam/ful targets, the top-3 nearest gold pairs are
+    injected into the LLM system prompt as additional dynamic few-shot
+    (RAG-style anchoring). Misses on en/fr targets call the LLM with no
+    extras since the curated phrasebooks only cover bam/ful.
+    """
+    import time
+    hit = phrasebook_lookup(user_text, output_lang)
+    if hit:
+        logger.info(
+            "Phrasebook hit (%s, score=%.2f): %r → %r [cat=%s]",
+            hit["match"], hit["score"], user_text, hit["target"], hit["category"],
+        )
+        reply = hit["target"] or "(empty reply)"
+        return reply, hit, None
+    extras = phrasebook_top_k(user_text, output_lang, k=3) or None
+    if extras:
+        logger.info(
+            "Phrasebook miss; RAG-injecting top-%d nearest (top score=%.2f)",
+            len(extras), extras[0]["score"],
+        )
+    t_llm = time.perf_counter()
+    try:
+        reply = get_llm().chat(
+            user_text, target_lang=output_lang, extra_examples=extras,
+        )
+    except Exception as exc:  # pragma: no cover
+        logger.exception("LLM call failed")
+        return None, None, int((time.perf_counter() - t_llm) * 1000)
+    llm_ms = int((time.perf_counter() - t_llm) * 1000)
+    return (reply or "(empty reply)"), None, llm_ms
 # ── Gradio UI ────────────────────────────────────────────────────────────────

src/engine/turn_logger.py ADDED Viewed

	@@ -0,0 +1,80 @@

+"""TurnLogger — per-turn JSONL telemetry for the minimal baseline.
+Every voice or text turn writes one self-contained line to
+`data/field_turns.jsonl` (path overridable via FIELD_TURNS_PATH).
+This is the foundation for:
+  - field-test review (read the JSONL after a session)
+  - phrasebook hit-rate measurement
+  - LLM A/B comparisons
+  - eventually, Stage-4 LoRA training-data curation
+      (every line already pairs an English/French input with a vetted
+       Bambara/Pular reply; we'll filter on phrasebook-hit + user-confirmed
+       turns later).
+Schema (one JSON object per line):
+    {
+      "ts":            "<ISO-8601 UTC>",
+      "tab":           "voice" | "text",
+      "input_lang":    "bam" | "ful" | "fr" | "en" | null,
+      "output_lang":   "bam" | "ful" | "fr" | "en",
+      "user_text":     "<raw input from text tab, or transcript for voice tab>",
+      "transcript":    "<whisper output, voice tab only>" | null,
+      "transcribe_ms": <int> | null,
+      "phrasebook":    { match, score, category, source, target } | null,
+      "llm_model":     "<model id>" | null,
+      "llm_ms":        <int> | null,
+      "reply_text":    "<final reply that fed TTS>",
+      "tts_ms":        <int> | null,
+      "total_ms":      <int>,
+      "error":         "<short error string>" | null
+    }
+Notes:
+  - File path is gitignored (data/ is excluded by .gitignore).
+  - Append mode + line-buffered + lock — safe for the single-process Gradio
+    server. Not designed for multi-worker writes.
+"""
+from __future__ import annotations
+import json
+import logging
+import os
+import threading
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any, Optional
+logger = logging.getLogger(__name__)
+_DEFAULT_PATH = (
+    Path(__file__).resolve().parent.parent.parent / "data" / "field_turns.jsonl"
+)
+def _utcnow_iso() -> str:
+    return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
+class TurnLogger:
+    """Append-only JSONL logger. Thread-safe for one process."""
+    def __init__(self, path: Optional[str] = None) -> None:
+        env_path = os.environ.get("FIELD_TURNS_PATH")
+        self.path = Path(path or env_path or _DEFAULT_PATH)
+        self.path.parent.mkdir(parents=True, exist_ok=True)
+        self._lock = threading.Lock()
+        logger.info("TurnLogger writing to %s", self.path)
+    def log(self, **fields: Any) -> None:
+        """Write one row. Always sets ts; leaves the rest to the caller.
+        Never raises — telemetry must not break the user-facing pipeline.
+        """
+        row = {"ts": _utcnow_iso(), **fields}
+        try:
+            line = json.dumps(row, ensure_ascii=False)
+            with self._lock, self.path.open("a", encoding="utf-8") as fh:
+                fh.write(line + "\n")
+        except Exception as exc:  # pragma: no cover
+            logger.warning("TurnLogger.log failed: %s", exc)

src/llm/minimal_client.py CHANGED Viewed

@@ -76,8 +76,17 @@ def _load_anchors(lang: str) -> list[dict]:
     return data.get("pairs", [])
-def _build_system_prompt(target_lang: str) -> str:
-    """Assemble the per-call system prompt for a target output language."""
     full = LANG_FULL_NAME.get(target_lang, "English")
     forbidden = FORBIDDEN_DRIFT.get(target_lang, "")
     anchors = _load_anchors(target_lang)
@@ -114,6 +123,19 @@ def _build_system_prompt(target_lang: str) -> str:
             if src and tgt:
                 lines.append(f"- {src}  →  {tgt}")
     lines += [
         "",
         f"Always reply in {full}, even if the user writes to you in English, "
@@ -146,13 +168,23 @@ class MinimalClient:
             self._client = InferenceClient(token=self.hf_token)
         return self._client
-    def chat(self, user_text: str, target_lang: str = "bam") -> str:
         """Return a plain-text reply in `target_lang`.
         On any error returns a short parenthetical error string so the caller
         can still feed something into TTS / display.
         """
-        system_prompt = _build_system_prompt(target_lang)
         try:
             client = self._get_client()
             completion = client.chat_completion(

     return data.get("pairs", [])
+def _build_system_prompt(
+    target_lang: str,
+    extra_examples: Optional[list[dict]] = None,
+) -> str:
+    """Assemble the per-call system prompt for a target output language.
+    `extra_examples`, when supplied, are appended after the curated 30-pair
+    gold list as additional dynamic few-shot anchoring — used by app_minimal
+    to inject the top-K nearest phrasebook entries when the strict short-
+    circuit misses.
+    """
     full = LANG_FULL_NAME.get(target_lang, "English")
     forbidden = FORBIDDEN_DRIFT.get(target_lang, "")
     anchors = _load_anchors(target_lang)
             if src and tgt:
                 lines.append(f"- {src}  →  {tgt}")
+    if extra_examples:
+        lines += [
+            "",
+            "Additional reference phrases relevant to the current user input "
+            f"(curated gold {full} translations — use the same orthography and "
+            "style):",
+        ]
+        for item in extra_examples:
+            src = (item.get("source") or "").strip()
+            tgt = (item.get("target") or "").strip()
+            if src and tgt:
+                lines.append(f"- {src}  →  {tgt}")
     lines += [
         "",
         f"Always reply in {full}, even if the user writes to you in English, "
             self._client = InferenceClient(token=self.hf_token)
         return self._client
+    def chat(
+        self,
+        user_text: str,
+        target_lang: str = "bam",
+        extra_examples: Optional[list[dict]] = None,
+    ) -> str:
         """Return a plain-text reply in `target_lang`.
+        `extra_examples` (optional) — list of {source, target} dicts that get
+        appended to the system prompt as additional dynamic few-shot. Used by
+        app_minimal to RAG-inject the top-K nearest phrasebook entries when
+        the strict phrasebook short-circuit misses.
         On any error returns a short parenthetical error string so the caller
         can still feed something into TTS / display.
         """
+        system_prompt = _build_system_prompt(target_lang, extra_examples)
         try:
             client = self._get_client()
             completion = client.chat_completion(

src/llm/phrasebook.py CHANGED Viewed

@@ -121,3 +121,35 @@ def lookup(
             "match":    "fuzzy",
         }
     return None

             "match":    "fuzzy",
         }
     return None
+def top_k(user_text: str, target_lang: str, k: int = 3) -> list[dict]:
+    """Return the k closest phrasebook entries to `user_text` regardless of threshold.
+    Used as RAG-style few-shot context when the strict `lookup()` misses but we
+    still want to anchor the LLM with locally relevant gold pairs. Returns
+    results sorted by descending score; never raises.
+    """
+    pairs = _load_phrasebook(target_lang)
+    if not pairs:
+        return []
+    q = _normalize(user_text)
+    if not q:
+        return []
+    scored: list[tuple[float, dict]] = []
+    for p in pairs:
+        src = p.get("_norm", "")
+        if not src:
+            continue
+        score = 1.0 if src == q else SequenceMatcher(None, q, src).ratio()
+        scored.append((score, p))
+    scored.sort(key=lambda x: x[0], reverse=True)
+    out: list[dict] = []
+    for score, p in scored[:k]:
+        out.append({
+            "source":   p.get("source"),
+            "target":   p.get("target"),
+            "category": p.get("category"),
+            "score":    round(score, 3),
+        })
+    return out