jefffffff9 commited on
Commit
064d08b
·
1 Parent(s): cc82bd8

Per-turn telemetry + RAG few-shot on phrasebook miss

Browse files

Telemetry (src/engine/turn_logger.py)
Thread-safe JSONL append logger writing one self-contained row per turn
to data/field_turns.jsonl (overridable via FIELD_TURNS_PATH). Captures
ts, tab, langs, transcript/user_text, phrasebook hit details, llm model
+ ms, reply, tts ms, total ms, error. Foundation for field-test review,
phrasebook hit-rate measurement, and Stage-4 training-data curation.
Never raises — telemetry must not break the user-facing pipeline.

RAG few-shot (src/llm/phrasebook.py + src/llm/minimal_client.py)
- phrasebook.top_k(text, lang, k=3): nearest-K phrasebook entries
regardless of threshold, sorted by descending fuzzy score.
- MinimalClient.chat() now accepts extra_examples; _build_system_prompt
appends them as a second 'Additional reference phrases relevant to the
current user input' section after the 30-pair anchor list.

app_minimal.py
- Both run_pipeline (voice) and run_text_pipeline (text) now share a
single _resolve_reply() helper: phrasebook short-circuit first, on
miss inject top-3 nearest entries into the LLM call, log the turn.
- Stage timings (transcribe_ms, llm_ms, tts_ms, total_ms) captured per
turn and emitted via TurnLogger.

app_minimal.py CHANGED
@@ -36,9 +36,10 @@ except ImportError:
36
  # Local imports — the four modules the baseline-rebuild plan authorizes.
37
  # Everything else in src/ is intentionally unused here.
38
  from src.data.bam_normalize import normalize as bam_normalize
 
39
  from src.engine.whisper_base import WhisperBackbone
40
  from src.llm.minimal_client import MinimalClient
41
- from src.llm.phrasebook import lookup as phrasebook_lookup
42
  from src.tts.mms_tts import MMSTTSEngine
43
 
44
  logging.basicConfig(
@@ -71,9 +72,10 @@ LANG_TO_WHISPER_HINT = {
71
 
72
 
73
  # ── Service singletons (lazy-loaded) ────────────────────────────────────────
74
- _backbone: Optional[WhisperBackbone] = None
75
- _llm: Optional[MinimalClient] = None
76
- _tts: Optional[MMSTTSEngine] = None
 
77
 
78
 
79
  def _resolve_device() -> str:
@@ -180,6 +182,8 @@ def run_pipeline(
180
  Returns (transcript, reply_text, reply_audio). Graceful degradation: any
181
  stage failure yields a readable string and None audio instead of raising.
182
  """
 
 
183
  if audio is None:
184
  return "", "(no audio received)", None
185
 
@@ -187,44 +191,75 @@ def run_pipeline(
187
  if audio_np.size == 0:
188
  return "", "(empty audio)", None
189
 
 
 
190
  try:
191
  transcript = transcribe(audio_np, sample_rate, input_lang)
192
  except Exception as exc: # pragma: no cover — field-safety
193
  logger.exception("Transcription failed")
 
 
 
 
 
 
 
 
194
  return "", f"(STT error: {exc})", None
 
195
 
196
  if not transcript:
 
 
 
 
 
 
 
 
197
  return "", "(no speech detected)", None
198
 
199
- # ── Phrasebook short-circuit ──────────────────────────────────────────
200
- # Canonical greetings/courtesies hit the curated gold phrasebook directly,
201
- # skipping the LLM entirely. Only fires for bam/ful targets.
202
- hit = phrasebook_lookup(transcript, output_lang)
203
- if hit:
204
- logger.info(
205
- "Phrasebook hit (%s, score=%.2f): %r → %r [cat=%s]",
206
- hit["match"], hit["score"], transcript, hit["target"], hit["category"],
 
 
 
207
  )
208
- reply_text = hit["target"]
209
- else:
210
- try:
211
- # Dialect-anchored plain-string reply (see MinimalClient).
212
- reply_text = get_llm().chat(transcript, target_lang=output_lang)
213
- except Exception as exc: # pragma: no cover
214
- logger.exception("LLM call failed")
215
- return transcript, f"(LLM error: {exc})", None
216
-
217
- reply_text = reply_text or "(empty reply)"
218
 
 
 
 
 
 
219
  try:
220
  wav, sr = get_tts().synthesize(
221
  reply_text, language=output_lang, device=_resolve_device()
222
  )
 
 
223
  except Exception as exc:
224
  logger.exception("TTS failed")
225
- return transcript, reply_text, None
226
-
227
- return transcript, reply_text, (sr, wav)
 
 
 
 
 
 
 
 
 
 
 
228
 
229
 
230
  def run_text_pipeline(
@@ -241,36 +276,93 @@ def run_text_pipeline(
241
  reads it as-is and replies in `output_lang`. Skips Whisper entirely; this
242
  is the fast dev-loop path.
243
  """
 
 
244
  text = (text or "").strip()
245
  if not text:
246
  return "(no text entered)", None
247
 
248
- # ── Phrasebook short-circuit (see voice path above) ──────────────────
249
- hit = phrasebook_lookup(text, output_lang)
250
- if hit:
251
- logger.info(
252
- "Phrasebook hit (%s, score=%.2f): %r → %r [cat=%s]",
253
- hit["match"], hit["score"], text, hit["target"], hit["category"],
 
 
 
254
  )
255
- reply_text = hit["target"]
256
- else:
257
- try:
258
- reply_text = get_llm().chat(text, target_lang=output_lang)
259
- except Exception as exc: # pragma: no cover
260
- logger.exception("LLM call failed")
261
- return f"(LLM error: {exc})", None
262
-
263
- reply_text = reply_text or "(empty reply)"
264
 
 
 
 
 
265
  try:
266
  wav, sr = get_tts().synthesize(
267
  reply_text, language=output_lang, device=_resolve_device()
268
  )
 
 
269
  except Exception as exc:
270
  logger.exception("TTS failed")
271
- return reply_text, None
 
 
 
 
 
 
 
 
 
 
 
 
 
272
 
273
- return reply_text, (sr, wav)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
274
 
275
 
276
  # ── Gradio UI ────────────────────────────────────────────────────────────────
 
36
  # Local imports — the four modules the baseline-rebuild plan authorizes.
37
  # Everything else in src/ is intentionally unused here.
38
  from src.data.bam_normalize import normalize as bam_normalize
39
+ from src.engine.turn_logger import TurnLogger
40
  from src.engine.whisper_base import WhisperBackbone
41
  from src.llm.minimal_client import MinimalClient
42
+ from src.llm.phrasebook import lookup as phrasebook_lookup, top_k as phrasebook_top_k
43
  from src.tts.mms_tts import MMSTTSEngine
44
 
45
  logging.basicConfig(
 
72
 
73
 
74
  # ── Service singletons (lazy-loaded) ────────────────────────────────────────
75
+ _backbone: Optional[WhisperBackbone] = None
76
+ _llm: Optional[MinimalClient] = None
77
+ _tts: Optional[MMSTTSEngine] = None
78
+ _turn_logger: TurnLogger = TurnLogger()
79
 
80
 
81
  def _resolve_device() -> str:
 
182
  Returns (transcript, reply_text, reply_audio). Graceful degradation: any
183
  stage failure yields a readable string and None audio instead of raising.
184
  """
185
+ import time
186
+ t0 = time.perf_counter()
187
  if audio is None:
188
  return "", "(no audio received)", None
189
 
 
191
  if audio_np.size == 0:
192
  return "", "(empty audio)", None
193
 
194
+ # ── 1. Transcribe ─────────────────────────────────────────────────────
195
+ t_stt = time.perf_counter()
196
  try:
197
  transcript = transcribe(audio_np, sample_rate, input_lang)
198
  except Exception as exc: # pragma: no cover — field-safety
199
  logger.exception("Transcription failed")
200
+ _turn_logger.log(
201
+ tab="voice", input_lang=input_lang, output_lang=output_lang,
202
+ user_text=None, transcript=None, transcribe_ms=None,
203
+ phrasebook=None, llm_model=None, llm_ms=None,
204
+ reply_text=None, tts_ms=None,
205
+ total_ms=int((time.perf_counter() - t0) * 1000),
206
+ error=f"stt: {exc}",
207
+ )
208
  return "", f"(STT error: {exc})", None
209
+ transcribe_ms = int((time.perf_counter() - t_stt) * 1000)
210
 
211
  if not transcript:
212
+ _turn_logger.log(
213
+ tab="voice", input_lang=input_lang, output_lang=output_lang,
214
+ user_text=None, transcript="", transcribe_ms=transcribe_ms,
215
+ phrasebook=None, llm_model=None, llm_ms=None,
216
+ reply_text=None, tts_ms=None,
217
+ total_ms=int((time.perf_counter() - t0) * 1000),
218
+ error="no_speech",
219
+ )
220
  return "", "(no speech detected)", None
221
 
222
+ # ── 2. Phrasebook → LLM (with RAG few-shot on miss) → reply ──────────
223
+ reply_text, hit, llm_ms = _resolve_reply(transcript, output_lang)
224
+ if reply_text is None:
225
+ _turn_logger.log(
226
+ tab="voice", input_lang=input_lang, output_lang=output_lang,
227
+ user_text=transcript, transcript=transcript,
228
+ transcribe_ms=transcribe_ms,
229
+ phrasebook=hit, llm_model=LLM_MODEL_ID, llm_ms=llm_ms,
230
+ reply_text=None, tts_ms=None,
231
+ total_ms=int((time.perf_counter() - t0) * 1000),
232
+ error="llm_failed",
233
  )
234
+ return transcript, "(LLM error)", None
 
 
 
 
 
 
 
 
 
235
 
236
+ # ── 3. TTS ────────────────────────────────────────────────────────────
237
+ t_tts = time.perf_counter()
238
+ tts_ms: Optional[int] = None
239
+ audio_out: Optional[Tuple[int, np.ndarray]] = None
240
+ tts_error: Optional[str] = None
241
  try:
242
  wav, sr = get_tts().synthesize(
243
  reply_text, language=output_lang, device=_resolve_device()
244
  )
245
+ audio_out = (sr, wav)
246
+ tts_ms = int((time.perf_counter() - t_tts) * 1000)
247
  except Exception as exc:
248
  logger.exception("TTS failed")
249
+ tts_error = f"tts: {exc}"
250
+
251
+ _turn_logger.log(
252
+ tab="voice", input_lang=input_lang, output_lang=output_lang,
253
+ user_text=transcript, transcript=transcript,
254
+ transcribe_ms=transcribe_ms,
255
+ phrasebook=hit,
256
+ llm_model=None if hit else LLM_MODEL_ID,
257
+ llm_ms=llm_ms,
258
+ reply_text=reply_text, tts_ms=tts_ms,
259
+ total_ms=int((time.perf_counter() - t0) * 1000),
260
+ error=tts_error,
261
+ )
262
+ return transcript, reply_text, audio_out
263
 
264
 
265
  def run_text_pipeline(
 
276
  reads it as-is and replies in `output_lang`. Skips Whisper entirely; this
277
  is the fast dev-loop path.
278
  """
279
+ import time
280
+ t0 = time.perf_counter()
281
  text = (text or "").strip()
282
  if not text:
283
  return "(no text entered)", None
284
 
285
+ reply_text, hit, llm_ms = _resolve_reply(text, output_lang)
286
+ if reply_text is None:
287
+ _turn_logger.log(
288
+ tab="text", input_lang=None, output_lang=output_lang,
289
+ user_text=text, transcript=None, transcribe_ms=None,
290
+ phrasebook=hit, llm_model=LLM_MODEL_ID, llm_ms=llm_ms,
291
+ reply_text=None, tts_ms=None,
292
+ total_ms=int((time.perf_counter() - t0) * 1000),
293
+ error="llm_failed",
294
  )
295
+ return "(LLM error)", None
 
 
 
 
 
 
 
 
296
 
297
+ t_tts = time.perf_counter()
298
+ tts_ms: Optional[int] = None
299
+ audio_out: Optional[Tuple[int, np.ndarray]] = None
300
+ tts_error: Optional[str] = None
301
  try:
302
  wav, sr = get_tts().synthesize(
303
  reply_text, language=output_lang, device=_resolve_device()
304
  )
305
+ audio_out = (sr, wav)
306
+ tts_ms = int((time.perf_counter() - t_tts) * 1000)
307
  except Exception as exc:
308
  logger.exception("TTS failed")
309
+ tts_error = f"tts: {exc}"
310
+
311
+ _turn_logger.log(
312
+ tab="text", input_lang=None, output_lang=output_lang,
313
+ user_text=text, transcript=None, transcribe_ms=None,
314
+ phrasebook=hit,
315
+ llm_model=None if hit else LLM_MODEL_ID,
316
+ llm_ms=llm_ms,
317
+ reply_text=reply_text, tts_ms=tts_ms,
318
+ total_ms=int((time.perf_counter() - t0) * 1000),
319
+ error=tts_error,
320
+ )
321
+ return reply_text, audio_out
322
+
323
 
324
+ def _resolve_reply(
325
+ user_text: str,
326
+ output_lang: str,
327
+ ) -> Tuple[Optional[str], Optional[dict], Optional[int]]:
328
+ """Shared phrasebook → LLM resolver for both voice and text tabs.
329
+
330
+ Returns (reply_text, phrasebook_hit_or_None, llm_ms_or_None).
331
+ `reply_text` is None only if the LLM itself failed; in every other case
332
+ the caller is given a usable string (possibly an "(empty reply)" sentinel).
333
+
334
+ On phrasebook miss for bam/ful targets, the top-3 nearest gold pairs are
335
+ injected into the LLM system prompt as additional dynamic few-shot
336
+ (RAG-style anchoring). Misses on en/fr targets call the LLM with no
337
+ extras since the curated phrasebooks only cover bam/ful.
338
+ """
339
+ import time
340
+ hit = phrasebook_lookup(user_text, output_lang)
341
+ if hit:
342
+ logger.info(
343
+ "Phrasebook hit (%s, score=%.2f): %r → %r [cat=%s]",
344
+ hit["match"], hit["score"], user_text, hit["target"], hit["category"],
345
+ )
346
+ reply = hit["target"] or "(empty reply)"
347
+ return reply, hit, None
348
+
349
+ extras = phrasebook_top_k(user_text, output_lang, k=3) or None
350
+ if extras:
351
+ logger.info(
352
+ "Phrasebook miss; RAG-injecting top-%d nearest (top score=%.2f)",
353
+ len(extras), extras[0]["score"],
354
+ )
355
+
356
+ t_llm = time.perf_counter()
357
+ try:
358
+ reply = get_llm().chat(
359
+ user_text, target_lang=output_lang, extra_examples=extras,
360
+ )
361
+ except Exception as exc: # pragma: no cover
362
+ logger.exception("LLM call failed")
363
+ return None, None, int((time.perf_counter() - t_llm) * 1000)
364
+ llm_ms = int((time.perf_counter() - t_llm) * 1000)
365
+ return (reply or "(empty reply)"), None, llm_ms
366
 
367
 
368
  # ── Gradio UI ────────────────────────────────────────────────────────────────
src/engine/turn_logger.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """TurnLogger — per-turn JSONL telemetry for the minimal baseline.
2
+
3
+ Every voice or text turn writes one self-contained line to
4
+ `data/field_turns.jsonl` (path overridable via FIELD_TURNS_PATH).
5
+
6
+ This is the foundation for:
7
+ - field-test review (read the JSONL after a session)
8
+ - phrasebook hit-rate measurement
9
+ - LLM A/B comparisons
10
+ - eventually, Stage-4 LoRA training-data curation
11
+ (every line already pairs an English/French input with a vetted
12
+ Bambara/Pular reply; we'll filter on phrasebook-hit + user-confirmed
13
+ turns later).
14
+
15
+ Schema (one JSON object per line):
16
+ {
17
+ "ts": "<ISO-8601 UTC>",
18
+ "tab": "voice" | "text",
19
+ "input_lang": "bam" | "ful" | "fr" | "en" | null,
20
+ "output_lang": "bam" | "ful" | "fr" | "en",
21
+ "user_text": "<raw input from text tab, or transcript for voice tab>",
22
+ "transcript": "<whisper output, voice tab only>" | null,
23
+ "transcribe_ms": <int> | null,
24
+ "phrasebook": { match, score, category, source, target } | null,
25
+ "llm_model": "<model id>" | null,
26
+ "llm_ms": <int> | null,
27
+ "reply_text": "<final reply that fed TTS>",
28
+ "tts_ms": <int> | null,
29
+ "total_ms": <int>,
30
+ "error": "<short error string>" | null
31
+ }
32
+
33
+ Notes:
34
+ - File path is gitignored (data/ is excluded by .gitignore).
35
+ - Append mode + line-buffered + lock — safe for the single-process Gradio
36
+ server. Not designed for multi-worker writes.
37
+ """
38
+ from __future__ import annotations
39
+
40
+ import json
41
+ import logging
42
+ import os
43
+ import threading
44
+ from datetime import datetime, timezone
45
+ from pathlib import Path
46
+ from typing import Any, Optional
47
+
48
+ logger = logging.getLogger(__name__)
49
+
50
+ _DEFAULT_PATH = (
51
+ Path(__file__).resolve().parent.parent.parent / "data" / "field_turns.jsonl"
52
+ )
53
+
54
+
55
+ def _utcnow_iso() -> str:
56
+ return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
57
+
58
+
59
+ class TurnLogger:
60
+ """Append-only JSONL logger. Thread-safe for one process."""
61
+
62
+ def __init__(self, path: Optional[str] = None) -> None:
63
+ env_path = os.environ.get("FIELD_TURNS_PATH")
64
+ self.path = Path(path or env_path or _DEFAULT_PATH)
65
+ self.path.parent.mkdir(parents=True, exist_ok=True)
66
+ self._lock = threading.Lock()
67
+ logger.info("TurnLogger writing to %s", self.path)
68
+
69
+ def log(self, **fields: Any) -> None:
70
+ """Write one row. Always sets ts; leaves the rest to the caller.
71
+
72
+ Never raises — telemetry must not break the user-facing pipeline.
73
+ """
74
+ row = {"ts": _utcnow_iso(), **fields}
75
+ try:
76
+ line = json.dumps(row, ensure_ascii=False)
77
+ with self._lock, self.path.open("a", encoding="utf-8") as fh:
78
+ fh.write(line + "\n")
79
+ except Exception as exc: # pragma: no cover
80
+ logger.warning("TurnLogger.log failed: %s", exc)
src/llm/minimal_client.py CHANGED
@@ -76,8 +76,17 @@ def _load_anchors(lang: str) -> list[dict]:
76
  return data.get("pairs", [])
77
 
78
 
79
- def _build_system_prompt(target_lang: str) -> str:
80
- """Assemble the per-call system prompt for a target output language."""
 
 
 
 
 
 
 
 
 
81
  full = LANG_FULL_NAME.get(target_lang, "English")
82
  forbidden = FORBIDDEN_DRIFT.get(target_lang, "")
83
  anchors = _load_anchors(target_lang)
@@ -114,6 +123,19 @@ def _build_system_prompt(target_lang: str) -> str:
114
  if src and tgt:
115
  lines.append(f"- {src} → {tgt}")
116
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  lines += [
118
  "",
119
  f"Always reply in {full}, even if the user writes to you in English, "
@@ -146,13 +168,23 @@ class MinimalClient:
146
  self._client = InferenceClient(token=self.hf_token)
147
  return self._client
148
 
149
- def chat(self, user_text: str, target_lang: str = "bam") -> str:
 
 
 
 
 
150
  """Return a plain-text reply in `target_lang`.
151
 
 
 
 
 
 
152
  On any error returns a short parenthetical error string so the caller
153
  can still feed something into TTS / display.
154
  """
155
- system_prompt = _build_system_prompt(target_lang)
156
  try:
157
  client = self._get_client()
158
  completion = client.chat_completion(
 
76
  return data.get("pairs", [])
77
 
78
 
79
+ def _build_system_prompt(
80
+ target_lang: str,
81
+ extra_examples: Optional[list[dict]] = None,
82
+ ) -> str:
83
+ """Assemble the per-call system prompt for a target output language.
84
+
85
+ `extra_examples`, when supplied, are appended after the curated 30-pair
86
+ gold list as additional dynamic few-shot anchoring — used by app_minimal
87
+ to inject the top-K nearest phrasebook entries when the strict short-
88
+ circuit misses.
89
+ """
90
  full = LANG_FULL_NAME.get(target_lang, "English")
91
  forbidden = FORBIDDEN_DRIFT.get(target_lang, "")
92
  anchors = _load_anchors(target_lang)
 
123
  if src and tgt:
124
  lines.append(f"- {src} → {tgt}")
125
 
126
+ if extra_examples:
127
+ lines += [
128
+ "",
129
+ "Additional reference phrases relevant to the current user input "
130
+ f"(curated gold {full} translations — use the same orthography and "
131
+ "style):",
132
+ ]
133
+ for item in extra_examples:
134
+ src = (item.get("source") or "").strip()
135
+ tgt = (item.get("target") or "").strip()
136
+ if src and tgt:
137
+ lines.append(f"- {src} → {tgt}")
138
+
139
  lines += [
140
  "",
141
  f"Always reply in {full}, even if the user writes to you in English, "
 
168
  self._client = InferenceClient(token=self.hf_token)
169
  return self._client
170
 
171
+ def chat(
172
+ self,
173
+ user_text: str,
174
+ target_lang: str = "bam",
175
+ extra_examples: Optional[list[dict]] = None,
176
+ ) -> str:
177
  """Return a plain-text reply in `target_lang`.
178
 
179
+ `extra_examples` (optional) — list of {source, target} dicts that get
180
+ appended to the system prompt as additional dynamic few-shot. Used by
181
+ app_minimal to RAG-inject the top-K nearest phrasebook entries when
182
+ the strict phrasebook short-circuit misses.
183
+
184
  On any error returns a short parenthetical error string so the caller
185
  can still feed something into TTS / display.
186
  """
187
+ system_prompt = _build_system_prompt(target_lang, extra_examples)
188
  try:
189
  client = self._get_client()
190
  completion = client.chat_completion(
src/llm/phrasebook.py CHANGED
@@ -121,3 +121,35 @@ def lookup(
121
  "match": "fuzzy",
122
  }
123
  return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  "match": "fuzzy",
122
  }
123
  return None
124
+
125
+
126
+ def top_k(user_text: str, target_lang: str, k: int = 3) -> list[dict]:
127
+ """Return the k closest phrasebook entries to `user_text` regardless of threshold.
128
+
129
+ Used as RAG-style few-shot context when the strict `lookup()` misses but we
130
+ still want to anchor the LLM with locally relevant gold pairs. Returns
131
+ results sorted by descending score; never raises.
132
+ """
133
+ pairs = _load_phrasebook(target_lang)
134
+ if not pairs:
135
+ return []
136
+ q = _normalize(user_text)
137
+ if not q:
138
+ return []
139
+ scored: list[tuple[float, dict]] = []
140
+ for p in pairs:
141
+ src = p.get("_norm", "")
142
+ if not src:
143
+ continue
144
+ score = 1.0 if src == q else SequenceMatcher(None, q, src).ratio()
145
+ scored.append((score, p))
146
+ scored.sort(key=lambda x: x[0], reverse=True)
147
+ out: list[dict] = []
148
+ for score, p in scored[:k]:
149
+ out.append({
150
+ "source": p.get("source"),
151
+ "target": p.get("target"),
152
+ "category": p.get("category"),
153
+ "score": round(score, 3),
154
+ })
155
+ return out