Spaces:

MataStrategy
/

ground-zero

Sleeping

jefffffff9 Claude Sonnet 4.6 commited on 29 days ago

Commit

8d7d9d8

1 Parent(s): ad902c6

Fix conversation mode timeout: two-stage pipeline + faster LLM

Root cause: the entire pipeline (ASR + LLM API call + TTS) ran as one
blocking Gradio event before anything appeared in the UI. On cpu-basic:
- Whisper small on CPU: ~5-10s
- Qwen 72B on HF Serverless: 20-40s (queue + generation)
- MMS-TTS on CPU: ~5-10s
Total: 30-60s, hitting Gradio's request timeout → error in all boxes.

Fix 1 — Two-stage pipeline with .then() chaining:
Stage 1 (_do_asr): Whisper only → transcript appears in ~5s
Stage 2 (_do_respond): LLM + TTS → response + audio follow after
User sees the transcript almost immediately; no more blank wait.

Fix 2 — LLM model: Qwen 72B → Qwen 7B (10x faster on HF Serverless,
same quality for 1-3 sentence voice responses). Env var LLM_MODEL_ID
still overrides to any model.

Fix 3 — max_tokens 300→150: voice responses are short; cutting tokens
in half cuts LLM latency ~40% further.

Fix 4 — Remove @_gpu from _convo_pipeline: the LLM step is a network
request; wrapping it in the GPU time budget wasted the 55s allowance
on network latency instead of actual compute.

Fix 5 — _do_respond for sensor mode: replicates phrase+intent+sensor+TTS
logic without re-running ASR, so both modes benefit from the split.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (1) hide show

app.py +172 -18

app.py CHANGED Viewed

@@ -37,7 +37,7 @@ ADAPTER_REPO_ID  = os.environ.get("ADAPTER_REPO_ID",  "ous-sow/sahel-agri-adapte
 # whisper-small: ~10s on cpu-basic, good multilingual quality.
 # Override via WHISPER_MODEL_ID env var if you upgrade to a GPU Space later.
 WHISPER_MODEL_ID     = os.environ.get("WHISPER_MODEL_ID",    "openai/whisper-small")
-LLM_MODEL_ID         = os.environ.get("LLM_MODEL_ID",         "Qwen/Qwen2.5-72B-Instruct")
 KAGGLE_USERNAME      = os.environ.get("KAGGLE_USERNAME",      "")
 KAGGLE_KEY           = os.environ.get("KAGGLE_KEY",           "")
 KAGGLE_KERNEL_SLUG   = os.environ.get("KAGGLE_KERNEL_SLUG",   "ous-sow/sahel-voice-master-trainer")
@@ -441,7 +441,6 @@ def set_voice_reference(audio_file) -> str:
         return f"❌ Could not process reference audio: {exc}"
-@_gpu
 def _convo_pipeline(audio_path: str, language_code: str, history: list):
     """
     Full S2S conversation pipeline with memory:
@@ -1288,6 +1287,150 @@ def handle_ask(audio_path, language_label, convo_mode: bool = False, history: li
         return f"❌ {e}", "", "", None, history
 # ── Gradio UI ─────────────────────────────────────────────────────────────────
 def build_ui() -> gr.Blocks:
@@ -1406,28 +1549,39 @@ def build_ui() -> gr.Blocks:
                     outputs=[chatbot],
                 )
-                _ask_inputs  = [audio_input, language_dd, convo_mode_toggle, conv_history]
-                _ask_outputs = [transcript_box, translation_box, response_box,
-                                audio_output, conv_history, chatbot]
-                def _ask_and_update(ap, ll, cm, hist):
-                    t, e, r, a, new_hist = handle_ask(ap, ll, cm, hist)
-                    # Convert history tuples to list-of-lists for gr.Chatbot
-                    chat_msgs = [[u, v] for u, v in new_hist]
-                    return t, e, r, a, new_hist, chat_msgs
                 ask_btn.click(
-                    fn=_ask_and_update,
-                    inputs=_ask_inputs,
-                    outputs=_ask_outputs,
                 )
-                # Auto-submit when mic stops (Conversation Mode)
                 audio_input.stop_recording(
-                    fn=lambda ap, ll, cm, h: _ask_and_update(ap, ll, cm, h) if cm
-                        else (None, None, None, None, h, [[u, v] for u, v in h]),
-                    inputs=_ask_inputs,
-                    outputs=_ask_outputs,
                 )
                 # Clear conversation
                 clear_btn.click(
                     fn=lambda: ([], []),

 # whisper-small: ~10s on cpu-basic, good multilingual quality.
 # Override via WHISPER_MODEL_ID env var if you upgrade to a GPU Space later.
 WHISPER_MODEL_ID     = os.environ.get("WHISPER_MODEL_ID",    "openai/whisper-small")
+LLM_MODEL_ID         = os.environ.get("LLM_MODEL_ID",         "Qwen/Qwen2.5-7B-Instruct")
 KAGGLE_USERNAME      = os.environ.get("KAGGLE_USERNAME",      "")
 KAGGLE_KEY           = os.environ.get("KAGGLE_KEY",           "")
 KAGGLE_KERNEL_SLUG   = os.environ.get("KAGGLE_KERNEL_SLUG",   "ous-sow/sahel-voice-master-trainer")
         return f"❌ Could not process reference audio: {exc}"
 def _convo_pipeline(audio_path: str, language_code: str, history: list):
     """
     Full S2S conversation pipeline with memory:
         return f"❌ {e}", "", "", None, history
+# ── Two-stage pipeline (shows transcript fast, then response) ─────────────────
+def _do_asr(audio_path: str, language_label: str) -> str:
+    """
+    Stage 1 — Whisper only.  Returns the transcript string (or error/status).
+    Completes in ~3-8s on cpu-basic so the user sees what was heard immediately.
+    """
+    if audio_path is None:
+        return "⚠️ No audio — press Record or upload a file."
+    lang = SUPPORTED_LANGUAGES.get(language_label, "bam")
+    status = _ensure_whisper_loaded()
+    if _whisper_model is None:
+        return f"⏳ Model loading ({status}). Wait a moment and try again."
+    try:
+        import torch, librosa
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        audio_np, _ = librosa.load(audio_path, sr=16000, mono=True)
+        active_model = _fine_tuned_models.get(lang, _whisper_model)
+        active_model.to(device)
+        with _model_lock:
+            input_features = _whisper_processor.feature_extractor(
+                audio_np, sampling_rate=16000, return_tensors="pt"
+            ).input_features.to(device)
+            forced_ids = None
+            if lang not in ("bam", "ful"):
+                forced_ids = _whisper_processor.get_decoder_prompt_ids(
+                    language=lang, task="transcribe"
+                )
+            with torch.no_grad():
+                ids = active_model.generate(
+                    input_features,
+                    forced_decoder_ids=forced_ids or None,
+                    max_new_tokens=256,
+                )
+        transcript = _whisper_processor.batch_decode(ids, skip_special_tokens=True)[0].strip()
+        active_model.to("cpu")
+        if device == "cuda":
+            torch.cuda.empty_cache()
+        # Bambara phonetic normalisation
+        return bam_normalize(transcript) if lang == "bam" else transcript
+    except Exception as e:
+        return f"❌ Transcription error: {e}"
+def _do_respond(
+    transcript: str,
+    language_label: str,
+    convo_mode: bool,
+    history: list,
+) -> tuple:
+    """
+    Stage 2 — LLM or sensor response, runs after transcript is already visible.
+    Returns (eng_translation, response_text, audio_out, new_history, chat_msgs).
+    """
+    history = history or []
+    # Bail early if stage 1 errored
+    if not transcript or transcript[:1] in ("⚠️", "⏳", "❌") or transcript.startswith(("⚠", "⏳", "❌")):
+        chat_msgs = [[u, v] for u, v in history]
+        return "", "", None, history, chat_msgs
+    lang = SUPPORTED_LANGUAGES.get(language_label, "bam")
+    import torch
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    if convo_mode:
+        # ── LLM brain ────────────────────────────────────────────────────────
+        response_text = ""
+        try:
+            from huggingface_hub import InferenceClient
+            client    = InferenceClient(token=HF_TOKEN)
+            messages  = _build_messages(transcript, history, lang)
+            completion = client.chat_completion(
+                model=LLM_MODEL_ID,
+                messages=messages,
+                max_tokens=150,     # short spoken responses, much faster
+                temperature=0.65,
+            )
+            response_text = completion.choices[0].message.content.strip()
+        except Exception as llm_err:
+            import logging
+            logging.getLogger(__name__).warning("LLM error: %s", llm_err)
+            response_text = (
+                "Hakɛ to, tasuma tɛ kɛ sisan. I ka a lasɔrɔ tugu."
+                if lang == "bam"
+                else "Sorry, I could not reach the language model right now."
+            )
+        # Strip [LEARNED:] tags, persist async
+        response_text, _ = _parse_and_strip_learned(response_text, lang)
+        # Update history
+        new_history = list(history) + [(transcript, response_text)]
+        if len(new_history) > 20:
+            new_history = new_history[-20:]
+        chat_msgs = [[u, v] for u, v in new_history]
+        # ── TTS ───────────────────────────────────────────────────────────────
+        audio_out = None
+        if _voice_ref_path and Path(_voice_ref_path).exists():
+            try:
+                from src.tts.f5_tts import synthesize as f5s
+                result = f5s(response_text, ref_wav_path=_voice_ref_path,
+                             ref_text=_voice_ref_text, device=device)
+                if result is not None:
+                    audio_out = (result[1], result[0])
+            except Exception:
+                pass
+        if audio_out is None:
+            wav_np, sr = _tts.synthesize(response_text, lang, device=device)
+            audio_out = (sr, wav_np)
+        return "", response_text, audio_out, new_history, chat_msgs
+    else:
+        # ── Sensor / phrase pipeline ──────────────────────────────────────────
+        import asyncio
+        phrase_match = _phrase_matcher.match(transcript, lang)
+        if phrase_match:
+            response_text      = phrase_match["response"]
+            english_translation = phrase_match["english"]
+        else:
+            intent = _intent_parser.parse(transcript, language=lang)
+            try:
+                loop = asyncio.new_event_loop()
+                sensor_data = loop.run_until_complete(_sensor_bridge.fetch(intent))
+                loop.close()
+            except Exception:
+                from src.iot.sensor_bridge import SensorData
+                sensor_data = SensorData(sensor_type="soil",
+                                         values={"moisture_pct": 45.0, "ph": 6.5, "temperature_c": 28.0})
+            responder = VoiceResponder(language=lang)
+            response_text, english_translation = responder.generate_response(intent, sensor_data)
+            if intent.action == "unknown" and intent.confidence < 0.15:
+                from src.iot.voice_responder import BAMBARA_TEMPLATES, FULA_TEMPLATES
+                if lang == "bam":
+                    response_text, english_translation = BAMBARA_TEMPLATES["not_understood"]
+                elif lang == "ful":
+                    response_text, english_translation = FULA_TEMPLATES["not_understood"]
+        wav_np, sr = _tts.synthesize(response_text, lang, device=device)
+        chat_msgs = [[u, v] for u, v in history]
+        return english_translation, response_text, (sr, wav_np), history, chat_msgs
 # ── Gradio UI ─────────────────────────────────────────────────────────────────
 def build_ui() -> gr.Blocks:
                     outputs=[chatbot],
                 )
+                # ── Stage 1 inputs/outputs (ASR only — fast) ─────────────────
+                _s1_inputs  = [audio_input, language_dd]
+                _s1_outputs = [transcript_box]
+                # ── Stage 2 inputs/outputs (LLM / sensor + TTS) ──────────────
+                _s2_inputs  = [transcript_box, language_dd, convo_mode_toggle, conv_history]
+                _s2_outputs = [translation_box, response_box, audio_output,
+                               conv_history, chatbot]
+                # Manual button: stage 1 then stage 2
                 ask_btn.click(
+                    fn=_do_asr,
+                    inputs=_s1_inputs,
+                    outputs=_s1_outputs,
+                ).then(
+                    fn=_do_respond,
+                    inputs=_s2_inputs,
+                    outputs=_s2_outputs,
                 )
+                # Auto-submit on mic stop: same chain, but stage 2 only runs when
+                # convo_mode is ON (sensor mode has a manual button for deliberate use)
                 audio_input.stop_recording(
+                    fn=_do_asr,
+                    inputs=_s1_inputs,
+                    outputs=_s1_outputs,
+                ).then(
+                    fn=lambda t, ll, cm, h: _do_respond(t, ll, cm, h) if cm
+                        else ("", "", None, h, [[u, v] for u, v in (h or [])]),
+                    inputs=_s2_inputs,
+                    outputs=_s2_outputs,
                 )
                 # Clear conversation
                 clear_btn.click(
                     fn=lambda: ([], []),