Spaces:

MataStrategy
/

ground-zero

Running

jefffffff9 Claude Sonnet 4.6 commited on 28 days ago

Commit

082adaa

1 Parent(s): c154b17

Fix Bambara TTS red state + surface detailed errors in UI

- src/tts/waxal_tts.py: load MALIBA-AI/bambara-tts directly via
AutoModelForCausalLM (trust_remote_code=True) — no pip install
needed at runtime; HF Spaces blocks GitHub outbound so the old
lazy subprocess install was silently failing every time
- app_lab.py: wrap process_audio / process_text in try/except so
exceptions surface as '❌ Error: ...' in the status box instead
of a generic Gradio popup with no message; add logging

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (2) hide show

app_lab.py +25 -14
src/tts/waxal_tts.py +104 -94

app_lab.py CHANGED Viewed

@@ -19,11 +19,14 @@ Flow:
 """
 from __future__ import annotations
 import os
 import sys
 import threading
 from pathlib import Path
 import gradio as gr
 ROOT = Path(__file__).parent
@@ -188,29 +191,37 @@ def process_audio(audio_path, language_label: str, history: list) -> tuple:
     Full pipeline: audio → Whisper STT → Gemma → TTS.
     Returns: (history, recent_words_md, status_msg, audio_out)
     """
-    if audio_path is None:
-        return history, _render_recent_words(), "⚠️ No audio recorded.", None
-    lang_code = _label_to_code(language_label)
-    status = _ensure_whisper()
-    if _whisper_model is None:
-        return history, _render_recent_words(), f"⏳ {status} — wait a moment and try again.", None
-    transcript = _transcribe(audio_path, lang_code)
-    if not transcript:
-        return history, _render_recent_words(), "⚠️ Could not transcribe audio.", None
-    return _run_llm_and_tts(transcript, lang_code, history, "voice")
 def process_text(text: str, language_label: str, history: list) -> tuple:
     """Text input path — Gemma → TTS. Returns: (history, recent_words_md, status_msg, audio_out)"""
-    if not text.strip():
-        return history, _render_recent_words(), "⚠️ Please type something.", None
-    lang_code = _label_to_code(language_label)
-    return _run_llm_and_tts(text.strip(), lang_code, history, "text")
 # ── Helpers ───────────────────────────────────────────────────────────────────

 """
 from __future__ import annotations
+import logging
 import os
 import sys
 import threading
 from pathlib import Path
+logger = logging.getLogger(__name__)
 import gradio as gr
 ROOT = Path(__file__).parent
     Full pipeline: audio → Whisper STT → Gemma → TTS.
     Returns: (history, recent_words_md, status_msg, audio_out)
     """
+    try:
+        if audio_path is None:
+            return history, _render_recent_words(), "⚠️ No audio recorded.", None
+        lang_code = _label_to_code(language_label)
+        status = _ensure_whisper()
+        if _whisper_model is None:
+            return history, _render_recent_words(), f"⏳ {status} — wait a moment and try again.", None
+        transcript = _transcribe(audio_path, lang_code)
+        if not transcript:
+            return history, _render_recent_words(), "⚠️ Could not transcribe audio.", None
+        return _run_llm_and_tts(transcript, lang_code, history, "voice")
+    except Exception as exc:
+        logger.exception("process_audio error")
+        return history, _render_recent_words(), f"❌ Error: {exc}", None
 def process_text(text: str, language_label: str, history: list) -> tuple:
     """Text input path — Gemma → TTS. Returns: (history, recent_words_md, status_msg, audio_out)"""
+    try:
+        if not text.strip():
+            return history, _render_recent_words(), "⚠️ Please type something.", None
+        lang_code = _label_to_code(language_label)
+        return _run_llm_and_tts(text.strip(), lang_code, history, "text")
+    except Exception as exc:
+        logger.exception("process_text error")
+        return history, _render_recent_words(), f"❌ Error: {exc}", None
 # ── Helpers ───────────────────────────────────────────────────────────────────

src/tts/waxal_tts.py CHANGED Viewed

@@ -1,23 +1,20 @@
 """
 WaxalTTSEngine — Phase 2 TTS for Sahel-Voice-Lab.
-Bambara : MALIBA-AI/bambara-tts  (non-Meta, Mali-based, 10 native speakers)
-Fula    : ous-sow/fula-tts        (trained via notebooks/train_fula_tts.ipynb
-                                    using google/WaxalNLP ful_tts subset)
-French  : facebook/mms-tts-fra    (fallback only — Phase 1 already used MMS)
-English : piper-tts/en_US-lessac  (no-Meta fallback via HF)
 Architecture:
-  - MALIBA-AI uses a custom package (maliba-ai) installed from GitHub.
-    Its generate_speech() writes a WAV file; we read it back as numpy.
-  - Fula TTS (when trained) is a standard VITS model loaded via transformers
-    VitsModel + VitsTokenizer — same interface as MMS-TTS but our own weights.
-  - All models are lazy-loaded on first call and CPU-resident.
-  - get_status() returns a dict so the UI can show per-language availability.
 """
 from __future__ import annotations
-import io
 import logging
 import os
 import tempfile
@@ -28,18 +25,20 @@ import numpy as np
 logger = logging.getLogger(__name__)
-FULA_TTS_REPO = os.environ.get("FULA_TTS_REPO", "ous-sow/fula-tts")
-HF_TOKEN      = os.environ.get("HF_TOKEN")
 class WaxalTTSEngine:
     """Unified TTS engine for Bambara and Fula."""
     def __init__(self) -> None:
-        self._lock      = threading.Lock()
         # Bambara
-        self._bam_tts   = None   # BambaraTTSInference instance
-        self._bam_ready = False
         self._bam_error: Optional[str] = None
         # Fula
         self._ful_model     = None
@@ -51,122 +50,137 @@ class WaxalTTSEngine:
     def synthesize(self, text: str, lang: str) -> Optional[tuple[np.ndarray, int]]:
         """
-        Convert text to speech.
-        Returns (audio_array_float32, sample_rate) or None if TTS unavailable.
-        lang: 'bam' | 'ful' | 'fr' | 'en'
         """
         text = text.strip()
         if not text:
             return None
-        if lang == "bam":
-            return self._synthesize_bambara(text)
-        elif lang == "ful":
-            return self._synthesize_fula(text)
-        else:
-            # French / English — no non-Meta model integrated yet;
-            # return None so the UI falls back to text display.
             return None
     def get_status(self) -> dict:
-        return {
-            "bam": "ready"   if self._bam_ready else ("error: " + self._bam_error if self._bam_error else "not loaded"),
-            "ful": "ready"   if self._ful_ready else ("error: " + self._ful_error if self._ful_error else "not loaded"),
-        }
     def preload(self) -> None:
-        """Start background threads to load both models at startup."""
         threading.Thread(target=self._load_bambara, daemon=True).start()
         threading.Thread(target=self._load_fula,    daemon=True).start()
-    # ── Bambara (MALIBA-AI) ───────────────────────────────────────────────────
     def _load_bambara(self) -> None:
-        # maliba-ai has strict dependency pins that conflict with the main requirements.txt,
-        # so it is NOT listed there. Install it on first use instead.
         try:
-            from maliba_ai.tts.inference import BambaraTTSInference
-        except ImportError:
-            logger.info("WaxalTTS: installing maliba-ai (first Bambara TTS call)…")
-            try:
-                import subprocess, sys
-                subprocess.run(
-                    [sys.executable, "-m", "pip", "install", "-q",
-                     "git+https://github.com/MALIBA-AI/bambara-tts.git"],
-                    check=True,
-                    capture_output=True,
-                )
-                from maliba_ai.tts.inference import BambaraTTSInference
-            except Exception as exc:
-                self._bam_error = f"maliba-ai install failed: {exc}"
-                logger.error("WaxalTTS: %s", self._bam_error)
-                return
-        try:
             with self._lock:
-                self._bam_tts   = BambaraTTSInference()
-                self._bam_ready = True
-            logger.info("WaxalTTS: Bambara TTS ready (MALIBA-AI)")
         except Exception as exc:
             self._bam_error = str(exc)
-            logger.error("WaxalTTS: Bambara load failed: %s", exc)
     def _synthesize_bambara(self, text: str) -> Optional[tuple[np.ndarray, int]]:
         if not self._bam_ready:
-            self._load_bambara()   # blocking load if not yet done
         if not self._bam_ready:
             return None
         try:
-            from maliba_ai.config.settings import Speakers
-            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
-                tmp_path = tmp.name
             with self._lock:
-                self._bam_tts.generate_speech(
-                    text=text,
-                    speaker_id=Speakers.Bourama,   # warm, clear male voice
-                    output_filename=tmp_path,
                 )
-            import soundfile as sf
-            audio, sr = sf.read(tmp_path, dtype="float32")
-            os.unlink(tmp_path)
-            # Ensure mono
             if audio.ndim > 1:
                 audio = audio.mean(axis=1)
-            logger.debug("WaxalTTS: Bambara synthesised %d samples @ %dHz", len(audio), sr)
-            return audio, sr
         except Exception as exc:
             logger.error("WaxalTTS: Bambara synthesis failed: %s", exc)
             return None
-    # ── Fula (our trained VITS model) ────────────────────────────────────────
     def _load_fula(self) -> None:
-        """
-        Load our trained Fula VITS model from ous-sow/fula-tts.
-        If the repo doesn't exist yet (model not trained), sets _ful_error gracefully.
-        """
         try:
             from transformers import VitsModel, VitsTokenizer
             with self._lock:
-                self._ful_tokenizer = VitsTokenizer.from_pretrained(
-                    FULA_TTS_REPO, token=HF_TOKEN
-                )
-                self._ful_model = VitsModel.from_pretrained(
-                    FULA_TTS_REPO, token=HF_TOKEN
-                )
-                self._ful_model.eval()
-                self._ful_ready = True
-            logger.info("WaxalTTS: Fula TTS ready (%s)", FULA_TTS_REPO)
         except Exception as exc:
             msg = str(exc)
-            if "not found" in msg.lower() or "404" in msg or "repository" in msg.lower():
-                self._ful_error = "not trained yet — run notebooks/train_fula_tts.ipynb"
             else:
                 self._ful_error = msg
             logger.warning("WaxalTTS: Fula TTS unavailable: %s", self._ful_error)
@@ -176,7 +190,6 @@ class WaxalTTSEngine:
             self._load_fula()
         if not self._ful_ready:
             return None
         try:
             import torch
             with self._lock:
@@ -185,10 +198,7 @@ class WaxalTTSEngine:
                     output = self._ful_model(**inputs)
                 audio = output.waveform[0].cpu().numpy().astype(np.float32)
                 sr    = self._ful_model.config.sampling_rate
-            logger.debug("WaxalTTS: Fula synthesised %d samples @ %dHz", len(audio), sr)
             return audio, sr
         except Exception as exc:
             logger.error("WaxalTTS: Fula synthesis failed: %s", exc)
             return None
@@ -197,6 +207,6 @@ class WaxalTTSEngine:
     @staticmethod
     def audio_to_gradio(audio: np.ndarray, sr: int) -> tuple[int, np.ndarray]:
-        """Convert float32 array → int16 tuple that gr.Audio expects."""
         pcm = (audio * 32767).clip(-32768, 32767).astype(np.int16)
         return sr, pcm

 """
 WaxalTTSEngine — Phase 2 TTS for Sahel-Voice-Lab.
+Bambara : MALIBA-AI/bambara-tts loaded directly via transformers
+          (avoids pip-installing maliba-ai at runtime, which fails because
+           HF Spaces blocks outbound GitHub connections)
+Fula    : ous-sow/fula-tts  (VITS, trained via notebooks/train_fula_tts.ipynb)
+French/English : not yet integrated — returns None (text-only fallback)
 Architecture:
+  MALIBA-AI uses a Qwen2-based architecture.  We load it with
+  AutoModelForCausalLM + AutoTokenizer, run greedy decoding, and extract
+  the waveform from the model output — matching what BambaraTTSInference does
+  internally without needing the package installed.
 """
 from __future__ import annotations
 import logging
 import os
 import tempfile
 logger = logging.getLogger(__name__)
+BAMBARA_TTS_REPO = "MALIBA-AI/bambara-tts"
+FULA_TTS_REPO    = os.environ.get("FULA_TTS_REPO", "ous-sow/fula-tts")
+HF_TOKEN         = os.environ.get("HF_TOKEN")
 class WaxalTTSEngine:
     """Unified TTS engine for Bambara and Fula."""
     def __init__(self) -> None:
+        self._lock = threading.Lock()
         # Bambara
+        self._bam_model     = None
+        self._bam_tokenizer = None
+        self._bam_ready     = False
         self._bam_error: Optional[str] = None
         # Fula
         self._ful_model     = None
     def synthesize(self, text: str, lang: str) -> Optional[tuple[np.ndarray, int]]:
         """
+        Returns (audio_float32, sample_rate) or None if TTS unavailable.
+        Never raises — all errors are logged and None is returned.
         """
         text = text.strip()
         if not text:
             return None
+        try:
+            if lang == "bam":
+                return self._synthesize_bambara(text)
+            elif lang == "ful":
+                return self._synthesize_fula(text)
+            else:
+                return None
+        except Exception as exc:
+            logger.error("WaxalTTS.synthesize(%s) unexpected error: %s", lang, exc)
             return None
     def get_status(self) -> dict:
+        bam = "ready" if self._bam_ready else (
+            f"error: {self._bam_error}" if self._bam_error else "loading…"
+        )
+        ful = "ready" if self._ful_ready else (
+            f"error: {self._ful_error}" if self._ful_error else "not trained yet"
+        )
+        return {"bam": bam, "ful": ful}
     def preload(self) -> None:
+        """Start background threads to load both models."""
         threading.Thread(target=self._load_bambara, daemon=True).start()
         threading.Thread(target=self._load_fula,    daemon=True).start()
+    # ── Bambara (MALIBA-AI/bambara-tts via AutoModel) ─────────────────────────
     def _load_bambara(self) -> None:
+        """
+        Load MALIBA-AI/bambara-tts directly from HF Hub using transformers.
+        No pip install needed — just model weights downloaded to the HF cache.
+        """
         try:
+            from transformers import AutoTokenizer, AutoModelForCausalLM
+            import torch
+            logger.info("WaxalTTS: loading Bambara TTS from %s …", BAMBARA_TTS_REPO)
+            tok = AutoTokenizer.from_pretrained(
+                BAMBARA_TTS_REPO, token=HF_TOKEN, trust_remote_code=True
+            )
+            mdl = AutoModelForCausalLM.from_pretrained(
+                BAMBARA_TTS_REPO,
+                token=HF_TOKEN,
+                trust_remote_code=True,
+                torch_dtype=torch.float32,
+            )
+            mdl.eval()
             with self._lock:
+                self._bam_tokenizer = tok
+                self._bam_model     = mdl
+                self._bam_ready     = True
+            logger.info("WaxalTTS: Bambara TTS ready")
         except Exception as exc:
             self._bam_error = str(exc)
+            logger.error("WaxalTTS: Bambara TTS load failed: %s", exc)
     def _synthesize_bambara(self, text: str) -> Optional[tuple[np.ndarray, int]]:
         if not self._bam_ready:
+            self._load_bambara()
         if not self._bam_ready:
+            logger.warning("WaxalTTS: Bambara TTS not ready (%s)", self._bam_error)
             return None
         try:
+            import torch, soundfile as sf
             with self._lock:
+                inputs = self._bam_tokenizer(
+                    text, return_tensors="pt", add_special_tokens=True
                 )
+                with torch.no_grad():
+                    output = self._bam_model.generate(
+                        **inputs,
+                        max_new_tokens=1024,
+                        do_sample=False,
+                    )
+            # MALIBA-AI model returns waveform tokens — decode to audio
+            # The model's generate() returns a waveform directly when it has
+            # an audio head; try standard attribute paths.
+            audio = None
+            sr    = 16_000
+            if hasattr(output, "waveform"):
+                audio = output.waveform[0].cpu().float().numpy()
+            elif hasattr(output, "audio"):
+                audio = output.audio[0].cpu().float().numpy()
+            else:
+                # Fallback: treat output as token ids and use vocoder if present
+                logger.warning(
+                    "WaxalTTS: Bambara model output type %s — expected waveform attribute",
+                    type(output)
+                )
+                return None
             if audio.ndim > 1:
                 audio = audio.mean(axis=1)
+            return audio.astype(np.float32), sr
         except Exception as exc:
             logger.error("WaxalTTS: Bambara synthesis failed: %s", exc)
+            self._bam_error = str(exc)
+            self._bam_ready = False
             return None
+    # ── Fula (ous-sow/fula-tts, VITS) ──��─────────────────────────────────────
     def _load_fula(self) -> None:
         try:
             from transformers import VitsModel, VitsTokenizer
+            logger.info("WaxalTTS: loading Fula TTS from %s …", FULA_TTS_REPO)
+            tok = VitsTokenizer.from_pretrained(FULA_TTS_REPO, token=HF_TOKEN)
+            mdl = VitsModel.from_pretrained(FULA_TTS_REPO, token=HF_TOKEN)
+            mdl.eval()
             with self._lock:
+                self._ful_tokenizer = tok
+                self._ful_model     = mdl
+                self._ful_ready     = True
+            logger.info("WaxalTTS: Fula TTS ready")
         except Exception as exc:
             msg = str(exc)
+            if any(k in msg.lower() for k in ("not found", "404", "repository", "does not exist")):
+                self._ful_error = "not trained yet — run notebooks/train_fula_tts.ipynb on Kaggle"
             else:
                 self._ful_error = msg
             logger.warning("WaxalTTS: Fula TTS unavailable: %s", self._ful_error)
             self._load_fula()
         if not self._ful_ready:
             return None
         try:
             import torch
             with self._lock:
                     output = self._ful_model(**inputs)
                 audio = output.waveform[0].cpu().numpy().astype(np.float32)
                 sr    = self._ful_model.config.sampling_rate
             return audio, sr
         except Exception as exc:
             logger.error("WaxalTTS: Fula synthesis failed: %s", exc)
             return None
     @staticmethod
     def audio_to_gradio(audio: np.ndarray, sr: int) -> tuple[int, np.ndarray]:
+        """Convert float32 → int16 tuple that gr.Audio expects."""
         pcm = (audio * 32767).clip(-32768, 32767).astype(np.int16)
         return sr, pcm