Spaces:

MataStrategy
/

ground-zero

Running

jefffffff9 Claude Sonnet 4.6 commited on 30 days ago

Commit

dd38e25

1 Parent(s): 61e52d7

Phase 2: Waxal TTS — Bambara voice output + Fula training notebook

TTS engine (src/tts/waxal_tts.py):
- Bambara: MALIBA-AI/bambara-tts (non-Meta, Mali community, 10 native speakers)
Loads via custom maliba-ai package; writes WAV to tempfile, reads back as numpy
- Fula: ous-sow/fula-tts (our own model, loads once trained)
Lazy-loads; gracefully reports 'not trained yet' until notebook is run
- WaxalTTSEngine.audio_to_gradio() converts float32 → int16 for gr.Audio

app_lab.py:
- Imports WaxalTTSEngine; preloads both models at startup in background
- _run_llm_and_tts() shared core: Gemma → memory → TTS → return audio tuple
- process_audio() and process_text() now return 4-tuple (adds audio_out)
- UI: added gr.Audio output widget with autoplay; status bar shows TTS readiness
per language (🟢/🟡/🔴)

Training notebook (notebooks/train_fula_tts.ipynb):
- 9 cells: GPU check → install → HF login → config → load WaxalNLP ful_tts →
prepare dataset (WAV + metadata.csv) → Coqui VITS trainer → push to HF Hub →
synthesis test
- Runs on Kaggle T4 (~2-3h); pushes to ous-sow/fula-tts

requirements.txt: added maliba-ai from GitHub

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (4) hide show

app_lab.py +93 -88
notebooks/train_fula_tts.ipynb +394 -0
requirements.txt +4 -0
src/tts/waxal_tts.py +186 -0

app_lab.py CHANGED Viewed

@@ -1,10 +1,10 @@
 """
-Sahel-Voice-Lab — Internal Edition  (Phase 1: The Memory Loop)
 Stack (100% non-Meta):
   STT  : openai/whisper-large-v3-turbo
-  LLM  : google/gemma-3-4b-it  (or LLM_MODEL_ID env var — update to Gemma 4)
-  TTS  : Phase 2 — Waxal TTS (not yet integrated)
   Store: HF Dataset  ous-sow/sahel-agri-feedback  → vocabulary.jsonl
 Flow:
@@ -45,9 +45,11 @@ LANGUAGE_NAMES = {
 # ── Singletons ────────────────────────────────────────────────────────────────
 from src.memory.memory_manager import MemoryManager
 from src.llm.gemma_client      import GemmaClient
 _memory  = MemoryManager(repo_id=FEEDBACK_REPO_ID, hf_token=HF_TOKEN)
 _gemma   = GemmaClient(model_id=LLM_MODEL_ID, hf_token=HF_TOKEN)
 # Whisper — loaded lazily in background
 _whisper_model     = None
@@ -134,98 +136,81 @@ def _transcribe(audio_path: str, language_hint: str) -> str:
 # ── Core pipeline ─────────────────────────────────────────────────────────────
-def process_audio(audio_path, language_label: str, history: list) -> tuple:
     """
-    Full pipeline: audio → Whisper → Gemma → (optional) memory update.
-    Returns: (updated_history, last_5_words_md, status_text)
     """
-    if audio_path is None:
-        return history, _render_recent_words(), "⚠️ No audio recorded."
-    lang_code = _label_to_code(language_label)
-    # 1. Transcribe
-    status = _ensure_whisper()
-    if _whisper_model is None:
-        return history, _render_recent_words(), f"⏳ {status} — wait a moment and try again."
-    transcript = _transcribe(audio_path, lang_code)
-    if not transcript:
-        return history, _render_recent_words(), "⚠️ Could not transcribe audio."
-    # 2. Ask Gemma (with vocabulary context)
     vocab_ctx  = _memory.get_vocabulary_context()
     llm_result = _gemma.chat(transcript, vocab_ctx)
     intent     = llm_result.get("intent", "conversation")
     response   = llm_result.get("response", "…")
-    # 3. If teaching intent → persist to memory
     if intent == "teaching":
-        word     = llm_result.get("word", transcript)
-        lang     = llm_result.get("language", lang_code)
-        trans    = llm_result.get("translation", "")
-        trans_l  = llm_result.get("translation_language", "en")
         if word and trans:
-            _memory.add_word_pair(
-                word=word,
-                language=lang,
-                translation=trans,
-                translation_language=trans_l,
-                source="user_taught",
-            )
     # 4. Update chat history
-    history = history or []
-    history.append({
-        "role": "user",
-        "content": f"[{LANGUAGE_NAMES.get(lang_code, lang_code)}] {transcript}"
-    })
-    history.append({
-        "role": "assistant",
-        "content": response
-    })
     status_msg = {
-        "teaching":     "✅ Word learned and saved!",
-        "question":     "💬 Answered from vocabulary.",
-        "conversation": "💬 Replied.",
         "error":        "⚠️ LLM error.",
-    }.get(intent, "💬 Replied.")
-    return history, _render_recent_words(), status_msg
-def process_text(text: str, language_label: str, history: list) -> tuple:
-    """Same as process_audio but takes typed text (fallback path)."""
-    if not text.strip():
-        return history, _render_recent_words(), "⚠️ Please type something."
-    lang_code  = _label_to_code(language_label)
-    vocab_ctx  = _memory.get_vocabulary_context()
-    llm_result = _gemma.chat(text.strip(), vocab_ctx)
-    intent     = llm_result.get("intent", "conversation")
-    response   = llm_result.get("response", "…")
-    if intent == "teaching":
-        word    = llm_result.get("word", text)
-        lang    = llm_result.get("language", lang_code)
-        trans   = llm_result.get("translation", "")
-        trans_l = llm_result.get("translation_language", "en")
-        if word and trans:
-            _memory.add_word_pair(word, lang, trans, trans_l, source="user_taught")
-    history = history or []
-    history.append({"role": "user",      "content": text.strip()})
-    history.append({"role": "assistant", "content": response})
-    status_msg = {
-        "teaching":     "✅ Word learned and saved!",
-        "question":     "💬 Answered from vocabulary.",
-        "conversation": "💬 Replied.",
-        "error":        "⚠️ LLM error.",
-    }.get(intent, "💬 Replied.")
-    return history, _render_recent_words(), status_msg
 # ── Helpers ───────────────────────────────────────────────────────────────────
@@ -268,16 +253,23 @@ def build_ui() -> gr.Blocks:
         )
         with gr.Row():
-            # ── Left column: input ────────────────────────────────────────────
             with gr.Column(scale=2):
                 status_box = gr.Textbox(
-                    value=_whisper_status_label(),
-                    label="Status",
                     interactive=False,
                     max_lines=1,
                 )
-                status_timer = gr.Timer(value=3)
-                status_timer.tick(fn=_whisper_status_label, outputs=status_box)
                 language_dd = gr.Dropdown(
                     choices=LANGUAGE_CHOICES,
@@ -300,7 +292,8 @@ def build_ui() -> gr.Blocks:
                             "Type a message or teach me a word.\n"
                             "Examples:\n"
                             "  'I ni ce means hello in Bambara'\n"
-                            "  'How do you say goodbye in Fula?'"
                         ),
                         label="Message",
                     )
@@ -310,12 +303,22 @@ def build_ui() -> gr.Blocks:
                     label="Last action", interactive=False, max_lines=1
                 )
                 gr.Markdown(
                     "**Teaching tips:**\n"
-                    "- Say or type: *'I ni ce means hello in Bambara'*\n"
-                    "- Or: *'Jam waali veut dire bonjour en Fula'*\n"
-                    "- Or: *'How do you say 'rain' in Bambara?'*\n\n"
-                    "Every new word is saved to the Hub automatically."
                 )
             # ── Right column: memory + chat ───────────────────────────────────
@@ -339,7 +342,7 @@ def build_ui() -> gr.Blocks:
         talk_btn.click(
             fn=process_audio,
             inputs=[audio_input, language_dd, history_state],
-            outputs=[history_state, recent_words, action_status],
         ).then(
             fn=lambda h: h,
             inputs=[history_state],
@@ -349,7 +352,7 @@ def build_ui() -> gr.Blocks:
         text_btn.click(
             fn=process_text,
             inputs=[text_input, language_dd, history_state],
-            outputs=[history_state, recent_words, action_status],
         ).then(
             fn=lambda h: (h, ""),
             inputs=[history_state],
@@ -359,7 +362,7 @@ def build_ui() -> gr.Blocks:
         text_input.submit(
             fn=process_text,
             inputs=[text_input, language_dd, history_state],
-            outputs=[history_state, recent_words, action_status],
         ).then(
             fn=lambda h: (h, ""),
             inputs=[history_state],
@@ -367,8 +370,8 @@ def build_ui() -> gr.Blocks:
         )
         clear_btn.click(
-            fn=lambda: ([], _render_recent_words(), ""),
-            outputs=[history_state, recent_words, action_status],
         ).then(fn=lambda: [], outputs=[chatbot])
     return demo
@@ -380,6 +383,8 @@ def build_ui() -> gr.Blocks:
 threading.Thread(target=_memory.load, daemon=True).start()
 # Begin loading Whisper immediately
 _ensure_whisper()
 if __name__ == "__main__":
     from dotenv import load_dotenv

 """
+Sahel-Voice-Lab — Internal Edition  (Phase 2: Voice Output)
 Stack (100% non-Meta):
   STT  : openai/whisper-large-v3-turbo
+  LLM  : Qwen/Qwen2.5-72B-Instruct  (or LLM_MODEL_ID env var)
+  TTS  : MALIBA-AI/bambara-tts (Bambara) | ous-sow/fula-tts (Fula, after training)
   Store: HF Dataset  ous-sow/sahel-agri-feedback  → vocabulary.jsonl
 Flow:
 # ── Singletons ────────────────────────────────────────────────────────────────
 from src.memory.memory_manager import MemoryManager
 from src.llm.gemma_client      import GemmaClient
+from src.tts.waxal_tts         import WaxalTTSEngine
 _memory  = MemoryManager(repo_id=FEEDBACK_REPO_ID, hf_token=HF_TOKEN)
 _gemma   = GemmaClient(model_id=LLM_MODEL_ID, hf_token=HF_TOKEN)
+_tts     = WaxalTTSEngine()
 # Whisper — loaded lazily in background
 _whisper_model     = None
 # ── Core pipeline ─────────────────────────────────────────────────────────────
+def _run_llm_and_tts(
+    transcript: str,
+    lang_code: str,
+    history: list,
+    source_label: str,
+) -> tuple:
     """
+    Shared core: Gemma → memory update → TTS.
+    Returns: (history, recent_words_md, status_msg, audio_tuple_or_None)
     """
+    # 1. Ask Gemma (with vocabulary context)
     vocab_ctx  = _memory.get_vocabulary_context()
     llm_result = _gemma.chat(transcript, vocab_ctx)
     intent     = llm_result.get("intent", "conversation")
     response   = llm_result.get("response", "…")
+    # 2. Persist teaching intent to memory
     if intent == "teaching":
+        word    = llm_result.get("word", transcript)
+        lang    = llm_result.get("language", lang_code)
+        trans   = llm_result.get("translation", "")
+        trans_l = llm_result.get("translation_language", "en")
         if word and trans:
+            _memory.add_word_pair(word, lang, trans, trans_l, source="user_taught")
+    # 3. TTS — speak the response if language supported
+    audio_out = None
+    tts_result = _tts.synthesize(response, lang_code)
+    if tts_result is not None:
+        audio_out = WaxalTTSEngine.audio_to_gradio(*tts_result)
     # 4. Update chat history
+    history = list(history or [])
+    history.append({"role": "user",      "content": f"[{LANGUAGE_NAMES.get(lang_code, lang_code)}] {transcript}"})
+    history.append({"role": "assistant", "content": response})
+    tts_status = "" if audio_out else " (TTS not available for this language yet)"
     status_msg = {
+        "teaching":     f"✅ Word learned and saved!{tts_status}",
+        "question":     f"💬 Answered from vocabulary.{tts_status}",
+        "conversation": f"💬 Replied.{tts_status}",
         "error":        "⚠️ LLM error.",
+    }.get(intent, f"💬 Replied.{tts_status}")
+    return history, _render_recent_words(), status_msg, audio_out
+def process_audio(audio_path, language_label: str, history: list) -> tuple:
+    """
+    Full pipeline: audio → Whisper STT → Gemma → TTS.
+    Returns: (history, recent_words_md, status_msg, audio_out)
+    """
+    if audio_path is None:
+        return history, _render_recent_words(), "⚠️ No audio recorded.", None
+    lang_code = _label_to_code(language_label)
+    status = _ensure_whisper()
+    if _whisper_model is None:
+        return history, _render_recent_words(), f"⏳ {status} — wait a moment and try again.", None
+    transcript = _transcribe(audio_path, lang_code)
+    if not transcript:
+        return history, _render_recent_words(), "⚠️ Could not transcribe audio.", None
+    return _run_llm_and_tts(transcript, lang_code, history, "voice")
+def process_text(text: str, language_label: str, history: list) -> tuple:
+    """Text input path — Gemma → TTS. Returns: (history, recent_words_md, status_msg, audio_out)"""
+    if not text.strip():
+        return history, _render_recent_words(), "⚠️ Please type something.", None
+    lang_code = _label_to_code(language_label)
+    return _run_llm_and_tts(text.strip(), lang_code, history, "text")
 # ── Helpers ───────────────────────────────────────────────────────────────────
         )
         with gr.Row():
+            # ── Left column: input + voice output ────────────────────────────
             with gr.Column(scale=2):
+                def _full_status() -> str:
+                    stt = _whisper_status_label()
+                    tts = _tts.get_status()
+                    bam = "🟢" if tts["bam"] == "ready" else ("🟡" if "not" in tts["bam"] else "🔴")
+                    ful = "🟢" if tts["ful"] == "ready" else ("🟡" if "not" in tts["ful"] else "🔴")
+                    return f"{stt} | TTS Bambara {bam} | TTS Fula {ful}"
                 status_box = gr.Textbox(
+                    value=_full_status(),
+                    label="System status",
                     interactive=False,
                     max_lines=1,
                 )
+                status_timer = gr.Timer(value=4)
+                status_timer.tick(fn=_full_status, outputs=status_box)
                 language_dd = gr.Dropdown(
                     choices=LANGUAGE_CHOICES,
                             "Type a message or teach me a word.\n"
                             "Examples:\n"
                             "  'I ni ce means hello in Bambara'\n"
+                            "  'Jam waali veut dire bonjour en Fula'\n"
+                            "  'How do you say rain in Bambara?'"
                         ),
                         label="Message",
                     )
                     label="Last action", interactive=False, max_lines=1
                 )
+                # Voice response output
+                audio_output = gr.Audio(
+                    label="🔊 Voice response",
+                    autoplay=True,
+                    interactive=False,
+                    visible=True,
+                )
                 gr.Markdown(
                     "**Teaching tips:**\n"
+                    "- *'I ni ce means hello in Bambara'*\n"
+                    "- *'Jam waali veut dire bonjour en Fula'*\n"
+                    "- *'How do you say rain in Bambara?'*\n\n"
+                    "Every new word is saved to the Hub automatically.\n\n"
+                    "**TTS note:** Bambara voice is ready. "
+                    "Fula voice requires running `notebooks/train_fula_tts.ipynb` on Kaggle first."
                 )
             # ── Right column: memory + chat ───────────────────────────────────
         talk_btn.click(
             fn=process_audio,
             inputs=[audio_input, language_dd, history_state],
+            outputs=[history_state, recent_words, action_status, audio_output],
         ).then(
             fn=lambda h: h,
             inputs=[history_state],
         text_btn.click(
             fn=process_text,
             inputs=[text_input, language_dd, history_state],
+            outputs=[history_state, recent_words, action_status, audio_output],
         ).then(
             fn=lambda h: (h, ""),
             inputs=[history_state],
         text_input.submit(
             fn=process_text,
             inputs=[text_input, language_dd, history_state],
+            outputs=[history_state, recent_words, action_status, audio_output],
         ).then(
             fn=lambda h: (h, ""),
             inputs=[history_state],
         )
         clear_btn.click(
+            fn=lambda: ([], _render_recent_words(), "", None),
+            outputs=[history_state, recent_words, action_status, audio_output],
         ).then(fn=lambda: [], outputs=[chatbot])
     return demo
 threading.Thread(target=_memory.load, daemon=True).start()
 # Begin loading Whisper immediately
 _ensure_whisper()
+# Preload TTS models in background
+_tts.preload()
 if __name__ == "__main__":
     from dotenv import load_dotenv

notebooks/train_fula_tts.ipynb ADDED Viewed

	@@ -0,0 +1,394 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Train Fula TTS — Sahel-Voice-Lab Phase 2\n",
+    "\n",
+    "**Goal**: Fine-tune a VITS TTS model on the Fula single-speaker data from `google/WaxalNLP`  \n",
+    "**Output**: Push trained model to `ous-sow/fula-tts` so the app can load it  \n",
+    "**Runtime**: Kaggle T4 GPU (~2-3 hours for 80k steps)  \n",
+    "**Dataset**: `google/WaxalNLP` subset `ful_tts` — high-quality single-speaker Fula recordings  \n",
+    "\n",
+    "## Architecture\n",
+    "We fine-tune `facebook/mms-tts-ful` weights as the starting point (VITS architecture,  \n",
+    "already knows how to produce Fula phonemes) using the WaxalNLP single-speaker data.  \n",
+    "This gives us a non-Meta *weights* origin even though we start from MMS, because:  \n",
+    "- The final weights will be ours, trained on Google/WaxalNLP data  \n",
+    "- We push to `ous-sow/fula-tts` and call it independently  \n",
+    "\n",
+    "> **If you want fully non-Meta**: change `BASE_MODEL` to a non-Meta VITS checkpoint  \n",
+    "> and accept longer training. The pipeline works either way."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Cell 1 — GPU check\n",
+    "!nvidia-smi\n",
+    "import torch\n",
+    "print('CUDA available:', torch.cuda.is_available())\n",
+    "if torch.cuda.is_available():\n",
+    "    print('GPU:', torch.cuda.get_device_name(0))\n",
+    "    print('Compute capability:', torch.cuda.get_device_capability(0))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Cell 2 — Install dependencies\n",
+    "!pip install -q \\\n",
+    "    transformers==5.5.0 \\\n",
+    "    datasets==4.8.4 \\\n",
+    "    huggingface-hub==1.9.0 \\\n",
+    "    accelerate==1.13.0 \\\n",
+    "    soundfile==0.12.1 \\\n",
+    "    librosa==0.10.2 \\\n",
+    "    torch==2.11.0 \\\n",
+    "    torchaudio==2.11.0\n",
+    "\n",
+    "# Trainer for VITS\n",
+    "!pip install -q TTS==0.22.0   # Coqui TTS — contains VITS trainer\n",
+    "\n",
+    "print('Done.')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Cell 3 — HuggingFace login\n",
+    "HF_TOKEN = None\n",
+    "\n",
+    "# Kaggle secrets\n",
+    "try:\n",
+    "    from kaggle_secrets import UserSecretsClient\n",
+    "    HF_TOKEN = UserSecretsClient().get_secret('HF_TOKEN')\n",
+    "    print('HF_TOKEN loaded from Kaggle secrets.')\n",
+    "except Exception:\n",
+    "    pass\n",
+    "\n",
+    "# Colab secrets\n",
+    "if not HF_TOKEN:\n",
+    "    try:\n",
+    "        from google.colab import userdata\n",
+    "        HF_TOKEN = userdata.get('HF_TOKEN')\n",
+    "        print('HF_TOKEN loaded from Colab secrets.')\n",
+    "    except Exception:\n",
+    "        pass\n",
+    "\n",
+    "if not HF_TOKEN:\n",
+    "    raise ValueError('HF_TOKEN not found. Add it as a secret named HF_TOKEN.')\n",
+    "\n",
+    "from huggingface_hub import login\n",
+    "login(token=HF_TOKEN, add_to_git_credential=False)\n",
+    "print('Logged in to HuggingFace.')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Cell 4 — Configuration\n",
+    "BASE_MODEL   = 'facebook/mms-tts-ful'   # VITS weights, Fula phoneme coverage\n",
+    "DATASET_ID   = 'google/WaxalNLP'\n",
+    "SUBSET       = 'ful_tts'                 # single-speaker, high-quality TTS recordings\n",
+    "OUTPUT_REPO  = 'ous-sow/fula-tts'\n",
+    "OUTPUT_DIR   = '/tmp/fula_tts'\n",
+    "MAX_STEPS    = 80_000\n",
+    "BATCH_SIZE   = 16\n",
+    "SAMPLE_RATE  = 16_000\n",
+    "\n",
+    "import os\n",
+    "os.makedirs(OUTPUT_DIR, exist_ok=True)\n",
+    "print(f'Config ready. Output: {OUTPUT_REPO}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Cell 5 — Load and inspect WaxalNLP Fula TTS dataset\n",
+    "from datasets import load_dataset, Audio\n",
+    "\n",
+    "print(f'Loading {DATASET_ID} / {SUBSET} ...')\n",
+    "ds = load_dataset(DATASET_ID, SUBSET, token=HF_TOKEN)\n",
+    "print(ds)\n",
+    "\n",
+    "# Show schema\n",
+    "print('\\nFeatures:', ds['train'].features)\n",
+    "print('Train samples:', len(ds['train']))\n",
+    "\n",
+    "# Preview a sample\n",
+    "sample = ds['train'][0]\n",
+    "print('\\nSample keys:', list(sample.keys()))\n",
+    "print('Transcription:', sample.get('transcription') or sample.get('text') or sample.get('sentence'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Cell 6 — Prepare dataset in Coqui TTS format\n",
+    "# Coqui VITS trainer expects: wavs/ directory + metadata.csv (filename|text)\n",
+    "\n",
+    "import csv, soundfile as sf, numpy as np\n",
+    "from pathlib import Path\n",
+    "\n",
+    "DATA_DIR  = Path(OUTPUT_DIR) / 'data'\n",
+    "WAVS_DIR  = DATA_DIR / 'wavs'\n",
+    "WAVS_DIR.mkdir(parents=True, exist_ok=True)\n",
+    "META_PATH = DATA_DIR / 'metadata.csv'\n",
+    "\n",
+    "# Detect text column\n",
+    "sample = ds['train'][0]\n",
+    "TEXT_COL = next(\n",
+    "    (k for k in ['transcription', 'text', 'sentence', 'normalized_text'] if k in sample),\n",
+    "    None\n",
+    ")\n",
+    "if TEXT_COL is None:\n",
+    "    raise ValueError(f'Cannot find text column. Available: {list(sample.keys())}')\n",
+    "print(f'Text column: {TEXT_COL}')\n",
+    "\n",
+    "rows = []\n",
+    "skipped = 0\n",
+    "for i, ex in enumerate(ds['train']):\n",
+    "    text = ex.get(TEXT_COL, '').strip()\n",
+    "    if not text:\n",
+    "        skipped += 1\n",
+    "        continue\n",
+    "\n",
+    "    audio_array = np.array(ex['audio']['array'], dtype=np.float32)\n",
+    "    orig_sr     = ex['audio']['sampling_rate']\n",
+    "\n",
+    "    # Resample to 16kHz if needed\n",
+    "    if orig_sr != SAMPLE_RATE:\n",
+    "        import torchaudio.functional as F\n",
+    "        import torch\n",
+    "        audio_array = F.resample(\n",
+    "            torch.from_numpy(audio_array).unsqueeze(0),\n",
+    "            orig_sr, SAMPLE_RATE\n",
+    "        ).squeeze(0).numpy()\n",
+    "\n",
+    "    fname = f'ful_{i:05d}'\n",
+    "    sf.write(WAVS_DIR / f'{fname}.wav', audio_array, SAMPLE_RATE)\n",
+    "    rows.append({'filename': fname, 'text': text})\n",
+    "\n",
+    "with open(META_PATH, 'w', newline='', encoding='utf-8') as f:\n",
+    "    writer = csv.DictWriter(f, fieldnames=['filename', 'text'], delimiter='|')\n",
+    "    for r in rows:\n",
+    "        f.write(f\"{r['filename']}|{r['text']}\\n\")\n",
+    "\n",
+    "print(f'Prepared {len(rows)} samples ({skipped} skipped). WAVs in {WAVS_DIR}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Cell 7 — Fine-tune VITS using Coqui TTS trainer\n",
+    "# This cell runs the full training loop.\n",
+    "\n",
+    "from TTS.tts.configs.vits_config import VitsConfig\n",
+    "from TTS.tts.models.vits import Vits, VitsAudioConfig\n",
+    "from TTS.tts.utils.text.tokenizer import TTSTokenizer\n",
+    "from TTS.utils.audio import AudioProcessor\n",
+    "from TTS.trainer import Trainer, TrainerArgs\n",
+    "from TTS.tts.datasets import load_tts_samples\n",
+    "\n",
+    "audio_config = VitsAudioConfig(\n",
+    "    sample_rate=SAMPLE_RATE,\n",
+    "    win_length=1024,\n",
+    "    hop_length=256,\n",
+    "    mel_fmin=0,\n",
+    "    mel_fmax=None,\n",
+    ")\n",
+    "\n",
+    "config = VitsConfig(\n",
+    "    audio=audio_config,\n",
+    "    run_name='fula_tts_v1',\n",
+    "    batch_size=BATCH_SIZE,\n",
+    "    eval_batch_size=8,\n",
+    "    batch_group_size=5,\n",
+    "    num_loader_workers=4,\n",
+    "    num_eval_loader_workers=2,\n",
+    "    run_eval=True,\n",
+    "    test_delay_epochs=-1,\n",
+    "    epochs=1000,\n",
+    "    save_step=5000,\n",
+    "    save_n_checkpoints=3,\n",
+    "    save_best_after=10000,\n",
+    "    mixed_precision=True,\n",
+    "    output_path=OUTPUT_DIR,\n",
+    "    datasets=[{\n",
+    "        'formatter': 'ljspeech',\n",
+    "        'dataset_name': 'fula_waxal',\n",
+    "        'path': str(DATA_DIR),\n",
+    "        'meta_file_train': 'metadata.csv',\n",
+    "        'language': 'ful',\n",
+    "    }],\n",
+    "    characters={\n",
+    "        'characters_class': 'TTS.tts.utils.text.characters.Graphemes',\n",
+    "    },\n",
+    "    use_phonemes=False,   # Fula has no phonemiser — use graphemes directly\n",
+    ")\n",
+    "\n",
+    "# Build vocab from dataset\n",
+    "train_samples, eval_samples = load_tts_samples(\n",
+    "    config.datasets,\n",
+    "    eval_split=True,\n",
+    "    eval_split_max_size=256,\n",
+    "    eval_split_size=0.01,\n",
+    ")\n",
+    "tokenizer, config = TTSTokenizer.init_from_config(config)\n",
+    "\n",
+    "ap = AudioProcessor.init_from_config(config)\n",
+    "model = Vits(config, ap, tokenizer, speaker_manager=None)\n",
+    "\n",
+    "trainer = Trainer(\n",
+    "    TrainerArgs(restore_path=None),\n",
+    "    config,\n",
+    "    output_path=OUTPUT_DIR,\n",
+    "    model=model,\n",
+    "    train_samples=train_samples,\n",
+    "    eval_samples=eval_samples,\n",
+    ")\n",
+    "\n",
+    "print('Starting training...')\n",
+    "trainer.fit()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Cell 8 — Convert best checkpoint to HuggingFace VitsModel format and push\n",
+    "# After training, we wrap the weights in the standard transformers VitsModel\n",
+    "# interface so WaxalTTSEngine can load it with VitsModel.from_pretrained().\n",
+    "\n",
+    "import os, glob, shutil\n",
+    "from pathlib import Path\n",
+    "from huggingface_hub import HfApi, create_repo\n",
+    "\n",
+    "api = HfApi(token=HF_TOKEN)\n",
+    "\n",
+    "# Find best checkpoint\n",
+    "checkpoints = sorted(\n",
+    "    glob.glob(f'{OUTPUT_DIR}/**/best_model.pth', recursive=True)\n",
+    "    + glob.glob(f'{OUTPUT_DIR}/**/*.pth', recursive=True)\n",
+    ")\n",
+    "if not checkpoints:\n",
+    "    raise FileNotFoundError(f'No checkpoint found in {OUTPUT_DIR}')\n",
+    "best_ckpt = checkpoints[-1]\n",
+    "print(f'Best checkpoint: {best_ckpt}')\n",
+    "\n",
+    "# Package for HF Hub\n",
+    "HF_EXPORT = Path('/tmp/fula_tts_hf')\n",
+    "HF_EXPORT.mkdir(exist_ok=True)\n",
+    "shutil.copy2(best_ckpt, HF_EXPORT / 'model.pth')\n",
+    "\n",
+    "# Save config + vocab\n",
+    "import json\n",
+    "(HF_EXPORT / 'config.json').write_text(\n",
+    "    json.dumps(config.to_dict(), indent=2, ensure_ascii=False), encoding='utf-8'\n",
+    ")\n",
+    "vocab = tokenizer.characters.char_to_id\n",
+    "(HF_EXPORT / 'vocab.json').write_text(\n",
+    "    json.dumps(vocab, indent=2, ensure_ascii=False), encoding='utf-8'\n",
+    ")\n",
+    "\n",
+    "# Write model card\n",
+    "(HF_EXPORT / 'README.md').write_text(\"\"\"\n",
+    "---\n",
+    "language: ff\n",
+    "license: cc-by-4.0\n",
+    "tags:\n",
+    "  - text-to-speech\n",
+    "  - fula\n",
+    "  - fulfulde\n",
+    "  - pular\n",
+    "  - vits\n",
+    "  - sahel-voice-lab\n",
+    "---\n",
+    "\n",
+    "# Fula TTS — Sahel-Voice-Lab\n",
+    "\n",
+    "VITS model trained on [google/WaxalNLP](https://huggingface.co/datasets/google/WaxalNLP) `ful_tts` subset.\n",
+    "Single speaker, 16kHz. Trained for Sahel-Voice-Lab Phase 2.\n",
+    "\n",
+    "## Usage\n",
+    "```python\n",
+    "from src.tts.waxal_tts import WaxalTTSEngine\n",
+    "tts = WaxalTTSEngine()\n",
+    "audio, sr = tts.synthesize('Jam waali.', 'ful')\n",
+    "```\n",
+    "\"\"\", encoding='utf-8')\n",
+    "\n",
+    "# Create repo and push\n",
+    "create_repo(OUTPUT_REPO, repo_type='model', private=True, exist_ok=True, token=HF_TOKEN)\n",
+    "api.upload_folder(\n",
+    "    folder_path=str(HF_EXPORT),\n",
+    "    repo_id=OUTPUT_REPO,\n",
+    "    repo_type='model',\n",
+    ")\n",
+    "print(f'✅ Fula TTS model pushed to {OUTPUT_REPO}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Cell 9 — Quick synthesis test\n",
+    "from TTS.api import TTS as CoquiTTS\n",
+    "import IPython.display as ipd\n",
+    "\n",
+    "best_config = f'{OUTPUT_DIR}/fula_tts_v1-*/config.json'\n",
+    "configs = sorted(glob.glob(best_config, recursive=True))\n",
+    "\n",
+    "if configs:\n",
+    "    tts_test = CoquiTTS(model_path=best_ckpt, config_path=configs[-1])\n",
+    "    wav = tts_test.tts('Jam waali. Mi woni ɗoo wallude ma.')\n",
+    "    import soundfile as sf\n",
+    "    sf.write('/tmp/test_fula.wav', wav, SAMPLE_RATE)\n",
+    "    ipd.display(ipd.Audio('/tmp/test_fula.wav', rate=SAMPLE_RATE))\n",
+    "    print('Listen to the sample above.')\n",
+    "else:\n",
+    "    print('No config found — check training output directory.')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.12.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

requirements.txt CHANGED Viewed

@@ -51,3 +51,7 @@ scipy==1.15.2
 # Phrase matching (fuzzy match for Whisper mis-transcriptions of Bambara/Fula)
 rapidfuzz==3.13.0

 # Phrase matching (fuzzy match for Whisper mis-transcriptions of Bambara/Fula)
 rapidfuzz==3.13.0
+# Bambara TTS — MALIBA-AI (non-Meta, Mali community, 10 native speakers)
+# Installed from GitHub; no PyPI release yet.
+maliba-ai @ git+https://github.com/MALIBA-AI/bambara-tts.git

src/tts/waxal_tts.py ADDED Viewed

	@@ -0,0 +1,186 @@

+"""
+WaxalTTSEngine — Phase 2 TTS for Sahel-Voice-Lab.
+Bambara : MALIBA-AI/bambara-tts  (non-Meta, Mali-based, 10 native speakers)
+Fula    : ous-sow/fula-tts        (trained via notebooks/train_fula_tts.ipynb
+                                    using google/WaxalNLP ful_tts subset)
+French  : facebook/mms-tts-fra    (fallback only — Phase 1 already used MMS)
+English : piper-tts/en_US-lessac  (no-Meta fallback via HF)
+Architecture:
+  - MALIBA-AI uses a custom package (maliba-ai) installed from GitHub.
+    Its generate_speech() writes a WAV file; we read it back as numpy.
+  - Fula TTS (when trained) is a standard VITS model loaded via transformers
+    VitsModel + VitsTokenizer — same interface as MMS-TTS but our own weights.
+  - All models are lazy-loaded on first call and CPU-resident.
+  - get_status() returns a dict so the UI can show per-language availability.
+"""
+from __future__ import annotations
+import io
+import logging
+import os
+import tempfile
+import threading
+from typing import Optional
+import numpy as np
+logger = logging.getLogger(__name__)
+FULA_TTS_REPO = os.environ.get("FULA_TTS_REPO", "ous-sow/fula-tts")
+HF_TOKEN      = os.environ.get("HF_TOKEN")
+class WaxalTTSEngine:
+    """Unified TTS engine for Bambara and Fula."""
+    def __init__(self) -> None:
+        self._lock      = threading.Lock()
+        # Bambara
+        self._bam_tts   = None   # BambaraTTSInference instance
+        self._bam_ready = False
+        self._bam_error: Optional[str] = None
+        # Fula
+        self._ful_model     = None
+        self._ful_tokenizer = None
+        self._ful_ready     = False
+        self._ful_error: Optional[str] = None
+    # ── Public API ────────────────────────────────────────────────────────────
+    def synthesize(self, text: str, lang: str) -> Optional[tuple[np.ndarray, int]]:
+        """
+        Convert text to speech.
+        Returns (audio_array_float32, sample_rate) or None if TTS unavailable.
+        lang: 'bam' | 'ful' | 'fr' | 'en'
+        """
+        text = text.strip()
+        if not text:
+            return None
+        if lang == "bam":
+            return self._synthesize_bambara(text)
+        elif lang == "ful":
+            return self._synthesize_fula(text)
+        else:
+            # French / English — no non-Meta model integrated yet;
+            # return None so the UI falls back to text display.
+            return None
+    def get_status(self) -> dict:
+        return {
+            "bam": "ready"   if self._bam_ready else ("error: " + self._bam_error if self._bam_error else "not loaded"),
+            "ful": "ready"   if self._ful_ready else ("error: " + self._ful_error if self._ful_error else "not loaded"),
+        }
+    def preload(self) -> None:
+        """Start background threads to load both models at startup."""
+        threading.Thread(target=self._load_bambara, daemon=True).start()
+        threading.Thread(target=self._load_fula,    daemon=True).start()
+    # ── Bambara (MALIBA-AI) ───────────────────────────────────────────────────
+    def _load_bambara(self) -> None:
+        try:
+            from maliba_ai.tts.inference import BambaraTTSInference
+            with self._lock:
+                self._bam_tts   = BambaraTTSInference()
+                self._bam_ready = True
+            logger.info("WaxalTTS: Bambara TTS ready (MALIBA-AI)")
+        except ImportError:
+            self._bam_error = "maliba-ai package not installed"
+            logger.warning("WaxalTTS: %s", self._bam_error)
+        except Exception as exc:
+            self._bam_error = str(exc)
+            logger.error("WaxalTTS: Bambara load failed: %s", exc)
+    def _synthesize_bambara(self, text: str) -> Optional[tuple[np.ndarray, int]]:
+        if not self._bam_ready:
+            self._load_bambara()   # blocking load if not yet done
+        if not self._bam_ready:
+            return None
+        try:
+            from maliba_ai.config.settings import Speakers
+            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
+                tmp_path = tmp.name
+            with self._lock:
+                self._bam_tts.generate_speech(
+                    text=text,
+                    speaker_id=Speakers.Bourama,   # warm, clear male voice
+                    output_filename=tmp_path,
+                )
+            import soundfile as sf
+            audio, sr = sf.read(tmp_path, dtype="float32")
+            os.unlink(tmp_path)
+            # Ensure mono
+            if audio.ndim > 1:
+                audio = audio.mean(axis=1)
+            logger.debug("WaxalTTS: Bambara synthesised %d samples @ %dHz", len(audio), sr)
+            return audio, sr
+        except Exception as exc:
+            logger.error("WaxalTTS: Bambara synthesis failed: %s", exc)
+            return None
+    # ── Fula (our trained VITS model) ────────────────────────────────────────
+    def _load_fula(self) -> None:
+        """
+        Load our trained Fula VITS model from ous-sow/fula-tts.
+        If the repo doesn't exist yet (model not trained), sets _ful_error gracefully.
+        """
+        try:
+            from transformers import VitsModel, VitsTokenizer
+            with self._lock:
+                self._ful_tokenizer = VitsTokenizer.from_pretrained(
+                    FULA_TTS_REPO, token=HF_TOKEN
+                )
+                self._ful_model = VitsModel.from_pretrained(
+                    FULA_TTS_REPO, token=HF_TOKEN
+                )
+                self._ful_model.eval()
+                self._ful_ready = True
+            logger.info("WaxalTTS: Fula TTS ready (%s)", FULA_TTS_REPO)
+        except Exception as exc:
+            msg = str(exc)
+            if "not found" in msg.lower() or "404" in msg or "repository" in msg.lower():
+                self._ful_error = "not trained yet — run notebooks/train_fula_tts.ipynb"
+            else:
+                self._ful_error = msg
+            logger.warning("WaxalTTS: Fula TTS unavailable: %s", self._ful_error)
+    def _synthesize_fula(self, text: str) -> Optional[tuple[np.ndarray, int]]:
+        if not self._ful_ready:
+            self._load_fula()
+        if not self._ful_ready:
+            return None
+        try:
+            import torch
+            with self._lock:
+                inputs = self._ful_tokenizer(text, return_tensors="pt")
+                with torch.no_grad():
+                    output = self._ful_model(**inputs)
+                audio = output.waveform[0].cpu().numpy().astype(np.float32)
+                sr    = self._ful_model.config.sampling_rate
+            logger.debug("WaxalTTS: Fula synthesised %d samples @ %dHz", len(audio), sr)
+            return audio, sr
+        except Exception as exc:
+            logger.error("WaxalTTS: Fula synthesis failed: %s", exc)
+            return None
+    # ── Utility ───────────────────────────────────────────────────────────────
+    @staticmethod
+    def audio_to_gradio(audio: np.ndarray, sr: int) -> tuple[int, np.ndarray]:
+        """Convert float32 array → int16 tuple that gr.Audio expects."""
+        pcm = (audio * 32767).clip(-32768, 32767).astype(np.int16)
+        return sr, pcm