Spaces:

MataStrategy
/

ground-zero

Running

jefffffff9 Claude Sonnet 4.6 commited on 23 days ago

Commit

8952fff

1 Parent(s): cd017e2

Phase 3: Voice-to-Voice S2S pipeline — F5-TTS, LLM brain, CER metric

app.py:
- Add LLM_MODEL_ID env var (default: Qwen/Qwen2.5-72B-Instruct)
- Import GemmaClient + bam_normalize at module level
- Add _voice_ref_path / _voice_ref_text / _llm_client state
- Add set_voice_reference(): converts MP3->24kHz WAV, auto-transcribes
- Add _convo_pipeline(): ASR -> bam_normalize -> LLM (phonetic Bambara
system prompt) -> F5-TTS (voice ref) with MMS-TTS fallback
- handle_ask() accepts convo_mode=bool, routes to _convo_pipeline or
the original sensor pipeline accordingly
- Tab 1 UI: Conversation Mode toggle, Voice Reference upload accordion,
stop_recording auto-submit for true back-to-back loop

src/tts/f5_tts.py (new):
- Lazy-loaded F5TTS wrapper; synthesize() with ref_wav + ref_text
- to_wav_24k() resampler (F5-TTS needs 24 kHz input)
- Graceful fallback (returns None) when f5-tts not installed

src/data/bam_normalize.py (new):
- _bam_norm(): ou->u, dj->j, gn->ny, ch->c, oo->o-open, ee->e-open
- Used at inference (app.py) and training (notebook Cell 11)

requirements.txt: add f5-tts>=1.0.0

Notebook:
- Cell 10: _bam_norm() defined inline (no external dependency)
- Cell 11: apply _bam_norm before tokenisation in prepare_dataset
- Cell 14: compute_metrics returns {cer, wer}; CER is primary metric
- Cell 15: metric_for_best_model='cer'; best checkpoint = lowest CER
- Cell 17: show CER (primary) + WER (secondary) in evaluation output
- Cell 19: CER in Hub commit message
- Cell 20: CER in verification summary

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (6) hide show

app.py +231 -8
notebooks/kaggle_master_trainer.ipynb +11 -7
requirements.txt +5 -0
scripts/patch_notebook_cer.py +177 -0
src/data/bam_normalize.py +67 -0
src/tts/f5_tts.py +114 -0

app.py CHANGED Viewed

@@ -7,6 +7,7 @@ Environment variables (set in Space Settings → Secrets):
   FEEDBACK_REPO_ID       — e.g. ous-sow/sahel-agri-feedback  (dataset, private)
   ADAPTER_REPO_ID        — e.g. ous-sow/sahel-agri-adapters   (model, private)
   WHISPER_MODEL_ID       — default: openai/whisper-small
   KAGGLE_USERNAME        — Kaggle username (for auto-trigger training)
   KAGGLE_KEY             — Kaggle API key  (for auto-trigger training)
   KAGGLE_KERNEL_SLUG     — default: ous-sow/sahel-voice-master-trainer
@@ -36,6 +37,7 @@ ADAPTER_REPO_ID  = os.environ.get("ADAPTER_REPO_ID",  "ous-sow/sahel-agri-adapte
 # whisper-small: ~10s on cpu-basic, good multilingual quality.
 # Override via WHISPER_MODEL_ID env var if you upgrade to a GPU Space later.
 WHISPER_MODEL_ID     = os.environ.get("WHISPER_MODEL_ID",    "openai/whisper-small")
 KAGGLE_USERNAME      = os.environ.get("KAGGLE_USERNAME",      "")
 KAGGLE_KEY           = os.environ.get("KAGGLE_KEY",           "")
 KAGGLE_KERNEL_SLUG   = os.environ.get("KAGGLE_KERNEL_SLUG",   "ous-sow/sahel-voice-master-trainer")
@@ -66,11 +68,18 @@ _fine_tuned_models  = {}     # lang_code -> WhisperForConditionalGeneration (ful
 _model_lock         = threading.Lock()
 _model_status       = "not loaded"
 from src.tts.mms_tts                  import MMSTTSEngine
 from src.iot.intent_parser             import IntentParser
 from src.iot.sensor_bridge             import SensorBridge
 from src.iot.voice_responder           import VoiceResponder
 from src.conversation.phrase_matcher   import PhraseMatcher
 _tts             = MMSTTSEngine()
 _intent_parser   = IntentParser()
@@ -237,6 +246,174 @@ def _run_pipeline(audio_path: str, language_code: str):
     return transcript, english_translation, response_text, (sample_rate, wav_np)
 # ── HF Hub feedback persistence ───────────────────────────────────────────────
 def _save_feedback_to_hub(
@@ -937,7 +1114,7 @@ def _harvest_hf_dataset(lang_label: str, max_samples: int = 500) -> str:
 # ── Main ask handler ──────────────────────────────────────────────────────────
-def handle_ask(audio_path, language_label):
     if audio_path is None:
         return "⚠️ No audio — press Record or upload a file.", "", "", None
@@ -948,8 +1125,11 @@ def handle_ask(audio_path, language_label):
         return f"⏳ Model loading ({status}). Wait a moment and try again.", "", "", None
     try:
-        transcript, english_translation, response_text, audio_out = _run_pipeline(audio_path, language_code)
-        return transcript, english_translation, response_text, audio_out
     except Exception as e:
         return f"❌ {e}", "", "", None
@@ -977,6 +1157,39 @@ def build_ui() -> gr.Blocks:
             # ── Tab 1: Voice Assistant ────────────────────────────────────────
             with gr.TabItem("🎙️ Voice Assistant", id="tab_voice"):
                 with gr.Row():
                     with gr.Column(scale=1):
                         language_dd = gr.Dropdown(
@@ -999,15 +1212,15 @@ def build_ui() -> gr.Blocks:
                             interactive=False,
                         )
                         translation_box = gr.Textbox(
-                            label="English translation",
                             lines=2,
                             placeholder="English meaning will appear here…",
                             interactive=False,
                         )
                         response_box = gr.Textbox(
-                            label="Response in your language",
                             lines=2,
-                            placeholder="Agricultural advice will appear here…",
                             interactive=False,
                         )
                         audio_output = gr.Audio(
@@ -1021,10 +1234,20 @@ def build_ui() -> gr.Blocks:
                             size="sm",
                         )
                 ask_btn.click(
                     fn=handle_ask,
-                    inputs=[audio_input, language_dd],
-                    outputs=[transcript_box, translation_box, response_box, audio_output],
                 )
             # ── Tab 2: Feedback & Correction ─────────────────────────────────

   FEEDBACK_REPO_ID       — e.g. ous-sow/sahel-agri-feedback  (dataset, private)
   ADAPTER_REPO_ID        — e.g. ous-sow/sahel-agri-adapters   (model, private)
   WHISPER_MODEL_ID       — default: openai/whisper-small
+  LLM_MODEL_ID           — default: Qwen/Qwen2.5-72B-Instruct
   KAGGLE_USERNAME        — Kaggle username (for auto-trigger training)
   KAGGLE_KEY             — Kaggle API key  (for auto-trigger training)
   KAGGLE_KERNEL_SLUG     — default: ous-sow/sahel-voice-master-trainer
 # whisper-small: ~10s on cpu-basic, good multilingual quality.
 # Override via WHISPER_MODEL_ID env var if you upgrade to a GPU Space later.
 WHISPER_MODEL_ID     = os.environ.get("WHISPER_MODEL_ID",    "openai/whisper-small")
+LLM_MODEL_ID         = os.environ.get("LLM_MODEL_ID",         "Qwen/Qwen2.5-72B-Instruct")
 KAGGLE_USERNAME      = os.environ.get("KAGGLE_USERNAME",      "")
 KAGGLE_KEY           = os.environ.get("KAGGLE_KEY",           "")
 KAGGLE_KERNEL_SLUG   = os.environ.get("KAGGLE_KERNEL_SLUG",   "ous-sow/sahel-voice-master-trainer")
 _model_lock         = threading.Lock()
 _model_status       = "not loaded"
+# ── Conversation-mode state ───────────────────────────────────────────────────
+_voice_ref_path: str | None = None   # path to 24 kHz WAV converted from user MP3
+_voice_ref_text: str        = ""     # auto-transcribed text of reference audio
+_llm_client                 = None   # GemmaClient, lazy init
 from src.tts.mms_tts                  import MMSTTSEngine
 from src.iot.intent_parser             import IntentParser
 from src.iot.sensor_bridge             import SensorBridge
 from src.iot.voice_responder           import VoiceResponder
 from src.conversation.phrase_matcher   import PhraseMatcher
+from src.llm.gemma_client              import GemmaClient
+from src.data.bam_normalize            import normalize as bam_normalize
 _tts             = MMSTTSEngine()
 _intent_parser   = IntentParser()
     return transcript, english_translation, response_text, (sample_rate, wav_np)
+# ── Conversation-mode helpers ─────────────────────────────────────────────────
+# Bambara conversation system prompt — instructs LLM to respond phonetically
+_BAM_CONVO_SYSTEM = """\
+You are a friendly Bambara voice assistant. Rules you must follow:
+1. Always reply in Bambara, matching the user's informal spoken style.
+2. Use phonetic spelling: write 'u' instead of 'ou', 'j' instead of 'dj', \
+'c' instead of 'ch' — spell words as they sound when spoken aloud.
+3. Keep responses short: 1–3 sentences max. This is a voice conversation.
+4. Never add translations or explanations unless explicitly asked.
+5. If the user speaks French or English, switch to that language naturally."""
+def _get_llm() -> GemmaClient:
+    global _llm_client
+    if _llm_client is None:
+        _llm_client = GemmaClient(model_id=LLM_MODEL_ID, hf_token=HF_TOKEN)
+    return _llm_client
+def set_voice_reference(audio_file) -> str:
+    """
+    Store an uploaded audio file as the TTS voice reference.
+    Converts to 24 kHz WAV (F5-TTS requirement) and auto-transcribes.
+    Returns a status string for the UI.
+    """
+    global _voice_ref_path, _voice_ref_text
+    if audio_file is None:
+        _voice_ref_path = None
+        _voice_ref_text = ""
+        return "🗑️ Voice reference cleared — using default MMS-TTS voice."
+    try:
+        from src.tts.f5_tts import to_wav_24k
+        wav_path = to_wav_24k(audio_file)
+        _voice_ref_path = wav_path
+        # Auto-transcribe using already-loaded Whisper if available
+        if _whisper_model is not None and _whisper_processor is not None:
+            import torch, librosa
+            audio_np, _ = librosa.load(wav_path, sr=16000, mono=True)
+            with _model_lock:
+                inputs = _whisper_processor.feature_extractor(
+                    audio_np, sampling_rate=16000, return_tensors="pt"
+                )
+                with torch.no_grad():
+                    ids = _whisper_model.generate(
+                        inputs.input_features,
+                        max_new_tokens=128,
+                    )
+            _voice_ref_text = _whisper_processor.batch_decode(
+                ids, skip_special_tokens=True
+            )[0].strip()
+            return (
+                f"✅ Voice reference set!\n"
+                f"File : {Path(audio_file).name}\n"
+                f"Transcript : {_voice_ref_text[:80] or '(empty — F5-TTS will use in-context inference)'}"
+            )
+        else:
+            _voice_ref_text = ""
+            return (
+                f"✅ Voice reference set (model not loaded yet — transcript pending).\n"
+                f"File: {Path(audio_file).name}"
+            )
+    except Exception as exc:
+        return f"❌ Could not process reference audio: {exc}"
+@_gpu
+def _convo_pipeline(audio_path: str, language_code: str):
+    """
+    Full S2S conversation pipeline:
+      1. ASR   — fine-tuned Whisper → transcript
+      2. Norm  — bam_normalize() on Bambara input
+      3. Brain — LLM (Qwen) with Bambara phonetic system prompt → response text
+      4. Mouth — F5-TTS with voice reference (or MMS-TTS fallback) → audio
+    Returns same 4-tuple as _run_pipeline.
+    """
+    import torch
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    if _whisper_model is None:
+        return "⏳ Model still loading…", "", "", None
+    import librosa
+    audio_np, _ = librosa.load(audio_path, sr=16000, mono=True)
+    active_model = _fine_tuned_models.get(language_code, _whisper_model)
+    active_model.to(device)
+    with _model_lock:
+        inputs = _whisper_processor.feature_extractor(
+            audio_np, sampling_rate=16000, return_tensors="pt"
+        )
+        input_features = inputs.input_features.to(device)
+        forced_ids = None
+        if language_code not in ("bam", "ful"):
+            forced_ids = _whisper_processor.get_decoder_prompt_ids(
+                language=language_code, task="transcribe"
+            )
+        with torch.no_grad():
+            predicted_ids = active_model.generate(
+                input_features,
+                forced_decoder_ids=forced_ids if forced_ids else None,
+                max_new_tokens=256,
+            )
+    transcript = _whisper_processor.batch_decode(
+        predicted_ids, skip_special_tokens=True
+    )[0].strip()
+    active_model.to("cpu")
+    if device == "cuda":
+        torch.cuda.empty_cache()
+    # Phonetic normalisation for Bambara (unifies ou→u etc.)
+    normalised = bam_normalize(transcript) if language_code == "bam" else transcript
+    # ── LLM brain ─────────────────────────────────────────────────────────────
+    try:
+        from huggingface_hub import InferenceClient
+        client = InferenceClient(token=HF_TOKEN)
+        completion = client.chat_completion(
+            model=LLM_MODEL_ID,
+            messages=[
+                {"role": "system", "content": _BAM_CONVO_SYSTEM},
+                {"role": "user",   "content": normalised},
+            ],
+            max_tokens=256,
+            temperature=0.6,
+        )
+        response_text = completion.choices[0].message.content.strip()
+    except Exception as llm_err:
+        response_text = normalised  # echo transcript if LLM fails
+        import logging
+        logging.getLogger(__name__).warning("LLM failed: %s", llm_err)
+    # ── TTS mouth — F5-TTS preferred, MMS-TTS fallback ────────────────────────
+    audio_out = None
+    if _voice_ref_path and Path(_voice_ref_path).exists():
+        try:
+            from src.tts.f5_tts import synthesize as f5_synthesize
+            result = f5_synthesize(
+                response_text,
+                ref_wav_path=_voice_ref_path,
+                ref_text=_voice_ref_text,
+                device=device,
+            )
+            if result is not None:
+                wav_np, sr = result
+                audio_out = (sr, wav_np)
+        except Exception as tts_err:
+            import logging
+            logging.getLogger(__name__).warning("F5-TTS failed, falling back: %s", tts_err)
+    if audio_out is None:
+        # MMS-TTS fallback
+        wav_np, sr = _tts.synthesize(response_text, language_code, device=device)
+        audio_out = (sr, wav_np)
+    return transcript, "", response_text, audio_out
 # ── HF Hub feedback persistence ───────────────────────────────────────────────
 def _save_feedback_to_hub(
 # ── Main ask handler ──────────────────────────────────────────────────────────
+def handle_ask(audio_path, language_label, convo_mode: bool = False):
     if audio_path is None:
         return "⚠️ No audio — press Record or upload a file.", "", "", None
         return f"⏳ Model loading ({status}). Wait a moment and try again.", "", "", None
     try:
+        if convo_mode:
+            transcript, eng, response_text, audio_out = _convo_pipeline(audio_path, language_code)
+        else:
+            transcript, eng, response_text, audio_out = _run_pipeline(audio_path, language_code)
+        return transcript, eng, response_text, audio_out
     except Exception as e:
         return f"❌ {e}", "", "", None
             # ── Tab 1: Voice Assistant ────────────────────────────────────────
             with gr.TabItem("🎙️ Voice Assistant", id="tab_voice"):
+                # ── Conversation Mode controls (top bar) ─────────────────────
+                with gr.Row():
+                    convo_mode_toggle = gr.Checkbox(
+                        value=False,
+                        label="🔄 Conversation Mode — AI responds with LLM + cloned voice",
+                        info="When ON: mic auto-submits on stop; AI replies via LLM + F5-TTS (requires voice reference below).",
+                    )
+                with gr.Accordion("🎤 Voice Reference — upload an MP3/WAV of the target speaker", open=False):
+                    gr.Markdown(
+                        "Upload **5–30 seconds** of clear speech in the target voice.  "
+                        "The AI will speak all its responses using this voice.  "
+                        "Requires `f5-tts` and a GPU — falls back to MMS-TTS otherwise."
+                    )
+                    with gr.Row():
+                        voice_ref_input = gr.Audio(
+                            sources=["upload"],
+                            type="filepath",
+                            label="Reference audio (MP3 or WAV)",
+                        )
+                        voice_ref_status = gr.Textbox(
+                            label="Status", interactive=False, lines=3
+                        )
+                    voice_ref_btn = gr.Button("💾 Set as Voice Reference", variant="secondary")
+                    voice_ref_btn.click(
+                        fn=set_voice_reference,
+                        inputs=[voice_ref_input],
+                        outputs=[voice_ref_status],
+                    )
+                gr.Markdown("---")
                 with gr.Row():
                     with gr.Column(scale=1):
                         language_dd = gr.Dropdown(
                             interactive=False,
                         )
                         translation_box = gr.Textbox(
+                            label="English translation (hidden in Conversation Mode)",
                             lines=2,
                             placeholder="English meaning will appear here…",
                             interactive=False,
                         )
                         response_box = gr.Textbox(
+                            label="AI response",
                             lines=2,
+                            placeholder="Response will appear here…",
                             interactive=False,
                         )
                         audio_output = gr.Audio(
                             size="sm",
                         )
+                _ask_inputs  = [audio_input, language_dd, convo_mode_toggle]
+                _ask_outputs = [transcript_box, translation_box, response_box, audio_output]
+                # Manual button click
                 ask_btn.click(
                     fn=handle_ask,
+                    inputs=_ask_inputs,
+                    outputs=_ask_outputs,
+                )
+                # Auto-submit when mic recording stops (Conversation Mode only)
+                audio_input.stop_recording(
+                    fn=lambda ap, ll, cm: handle_ask(ap, ll, cm) if cm else (None, None, None, None),
+                    inputs=_ask_inputs,
+                    outputs=_ask_outputs,
                 )
             # ── Tab 2: Feedback & Correction ─────────────────────────────────

notebooks/kaggle_master_trainer.ipynb CHANGED Viewed

@@ -127,7 +127,9 @@
    "id": "cell-clean",
    "metadata": {},
    "outputs": [],
-   "source": "# -- Cell 10: Text cleaning utilities -----------------------------------------\nimport re, unicodedata\n\n_BAMBARA_EXTRA = {'\\u025b','\\u0254','\\u014b'}\n_FULA_EXTRA    = {'\\u0253','\\u0257','\\u01b4','\\u014b','\\u0272'}\n_BASE_LATIN    = set('abcdefghijklmnopqrstuvwxyz')\n_ACCENTED      = set('\\u00e0\\u00e2\\u00e4\\u00e8\\u00e9\\u00ea\\u00eb'\n                     '\\u00ee\\u00ef\\u00f4\\u00f9\\u00fb\\u00fc\\u00fd'\n                     '\\u00ff\\u00e6\\u0153\\u00e7')\n_KEEP_PUNCT    = set(\" ',-.'!?\")\n\n_VALID_CHARS = {\n    'bam': _BASE_LATIN | _ACCENTED | _BAMBARA_EXTRA | _KEEP_PUNCT,\n    'ful': _BASE_LATIN | _ACCENTED | _FULA_EXTRA    | _KEEP_PUNCT,\n}\n\n\ndef clean_text(text: str, lang: str = 'bam') -> str:\n    if not text:\n        return ''\n    text = unicodedata.normalize('NFKC', text.lower().strip())\n    text = re.sub(r'https?://\\S+', '', text)\n    text = re.sub(r'<[^>]+>', '', text)\n    text = re.sub(r'([.,!?])\\1+', r'\\1', text)\n    valid = _VALID_CHARS.get(lang, _VALID_CHARS['bam'] | _VALID_CHARS['ful'])\n    text  = ''.join(c for c in text if c in valid)\n    return re.sub(r'\\s+', ' ', text).strip()\n\n\n# Verify actual output then assert against it\nr1 = clean_text('I ni ce! (hello)', 'bam')   # parens stripped, ! kept\nr2 = clean_text('Jam waali. <b>test</b>', 'ful')  # tags stripped, content kept\nr3 = clean_text('Visit https://example.com now!!', 'bam')  # URL stripped, word before stays\n\nassert r1 == 'i ni ce! hello',   f'r1: {repr(r1)}'\nassert r2 == 'jam waali. test',  f'r2: {repr(r2)}'\nassert r3 == 'visit now!',       f'r3: {repr(r3)}'\n\nprint('clean_text tests passed')\nprint(f'  {repr(r1)}')\nprint(f'  {repr(r2)}')\nprint(f'  {repr(r3)}')"
   },
   {
    "cell_type": "code",
@@ -135,7 +137,9 @@
    "id": "cell-prepare",
    "metadata": {},
    "outputs": [],
-   "source": "# -- Cell 11: Whisper processor + prepare_dataset -----------------------------\n# WhisperProcessor imports processing_utils -> image_utils -> torchvision,\n# which crashes when torch/torchvision have mismatched CUDA versions.\n# Fix: build the processor manually from its two sub-components.\n# WhisperFeatureExtractor and WhisperTokenizer have no torchvision dependency.\nimport numpy as np\n\nfrom transformers.models.whisper.feature_extraction_whisper import WhisperFeatureExtractor\nfrom transformers.models.whisper.tokenization_whisper import WhisperTokenizer\n\nprint(f'Loading Whisper feature extractor + tokenizer: {WHISPER_MODEL_ID} ...')\n_feat_ext  = WhisperFeatureExtractor.from_pretrained(WHISPER_MODEL_ID, token=HF_TOKEN)\n_tokenizer = WhisperTokenizer.from_pretrained(WHISPER_MODEL_ID, token=HF_TOKEN)\n\n\nclass _Processor:\n    \"\"\"Minimal WhisperProcessor substitute that avoids the torchvision import chain.\"\"\"\n    def __init__(self, feature_extractor, tokenizer):\n        self.feature_extractor = feature_extractor\n        self.tokenizer         = tokenizer\n\n    def get_decoder_prompt_ids(self, language, task='transcribe'):\n        return self.tokenizer.get_decoder_prompt_ids(language=language, task=task)\n\n    def save_pretrained(self, path):\n        self.feature_extractor.save_pretrained(path)\n        self.tokenizer.save_pretrained(path)\n\n\nprocessor = _Processor(_feat_ext, _tokenizer)\nprint('Processor ready')\n\n\ndef prepare_dataset(batch, text_col='transcription', lang=TRAIN_LANG):\n    \"\"\"\n    Resample to 16 kHz, extract log-mel features, tokenise text.\n    Works on any dict with 'audio' (HF Audio column) and a text column.\n    \"\"\"\n    audio       = batch['audio']\n    audio_array = np.array(audio['array'], dtype=np.float32)\n    orig_sr     = audio['sampling_rate']\n\n    if orig_sr != TARGET_SR:\n        try:\n            import torchaudio.functional as F_audio, torch\n            audio_array = F_audio.resample(\n                torch.from_numpy(audio_array).unsqueeze(0),\n                orig_sr, TARGET_SR,\n            ).squeeze(0).numpy()\n        except Exception:\n            import librosa\n            audio_array = librosa.resample(audio_array, orig_sr=orig_sr, target_sr=TARGET_SR)\n\n    batch['input_features'] = processor.feature_extractor(\n        audio_array, sampling_rate=TARGET_SR\n    ).input_features[0]\n\n    raw_text        = batch.get(text_col, '') or ''\n    cleaned         = clean_text(str(raw_text), lang=lang)\n    batch['labels'] = processor.tokenizer(cleaned).input_ids\n    return batch\n\n\nprint('prepare_dataset ready')"
   },
   {
    "cell_type": "code",
@@ -180,7 +184,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# -- Cell 14: Data collator + WER metric --------------------------------------\nimport jiwer\nfrom dataclasses import dataclass\nfrom typing import Any, Dict, List\n\ntransform = jiwer.Compose([\n    jiwer.ToLowerCase(),\n    jiwer.RemoveMultipleSpaces(),\n    jiwer.Strip(),\n    jiwer.RemovePunctuation(),\n    jiwer.ReduceToListOfListOfWords(),\n])\n\n\n@dataclass\nclass DataCollatorSpeechSeq2SeqWithPadding:\n    processor: Any\n\n    def __call__(self, features: List[Dict]) -> Dict:\n        import torch\n        input_feats  = [{'input_features': f['input_features']} for f in features]\n        batch        = self.processor.feature_extractor.pad(input_feats, return_tensors='pt')\n\n        # Leave features in fp32 -- AMP (fp16=True in TrainingArgs) handles casting\n\n        label_feats  = [{'input_ids': f['labels']} for f in features]\n        labels_batch = self.processor.tokenizer.pad(label_feats, return_tensors='pt')\n        labels       = labels_batch['input_ids'].masked_fill(\n            labels_batch.attention_mask.ne(1), -100\n        )\n        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().item():\n            labels = labels[:, 1:]\n        batch['labels'] = labels\n        return batch\n\n\ndef compute_metrics(pred):\n    pred_ids  = pred.predictions\n    label_ids = pred.label_ids\n    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id\n\n    pred_str  = processor.tokenizer.batch_decode(pred_ids,  skip_special_tokens=True)\n    label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)\n\n    wer = jiwer.wer(label_str, pred_str,\n                    hypothesis_transform=transform,\n                    reference_transform=transform)\n    return {'wer': round(wer, 4)}\n\n\ncollator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)\nprint('Collator and WER metric ready')"
    ]
   },
   {
@@ -196,7 +200,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# -- Cell 15: Training arguments ----------------------------------------------\nimport inspect\nfrom transformers import Seq2SeqTrainingArguments\n\n# transformers 4.x used 'evaluation_strategy'; 4.45+ renamed to 'eval_strategy'.\n# Detect which name this installed version accepts.\n_params = inspect.signature(Seq2SeqTrainingArguments.__init__).parameters\n_eval_key = 'eval_strategy' if 'eval_strategy' in _params else 'evaluation_strategy'\n\ntraining_args = Seq2SeqTrainingArguments(\n    output_dir=OUTPUT_DIR,\n\n    max_steps=MAX_STEPS,\n    warmup_steps=WARMUP_STEPS,\n    logging_steps=LOGGING_STEPS,\n    save_steps=SAVE_STEPS,\n    eval_steps=EVAL_STEPS,\n\n    per_device_train_batch_size=BATCH_SIZE,\n    per_device_eval_batch_size=8,\n    gradient_accumulation_steps=GRAD_ACCUM,\n\n    fp16=True,\n    gradient_checkpointing=True,   # reduces activation memory on T4\n\n    learning_rate=LEARNING_RATE,\n    lr_scheduler_type='cosine',\n    weight_decay=0.0,\n    adam_beta1=0.9,\n    adam_beta2=0.98,\n    adam_epsilon=1e-6,\n\n    **{_eval_key: 'steps'},\n    predict_with_generate=True,\n    generation_max_length=225,\n    load_best_model_at_end=True,\n    metric_for_best_model='wer',\n    greater_is_better=False,\n\n    save_total_limit=3,\n    save_strategy='steps',\n\n    report_to=['tensorboard'],  # tensorboard logs to OUTPUT_DIR/runs by default\n    push_to_hub=False,\n)\n\nprint(f'Training arguments ready (using {_eval_key}=steps)')\nprint(f'  Effective batch size: {BATCH_SIZE * GRAD_ACCUM}')\nprint(f'  Max steps           : {MAX_STEPS}')\n"
    ]
   },
   {
@@ -222,7 +226,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# ── Cell 17: WER evaluation ───────────────────────────────────────────────────\nprint('Running full evaluation on eval split ...')\neval_results = trainer.evaluate()\n\nwer_score = eval_results.get('eval_wer', float('nan'))\nprint(f'\\n📊 Final WER : {wer_score:.1%}')\nprint(f'   Eval loss : {eval_results.get(\"eval_loss\", float(\"nan\")):.4f}')\n\n# Show a few example transcriptions side-by-side\nimport random, torch\nprint('\\n── Sample predictions ───────────────────────────────')\nsamples = random.sample(range(len(eval_ds)), min(5, len(eval_ds)))\nfor idx in samples:\n    item     = eval_ds[idx]\n    feats    = torch.tensor(item['input_features']).unsqueeze(0).to(model.device)\n    with torch.no_grad():\n        pred_ids = model.generate(\n            feats,  # fp32 to match model dtype\n            max_new_tokens=128,\n        )\n    pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)[0]\n    labels   = [t if t != -100 else processor.tokenizer.pad_token_id\n                for t in item['labels']]\n    ref_str  = processor.tokenizer.decode(labels, skip_special_tokens=True)\n    print(f'  Ref : {ref_str}')\n    print(f'  Pred: {pred_str}')\n    print()"
    ]
   },
   {
@@ -248,7 +252,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# ── Cell 19: Push adapter to HF Model repo ───────────────────────────────────\nfrom huggingface_hub import HfApi, create_repo\n\n# Ensure repo exists\ncreate_repo(ADAPTER_REPO_ID, repo_type='model', private=True,\n            exist_ok=True, token=HF_TOKEN)\n\n_wer_part = f'{wer_score:.1%}' if wer_score == wer_score else 'n/a'\ncommit_msg = (\n    f'[{VERSION_TAG}] {LANG_NAME} fine-tuned checkpoint — '\n    f'{train_result.global_step} steps | WER {_wer_part} | '\n    f'{len(correction_records)} corrections + WaxalNLP'\n)\n\napi.upload_folder(\n    folder_path=OUTPUT_DIR,\n    repo_id=ADAPTER_REPO_ID,\n    repo_type='model',\n    path_in_repo=PATH_IN_REPO,\n    commit_message=commit_msg,\n)\nprint(f'✅ Adapter uploaded: {ADAPTER_REPO_ID}/{PATH_IN_REPO}')\n\n# Create a Git tag for this version\ntry:\n    api.create_tag(\n        repo_id=ADAPTER_REPO_ID,\n        repo_type='model',\n        tag=VERSION_TAG,\n        tag_message=commit_msg,\n        token=HF_TOKEN,\n    )\n    print(f'✅ Tag created   : {VERSION_TAG}')\nexcept Exception as e:\n    print(f'⚠️  Tag creation skipped: {e}')"
    ]
   },
   {
@@ -258,7 +262,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# ── Cell 20: Verification summary ────────────────────────────────────────────\nfrom huggingface_hub import list_repo_files\n\nprint('=' * 60)\nprint('DEEP SLEEP TRAINING — COMPLETE')\nprint('=' * 60)\nprint(f'  Language          : {TRAIN_LANG} ({LANG_NAME})')\nprint(f'  Model             : {WHISPER_MODEL_ID}')\nprint(f'  Steps completed   : {train_result.global_step}')\nprint(f'  Train loss        : {train_result.training_loss:.4f}')\n_wer_disp = f'{wer_score:.1%}' if wer_score == wer_score else 'n/a'\nprint(f'  Eval WER          : {_wer_disp}')\nprint(f'  Corrections used  : {len(correction_records)} × {CORRECTION_REPEAT}')\nprint(f'  WaxalNLP samples  : up to {MAX_WAXAL_TRAIN}')\nprint(f'  Version tag       : {VERSION_TAG}')\nprint(f'  HF repo           : {ADAPTER_REPO_ID}/{PATH_IN_REPO}')\nprint()\n\n# List what was pushed\ntry:\n    repo_files = sorted(list_repo_files(\n        ADAPTER_REPO_ID, repo_type='model', token=HF_TOKEN\n    ))\n    adapter_files = [f for f in repo_files if f.startswith(f'adapters/{LANG_NAME}/')]\n    print('Adapter files in repo:')\n    for f in adapter_files:\n        print(f'  {f}')\nexcept Exception as e:\n    print(f'Could not list repo files: {e}')\n\nprint()\nprint('Next steps:')\nprint('  1. In your HF Space settings, confirm ADAPTER_REPO_ID secret is set')\nprint(f'  2. Tab 3 → Reload Adapters → select \"{VERSION_TAG}\"')\nprint('  3. Collect more corrections in the Space, then re-run this notebook')"
    ]
   }
  ]

    "id": "cell-clean",
    "metadata": {},
    "outputs": [],
+   "source": [
+    "# -- Cell 10: Text cleaning utilities + Bambara phonetic normaliser -----------\nimport re, unicodedata\n\n# Phonetic normaliser: unifies French-influenced spellings before training.\n# ou->u, dj->j, gn->ny_palatal etc. so spelling variants map to same token.\n_BAM_NORM_RULES = [('ou','u'),('dj','j'),('gn','ɲ'),('ny','ɲ'),('ch','c'),('oo','ɔ'),('ee','ɛ')]\n_BAM_NORM_PAT = re.compile('|'.join(re.escape(s) for s,_ in _BAM_NORM_RULES))\n_BAM_NORM_MAP = {s:d for s,d in _BAM_NORM_RULES}\n\ndef _bam_norm(text):\n    import unicodedata as _ud\n    text = _ud.normalize('NFC', text.lower())\n    return _BAM_NORM_PAT.sub(lambda m: _BAM_NORM_MAP[m.group(0)], text)\n\n\n_BAMBARA_EXTRA = {'\\u025b','\\u0254','\\u014b'}\n_FULA_EXTRA    = {'\\u0253','\\u0257','\\u01b4','\\u014b','\\u0272'}\n_BASE_LATIN    = set('abcdefghijklmnopqrstuvwxyz')\n_ACCENTED      = set('\\u00e0\\u00e2\\u00e4\\u00e8\\u00e9\\u00ea\\u00eb'\n                     '\\u00ee\\u00ef\\u00f4\\u00f9\\u00fb\\u00fc\\u00fd'\n                     '\\u00ff\\u00e6\\u0153\\u00e7')\n_KEEP_PUNCT    = set(\" ',-.'!?\")\n\n_VALID_CHARS = {\n    'bam': _BASE_LATIN | _ACCENTED | _BAMBARA_EXTRA | _KEEP_PUNCT,\n    'ful': _BASE_LATIN | _ACCENTED | _FULA_EXTRA    | _KEEP_PUNCT,\n}\n\n\ndef clean_text(text: str, lang: str = 'bam') -> str:\n    if not text:\n        return ''\n    text = unicodedata.normalize('NFKC', text.lower().strip())\n    text = re.sub(r'https?://\\S+', '', text)\n    text = re.sub(r'<[^>]+>', '', text)\n    text = re.sub(r'([.,!?])\\1+', r'\\1', text)\n    valid = _VALID_CHARS.get(lang, _VALID_CHARS['bam'] | _VALID_CHARS['ful'])\n    text  = ''.join(c for c in text if c in valid)\n    return re.sub(r'\\s+', ' ', text).strip()\n\n\n# Verify actual output then assert against it\nr1 = clean_text('I ni ce! (hello)', 'bam')   # parens stripped, ! kept\nr2 = clean_text('Jam waali. <b>test</b>', 'ful')  # tags stripped, content kept\nr3 = clean_text('Visit https://example.com now!!', 'bam')  # URL stripped, word before stays\n\nassert r1 == 'i ni ce! hello',   f'r1: {repr(r1)}'\nassert r2 == 'jam waali. test',  f'r2: {repr(r2)}'\nassert r3 == 'visit now!',       f'r3: {repr(r3)}'\n\nprint('clean_text tests passed')\nprint(f'  {repr(r1)}')\nprint(f'  {repr(r2)}')\nprint(f'  {repr(r3)}')"
+   ]
   },
   {
    "cell_type": "code",
    "id": "cell-prepare",
    "metadata": {},
    "outputs": [],
+   "source": [
+    "# -- Cell 11: Whisper processor + prepare_dataset -----------------------------\n# WhisperProcessor imports processing_utils -> image_utils -> torchvision,\n# which crashes when torch/torchvision have mismatched CUDA versions.\n# Fix: build the processor manually from its two sub-components.\n# WhisperFeatureExtractor and WhisperTokenizer have no torchvision dependency.\nimport numpy as np\n\nfrom transformers.models.whisper.feature_extraction_whisper import WhisperFeatureExtractor\nfrom transformers.models.whisper.tokenization_whisper import WhisperTokenizer\n\nprint(f'Loading Whisper feature extractor + tokenizer: {WHISPER_MODEL_ID} ...')\n_feat_ext  = WhisperFeatureExtractor.from_pretrained(WHISPER_MODEL_ID, token=HF_TOKEN)\n_tokenizer = WhisperTokenizer.from_pretrained(WHISPER_MODEL_ID, token=HF_TOKEN)\n\n\nclass _Processor:\n    \"\"\"Minimal WhisperProcessor substitute that avoids the torchvision import chain.\"\"\"\n    def __init__(self, feature_extractor, tokenizer):\n        self.feature_extractor = feature_extractor\n        self.tokenizer         = tokenizer\n\n    def get_decoder_prompt_ids(self, language, task='transcribe'):\n        return self.tokenizer.get_decoder_prompt_ids(language=language, task=task)\n\n    def save_pretrained(self, path):\n        self.feature_extractor.save_pretrained(path)\n        self.tokenizer.save_pretrained(path)\n\n\nprocessor = _Processor(_feat_ext, _tokenizer)\nprint('Processor ready')\n\n\ndef prepare_dataset(batch, text_col='transcription', lang=TRAIN_LANG):\n    \"\"\"\n    Resample to 16 kHz, extract log-mel features, tokenise text.\n    Works on any dict with 'audio' (HF Audio column) and a text column.\n    \"\"\"\n    audio       = batch['audio']\n    audio_array = np.array(audio['array'], dtype=np.float32)\n    orig_sr     = audio['sampling_rate']\n\n    if orig_sr != TARGET_SR:\n        try:\n            import torchaudio.functional as F_audio, torch\n            audio_array = F_audio.resample(\n                torch.from_numpy(audio_array).unsqueeze(0),\n                orig_sr, TARGET_SR,\n            ).squeeze(0).numpy()\n        except Exception:\n            import librosa\n            audio_array = librosa.resample(audio_array, orig_sr=orig_sr, target_sr=TARGET_SR)\n\n    batch['input_features'] = processor.feature_extractor(\n        audio_array, sampling_rate=TARGET_SR\n    ).input_features[0]\n\n    raw_text        = batch.get(text_col, '') or ''\n    _norm_text      = _bam_norm(str(raw_text)) if lang == 'bam' else str(raw_text)\n    cleaned         = clean_text(_norm_text, lang=lang)\n    batch['labels'] = processor.tokenizer(cleaned).input_ids\n    return batch\n\n\nprint('prepare_dataset ready')"
+   ]
   },
   {
    "cell_type": "code",
    "metadata": {},
    "outputs": [],
    "source": [
+    "# -- Cell 14: Data collator + CER metric --------------------------------------\nimport jiwer\nfrom dataclasses import dataclass\nfrom typing import Any, Dict, List\n\ntransform = jiwer.Compose([\n    jiwer.ToLowerCase(),\n    jiwer.RemoveMultipleSpaces(),\n    jiwer.Strip(),\n    jiwer.RemovePunctuation(),\n    jiwer.ReduceToListOfListOfWords(),\n])\n\n# CER transform (no word-split step needed)\n_cer_transform = jiwer.Compose([\n    jiwer.ToLowerCase(),\n    jiwer.RemoveMultipleSpaces(),\n    jiwer.Strip(),\n    jiwer.RemovePunctuation(),\n])\n\n\n@dataclass\nclass DataCollatorSpeechSeq2SeqWithPadding:\n    processor: Any\n\n    def __call__(self, features: List[Dict]) -> Dict:\n        import torch\n        input_feats  = [{'input_features': f['input_features']} for f in features]\n        batch        = self.processor.feature_extractor.pad(input_feats, return_tensors='pt')\n\n        # Leave features in fp32 -- AMP (fp16=True in TrainingArgs) handles casting\n\n        label_feats  = [{'input_ids': f['labels']} for f in features]\n        labels_batch = self.processor.tokenizer.pad(label_feats, return_tensors='pt')\n        labels       = labels_batch['input_ids'].masked_fill(\n            labels_batch.attention_mask.ne(1), -100\n        )\n        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().item():\n            labels = labels[:, 1:]\n        batch['labels'] = labels\n        return batch\n\n\ndef compute_metrics(pred):\n    pred_ids  = pred.predictions\n    label_ids = pred.label_ids\n    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id\n\n    pred_str  = processor.tokenizer.batch_decode(pred_ids,  skip_special_tokens=True)\n    label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)\n\n    cer = jiwer.cer(\n        label_str, pred_str,\n        reference_transform=_cer_transform,\n        hypothesis_transform=_cer_transform,\n    )\n    wer = jiwer.wer(label_str, pred_str,\n                    hypothesis_transform=transform,\n                    reference_transform=transform)\n    return {'cer': round(cer, 4), 'wer': round(wer, 4)}\n\n\ncollator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)\nprint('Collator and WER metric ready')"
    ]
   },
   {
    "metadata": {},
    "outputs": [],
    "source": [
+    "# -- Cell 15: Training arguments ----------------------------------------------\nimport inspect\nfrom transformers import Seq2SeqTrainingArguments\n\n# transformers 4.x used 'evaluation_strategy'; 4.45+ renamed to 'eval_strategy'.\n# Detect which name this installed version accepts.\n_params = inspect.signature(Seq2SeqTrainingArguments.__init__).parameters\n_eval_key = 'eval_strategy' if 'eval_strategy' in _params else 'evaluation_strategy'\n\ntraining_args = Seq2SeqTrainingArguments(\n    output_dir=OUTPUT_DIR,\n\n    max_steps=MAX_STEPS,\n    warmup_steps=WARMUP_STEPS,\n    logging_steps=LOGGING_STEPS,\n    save_steps=SAVE_STEPS,\n    eval_steps=EVAL_STEPS,\n\n    per_device_train_batch_size=BATCH_SIZE,\n    per_device_eval_batch_size=8,\n    gradient_accumulation_steps=GRAD_ACCUM,\n\n    fp16=True,\n    gradient_checkpointing=True,   # reduces activation memory on T4\n\n    learning_rate=LEARNING_RATE,\n    lr_scheduler_type='cosine',\n    weight_decay=0.0,\n    adam_beta1=0.9,\n    adam_beta2=0.98,\n    adam_epsilon=1e-6,\n\n    **{_eval_key: 'steps'},\n    predict_with_generate=True,\n    generation_max_length=225,\n    load_best_model_at_end=True,\n    metric_for_best_model='cer',\n    greater_is_better=False,\n\n    save_total_limit=3,\n    save_strategy='steps',\n\n    report_to=['tensorboard'],  # tensorboard logs to OUTPUT_DIR/runs by default\n    push_to_hub=False,\n)\n\nprint(f'Training arguments ready (using {_eval_key}=steps)')\nprint(f'  Effective batch size: {BATCH_SIZE * GRAD_ACCUM}')\nprint(f'  Max steps           : {MAX_STEPS}')\n"
    ]
   },
   {
    "metadata": {},
    "outputs": [],
    "source": [
+    "# ── Cell 17: WER evaluation ───────────────────────────────────────────────────\nprint('Running full evaluation on eval split ...')\neval_results = trainer.evaluate()\n\ncer_score = eval_results.get('eval_cer', float('nan'))\nwer_score = eval_results.get('eval_wer', float('nan'))\nprint(f'\n✅ Final CER : {cer_score:.1%}  (primary — lower is better)')\nprint(f'   Final WER : {wer_score:.1%}  (secondary)')\nprint(f'   Eval loss : {eval_results.get(\"eval_loss\", float(\"nan\")):.4f}')\n# Show a few example transcriptions side-by-side\nimport random, torch\nprint('\\n── Sample predictions ───────────────────────────────')\nsamples = random.sample(range(len(eval_ds)), min(5, len(eval_ds)))\nfor idx in samples:\n    item     = eval_ds[idx]\n    feats    = torch.tensor(item['input_features']).unsqueeze(0).to(model.device)\n    with torch.no_grad():\n        pred_ids = model.generate(\n            feats,  # fp32 to match model dtype\n            max_new_tokens=128,\n        )\n    pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)[0]\n    labels   = [t if t != -100 else processor.tokenizer.pad_token_id\n                for t in item['labels']]\n    ref_str  = processor.tokenizer.decode(labels, skip_special_tokens=True)\n    print(f'  Ref : {ref_str}')\n    print(f'  Pred: {pred_str}')\n    print()"
    ]
   },
   {
    "metadata": {},
    "outputs": [],
    "source": [
+    "# ── Cell 19: Push adapter to HF Model repo ───────────────────────────────────\nfrom huggingface_hub import HfApi, create_repo\n\n# Ensure repo exists\ncreate_repo(ADAPTER_REPO_ID, repo_type='model', private=True,\n            exist_ok=True, token=HF_TOKEN)\n\n_cer_part = f'{cer_score:.1%}' if cer_score == cer_score else 'n/a'\ncommit_msg = (\n    f'[{VERSION_TAG}] {LANG_NAME} fine-tuned checkpoint — '\n    f'{train_result.global_step} steps | CER {_cer_part} | '\n    f'{len(correction_records)} corrections + WaxalNLP'\n)\n\napi.upload_folder(\n    folder_path=OUTPUT_DIR,\n    repo_id=ADAPTER_REPO_ID,\n    repo_type='model',\n    path_in_repo=PATH_IN_REPO,\n    commit_message=commit_msg,\n)\nprint(f'✅ Adapter uploaded: {ADAPTER_REPO_ID}/{PATH_IN_REPO}')\n\n# Create a Git tag for this version\ntry:\n    api.create_tag(\n        repo_id=ADAPTER_REPO_ID,\n        repo_type='model',\n        tag=VERSION_TAG,\n        tag_message=commit_msg,\n        token=HF_TOKEN,\n    )\n    print(f'✅ Tag created   : {VERSION_TAG}')\nexcept Exception as e:\n    print(f'⚠️  Tag creation skipped: {e}')"
    ]
   },
   {
    "metadata": {},
    "outputs": [],
    "source": [
+    "# ── Cell 20: Verification summary ────────────────────────────────────────────\nfrom huggingface_hub import list_repo_files\n\nprint('=' * 60)\nprint('DEEP SLEEP TRAINING — COMPLETE')\nprint('=' * 60)\nprint(f'  Language          : {TRAIN_LANG} ({LANG_NAME})')\nprint(f'  Model             : {WHISPER_MODEL_ID}')\nprint(f'  Steps completed   : {train_result.global_step}')\nprint(f'  Train loss        : {train_result.training_loss:.4f}')\n_cer_disp = f'{cer_score:.1%}' if cer_score == cer_score else 'n/a'\n_wer_disp = f'{wer_score:.1%}' if wer_score == wer_score else 'n/a'\nprint(f'  Eval CER (primary)  : {_cer_disp}')\nprint(f'  Eval WER (secondary): {_wer_disp}')\nprint(f'  Corrections used  : {len(correction_records)} × {CORRECTION_REPEAT}')\nprint(f'  WaxalNLP samples  : up to {MAX_WAXAL_TRAIN}')\nprint(f'  Version tag       : {VERSION_TAG}')\nprint(f'  HF repo           : {ADAPTER_REPO_ID}/{PATH_IN_REPO}')\nprint()\n\n# List what was pushed\ntry:\n    repo_files = sorted(list_repo_files(\n        ADAPTER_REPO_ID, repo_type='model', token=HF_TOKEN\n    ))\n    adapter_files = [f for f in repo_files if f.startswith(f'adapters/{LANG_NAME}/')]\n    print('Adapter files in repo:')\n    for f in adapter_files:\n        print(f'  {f}')\nexcept Exception as e:\n    print(f'Could not list repo files: {e}')\n\nprint()\nprint('Next steps:')\nprint('  1. In your HF Space settings, confirm ADAPTER_REPO_ID secret is set')\nprint(f'  2. Tab 3 → Reload Adapters → select \"{VERSION_TAG}\"')\nprint('  3. Collect more corrections in the Space, then re-run this notebook')"
    ]
   }
  ]

requirements.txt CHANGED Viewed

@@ -52,6 +52,11 @@ scipy==1.15.2
 # Phrase matching (fuzzy match for Whisper mis-transcriptions of Bambara/Fula)
 rapidfuzz==3.13.0
 # Kaggle API (used by Self-Teaching tab to trigger training runs)
 kaggle>=1.6.0

 # Phrase matching (fuzzy match for Whisper mis-transcriptions of Bambara/Fula)
 rapidfuzz==3.13.0
+# Voice cloning — F5-TTS (flow-matching, language-agnostic, reference-speaker)
+# Requires GPU at runtime (~750 MB model auto-downloaded on first use).
+# Falls back to MMS-TTS gracefully when not installed or GPU unavailable.
+f5-tts>=1.0.0
 # Kaggle API (used by Self-Teaching tab to trigger training runs)
 kaggle>=1.6.0

scripts/patch_notebook_cer.py ADDED Viewed

	@@ -0,0 +1,177 @@

+"""Patch kaggle_master_trainer.ipynb: add bam_normalize, replace WER with CER."""
+import json, sys, re
+sys.stdout.reconfigure(encoding="utf-8")
+NB = "notebooks/kaggle_master_trainer.ipynb"
+with open(NB, encoding="utf-8") as f:
+    nb = json.load(f)
+cells = nb["cells"]
+changed = []
+# ── Cell 10 (idx=11): inject _bam_norm definition ────────────────────────────
+old = "".join(cells[11]["source"])
+OLD_TOP = (
+    "# -- Cell 10: Text cleaning utilities -----------------------------------------\n"
+    "import re, unicodedata"
+)
+NEW_TOP = (
+    "# -- Cell 10: Text cleaning utilities + Bambara phonetic normaliser -----------\n"
+    "import re, unicodedata\n"
+    "\n"
+    "# Phonetic normaliser: unifies French-influenced spellings before training.\n"
+    "# ou->u, dj->j, gn->ny_palatal etc. so spelling variants map to same token.\n"
+    "_BAM_NORM_RULES = [('ou','u'),('dj','j'),('gn','\u0272'),('ny','\u0272'),"
+    "('ch','c'),('oo','\u0254'),('ee','\u025b')]\n"
+    "_BAM_NORM_PAT = re.compile('|'.join(re.escape(s) for s,_ in _BAM_NORM_RULES))\n"
+    "_BAM_NORM_MAP = {s:d for s,d in _BAM_NORM_RULES}\n"
+    "\n"
+    "def _bam_norm(text):\n"
+    "    import unicodedata as _ud\n"
+    "    text = _ud.normalize('NFC', text.lower())\n"
+    "    return _BAM_NORM_PAT.sub(lambda m: _BAM_NORM_MAP[m.group(0)], text)\n"
+)
+if OLD_TOP in old:
+    cells[11]["source"] = [old.replace(OLD_TOP, NEW_TOP)]
+    changed.append("Cell 10: _bam_norm injected")
+else:
+    changed.append("Cell 10: OLD_TOP not found - skip")
+# ── Cell 11 (idx=12): apply _bam_norm in prepare_dataset ─────────────────────
+old = "".join(cells[12]["source"])
+OLD_PREP = "    cleaned         = clean_text(str(raw_text), lang=lang)"
+NEW_PREP = (
+    "    _norm_text      = _bam_norm(str(raw_text)) if lang == 'bam' else str(raw_text)\n"
+    "    cleaned         = clean_text(_norm_text, lang=lang)"
+)
+if OLD_PREP in old:
+    cells[12]["source"] = [old.replace(OLD_PREP, NEW_PREP)]
+    changed.append("Cell 11: normaliser applied in prepare_dataset")
+else:
+    changed.append(f"Cell 11: prepare pattern not found ({repr(old[old.find('cleaned'):old.find('cleaned')+60])})")
+# ── Cell 14 (idx=17): WER -> CER in compute_metrics ──────────────────────────
+old = "".join(cells[17]["source"])
+# Replace header comment
+new = old.replace(
+    "# -- Cell 14: Data collator + WER metric",
+    "# -- Cell 14: Data collator + CER metric"
+)
+# Add CER transform after existing transform definition
+OLD_TRANSFORM_END = "    jiwer.ReduceToListOfListOfWords(),\n])"
+NEW_TRANSFORM_END = (
+    "    jiwer.ReduceToListOfListOfWords(),\n"
+    "])\n"
+    "\n"
+    "# CER transform (no word-split step needed)\n"
+    "_cer_transform = jiwer.Compose([\n"
+    "    jiwer.ToLowerCase(),\n"
+    "    jiwer.RemoveMultipleSpaces(),\n"
+    "    jiwer.Strip(),\n"
+    "    jiwer.RemovePunctuation(),\n"
+    "])"
+)
+new = new.replace(OLD_TRANSFORM_END, NEW_TRANSFORM_END)
+# Replace return value in compute_metrics
+OLD_RETURN = (
+    "    wer = jiwer.wer(label_str, pred_str,\n"
+    "                    hypothesis_transform=transform,\n"
+    "                    reference_transform=transform)\n"
+    "    return {'wer': round(wer, 4)}"
+)
+NEW_RETURN = (
+    "    cer = jiwer.cer(\n"
+    "        label_str, pred_str,\n"
+    "        reference_transform=_cer_transform,\n"
+    "        hypothesis_transform=_cer_transform,\n"
+    "    )\n"
+    "    wer = jiwer.wer(label_str, pred_str,\n"
+    "                    hypothesis_transform=transform,\n"
+    "                    reference_transform=transform)\n"
+    "    return {'cer': round(cer, 4), 'wer': round(wer, 4)}"
+)
+new = new.replace(OLD_RETURN, NEW_RETURN)
+if new != old:
+    cells[17]["source"] = [new]
+    changed.append("Cell 14: WER->CER in compute_metrics")
+else:
+    changed.append("Cell 14: no changes applied")
+# ── Cell 15 (idx=19): metric_for_best_model ───────────────────────────────────
+old = "".join(cells[19]["source"])
+new = old.replace(
+    "    metric_for_best_model='wer',",
+    "    metric_for_best_model='cer',"
+)
+if new != old:
+    cells[19]["source"] = [new]
+    changed.append("Cell 15: metric_for_best_model=cer")
+else:
+    changed.append("Cell 15: no change")
+# ── Cell 17 (idx=22): CER display in evaluation ───────────────────────────────
+old = "".join(cells[22]["source"])
+OLD_WER_PRINT = (
+    "wer_score = eval_results.get('eval_wer', float('nan'))\n"
+    "print(f'\\n? Final WER : {wer_score:.1%}')\n"
+    "print(f'   Eval loss : {eval_results.get(\"eval_loss\", float(\"nan\")):.4f}')"
+)
+NEW_WER_PRINT = (
+    "cer_score = eval_results.get('eval_cer', float('nan'))\n"
+    "wer_score = eval_results.get('eval_wer', float('nan'))\n"
+    "print(f'\\n\u2705 Final CER : {cer_score:.1%}  (primary — lower is better)')\n"
+    "print(f'   Final WER : {wer_score:.1%}  (secondary)')\n"
+    "print(f'   Eval loss : {eval_results.get(\"eval_loss\", float(\"nan\")):.4f}')"
+)
+if OLD_WER_PRINT in old:
+    cells[22]["source"] = [old.replace(OLD_WER_PRINT, NEW_WER_PRINT)]
+    changed.append("Cell 17: CER display")
+else:
+    changed.append("Cell 17: print pattern not found")
+    # Try to find what's there
+    idx = old.find("wer_score")
+    if idx >= 0:
+        changed.append(f"  ...found: {repr(old[idx:idx+100])}")
+# ── Cell 19 push (idx=25): cer_score in commit msg ───────────────────────────
+old = "".join(cells[25]["source"])
+new = (
+    old
+    .replace(
+        "_wer_part = f'{wer_score:.1%}' if wer_score == wer_score else 'n/a'",
+        "_cer_part = f'{cer_score:.1%}' if cer_score == cer_score else 'n/a'"
+    )
+    .replace(
+        "f'{train_result.global_step} steps | WER {_wer_part} | '",
+        "f'{train_result.global_step} steps | CER {_cer_part} | '"
+    )
+)
+if new != old:
+    cells[25]["source"] = [new]
+    changed.append("Cell 19: CER in commit msg")
+else:
+    changed.append("Cell 19: no change")
+# ── Cell 20 summary (idx=26) ─────────────────────────────────────────────────
+old = "".join(cells[26]["source"])
+new = old.replace(
+    "_wer_disp = f'{wer_score:.1%}' if wer_score == wer_score else 'n/a'\n"
+    "print(f'  Eval WER          : {_wer_disp}')",
+    "_cer_disp = f'{cer_score:.1%}' if cer_score == cer_score else 'n/a'\n"
+    "_wer_disp = f'{wer_score:.1%}' if wer_score == wer_score else 'n/a'\n"
+    "print(f'  Eval CER (primary)  : {_cer_disp}')\n"
+    "print(f'  Eval WER (secondary): {_wer_disp}')"
+)
+if new != old:
+    cells[26]["source"] = [new]
+    changed.append("Cell 20: CER in summary")
+else:
+    changed.append("Cell 20: no change")
+with open(NB, "w", encoding="utf-8") as f:
+    json.dump(nb, f, ensure_ascii=False, indent=1)
+for msg in changed:
+    print(msg)
+print("Done.")

src/data/bam_normalize.py ADDED Viewed

	@@ -0,0 +1,67 @@

+"""
+Bambara phonetic normalizer.
+Unifies French-influenced and informal spellings to the standard
+N'Ko-derived Bambara orthography used in most NLP datasets.
+Key rules (most impactful for ASR training):
+    ou  →  u        French vowel → Bambara standard
+    gn  →  ɲ        French nasal palatal
+    ny  →  ɲ        English nasal palatal notation
+    dj  →  j        French palatal affricate
+    ch  →  c        French palatalized consonant
+    oo  →  ɔ        long open-o (common informal spelling)
+    ee  →  ɛ        long open-e (common informal spelling)
+These rules run left-to-right on lower-cased text.  They are conservative:
+only unambiguous substitutions are applied so as not to corrupt words that
+happen to contain these letter sequences in a non-phonemic context.
+Usage:
+    from src.data.bam_normalize import normalize
+    text = normalize("I ni ce, a bɛ djourou la")
+    # → "i ni ce, a bɛ juruu la"
+"""
+from __future__ import annotations
+import re
+import unicodedata
+# ── Replacement table (order matters — longest match first) ─────────────────
+_RULES: list[tuple[str, str]] = [
+    ("ou",  "u"),    # most frequent French influence
+    ("dj",  "j"),    # palatal affricate
+    ("gn",  "ɲ"),    # nasal palatal (French orthography)
+    ("ny",  "ɲ"),    # nasal palatal (English-style notation)
+    ("ch",  "c"),    # palatalized stop
+    ("oo",  "ɔ"),    # long open-o (informal doubling)
+    ("ee",  "ɛ"),    # long open-e (informal doubling)
+]
+# Compile once for speed
+_PATTERN = re.compile(
+    "|".join(re.escape(src) for src, _ in _RULES)
+)
+_REPLACEMENTS = {src: dst for src, dst in _RULES}
+def normalize(text: str) -> str:
+    """
+    Apply phonetic normalization to a Bambara text string.
+    Steps:
+        1. Unicode NFC normalization (collapse combining characters).
+        2. Lowercase.
+        3. Apply phoneme substitution rules.
+        4. Collapse multiple spaces.
+    """
+    text = unicodedata.normalize("NFC", text)
+    text = text.lower()
+    text = _PATTERN.sub(lambda m: _REPLACEMENTS[m.group(0)], text)
+    text = re.sub(r" {2,}", " ", text).strip()
+    return text
+def normalize_batch(texts: list[str]) -> list[str]:
+    return [normalize(t) for t in texts]

src/tts/f5_tts.py ADDED Viewed

	@@ -0,0 +1,114 @@

+"""
+F5-TTS voice cloning engine.
+Generates speech in a target speaker's voice given a short reference WAV.
+Falls back to None gracefully if f5-tts is not installed or the GPU is
+unavailable — the caller then falls back to MMS-TTS.
+Install:
+    pip install f5-tts>=1.0.0
+Reference:
+    SWivid/F5-TTS  (HuggingFace / GitHub)
+    Model: ~750 MB, downloaded on first use to HF cache.
+"""
+from __future__ import annotations
+import logging
+import threading
+from pathlib import Path
+from typing import Optional, Tuple
+import numpy as np
+logger = logging.getLogger(__name__)
+_lock   = threading.Lock()
+_model  = None   # F5TTS instance, loaded lazily
+def _load_model():
+    global _model
+    if _model is not None:
+        return _model
+    with _lock:
+        if _model is None:
+            from f5_tts.api import F5TTS  # type: ignore
+            _model = F5TTS(model_type="F5TTS")
+            logger.info("F5-TTS model loaded.")
+    return _model
+def synthesize(
+    text: str,
+    ref_wav_path: str,
+    ref_text: str = "",
+    speed: float = 1.0,
+    device: str = "cuda",
+) -> Optional[Tuple[np.ndarray, int]]:
+    """
+    Generate speech for `text` using `ref_wav_path` as the speaker reference.
+    Args:
+        text:         Text to synthesize (Bambara, Fula, French, or English).
+        ref_wav_path: Path to reference audio (WAV, 5–30 s of the target speaker).
+        ref_text:     Transcript of the reference audio.  If empty the model
+                      uses in-context inference (slightly lower quality but still
+                      good for voice matching).
+        speed:        Speaking rate multiplier.  1.0 = normal.
+        device:       "cuda" or "cpu".  CPU is 30-60 s/sentence — use GPU.
+    Returns:
+        (waveform_float32, sample_rate) or None on failure.
+    """
+    if not text.strip():
+        return None
+    try:
+        import torch
+        model = _load_model()
+        wav, sr, _ = model.infer(
+            ref_file=ref_wav_path,
+            ref_text=ref_text.strip(),
+            gen_text=text.strip(),
+            speed=speed,
+            target_rms=0.1,
+            cross_fade_duration=0.15,
+            nfe_step=32,
+            cfg_strength=2.0,
+            show_info=False,
+            progress=None,
+        )
+        if isinstance(wav, torch.Tensor):
+            wav = wav.cpu().float().numpy()
+        else:
+            wav = np.asarray(wav, dtype=np.float32)
+        return wav, int(sr)
+    except ImportError:
+        logger.warning(
+            "f5-tts not installed — voice cloning disabled.  "
+            "Add 'f5-tts>=1.0.0' to requirements.txt."
+        )
+        return None
+    except Exception as exc:
+        logger.error("F5-TTS synthesis failed: %s", exc)
+        return None
+def to_wav_24k(audio_path: str) -> str:
+    """
+    Resample any audio file to 24 kHz mono WAV (F5-TTS preferred sample rate).
+    Returns the path to the converted file (same stem, .wav extension).
+    Modifies in-place if the input is already a WAV — otherwise writes a new file.
+    """
+    import librosa
+    import soundfile as sf
+    out_path = str(Path(audio_path).with_suffix(".f5ref.wav"))
+    audio, _ = librosa.load(audio_path, sr=24_000, mono=True)
+    sf.write(out_path, audio, 24_000)
+    return out_path