Spaces:

MataStrategy
/

ground-zero

Running

jefffffff9 Claude Sonnet 4.6 commited on 20 days ago

Commit

40cf84d

1 Parent(s): 618eab5

Fix language mixing: per-language prompts + Mali Bambara / Guinea Pular context

Root cause: one generic system prompt for all languages with no language code
injected. The LLM had no instructions to stay in one dialect, so it mixed
Bambara and Fula words freely.

app.py:
- SUPPORTED_LANGUAGES: labels now say "Bambara — Mali" and "Fula / Pular — Guinea"
- LANG_CONTEXT dict: per-language country, region, script, phonetic rules, and
explicit "do_not_mix" field naming the languages to never blend in
- _build_system_prompt(language_code, vocab): generates a language-specific
system prompt that opens with "You MUST respond exclusively in {lang} ({country})"
and lists exactly which other languages must NOT appear in responses
- _get_vocab_context_for(language_code): filters vocabulary cache to only entries
tagged [bam] or [ful] — LLM never sees the other language's words in its context
- _build_messages: uses _build_system_prompt + filtered vocab instead of the
single generic template
- LLM fallback messages now cover bam (Mali Bambara), ful (Guinea Pular), and fr

Notebook Cell 4: LANG_COUNTRY + LANG_DIALECT constants added
bam → Mali / "Standard Bambara (Bamako/Ségou) — Malian orthography"
ful → Guinea / "Pular (Labé/Mamou dialects) — Guinean orthography"
Notebook Cell 19: country and dialect included in Hub commit message

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (2) hide show

app.py +119 -39
notebooks/kaggle_master_trainer.ipynb +2 -2

app.py CHANGED Viewed

@@ -47,10 +47,54 @@ AUTO_TRAIN_THRESHOLD = int(os.environ.get("AUTO_TRAIN_THRESHOLD", "50"))
 _ON_SPACES = os.environ.get("SPACE_ID") is not None
 SUPPORTED_LANGUAGES = {
-    "Bambara (bam)":    "bam",
-    "Fula (ful)":       "ful",
-    "French / Français": "fr",
-    "English":          "en",
 }
 # ── ZeroGPU decorator (no-op locally) ────────────────────────────────────────
@@ -385,34 +429,68 @@ def _parse_and_strip_learned(text: str, lang: str) -> tuple[str, list[tuple[str,
 # System prompt — includes vocabulary context + conversation rules
-_CONVO_SYSTEM_TEMPLATE = """\
-You are a helpful voice assistant for Bambara and Fula speakers. \
-You are talking, not writing — keep every response to 1–3 short sentences.
-YOUR KNOWLEDGE BASE (words and phrases you have learned from users):
-{vocab}
-RULES you must always follow:
-1. Reply in whatever language the user speaks (Bambara, Fula, French, or English).
-2. When speaking Bambara, use phonetic spelling: 'u' not 'ou', 'j' not 'dj', 'c' not 'ch'.
-3. Keep responses SHORT — this is voice, not text.
-4. If you do not understand something, ask ONE specific follow-up question \
-   (e.g. "Mun ye o fileli ye?" = "What does that mean?").
-5. If the user teaches you a word or phrase (says "X means Y" or "X se dit Y in Bambara"), \
-   confirm warmly then add exactly: [LEARNED: word="X" meaning="Y"]
-6. Remember the full conversation — refer to earlier messages naturally \
-   (e.g. "As you said earlier…", "I ka kuma fɔlen don…").
-7. Never invent words you do not know. Honest uncertainty is always better than wrong answers."""
 def _build_messages(user_text: str, history: list, language_code: str) -> list[dict]:
-    """Build the full message list: system (with vocab) + history + new user turn."""
-    vocab = _get_vocab_context()
-    system_content = _CONVO_SYSTEM_TEMPLATE.format(
-        vocab=vocab if vocab else "(no vocabulary recorded yet — you can teach me words!)"
-    )
-    messages: list[dict] = [{"role": "system", "content": system_content}]
-    # Inject conversation history (last 20 turns max)
     for u, a in history[-20:]:
         messages.append({"role": "user",      "content": u})
         messages.append({"role": "assistant", "content": a})
@@ -541,11 +619,12 @@ def _convo_pipeline(audio_path: str, language_code: str, history: list):
     except Exception as llm_err:
         log.warning("LLM failed: %s", llm_err)
         # Graceful degradation: tell user LLM is unavailable, ask them to try again
-        response_text = (
-            "Hakɛ to, n bɛ sɔrɔ cogo dɔ la."
-            if language_code == "bam"
-            else "Sorry, I could not reach the language model. Please try again."
-        )
     # ── Parse and strip [LEARNED:] tags — save async to Hub ──────────────────
     response_text, learned_pairs = _parse_and_strip_learned(response_text, language_code)
@@ -1394,11 +1473,12 @@ def _do_respond(
         except Exception as llm_err:
             import logging
             logging.getLogger(__name__).warning("LLM error: %s", llm_err)
-            response_text = (
-                "Hakɛ to, tasuma tɛ kɛ sisan. I ka a lasɔrɔ tugu."
-                if lang == "bam"
-                else "Sorry, I could not reach the language model right now."
-            )
         # Strip [LEARNED:] tags, persist async
         response_text, _ = _parse_and_strip_learned(response_text, lang)

 _ON_SPACES = os.environ.get("SPACE_ID") is not None
 SUPPORTED_LANGUAGES = {
+    "Bambara — Mali (bam)":       "bam",
+    "Fula / Pular — Guinea (ful)": "ful",
+    "French / Français":           "fr",
+    "English":                     "en",
+}
+# Country and dialect context used in prompts and training metadata
+LANG_CONTEXT = {
+    "bam": {
+        "name":        "Bambara",
+        "country":     "Mali",
+        "region":      "West Africa (Bamako, Ségou, Mopti dialects)",
+        "script":      "Latin with special characters (ɛ, ɔ, ŋ, ɲ)",
+        "phonetic_note": (
+            "Use standard Malian orthography: 'u' not 'ou', 'j' not 'dj', "
+            "'c' not 'ch', 'ɲ' not 'gn' or 'ny', 'ɔ' not 'oo', 'ɛ' not 'ee'. "
+            "This is Bambara as spoken in Mali, NOT Dioula or other dialects."
+        ),
+        "do_not_mix":  "Fula (Pulaar/Pular), Wolof, Dioula, or any other language",
+    },
+    "ful": {
+        "name":        "Pular (Fula of Guinea)",
+        "country":     "Guinea",
+        "region":      "West Africa (Labé, Mamou, Kankan dialects)",
+        "script":      "Latin with special characters (ɓ, ɗ, ŋ, ɲ, ƴ)",
+        "phonetic_note": (
+            "Use standard Guinean Pular orthography. "
+            "This is the Fula variety spoken in Guinea (Pular/Pulaar), "
+            "NOT Fulfulde from Niger/Nigeria nor Wolof."
+        ),
+        "do_not_mix":  "Bambara, Soussou, Malinké, or any other language",
+    },
+    "fr": {
+        "name":        "French",
+        "country":     "France / West Africa",
+        "region":      "",
+        "script":      "Latin",
+        "phonetic_note": "Standard French.",
+        "do_not_mix":  "other languages unless the user switches",
+    },
+    "en": {
+        "name":        "English",
+        "country":     "",
+        "region":      "",
+        "script":      "Latin",
+        "phonetic_note": "Standard English.",
+        "do_not_mix":  "other languages unless the user switches",
+    },
 }
 # ── ZeroGPU decorator (no-op locally) ────────────────────────────────────────
 # System prompt — includes vocabulary context + conversation rules
+def _build_system_prompt(language_code: str, vocab: str) -> str:
+    """
+    Build a language-specific system prompt that makes the LLM stay strictly
+    in the correct dialect (Mali Bambara vs Guinea Pular) and never mix them.
+    """
+    ctx = LANG_CONTEXT.get(language_code, LANG_CONTEXT["fr"])
+    lang_name   = ctx["name"]
+    country     = ctx["country"]
+    region      = ctx["region"]
+    phon_note   = ctx["phonetic_note"]
+    do_not_mix  = ctx["do_not_mix"]
+    region_line = f" ({region})" if region else ""
+    vocab_section = (
+        f"WORDS AND PHRASES YOU HAVE LEARNED FOR {lang_name.upper()}:\n{vocab}"
+        if vocab
+        else f"(No {lang_name} vocabulary recorded yet — the user can teach you words.)"
+    )
+    return f"""\
+You are a voice assistant that speaks ONLY {lang_name} as used in {country}{region_line}.
+CRITICAL LANGUAGE RULE:
+- You MUST respond exclusively in {lang_name} ({country}).
+- NEVER mix in words from {do_not_mix}.
+- If the user writes in another language, gently ask them to switch to {lang_name}.
+- If you are unsure of a word in {lang_name}, say so honestly — do not substitute \
+a word from another language.
+ORTHOGRAPHY ({lang_name}):
+{phon_note}
+{vocab_section}
+CONVERSATION RULES:
+1. Keep every response to 1–3 short spoken sentences. This is voice, not text.
+2. If you do not understand, ask ONE short follow-up question in {lang_name}.
+3. If the user teaches you a word ("X means Y"), confirm warmly, then append \
+exactly: [LEARNED: word="X" meaning="Y"]
+4. Refer back to earlier messages naturally when relevant.
+5. Never invent vocabulary. Honest uncertainty is always correct."""
+def _get_vocab_context_for(language_code: str) -> str:
+    """Return only vocabulary entries for the given language code."""
+    with _vocab_lock:
+        raw = _vocab_context_cache
+    if not raw:
+        return ""
+    lines = [
+        line for line in raw.splitlines()
+        if f"[{language_code}]" in line
+    ]
+    return "\n".join(lines)
 def _build_messages(user_text: str, history: list, language_code: str) -> list[dict]:
+    """Build the full message list: system (with lang-filtered vocab) + history + new turn."""
+    vocab   = _get_vocab_context_for(language_code)
+    system  = _build_system_prompt(language_code, vocab)
+    messages: list[dict] = [{"role": "system", "content": system}]
     for u, a in history[-20:]:
         messages.append({"role": "user",      "content": u})
         messages.append({"role": "assistant", "content": a})
     except Exception as llm_err:
         log.warning("LLM failed: %s", llm_err)
         # Graceful degradation: tell user LLM is unavailable, ask them to try again
+        _fallbacks = {
+            "bam": "Hakɛ to, n bɛ sɔrɔ cogo dɔ la.",          # Bambara (Mali)
+            "ful": "Hakke, mi waawaa jogaade modèl oo jooni.",  # Pular (Guinea)
+            "fr":  "Désolé, je n'ai pas pu joindre le modèle.",
+        }
+        response_text = _fallbacks.get(language_code, "Sorry, the language model is unavailable.")
     # ── Parse and strip [LEARNED:] tags — save async to Hub ──────────────────
     response_text, learned_pairs = _parse_and_strip_learned(response_text, language_code)
         except Exception as llm_err:
             import logging
             logging.getLogger(__name__).warning("LLM error: %s", llm_err)
+            _fallbacks = {
+                "bam": "Hakɛ to, tasuma tɛ kɛ sisan. I ka a lasɔrɔ tugu.",
+                "ful": "Hakke, mi waawaa jogaade modèl oo jooni. Njaɓɓu.",
+                "fr":  "Désolé, le modèle est indisponible pour l'instant.",
+            }
+            response_text = _fallbacks.get(lang, "Sorry, the language model is unavailable.")
         # Strip [LEARNED:] tags, persist async
         response_text, _ = _parse_and_strip_learned(response_text, lang)

notebooks/kaggle_master_trainer.ipynb CHANGED Viewed

@@ -62,7 +62,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# ── Cell 3: CONFIGURATION — edit these before each run ───────────────────────\nimport os\n\n# ─── Language to train ───────────────────────────────────────────────────────\n#  'bam' = Bambara    'ful' = Fula\nTRAIN_LANG       = 'bam'\n\n# ─── Model ───────────────────────────────────────────────────────────────────\nWHISPER_MODEL_ID = 'openai/whisper-small'\nTARGET_SR        = 16_000\n\n# ─── HuggingFace repos ───────────────────────────────────────────────────────\nHF_USERNAME      = 'ous-sow'\nFEEDBACK_REPO_ID = f'{HF_USERNAME}/sahel-agri-feedback'\nADAPTER_REPO_ID  = f'{HF_USERNAME}/sahel-agri-adapters'\n\n# ─── Training hyper-parameters ───────────────────────────────────────────────\nMAX_STEPS        = 4_000     # T4 ~45 min; set 8000 for a deeper run\nBATCH_SIZE       = 16\nGRAD_ACCUM       = 2         # effective batch = 32\nLEARNING_RATE    = 1e-3\nWARMUP_STEPS     = 200\nSAVE_STEPS       = 500\nEVAL_STEPS       = 500\nLOGGING_STEPS    = 50\nMAX_WAXAL_TRAIN  = 5_000    # cap WaxalNLP samples (streaming budget)\nCORRECTION_REPEAT= 3        # upsample user corrections Nx for emphasis\n\n# ─── Paths (Kaggle working dir) ───────────────────────────────────────────────\nWORKING_DIR      = '/kaggle/working'\nOUTPUT_DIR       = f'{WORKING_DIR}/adapter_{TRAIN_LANG}'\nDATA_DIR         = f'{WORKING_DIR}/data'\nAUDIO_DIR        = f'{WORKING_DIR}/audio_feedback'\n\nLANG_NAME        = {'bam': 'bambara', 'ful': 'fula'}.get(TRAIN_LANG, TRAIN_LANG)\n\nprint(f'Language  : {TRAIN_LANG} ({LANG_NAME})')\nprint(f'Model     : {WHISPER_MODEL_ID}')\nprint(f'Output    : {OUTPUT_DIR}')\nprint(f'Max steps : {MAX_STEPS}')"
    ]
   },
   {
@@ -252,7 +252,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# ── Cell 19: Push adapter to HF Model repo ───────────────────���───────────────\nfrom huggingface_hub import HfApi, create_repo\n\n# Ensure repo exists\ncreate_repo(ADAPTER_REPO_ID, repo_type='model', private=True,\n            exist_ok=True, token=HF_TOKEN)\n\n_cer_part = f'{cer_score:.1%}' if cer_score == cer_score else 'n/a'\ncommit_msg = (\n    f'[{VERSION_TAG}] {LANG_NAME} fine-tuned checkpoint — '\n    f'{train_result.global_step} steps | CER {_cer_part} | '\n    f'{len(correction_records)} corrections + WaxalNLP'\n)\n\napi.upload_folder(\n    folder_path=OUTPUT_DIR,\n    repo_id=ADAPTER_REPO_ID,\n    repo_type='model',\n    path_in_repo=PATH_IN_REPO,\n    commit_message=commit_msg,\n)\nprint(f'✅ Adapter uploaded: {ADAPTER_REPO_ID}/{PATH_IN_REPO}')\n\n# Create a Git tag for this version\ntry:\n    api.create_tag(\n        repo_id=ADAPTER_REPO_ID,\n        repo_type='model',\n        tag=VERSION_TAG,\n        tag_message=commit_msg,\n        token=HF_TOKEN,\n    )\n    print(f'✅ Tag created   : {VERSION_TAG}')\nexcept Exception as e:\n    print(f'⚠️  Tag creation skipped: {e}')"
    ]
   },
   {

    "metadata": {},
    "outputs": [],
    "source": [
+    "# ── Cell 3: CONFIGURATION — edit these before each run ───────────────────────\nimport os\n\n# ─── Language to train ───────────────────────────────────────────────────────\n#  'bam' = Bambara    'ful' = Fula\nTRAIN_LANG       = 'bam'\n\n# ─── Model ───────────────────────────────────────────────────────────────────\nWHISPER_MODEL_ID = 'openai/whisper-small'\nTARGET_SR        = 16_000\n\n# ─── HuggingFace repos ───────────────────────────────────────────────────────\nHF_USERNAME      = 'ous-sow'\nFEEDBACK_REPO_ID = f'{HF_USERNAME}/sahel-agri-feedback'\nADAPTER_REPO_ID  = f'{HF_USERNAME}/sahel-agri-adapters'\n\n# ─── Training hyper-parameters ───────────────────────────────────────────────\nMAX_STEPS        = 4_000     # T4 ~45 min; set 8000 for a deeper run\nBATCH_SIZE       = 16\nGRAD_ACCUM       = 2         # effective batch = 32\nLEARNING_RATE    = 1e-3\nWARMUP_STEPS     = 200\nSAVE_STEPS       = 500\nEVAL_STEPS       = 500\nLOGGING_STEPS    = 50\nMAX_WAXAL_TRAIN  = 5_000    # cap WaxalNLP samples (streaming budget)\nCORRECTION_REPEAT= 3        # upsample user corrections Nx for emphasis\n\n# ─── Paths (Kaggle working dir) ───────────────────────────────────────────────\nWORKING_DIR      = '/kaggle/working'\nOUTPUT_DIR       = f'{WORKING_DIR}/adapter_{TRAIN_LANG}'\nDATA_DIR         = f'{WORKING_DIR}/data'\nAUDIO_DIR        = f'{WORKING_DIR}/audio_feedback'\n\nLANG_NAME        = {'bam': 'bambara', 'ful': 'fula'}.get(TRAIN_LANG, TRAIN_LANG)\nLANG_COUNTRY     = {'bam': 'Mali',    'ful': 'Guinea'}.get(TRAIN_LANG, '')\nLANG_DIALECT     = {\n    'bam': 'Standard Bambara (Bamako/Ségou) — Malian orthography',\n    'ful': 'Pular (Labé/Mamou dialects) — Guinean orthography',\n}.get(TRAIN_LANG, '')\n\nprint(f'Language  : {TRAIN_LANG} ({LANG_NAME}) — {LANG_COUNTRY}')\nprint(f'Dialect   : {LANG_DIALECT}')\nprint(f'Model     : {WHISPER_MODEL_ID}')\nprint(f'Output    : {OUTPUT_DIR}')\nprint(f'Max steps : {MAX_STEPS}')"
    ]
   },
   {
    "metadata": {},
    "outputs": [],
    "source": [
+    "# ── Cell 19: Push adapter to HF Model repo ───────────────────────────────────\nfrom huggingface_hub import HfApi, create_repo\n\n# Ensure repo exists\ncreate_repo(ADAPTER_REPO_ID, repo_type='model', private=True,\n            exist_ok=True, token=HF_TOKEN)\n\n_cer_part = f'{cer_score:.1%}' if cer_score == cer_score else 'n/a'\ncommit_msg = (\n    f'[{VERSION_TAG}] {LANG_NAME} ({LANG_COUNTRY}) fine-tuned checkpoint — '\n    f'{train_result.global_step} steps | CER {_cer_part} | '\n    f'{len(correction_records)} corrections + WaxalNLP | {LANG_DIALECT}'\n)\n\napi.upload_folder(\n    folder_path=OUTPUT_DIR,\n    repo_id=ADAPTER_REPO_ID,\n    repo_type='model',\n    path_in_repo=PATH_IN_REPO,\n    commit_message=commit_msg,\n)\nprint(f'✅ Adapter uploaded: {ADAPTER_REPO_ID}/{PATH_IN_REPO}')\n\n# Create a Git tag for this version\ntry:\n    api.create_tag(\n        repo_id=ADAPTER_REPO_ID,\n        repo_type='model',\n        tag=VERSION_TAG,\n        tag_message=commit_msg,\n        token=HF_TOKEN,\n    )\n    print(f'✅ Tag created   : {VERSION_TAG}')\nexcept Exception as e:\n    print(f'⚠️  Tag creation skipped: {e}')"
    ]
   },
   {