Spaces:
Running
Fix language mixing: per-language prompts + Mali Bambara / Guinea Pular context
Browse filesRoot cause: one generic system prompt for all languages with no language code
injected. The LLM had no instructions to stay in one dialect, so it mixed
Bambara and Fula words freely.
app.py:
- SUPPORTED_LANGUAGES: labels now say "Bambara — Mali" and "Fula / Pular — Guinea"
- LANG_CONTEXT dict: per-language country, region, script, phonetic rules, and
explicit "do_not_mix" field naming the languages to never blend in
- _build_system_prompt(language_code, vocab): generates a language-specific
system prompt that opens with "You MUST respond exclusively in {lang} ({country})"
and lists exactly which other languages must NOT appear in responses
- _get_vocab_context_for(language_code): filters vocabulary cache to only entries
tagged [bam] or [ful] — LLM never sees the other language's words in its context
- _build_messages: uses _build_system_prompt + filtered vocab instead of the
single generic template
- LLM fallback messages now cover bam (Mali Bambara), ful (Guinea Pular), and fr
Notebook Cell 4: LANG_COUNTRY + LANG_DIALECT constants added
bam → Mali / "Standard Bambara (Bamako/Ségou) — Malian orthography"
ful → Guinea / "Pular (Labé/Mamou dialects) — Guinean orthography"
Notebook Cell 19: country and dialect included in Hub commit message
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
- app.py +119 -39
- notebooks/kaggle_master_trainer.ipynb +2 -2
|
@@ -47,10 +47,54 @@ AUTO_TRAIN_THRESHOLD = int(os.environ.get("AUTO_TRAIN_THRESHOLD", "50"))
|
|
| 47 |
_ON_SPACES = os.environ.get("SPACE_ID") is not None
|
| 48 |
|
| 49 |
SUPPORTED_LANGUAGES = {
|
| 50 |
-
"Bambara (bam)":
|
| 51 |
-
"Fula (ful)":
|
| 52 |
-
"French / Français":
|
| 53 |
-
"English":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
}
|
| 55 |
|
| 56 |
# ── ZeroGPU decorator (no-op locally) ────────────────────────────────────────
|
|
@@ -385,34 +429,68 @@ def _parse_and_strip_learned(text: str, lang: str) -> tuple[str, list[tuple[str,
|
|
| 385 |
|
| 386 |
|
| 387 |
# System prompt — includes vocabulary context + conversation rules
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
|
| 399 |
-
|
| 400 |
-
|
| 401 |
-
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
|
| 405 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 406 |
|
| 407 |
|
| 408 |
def _build_messages(user_text: str, history: list, language_code: str) -> list[dict]:
|
| 409 |
-
"""Build the full message list: system (with vocab) + history + new
|
| 410 |
-
vocab
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
)
|
| 414 |
-
messages: list[dict] = [{"role": "system", "content": system_content}]
|
| 415 |
-
# Inject conversation history (last 20 turns max)
|
| 416 |
for u, a in history[-20:]:
|
| 417 |
messages.append({"role": "user", "content": u})
|
| 418 |
messages.append({"role": "assistant", "content": a})
|
|
@@ -541,11 +619,12 @@ def _convo_pipeline(audio_path: str, language_code: str, history: list):
|
|
| 541 |
except Exception as llm_err:
|
| 542 |
log.warning("LLM failed: %s", llm_err)
|
| 543 |
# Graceful degradation: tell user LLM is unavailable, ask them to try again
|
| 544 |
-
|
| 545 |
-
"Hakɛ to, n bɛ sɔrɔ cogo dɔ la."
|
| 546 |
-
|
| 547 |
-
|
| 548 |
-
|
|
|
|
| 549 |
|
| 550 |
# ── Parse and strip [LEARNED:] tags — save async to Hub ──────────────────
|
| 551 |
response_text, learned_pairs = _parse_and_strip_learned(response_text, language_code)
|
|
@@ -1394,11 +1473,12 @@ def _do_respond(
|
|
| 1394 |
except Exception as llm_err:
|
| 1395 |
import logging
|
| 1396 |
logging.getLogger(__name__).warning("LLM error: %s", llm_err)
|
| 1397 |
-
|
| 1398 |
-
"Hakɛ to, tasuma tɛ kɛ sisan. I ka a lasɔrɔ tugu."
|
| 1399 |
-
|
| 1400 |
-
|
| 1401 |
-
|
|
|
|
| 1402 |
|
| 1403 |
# Strip [LEARNED:] tags, persist async
|
| 1404 |
response_text, _ = _parse_and_strip_learned(response_text, lang)
|
|
|
|
| 47 |
_ON_SPACES = os.environ.get("SPACE_ID") is not None
|
| 48 |
|
| 49 |
SUPPORTED_LANGUAGES = {
|
| 50 |
+
"Bambara — Mali (bam)": "bam",
|
| 51 |
+
"Fula / Pular — Guinea (ful)": "ful",
|
| 52 |
+
"French / Français": "fr",
|
| 53 |
+
"English": "en",
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
# Country and dialect context used in prompts and training metadata
|
| 57 |
+
LANG_CONTEXT = {
|
| 58 |
+
"bam": {
|
| 59 |
+
"name": "Bambara",
|
| 60 |
+
"country": "Mali",
|
| 61 |
+
"region": "West Africa (Bamako, Ségou, Mopti dialects)",
|
| 62 |
+
"script": "Latin with special characters (ɛ, ɔ, ŋ, ɲ)",
|
| 63 |
+
"phonetic_note": (
|
| 64 |
+
"Use standard Malian orthography: 'u' not 'ou', 'j' not 'dj', "
|
| 65 |
+
"'c' not 'ch', 'ɲ' not 'gn' or 'ny', 'ɔ' not 'oo', 'ɛ' not 'ee'. "
|
| 66 |
+
"This is Bambara as spoken in Mali, NOT Dioula or other dialects."
|
| 67 |
+
),
|
| 68 |
+
"do_not_mix": "Fula (Pulaar/Pular), Wolof, Dioula, or any other language",
|
| 69 |
+
},
|
| 70 |
+
"ful": {
|
| 71 |
+
"name": "Pular (Fula of Guinea)",
|
| 72 |
+
"country": "Guinea",
|
| 73 |
+
"region": "West Africa (Labé, Mamou, Kankan dialects)",
|
| 74 |
+
"script": "Latin with special characters (ɓ, ɗ, ŋ, ɲ, ƴ)",
|
| 75 |
+
"phonetic_note": (
|
| 76 |
+
"Use standard Guinean Pular orthography. "
|
| 77 |
+
"This is the Fula variety spoken in Guinea (Pular/Pulaar), "
|
| 78 |
+
"NOT Fulfulde from Niger/Nigeria nor Wolof."
|
| 79 |
+
),
|
| 80 |
+
"do_not_mix": "Bambara, Soussou, Malinké, or any other language",
|
| 81 |
+
},
|
| 82 |
+
"fr": {
|
| 83 |
+
"name": "French",
|
| 84 |
+
"country": "France / West Africa",
|
| 85 |
+
"region": "",
|
| 86 |
+
"script": "Latin",
|
| 87 |
+
"phonetic_note": "Standard French.",
|
| 88 |
+
"do_not_mix": "other languages unless the user switches",
|
| 89 |
+
},
|
| 90 |
+
"en": {
|
| 91 |
+
"name": "English",
|
| 92 |
+
"country": "",
|
| 93 |
+
"region": "",
|
| 94 |
+
"script": "Latin",
|
| 95 |
+
"phonetic_note": "Standard English.",
|
| 96 |
+
"do_not_mix": "other languages unless the user switches",
|
| 97 |
+
},
|
| 98 |
}
|
| 99 |
|
| 100 |
# ── ZeroGPU decorator (no-op locally) ────────────────────────────────────────
|
|
|
|
| 429 |
|
| 430 |
|
| 431 |
# System prompt — includes vocabulary context + conversation rules
|
| 432 |
+
def _build_system_prompt(language_code: str, vocab: str) -> str:
|
| 433 |
+
"""
|
| 434 |
+
Build a language-specific system prompt that makes the LLM stay strictly
|
| 435 |
+
in the correct dialect (Mali Bambara vs Guinea Pular) and never mix them.
|
| 436 |
+
"""
|
| 437 |
+
ctx = LANG_CONTEXT.get(language_code, LANG_CONTEXT["fr"])
|
| 438 |
+
lang_name = ctx["name"]
|
| 439 |
+
country = ctx["country"]
|
| 440 |
+
region = ctx["region"]
|
| 441 |
+
phon_note = ctx["phonetic_note"]
|
| 442 |
+
do_not_mix = ctx["do_not_mix"]
|
| 443 |
+
|
| 444 |
+
region_line = f" ({region})" if region else ""
|
| 445 |
+
|
| 446 |
+
vocab_section = (
|
| 447 |
+
f"WORDS AND PHRASES YOU HAVE LEARNED FOR {lang_name.upper()}:\n{vocab}"
|
| 448 |
+
if vocab
|
| 449 |
+
else f"(No {lang_name} vocabulary recorded yet — the user can teach you words.)"
|
| 450 |
+
)
|
| 451 |
+
|
| 452 |
+
return f"""\
|
| 453 |
+
You are a voice assistant that speaks ONLY {lang_name} as used in {country}{region_line}.
|
| 454 |
+
|
| 455 |
+
CRITICAL LANGUAGE RULE:
|
| 456 |
+
- You MUST respond exclusively in {lang_name} ({country}).
|
| 457 |
+
- NEVER mix in words from {do_not_mix}.
|
| 458 |
+
- If the user writes in another language, gently ask them to switch to {lang_name}.
|
| 459 |
+
- If you are unsure of a word in {lang_name}, say so honestly — do not substitute \
|
| 460 |
+
a word from another language.
|
| 461 |
+
|
| 462 |
+
ORTHOGRAPHY ({lang_name}):
|
| 463 |
+
{phon_note}
|
| 464 |
+
|
| 465 |
+
{vocab_section}
|
| 466 |
+
|
| 467 |
+
CONVERSATION RULES:
|
| 468 |
+
1. Keep every response to 1–3 short spoken sentences. This is voice, not text.
|
| 469 |
+
2. If you do not understand, ask ONE short follow-up question in {lang_name}.
|
| 470 |
+
3. If the user teaches you a word ("X means Y"), confirm warmly, then append \
|
| 471 |
+
exactly: [LEARNED: word="X" meaning="Y"]
|
| 472 |
+
4. Refer back to earlier messages naturally when relevant.
|
| 473 |
+
5. Never invent vocabulary. Honest uncertainty is always correct."""
|
| 474 |
+
|
| 475 |
+
|
| 476 |
+
def _get_vocab_context_for(language_code: str) -> str:
|
| 477 |
+
"""Return only vocabulary entries for the given language code."""
|
| 478 |
+
with _vocab_lock:
|
| 479 |
+
raw = _vocab_context_cache
|
| 480 |
+
if not raw:
|
| 481 |
+
return ""
|
| 482 |
+
lines = [
|
| 483 |
+
line for line in raw.splitlines()
|
| 484 |
+
if f"[{language_code}]" in line
|
| 485 |
+
]
|
| 486 |
+
return "\n".join(lines)
|
| 487 |
|
| 488 |
|
| 489 |
def _build_messages(user_text: str, history: list, language_code: str) -> list[dict]:
|
| 490 |
+
"""Build the full message list: system (with lang-filtered vocab) + history + new turn."""
|
| 491 |
+
vocab = _get_vocab_context_for(language_code)
|
| 492 |
+
system = _build_system_prompt(language_code, vocab)
|
| 493 |
+
messages: list[dict] = [{"role": "system", "content": system}]
|
|
|
|
|
|
|
|
|
|
| 494 |
for u, a in history[-20:]:
|
| 495 |
messages.append({"role": "user", "content": u})
|
| 496 |
messages.append({"role": "assistant", "content": a})
|
|
|
|
| 619 |
except Exception as llm_err:
|
| 620 |
log.warning("LLM failed: %s", llm_err)
|
| 621 |
# Graceful degradation: tell user LLM is unavailable, ask them to try again
|
| 622 |
+
_fallbacks = {
|
| 623 |
+
"bam": "Hakɛ to, n bɛ sɔrɔ cogo dɔ la.", # Bambara (Mali)
|
| 624 |
+
"ful": "Hakke, mi waawaa jogaade modèl oo jooni.", # Pular (Guinea)
|
| 625 |
+
"fr": "Désolé, je n'ai pas pu joindre le modèle.",
|
| 626 |
+
}
|
| 627 |
+
response_text = _fallbacks.get(language_code, "Sorry, the language model is unavailable.")
|
| 628 |
|
| 629 |
# ── Parse and strip [LEARNED:] tags — save async to Hub ──────────────────
|
| 630 |
response_text, learned_pairs = _parse_and_strip_learned(response_text, language_code)
|
|
|
|
| 1473 |
except Exception as llm_err:
|
| 1474 |
import logging
|
| 1475 |
logging.getLogger(__name__).warning("LLM error: %s", llm_err)
|
| 1476 |
+
_fallbacks = {
|
| 1477 |
+
"bam": "Hakɛ to, tasuma tɛ kɛ sisan. I ka a lasɔrɔ tugu.",
|
| 1478 |
+
"ful": "Hakke, mi waawaa jogaade modèl oo jooni. Njaɓɓu.",
|
| 1479 |
+
"fr": "Désolé, le modèle est indisponible pour l'instant.",
|
| 1480 |
+
}
|
| 1481 |
+
response_text = _fallbacks.get(lang, "Sorry, the language model is unavailable.")
|
| 1482 |
|
| 1483 |
# Strip [LEARNED:] tags, persist async
|
| 1484 |
response_text, _ = _parse_and_strip_learned(response_text, lang)
|
|
@@ -62,7 +62,7 @@
|
|
| 62 |
"metadata": {},
|
| 63 |
"outputs": [],
|
| 64 |
"source": [
|
| 65 |
-
"# ── Cell 3: CONFIGURATION — edit these before each run ───────────────────────\nimport os\n\n# ─── Language to train ───────────────────────────────────────────────────────\n# 'bam' = Bambara 'ful' = Fula\nTRAIN_LANG = 'bam'\n\n# ─── Model ───────────────────────────────────────────────────────────────────\nWHISPER_MODEL_ID = 'openai/whisper-small'\nTARGET_SR = 16_000\n\n# ─── HuggingFace repos ───────────────────────────────────────────────────────\nHF_USERNAME = 'ous-sow'\nFEEDBACK_REPO_ID = f'{HF_USERNAME}/sahel-agri-feedback'\nADAPTER_REPO_ID = f'{HF_USERNAME}/sahel-agri-adapters'\n\n# ─── Training hyper-parameters ───────────────────────────────────────────────\nMAX_STEPS = 4_000 # T4 ~45 min; set 8000 for a deeper run\nBATCH_SIZE = 16\nGRAD_ACCUM = 2 # effective batch = 32\nLEARNING_RATE = 1e-3\nWARMUP_STEPS = 200\nSAVE_STEPS = 500\nEVAL_STEPS = 500\nLOGGING_STEPS = 50\nMAX_WAXAL_TRAIN = 5_000 # cap WaxalNLP samples (streaming budget)\nCORRECTION_REPEAT= 3 # upsample user corrections Nx for emphasis\n\n# ─── Paths (Kaggle working dir) ───────────────────────────────────────────────\nWORKING_DIR = '/kaggle/working'\nOUTPUT_DIR = f'{WORKING_DIR}/adapter_{TRAIN_LANG}'\nDATA_DIR = f'{WORKING_DIR}/data'\nAUDIO_DIR = f'{WORKING_DIR}/audio_feedback'\n\nLANG_NAME = {'bam': 'bambara', 'ful': 'fula'}.get(TRAIN_LANG, TRAIN_LANG)\n\nprint(f'Language : {TRAIN_LANG} ({LANG_NAME})')\nprint(f'Model : {WHISPER_MODEL_ID}')\nprint(f'Output : {OUTPUT_DIR}')\nprint(f'Max steps : {MAX_STEPS}')"
|
| 66 |
]
|
| 67 |
},
|
| 68 |
{
|
|
@@ -252,7 +252,7 @@
|
|
| 252 |
"metadata": {},
|
| 253 |
"outputs": [],
|
| 254 |
"source": [
|
| 255 |
-
"# ── Cell 19: Push adapter to HF Model repo ───────────────────
|
| 256 |
]
|
| 257 |
},
|
| 258 |
{
|
|
|
|
| 62 |
"metadata": {},
|
| 63 |
"outputs": [],
|
| 64 |
"source": [
|
| 65 |
+
"# ── Cell 3: CONFIGURATION — edit these before each run ───────────────────────\nimport os\n\n# ─── Language to train ───────────────────────────────────────────────────────\n# 'bam' = Bambara 'ful' = Fula\nTRAIN_LANG = 'bam'\n\n# ─── Model ───────────────────────────────────────────────────────────────────\nWHISPER_MODEL_ID = 'openai/whisper-small'\nTARGET_SR = 16_000\n\n# ─── HuggingFace repos ───────────────────────────────────────────────────────\nHF_USERNAME = 'ous-sow'\nFEEDBACK_REPO_ID = f'{HF_USERNAME}/sahel-agri-feedback'\nADAPTER_REPO_ID = f'{HF_USERNAME}/sahel-agri-adapters'\n\n# ─── Training hyper-parameters ───────────────────────────────────────────────\nMAX_STEPS = 4_000 # T4 ~45 min; set 8000 for a deeper run\nBATCH_SIZE = 16\nGRAD_ACCUM = 2 # effective batch = 32\nLEARNING_RATE = 1e-3\nWARMUP_STEPS = 200\nSAVE_STEPS = 500\nEVAL_STEPS = 500\nLOGGING_STEPS = 50\nMAX_WAXAL_TRAIN = 5_000 # cap WaxalNLP samples (streaming budget)\nCORRECTION_REPEAT= 3 # upsample user corrections Nx for emphasis\n\n# ─── Paths (Kaggle working dir) ───────────────────────────────────────────────\nWORKING_DIR = '/kaggle/working'\nOUTPUT_DIR = f'{WORKING_DIR}/adapter_{TRAIN_LANG}'\nDATA_DIR = f'{WORKING_DIR}/data'\nAUDIO_DIR = f'{WORKING_DIR}/audio_feedback'\n\nLANG_NAME = {'bam': 'bambara', 'ful': 'fula'}.get(TRAIN_LANG, TRAIN_LANG)\nLANG_COUNTRY = {'bam': 'Mali', 'ful': 'Guinea'}.get(TRAIN_LANG, '')\nLANG_DIALECT = {\n 'bam': 'Standard Bambara (Bamako/Ségou) — Malian orthography',\n 'ful': 'Pular (Labé/Mamou dialects) — Guinean orthography',\n}.get(TRAIN_LANG, '')\n\nprint(f'Language : {TRAIN_LANG} ({LANG_NAME}) — {LANG_COUNTRY}')\nprint(f'Dialect : {LANG_DIALECT}')\nprint(f'Model : {WHISPER_MODEL_ID}')\nprint(f'Output : {OUTPUT_DIR}')\nprint(f'Max steps : {MAX_STEPS}')"
|
| 66 |
]
|
| 67 |
},
|
| 68 |
{
|
|
|
|
| 252 |
"metadata": {},
|
| 253 |
"outputs": [],
|
| 254 |
"source": [
|
| 255 |
+
"# ── Cell 19: Push adapter to HF Model repo ───────────────────────────────────\nfrom huggingface_hub import HfApi, create_repo\n\n# Ensure repo exists\ncreate_repo(ADAPTER_REPO_ID, repo_type='model', private=True,\n exist_ok=True, token=HF_TOKEN)\n\n_cer_part = f'{cer_score:.1%}' if cer_score == cer_score else 'n/a'\ncommit_msg = (\n f'[{VERSION_TAG}] {LANG_NAME} ({LANG_COUNTRY}) fine-tuned checkpoint — '\n f'{train_result.global_step} steps | CER {_cer_part} | '\n f'{len(correction_records)} corrections + WaxalNLP | {LANG_DIALECT}'\n)\n\napi.upload_folder(\n folder_path=OUTPUT_DIR,\n repo_id=ADAPTER_REPO_ID,\n repo_type='model',\n path_in_repo=PATH_IN_REPO,\n commit_message=commit_msg,\n)\nprint(f'✅ Adapter uploaded: {ADAPTER_REPO_ID}/{PATH_IN_REPO}')\n\n# Create a Git tag for this version\ntry:\n api.create_tag(\n repo_id=ADAPTER_REPO_ID,\n repo_type='model',\n tag=VERSION_TAG,\n tag_message=commit_msg,\n token=HF_TOKEN,\n )\n print(f'✅ Tag created : {VERSION_TAG}')\nexcept Exception as e:\n print(f'⚠️ Tag creation skipped: {e}')"
|
| 256 |
]
|
| 257 |
},
|
| 258 |
{
|