jefffffff9 Claude Sonnet 4.6 commited on
Commit
40cf84d
·
1 Parent(s): 618eab5

Fix language mixing: per-language prompts + Mali Bambara / Guinea Pular context

Browse files

Root cause: one generic system prompt for all languages with no language code
injected. The LLM had no instructions to stay in one dialect, so it mixed
Bambara and Fula words freely.

app.py:
- SUPPORTED_LANGUAGES: labels now say "Bambara — Mali" and "Fula / Pular — Guinea"
- LANG_CONTEXT dict: per-language country, region, script, phonetic rules, and
explicit "do_not_mix" field naming the languages to never blend in
- _build_system_prompt(language_code, vocab): generates a language-specific
system prompt that opens with "You MUST respond exclusively in {lang} ({country})"
and lists exactly which other languages must NOT appear in responses
- _get_vocab_context_for(language_code): filters vocabulary cache to only entries
tagged [bam] or [ful] — LLM never sees the other language's words in its context
- _build_messages: uses _build_system_prompt + filtered vocab instead of the
single generic template
- LLM fallback messages now cover bam (Mali Bambara), ful (Guinea Pular), and fr

Notebook Cell 4: LANG_COUNTRY + LANG_DIALECT constants added
bam → Mali / "Standard Bambara (Bamako/Ségou) — Malian orthography"
ful → Guinea / "Pular (Labé/Mamou dialects) — Guinean orthography"
Notebook Cell 19: country and dialect included in Hub commit message

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (2) hide show
  1. app.py +119 -39
  2. notebooks/kaggle_master_trainer.ipynb +2 -2
app.py CHANGED
@@ -47,10 +47,54 @@ AUTO_TRAIN_THRESHOLD = int(os.environ.get("AUTO_TRAIN_THRESHOLD", "50"))
47
  _ON_SPACES = os.environ.get("SPACE_ID") is not None
48
 
49
  SUPPORTED_LANGUAGES = {
50
- "Bambara (bam)": "bam",
51
- "Fula (ful)": "ful",
52
- "French / Français": "fr",
53
- "English": "en",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  }
55
 
56
  # ── ZeroGPU decorator (no-op locally) ────────────────────────────────────────
@@ -385,34 +429,68 @@ def _parse_and_strip_learned(text: str, lang: str) -> tuple[str, list[tuple[str,
385
 
386
 
387
  # System prompt — includes vocabulary context + conversation rules
388
- _CONVO_SYSTEM_TEMPLATE = """\
389
- You are a helpful voice assistant for Bambara and Fula speakers. \
390
- You are talking, not writing keep every response to 1–3 short sentences.
391
-
392
- YOUR KNOWLEDGE BASE (words and phrases you have learned from users):
393
- {vocab}
394
-
395
- RULES you must always follow:
396
- 1. Reply in whatever language the user speaks (Bambara, Fula, French, or English).
397
- 2. When speaking Bambara, use phonetic spelling: 'u' not 'ou', 'j' not 'dj', 'c' not 'ch'.
398
- 3. Keep responses SHORT — this is voice, not text.
399
- 4. If you do not understand something, ask ONE specific follow-up question \
400
- (e.g. "Mun ye o fileli ye?" = "What does that mean?").
401
- 5. If the user teaches you a word or phrase (says "X means Y" or "X se dit Y in Bambara"), \
402
- confirm warmly then add exactly: [LEARNED: word="X" meaning="Y"]
403
- 6. Remember the full conversation refer to earlier messages naturally \
404
- (e.g. "As you said earlier…", "I ka kuma fɔlen don…").
405
- 7. Never invent words you do not know. Honest uncertainty is always better than wrong answers."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
406
 
407
 
408
  def _build_messages(user_text: str, history: list, language_code: str) -> list[dict]:
409
- """Build the full message list: system (with vocab) + history + new user turn."""
410
- vocab = _get_vocab_context()
411
- system_content = _CONVO_SYSTEM_TEMPLATE.format(
412
- vocab=vocab if vocab else "(no vocabulary recorded yet — you can teach me words!)"
413
- )
414
- messages: list[dict] = [{"role": "system", "content": system_content}]
415
- # Inject conversation history (last 20 turns max)
416
  for u, a in history[-20:]:
417
  messages.append({"role": "user", "content": u})
418
  messages.append({"role": "assistant", "content": a})
@@ -541,11 +619,12 @@ def _convo_pipeline(audio_path: str, language_code: str, history: list):
541
  except Exception as llm_err:
542
  log.warning("LLM failed: %s", llm_err)
543
  # Graceful degradation: tell user LLM is unavailable, ask them to try again
544
- response_text = (
545
- "Hakɛ to, n bɛ sɔrɔ cogo dɔ la."
546
- if language_code == "bam"
547
- else "Sorry, I could not reach the language model. Please try again."
548
- )
 
549
 
550
  # ── Parse and strip [LEARNED:] tags — save async to Hub ──────────────────
551
  response_text, learned_pairs = _parse_and_strip_learned(response_text, language_code)
@@ -1394,11 +1473,12 @@ def _do_respond(
1394
  except Exception as llm_err:
1395
  import logging
1396
  logging.getLogger(__name__).warning("LLM error: %s", llm_err)
1397
- response_text = (
1398
- "Hakɛ to, tasuma tɛ kɛ sisan. I ka a lasɔrɔ tugu."
1399
- if lang == "bam"
1400
- else "Sorry, I could not reach the language model right now."
1401
- )
 
1402
 
1403
  # Strip [LEARNED:] tags, persist async
1404
  response_text, _ = _parse_and_strip_learned(response_text, lang)
 
47
  _ON_SPACES = os.environ.get("SPACE_ID") is not None
48
 
49
  SUPPORTED_LANGUAGES = {
50
+ "Bambara — Mali (bam)": "bam",
51
+ "Fula / Pular — Guinea (ful)": "ful",
52
+ "French / Français": "fr",
53
+ "English": "en",
54
+ }
55
+
56
+ # Country and dialect context used in prompts and training metadata
57
+ LANG_CONTEXT = {
58
+ "bam": {
59
+ "name": "Bambara",
60
+ "country": "Mali",
61
+ "region": "West Africa (Bamako, Ségou, Mopti dialects)",
62
+ "script": "Latin with special characters (ɛ, ɔ, ŋ, ɲ)",
63
+ "phonetic_note": (
64
+ "Use standard Malian orthography: 'u' not 'ou', 'j' not 'dj', "
65
+ "'c' not 'ch', 'ɲ' not 'gn' or 'ny', 'ɔ' not 'oo', 'ɛ' not 'ee'. "
66
+ "This is Bambara as spoken in Mali, NOT Dioula or other dialects."
67
+ ),
68
+ "do_not_mix": "Fula (Pulaar/Pular), Wolof, Dioula, or any other language",
69
+ },
70
+ "ful": {
71
+ "name": "Pular (Fula of Guinea)",
72
+ "country": "Guinea",
73
+ "region": "West Africa (Labé, Mamou, Kankan dialects)",
74
+ "script": "Latin with special characters (ɓ, ɗ, ŋ, ɲ, ƴ)",
75
+ "phonetic_note": (
76
+ "Use standard Guinean Pular orthography. "
77
+ "This is the Fula variety spoken in Guinea (Pular/Pulaar), "
78
+ "NOT Fulfulde from Niger/Nigeria nor Wolof."
79
+ ),
80
+ "do_not_mix": "Bambara, Soussou, Malinké, or any other language",
81
+ },
82
+ "fr": {
83
+ "name": "French",
84
+ "country": "France / West Africa",
85
+ "region": "",
86
+ "script": "Latin",
87
+ "phonetic_note": "Standard French.",
88
+ "do_not_mix": "other languages unless the user switches",
89
+ },
90
+ "en": {
91
+ "name": "English",
92
+ "country": "",
93
+ "region": "",
94
+ "script": "Latin",
95
+ "phonetic_note": "Standard English.",
96
+ "do_not_mix": "other languages unless the user switches",
97
+ },
98
  }
99
 
100
  # ── ZeroGPU decorator (no-op locally) ────────────────────────────────────────
 
429
 
430
 
431
  # System prompt — includes vocabulary context + conversation rules
432
+ def _build_system_prompt(language_code: str, vocab: str) -> str:
433
+ """
434
+ Build a language-specific system prompt that makes the LLM stay strictly
435
+ in the correct dialect (Mali Bambara vs Guinea Pular) and never mix them.
436
+ """
437
+ ctx = LANG_CONTEXT.get(language_code, LANG_CONTEXT["fr"])
438
+ lang_name = ctx["name"]
439
+ country = ctx["country"]
440
+ region = ctx["region"]
441
+ phon_note = ctx["phonetic_note"]
442
+ do_not_mix = ctx["do_not_mix"]
443
+
444
+ region_line = f" ({region})" if region else ""
445
+
446
+ vocab_section = (
447
+ f"WORDS AND PHRASES YOU HAVE LEARNED FOR {lang_name.upper()}:\n{vocab}"
448
+ if vocab
449
+ else f"(No {lang_name} vocabulary recorded yet the user can teach you words.)"
450
+ )
451
+
452
+ return f"""\
453
+ You are a voice assistant that speaks ONLY {lang_name} as used in {country}{region_line}.
454
+
455
+ CRITICAL LANGUAGE RULE:
456
+ - You MUST respond exclusively in {lang_name} ({country}).
457
+ - NEVER mix in words from {do_not_mix}.
458
+ - If the user writes in another language, gently ask them to switch to {lang_name}.
459
+ - If you are unsure of a word in {lang_name}, say so honestly — do not substitute \
460
+ a word from another language.
461
+
462
+ ORTHOGRAPHY ({lang_name}):
463
+ {phon_note}
464
+
465
+ {vocab_section}
466
+
467
+ CONVERSATION RULES:
468
+ 1. Keep every response to 1–3 short spoken sentences. This is voice, not text.
469
+ 2. If you do not understand, ask ONE short follow-up question in {lang_name}.
470
+ 3. If the user teaches you a word ("X means Y"), confirm warmly, then append \
471
+ exactly: [LEARNED: word="X" meaning="Y"]
472
+ 4. Refer back to earlier messages naturally when relevant.
473
+ 5. Never invent vocabulary. Honest uncertainty is always correct."""
474
+
475
+
476
+ def _get_vocab_context_for(language_code: str) -> str:
477
+ """Return only vocabulary entries for the given language code."""
478
+ with _vocab_lock:
479
+ raw = _vocab_context_cache
480
+ if not raw:
481
+ return ""
482
+ lines = [
483
+ line for line in raw.splitlines()
484
+ if f"[{language_code}]" in line
485
+ ]
486
+ return "\n".join(lines)
487
 
488
 
489
  def _build_messages(user_text: str, history: list, language_code: str) -> list[dict]:
490
+ """Build the full message list: system (with lang-filtered vocab) + history + new turn."""
491
+ vocab = _get_vocab_context_for(language_code)
492
+ system = _build_system_prompt(language_code, vocab)
493
+ messages: list[dict] = [{"role": "system", "content": system}]
 
 
 
494
  for u, a in history[-20:]:
495
  messages.append({"role": "user", "content": u})
496
  messages.append({"role": "assistant", "content": a})
 
619
  except Exception as llm_err:
620
  log.warning("LLM failed: %s", llm_err)
621
  # Graceful degradation: tell user LLM is unavailable, ask them to try again
622
+ _fallbacks = {
623
+ "bam": "Hakɛ to, n bɛ sɔrɔ cogo dɔ la.", # Bambara (Mali)
624
+ "ful": "Hakke, mi waawaa jogaade modèl oo jooni.", # Pular (Guinea)
625
+ "fr": "Désolé, je n'ai pas pu joindre le modèle.",
626
+ }
627
+ response_text = _fallbacks.get(language_code, "Sorry, the language model is unavailable.")
628
 
629
  # ── Parse and strip [LEARNED:] tags — save async to Hub ──────────────────
630
  response_text, learned_pairs = _parse_and_strip_learned(response_text, language_code)
 
1473
  except Exception as llm_err:
1474
  import logging
1475
  logging.getLogger(__name__).warning("LLM error: %s", llm_err)
1476
+ _fallbacks = {
1477
+ "bam": "Hakɛ to, tasuma tɛ kɛ sisan. I ka a lasɔrɔ tugu.",
1478
+ "ful": "Hakke, mi waawaa jogaade modèl oo jooni. Njaɓɓu.",
1479
+ "fr": "Désolé, le modèle est indisponible pour l'instant.",
1480
+ }
1481
+ response_text = _fallbacks.get(lang, "Sorry, the language model is unavailable.")
1482
 
1483
  # Strip [LEARNED:] tags, persist async
1484
  response_text, _ = _parse_and_strip_learned(response_text, lang)
notebooks/kaggle_master_trainer.ipynb CHANGED
@@ -62,7 +62,7 @@
62
  "metadata": {},
63
  "outputs": [],
64
  "source": [
65
- "# ── Cell 3: CONFIGURATION — edit these before each run ───────────────────────\nimport os\n\n# ─── Language to train ───────────────────────────────────────────────────────\n# 'bam' = Bambara 'ful' = Fula\nTRAIN_LANG = 'bam'\n\n# ─── Model ───────────────────────────────────────────────────────────────────\nWHISPER_MODEL_ID = 'openai/whisper-small'\nTARGET_SR = 16_000\n\n# ─── HuggingFace repos ───────────────────────────────────────────────────────\nHF_USERNAME = 'ous-sow'\nFEEDBACK_REPO_ID = f'{HF_USERNAME}/sahel-agri-feedback'\nADAPTER_REPO_ID = f'{HF_USERNAME}/sahel-agri-adapters'\n\n# ─── Training hyper-parameters ───────────────────────────────────────────────\nMAX_STEPS = 4_000 # T4 ~45 min; set 8000 for a deeper run\nBATCH_SIZE = 16\nGRAD_ACCUM = 2 # effective batch = 32\nLEARNING_RATE = 1e-3\nWARMUP_STEPS = 200\nSAVE_STEPS = 500\nEVAL_STEPS = 500\nLOGGING_STEPS = 50\nMAX_WAXAL_TRAIN = 5_000 # cap WaxalNLP samples (streaming budget)\nCORRECTION_REPEAT= 3 # upsample user corrections Nx for emphasis\n\n# ─── Paths (Kaggle working dir) ───────────────────────────────────────────────\nWORKING_DIR = '/kaggle/working'\nOUTPUT_DIR = f'{WORKING_DIR}/adapter_{TRAIN_LANG}'\nDATA_DIR = f'{WORKING_DIR}/data'\nAUDIO_DIR = f'{WORKING_DIR}/audio_feedback'\n\nLANG_NAME = {'bam': 'bambara', 'ful': 'fula'}.get(TRAIN_LANG, TRAIN_LANG)\n\nprint(f'Language : {TRAIN_LANG} ({LANG_NAME})')\nprint(f'Model : {WHISPER_MODEL_ID}')\nprint(f'Output : {OUTPUT_DIR}')\nprint(f'Max steps : {MAX_STEPS}')"
66
  ]
67
  },
68
  {
@@ -252,7 +252,7 @@
252
  "metadata": {},
253
  "outputs": [],
254
  "source": [
255
- "# ── Cell 19: Push adapter to HF Model repo ───────────────────���───────────────\nfrom huggingface_hub import HfApi, create_repo\n\n# Ensure repo exists\ncreate_repo(ADAPTER_REPO_ID, repo_type='model', private=True,\n exist_ok=True, token=HF_TOKEN)\n\n_cer_part = f'{cer_score:.1%}' if cer_score == cer_score else 'n/a'\ncommit_msg = (\n f'[{VERSION_TAG}] {LANG_NAME} fine-tuned checkpoint — '\n f'{train_result.global_step} steps | CER {_cer_part} | '\n f'{len(correction_records)} corrections + WaxalNLP'\n)\n\napi.upload_folder(\n folder_path=OUTPUT_DIR,\n repo_id=ADAPTER_REPO_ID,\n repo_type='model',\n path_in_repo=PATH_IN_REPO,\n commit_message=commit_msg,\n)\nprint(f'✅ Adapter uploaded: {ADAPTER_REPO_ID}/{PATH_IN_REPO}')\n\n# Create a Git tag for this version\ntry:\n api.create_tag(\n repo_id=ADAPTER_REPO_ID,\n repo_type='model',\n tag=VERSION_TAG,\n tag_message=commit_msg,\n token=HF_TOKEN,\n )\n print(f'✅ Tag created : {VERSION_TAG}')\nexcept Exception as e:\n print(f'⚠️ Tag creation skipped: {e}')"
256
  ]
257
  },
258
  {
 
62
  "metadata": {},
63
  "outputs": [],
64
  "source": [
65
+ "# ── Cell 3: CONFIGURATION — edit these before each run ───────────────────────\nimport os\n\n# ─── Language to train ───────────────────────────────────────────────────────\n# 'bam' = Bambara 'ful' = Fula\nTRAIN_LANG = 'bam'\n\n# ─── Model ───────────────────────────────────────────────────────────────────\nWHISPER_MODEL_ID = 'openai/whisper-small'\nTARGET_SR = 16_000\n\n# ─── HuggingFace repos ───────────────────────────────────────────────────────\nHF_USERNAME = 'ous-sow'\nFEEDBACK_REPO_ID = f'{HF_USERNAME}/sahel-agri-feedback'\nADAPTER_REPO_ID = f'{HF_USERNAME}/sahel-agri-adapters'\n\n# ─── Training hyper-parameters ───────────────────────────────────────────────\nMAX_STEPS = 4_000 # T4 ~45 min; set 8000 for a deeper run\nBATCH_SIZE = 16\nGRAD_ACCUM = 2 # effective batch = 32\nLEARNING_RATE = 1e-3\nWARMUP_STEPS = 200\nSAVE_STEPS = 500\nEVAL_STEPS = 500\nLOGGING_STEPS = 50\nMAX_WAXAL_TRAIN = 5_000 # cap WaxalNLP samples (streaming budget)\nCORRECTION_REPEAT= 3 # upsample user corrections Nx for emphasis\n\n# ─── Paths (Kaggle working dir) ───────────────────────────────────────────────\nWORKING_DIR = '/kaggle/working'\nOUTPUT_DIR = f'{WORKING_DIR}/adapter_{TRAIN_LANG}'\nDATA_DIR = f'{WORKING_DIR}/data'\nAUDIO_DIR = f'{WORKING_DIR}/audio_feedback'\n\nLANG_NAME = {'bam': 'bambara', 'ful': 'fula'}.get(TRAIN_LANG, TRAIN_LANG)\nLANG_COUNTRY = {'bam': 'Mali', 'ful': 'Guinea'}.get(TRAIN_LANG, '')\nLANG_DIALECT = {\n 'bam': 'Standard Bambara (Bamako/Ségou) — Malian orthography',\n 'ful': 'Pular (Labé/Mamou dialects) — Guinean orthography',\n}.get(TRAIN_LANG, '')\n\nprint(f'Language : {TRAIN_LANG} ({LANG_NAME}) — {LANG_COUNTRY}')\nprint(f'Dialect : {LANG_DIALECT}')\nprint(f'Model : {WHISPER_MODEL_ID}')\nprint(f'Output : {OUTPUT_DIR}')\nprint(f'Max steps : {MAX_STEPS}')"
66
  ]
67
  },
68
  {
 
252
  "metadata": {},
253
  "outputs": [],
254
  "source": [
255
+ "# ── Cell 19: Push adapter to HF Model repo ──────────────────────────────────\nfrom huggingface_hub import HfApi, create_repo\n\n# Ensure repo exists\ncreate_repo(ADAPTER_REPO_ID, repo_type='model', private=True,\n exist_ok=True, token=HF_TOKEN)\n\n_cer_part = f'{cer_score:.1%}' if cer_score == cer_score else 'n/a'\ncommit_msg = (\n f'[{VERSION_TAG}] {LANG_NAME} ({LANG_COUNTRY}) fine-tuned checkpoint — '\n f'{train_result.global_step} steps | CER {_cer_part} | '\n f'{len(correction_records)} corrections + WaxalNLP | {LANG_DIALECT}'\n)\n\napi.upload_folder(\n folder_path=OUTPUT_DIR,\n repo_id=ADAPTER_REPO_ID,\n repo_type='model',\n path_in_repo=PATH_IN_REPO,\n commit_message=commit_msg,\n)\nprint(f'✅ Adapter uploaded: {ADAPTER_REPO_ID}/{PATH_IN_REPO}')\n\n# Create a Git tag for this version\ntry:\n api.create_tag(\n repo_id=ADAPTER_REPO_ID,\n repo_type='model',\n tag=VERSION_TAG,\n tag_message=commit_msg,\n token=HF_TOKEN,\n )\n print(f'✅ Tag created : {VERSION_TAG}')\nexcept Exception as e:\n print(f'⚠️ Tag creation skipped: {e}')"
256
  ]
257
  },
258
  {