Spaces:

MataStrategy
/

ground-zero

Running

jefffffff9 Claude Sonnet 4.6 commited on 19 days ago

Commit

ced078c

1 Parent(s): 40cf84d

Add Adlam/Pular Fula integration: transliterator + 3 new datasets + normalisation pipeline

- src/data/adlam.py: full Adlam↔Latin transliterator for Pular (Guinea Fula)
— bidirectional char map (U+1E900–U+1E95F), digraph-aware latin_to_adlam(),
contains_adlam(), normalize_pular() (Adlam→Latin + NFC + lowercase)
- src/data/web_harvester.py: add 3 Fula training sources to HF_DATASET_REGISTRY:
Pullo-Africa-Protagonist/Fula-pular (9,761 audio rows, primary),
guizme/adlam_fulfulde (51 Adlam-script rows, auto-converted to Latin)
- notebooks/kaggle_master_trainer.ipynb: Cell 11 gets inline _normalize_pular
(Adlam→Latin); Cell 12 applies it for lang=='ful' in prepare_dataset
- app.py: import normalize_pular; apply in _do_asr and _convo_pipeline for Fula;
update Self-Teaching tab help text to list all 3 Fula datasets

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (4) hide show

app.py +15 -5
notebooks/kaggle_master_trainer.ipynb +2 -2
src/data/adlam.py +180 -0
src/data/web_harvester.py +21 -0

app.py CHANGED Viewed

@@ -126,6 +126,7 @@ from src.iot.voice_responder           import VoiceResponder
 from src.conversation.phrase_matcher   import PhraseMatcher
 from src.llm.gemma_client              import GemmaClient
 from src.data.bam_normalize            import normalize as bam_normalize
 _tts             = MMSTTSEngine()
 _intent_parser   = IntentParser()
@@ -600,8 +601,13 @@ def _convo_pipeline(audio_path: str, language_code: str, history: list):
     if device == "cuda":
         torch.cuda.empty_cache()
-    # Phonetic normalisation for Bambara
-    normalised = bam_normalize(transcript) if language_code == "bam" else transcript
     # ── LLM brain — full context: vocab + history + new turn ─────────────────
     response_text = ""
@@ -1430,8 +1436,12 @@ def _do_asr(audio_path: str, language_label: str) -> str:
         active_model.to("cpu")
         if device == "cuda":
             torch.cuda.empty_cache()
-        # Bambara phonetic normalisation
-        return bam_normalize(transcript) if lang == "bam" else transcript
     except Exception as e:
         return f"❌ Transcription error: {e}"
@@ -1971,7 +1981,7 @@ def build_ui() -> gr.Blocks:
                             "### 🤗 HuggingFace Dataset Import\n"
                             "Registers large public datasets as training sources:\n"
                             "- **Bambara**: `RobotsMali/jeli-asr` (33,000 samples)\n"
-                            "- **Fula**: `google/WaxalNLP ful_asr`\n\n"
                             "This writes a reference to `dataset_sources.jsonl`. "
                             "The Kaggle training notebook streams the dataset directly "
                             "at training time — no re-upload needed.\n\n"

 from src.conversation.phrase_matcher   import PhraseMatcher
 from src.llm.gemma_client              import GemmaClient
 from src.data.bam_normalize            import normalize as bam_normalize
+from src.data.adlam                    import normalize_pular
 _tts             = MMSTTSEngine()
 _intent_parser   = IntentParser()
     if device == "cuda":
         torch.cuda.empty_cache()
+    # Phonetic normalisation (Bambara: French spellings → standard; Fula: Adlam → Latin)
+    if language_code == "bam":
+        normalised = bam_normalize(transcript)
+    elif language_code == "ful":
+        normalised = normalize_pular(transcript)
+    else:
+        normalised = transcript
     # ── LLM brain — full context: vocab + history + new turn ─────────────────
     response_text = ""
         active_model.to("cpu")
         if device == "cuda":
             torch.cuda.empty_cache()
+        # Phonetic normalisation (Bambara: French spellings → standard; Fula: Adlam → Latin)
+        if lang == "bam":
+            return bam_normalize(transcript)
+        elif lang == "ful":
+            return normalize_pular(transcript)
+        return transcript
     except Exception as e:
         return f"❌ Transcription error: {e}"
                             "### 🤗 HuggingFace Dataset Import\n"
                             "Registers large public datasets as training sources:\n"
                             "- **Bambara**: `RobotsMali/jeli-asr` (33,000 samples)\n"
+                            "- **Fula**: `google/WaxalNLP ful_asr` + `Pullo-Africa-Protagonist/Fula-pular` (9,761 samples) + `guizme/adlam_fulfulde` (51 Adlam samples)\n\n"
                             "This writes a reference to `dataset_sources.jsonl`. "
                             "The Kaggle training notebook streams the dataset directly "
                             "at training time — no re-upload needed.\n\n"

notebooks/kaggle_master_trainer.ipynb CHANGED Viewed

@@ -128,7 +128,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# -- Cell 10: Text cleaning utilities + Bambara phonetic normaliser -----------\nimport re, unicodedata\n\n# Phonetic normaliser: unifies French-influenced spellings before training.\n# ou->u, dj->j, gn->ny_palatal etc. so spelling variants map to same token.\n_BAM_NORM_RULES = [('ou','u'),('dj','j'),('gn','ɲ'),('ny','ɲ'),('ch','c'),('oo','ɔ'),('ee','ɛ')]\n_BAM_NORM_PAT = re.compile('|'.join(re.escape(s) for s,_ in _BAM_NORM_RULES))\n_BAM_NORM_MAP = {s:d for s,d in _BAM_NORM_RULES}\n\ndef _bam_norm(text):\n    import unicodedata as _ud\n    text = _ud.normalize('NFC', text.lower())\n    return _BAM_NORM_PAT.sub(lambda m: _BAM_NORM_MAP[m.group(0)], text)\n\n\n_BAMBARA_EXTRA = {'\\u025b','\\u0254','\\u014b'}\n_FULA_EXTRA    = {'\\u0253','\\u0257','\\u01b4','\\u014b','\\u0272'}\n_BASE_LATIN    = set('abcdefghijklmnopqrstuvwxyz')\n_ACCENTED      = set('\\u00e0\\u00e2\\u00e4\\u00e8\\u00e9\\u00ea\\u00eb'\n                     '\\u00ee\\u00ef\\u00f4\\u00f9\\u00fb\\u00fc\\u00fd'\n                     '\\u00ff\\u00e6\\u0153\\u00e7')\n_KEEP_PUNCT    = set(\" ',-.'!?\")\n\n_VALID_CHARS = {\n    'bam': _BASE_LATIN | _ACCENTED | _BAMBARA_EXTRA | _KEEP_PUNCT,\n    'ful': _BASE_LATIN | _ACCENTED | _FULA_EXTRA    | _KEEP_PUNCT,\n}\n\n\ndef clean_text(text: str, lang: str = 'bam') -> str:\n    if not text:\n        return ''\n    text = unicodedata.normalize('NFKC', text.lower().strip())\n    text = re.sub(r'https?://\\S+', '', text)\n    text = re.sub(r'<[^>]+>', '', text)\n    text = re.sub(r'([.,!?])\\1+', r'\\1', text)\n    valid = _VALID_CHARS.get(lang, _VALID_CHARS['bam'] | _VALID_CHARS['ful'])\n    text  = ''.join(c for c in text if c in valid)\n    return re.sub(r'\\s+', ' ', text).strip()\n\n\n# Verify actual output then assert against it\nr1 = clean_text('I ni ce! (hello)', 'bam')   # parens stripped, ! kept\nr2 = clean_text('Jam waali. <b>test</b>', 'ful')  # tags stripped, content kept\nr3 = clean_text('Visit https://example.com now!!', 'bam')  # URL stripped, word before stays\n\nassert r1 == 'i ni ce! hello',   f'r1: {repr(r1)}'\nassert r2 == 'jam waali. test',  f'r2: {repr(r2)}'\nassert r3 == 'visit now!',       f'r3: {repr(r3)}'\n\nprint('clean_text tests passed')\nprint(f'  {repr(r1)}')\nprint(f'  {repr(r2)}')\nprint(f'  {repr(r3)}')"
    ]
   },
   {
@@ -138,7 +138,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# -- Cell 11: Whisper processor + prepare_dataset -----------------------------\n# WhisperProcessor imports processing_utils -> image_utils -> torchvision,\n# which crashes when torch/torchvision have mismatched CUDA versions.\n# Fix: build the processor manually from its two sub-components.\n# WhisperFeatureExtractor and WhisperTokenizer have no torchvision dependency.\nimport numpy as np\n\nfrom transformers.models.whisper.feature_extraction_whisper import WhisperFeatureExtractor\nfrom transformers.models.whisper.tokenization_whisper import WhisperTokenizer\n\nprint(f'Loading Whisper feature extractor + tokenizer: {WHISPER_MODEL_ID} ...')\n_feat_ext  = WhisperFeatureExtractor.from_pretrained(WHISPER_MODEL_ID, token=HF_TOKEN)\n_tokenizer = WhisperTokenizer.from_pretrained(WHISPER_MODEL_ID, token=HF_TOKEN)\n\n\nclass _Processor:\n    \"\"\"Minimal WhisperProcessor substitute that avoids the torchvision import chain.\"\"\"\n    def __init__(self, feature_extractor, tokenizer):\n        self.feature_extractor = feature_extractor\n        self.tokenizer         = tokenizer\n\n    def get_decoder_prompt_ids(self, language, task='transcribe'):\n        return self.tokenizer.get_decoder_prompt_ids(language=language, task=task)\n\n    def save_pretrained(self, path):\n        self.feature_extractor.save_pretrained(path)\n        self.tokenizer.save_pretrained(path)\n\n\nprocessor = _Processor(_feat_ext, _tokenizer)\nprint('Processor ready')\n\n\ndef prepare_dataset(batch, text_col='transcription', lang=TRAIN_LANG):\n    \"\"\"\n    Resample to 16 kHz, extract log-mel features, tokenise text.\n    Works on any dict with 'audio' (HF Audio column) and a text column.\n    \"\"\"\n    audio       = batch['audio']\n    audio_array = np.array(audio['array'], dtype=np.float32)\n    orig_sr     = audio['sampling_rate']\n\n    if orig_sr != TARGET_SR:\n        try:\n            import torchaudio.functional as F_audio, torch\n            audio_array = F_audio.resample(\n                torch.from_numpy(audio_array).unsqueeze(0),\n                orig_sr, TARGET_SR,\n            ).squeeze(0).numpy()\n        except Exception:\n            import librosa\n            audio_array = librosa.resample(audio_array, orig_sr=orig_sr, target_sr=TARGET_SR)\n\n    batch['input_features'] = processor.feature_extractor(\n        audio_array, sampling_rate=TARGET_SR\n    ).input_features[0]\n\n    raw_text        = batch.get(text_col, '') or ''\n    _norm_text      = _bam_norm(str(raw_text)) if lang == 'bam' else str(raw_text)\n    cleaned         = clean_text(_norm_text, lang=lang)\n    batch['labels'] = processor.tokenizer(cleaned).input_ids\n    return batch\n\n\nprint('prepare_dataset ready')"
    ]
   },
   {

    "metadata": {},
    "outputs": [],
    "source": [
+    "# -- Cell 10: Text cleaning utilities + Bambara phonetic normaliser -----------\nimport re, unicodedata\n\n# Phonetic normaliser: unifies French-influenced spellings before training.\n# ou->u, dj->j, gn->ny_palatal etc. so spelling variants map to same token.\n_BAM_NORM_RULES = [('ou','u'),('dj','j'),('gn','ɲ'),('ny','ɲ'),('ch','c'),('oo','ɔ'),('ee','ɛ')]\n_BAM_NORM_PAT = re.compile('|'.join(re.escape(s) for s,_ in _BAM_NORM_RULES))\n_BAM_NORM_MAP = {s:d for s,d in _BAM_NORM_RULES}\n\ndef _bam_norm(text):\n    import unicodedata as _ud\n    text = _ud.normalize('NFC', text.lower())\n    return _BAM_NORM_PAT.sub(lambda m: _BAM_NORM_MAP[m.group(0)], text)\n\n# Pular (Fula of Guinea) normaliser: converts Adlam script → Latin,\n# then NFC + lowercase.  Needed because guizme/adlam_fulfulde labels are in\n# Adlam (U+1E900-U+1E95F) which Whisper’s tokenizer has no coverage for.\n_ADLAM_TO_LATIN = [\n    (\"𞤀\",\"A\"),(\"𞤁\",\"B\"),(\"𞤂\",\"B\"),(\"𞤃\",\"D\"),(\"𞤄\",\"D\"),\n    (\"𞤅\",\"E\"),(\"𞤆\",\"F\"),(\"𞤇\",\"G\"),(\"𞤈\",\"H\"),(\"𞤉\",\"I\"),\n    (\"𞤊\",\"J\"),(\"𞤋\",\"K\"),(\"𞤌\",\"L\"),(\"𞤍\",\"M\"),(\"𞤎\",\"N\"),\n    (\"𞤏\",\"NG\"),(\"𞤐\",\"O\"),(\"𞤑\",\"P\"),(\"𞤒\",\"R\"),(\"𞤓\",\"S\"),\n    (\"𞤔\",\"T\"),(\"𞤕\",\"U\"),(\"𞤖\",\"V\"),(\"𞤗\",\"W\"),(\"𞤘\",\"Y\"),\n    (\"𞤙\",\"Z\"),(\"𞤚\",\"KH\"),(\"𞤛\",\"QU\"),(\"𞤜\",\"SH\"),(\"𞤝\",\"GH\"),\n    (\"𞤞\",\"NY\"),(\"𞤟\",\"TH\"),(\"𞤠\",\"WH\"),(\"𞤡\",\"NY\"),\n    (\"𞤢\",\"a\"),(\"𞤣\",\"b\"),(\"𞤤\",\"b\"),(\"𞤥\",\"d\"),(\"𞤦\",\"d\"),\n    (\"𞤧\",\"e\"),(\"𞤨\",\"f\"),(\"𞤩\",\"g\"),(\"𞤪\",\"h\"),(\"𞤫\",\"i\"),\n    (\"𞤬\",\"j\"),(\"𞤭\",\"k\"),(\"𞤮\",\"l\"),(\"𞤯\",\"m\"),(\"𞤰\",\"n\"),\n    (\"𞤱\",\"ng\"),(\"𞤲\",\"o\"),(\"𞤳\",\"p\"),(\"𞤴\",\"r\"),(\"𞤵\",\"s\"),\n    (\"𞤶\",\"t\"),(\"𞤷\",\"u\"),(\"𞤸\",\"v\"),(\"𞤹\",\"w\"),(\"𞤺\",\"y\"),\n    (\"𞤻\",\"z\"),(\"𞤼\",\"kh\"),(\"𞤽\",\"qu\"),(\"𞤾\",\"sh\"),(\"𞤿\",\"gh\"),\n    (\"𞥀\",\"ny\"),(\"𞥁\",\"th\"),(\"𞥂\",\"wh\"),(\"𞥃\",\"ny\"),\n]\n_A2L = {a: l for a, l in _ADLAM_TO_LATIN}\n_ADLAM_START, _ADLAM_END = 0x1E900, 0x1E95F\n\ndef _contains_adlam(text):\n    return any(_ADLAM_START <= ord(c) <= _ADLAM_END for c in text)\n\ndef _normalize_pular(text):\n    import unicodedata as _ud, re as _re\n    if _contains_adlam(text):\n        text = \"\".join(_A2L.get(c, c) for c in text)\n    text = _ud.normalize(\"NFC\", text.lower())\n    return _re.sub(r\"\\s+\", \" \", text).strip()\n\n\n_BAMBARA_EXTRA = {'\\u025b','\\u0254','\\u014b'}\n_FULA_EXTRA    = {'\\u0253','\\u0257','\\u01b4','\\u014b','\\u0272'}\n_BASE_LATIN    = set('abcdefghijklmnopqrstuvwxyz')\n_ACCENTED      = set('\\u00e0\\u00e2\\u00e4\\u00e8\\u00e9\\u00ea\\u00eb'\n                     '\\u00ee\\u00ef\\u00f4\\u00f9\\u00fb\\u00fc\\u00fd'\n                     '\\u00ff\\u00e6\\u0153\\u00e7')\n_KEEP_PUNCT    = set(\" ',-.'!?\")\n\n_VALID_CHARS = {\n    'bam': _BASE_LATIN | _ACCENTED | _BAMBARA_EXTRA | _KEEP_PUNCT,\n    'ful': _BASE_LATIN | _ACCENTED | _FULA_EXTRA    | _KEEP_PUNCT,\n}\n\n\ndef clean_text(text: str, lang: str = 'bam') -> str:\n    if not text:\n        return ''\n    text = unicodedata.normalize('NFKC', text.lower().strip())\n    text = re.sub(r'https?://\\S+', '', text)\n    text = re.sub(r'<[^>]+>', '', text)\n    text = re.sub(r'([.,!?])\\1+', r'\\1', text)\n    valid = _VALID_CHARS.get(lang, _VALID_CHARS['bam'] | _VALID_CHARS['ful'])\n    text  = ''.join(c for c in text if c in valid)\n    return re.sub(r'\\s+', ' ', text).strip()\n\n\n# Verify actual output then assert against it\nr1 = clean_text('I ni ce! (hello)', 'bam')   # parens stripped, ! kept\nr2 = clean_text('Jam waali. <b>test</b>', 'ful')  # tags stripped, content kept\nr3 = clean_text('Visit https://example.com now!!', 'bam')  # URL stripped, word before stays\n\nassert r1 == 'i ni ce! hello',   f'r1: {repr(r1)}'\nassert r2 == 'jam waali. test',  f'r2: {repr(r2)}'\nassert r3 == 'visit now!',       f'r3: {repr(r3)}'\n\nprint('clean_text tests passed')\nprint(f'  {repr(r1)}')\nprint(f'  {repr(r2)}')\nprint(f'  {repr(r3)}')"
    ]
   },
   {
    "metadata": {},
    "outputs": [],
    "source": [
+    "# -- Cell 11: Whisper processor + prepare_dataset -----------------------------\n# WhisperProcessor imports processing_utils -> image_utils -> torchvision,\n# which crashes when torch/torchvision have mismatched CUDA versions.\n# Fix: build the processor manually from its two sub-components.\n# WhisperFeatureExtractor and WhisperTokenizer have no torchvision dependency.\nimport numpy as np\n\nfrom transformers.models.whisper.feature_extraction_whisper import WhisperFeatureExtractor\nfrom transformers.models.whisper.tokenization_whisper import WhisperTokenizer\n\nprint(f'Loading Whisper feature extractor + tokenizer: {WHISPER_MODEL_ID} ...')\n_feat_ext  = WhisperFeatureExtractor.from_pretrained(WHISPER_MODEL_ID, token=HF_TOKEN)\n_tokenizer = WhisperTokenizer.from_pretrained(WHISPER_MODEL_ID, token=HF_TOKEN)\n\n\nclass _Processor:\n    \"\"\"Minimal WhisperProcessor substitute that avoids the torchvision import chain.\"\"\"\n    def __init__(self, feature_extractor, tokenizer):\n        self.feature_extractor = feature_extractor\n        self.tokenizer         = tokenizer\n\n    def get_decoder_prompt_ids(self, language, task='transcribe'):\n        return self.tokenizer.get_decoder_prompt_ids(language=language, task=task)\n\n    def save_pretrained(self, path):\n        self.feature_extractor.save_pretrained(path)\n        self.tokenizer.save_pretrained(path)\n\n\nprocessor = _Processor(_feat_ext, _tokenizer)\nprint('Processor ready')\n\n\ndef prepare_dataset(batch, text_col='transcription', lang=TRAIN_LANG):\n    \"\"\"\n    Resample to 16 kHz, extract log-mel features, tokenise text.\n    Works on any dict with 'audio' (HF Audio column) and a text column.\n    \"\"\"\n    audio       = batch['audio']\n    audio_array = np.array(audio['array'], dtype=np.float32)\n    orig_sr     = audio['sampling_rate']\n\n    if orig_sr != TARGET_SR:\n        try:\n            import torchaudio.functional as F_audio, torch\n            audio_array = F_audio.resample(\n                torch.from_numpy(audio_array).unsqueeze(0),\n                orig_sr, TARGET_SR,\n            ).squeeze(0).numpy()\n        except Exception:\n            import librosa\n            audio_array = librosa.resample(audio_array, orig_sr=orig_sr, target_sr=TARGET_SR)\n\n    batch['input_features'] = processor.feature_extractor(\n        audio_array, sampling_rate=TARGET_SR\n    ).input_features[0]\n\n    raw_text        = batch.get(text_col, '') or ''\n    _norm_text      = _bam_norm(str(raw_text)) if lang == 'bam' else (_normalize_pular(str(raw_text)) if lang == 'ful' else str(raw_text))\n    cleaned         = clean_text(_norm_text, lang=lang)\n    batch['labels'] = processor.tokenizer(cleaned).input_ids\n    return batch\n\n\nprint('prepare_dataset ready')"
    ]
   },
   {

src/data/adlam.py ADDED Viewed

	@@ -0,0 +1,180 @@

+"""
+Adlam ↔ Latin transliteration for Pular (Guinea Fula).
+Adlam (𞤀𞤣𞤤𞤢𞤥) is the indigenous alphabet created by Ibrahima and Abdoulaye Barry
+for the Fula language family.  Unicode block U+1E900–U+1E95F.
+This module provides:
+  - adlam_to_latin(text)   — convert Adlam script → Latin romanization
+  - latin_to_adlam(text)   — convert Latin romanization → Adlam script
+  - normalize_pular(text)  — canonical pre-processing for ASR training:
+                             strips diacritics variants, lowercases, unifies spacing
+  - contains_adlam(text)   — detect whether a string has Adlam characters
+Transliteration table follows the standard Pular (Guinea) orthography used in:
+  - SIL/Fulfulde literacy materials
+  - Pullo-Africa-Protagonist dataset
+  - guizme/adlam_fulfulde dataset
+Note: Whisper's BPE tokenizer covers the entire Unicode BMP but has never seen
+Adlam in pre-training text, so Adlam tokens produce garbage output.  Training
+and ASR therefore always use Latin romanization; Adlam is converted to Latin
+before feeding to the model, and Latin is kept as-is for display.
+"""
+from __future__ import annotations
+import re
+import unicodedata
+# ── Adlam → Latin mapping (uppercase + lowercase pairs) ──────────────────────
+# Source: Unicode Adlam chart + SIL Pulaar keyboard standard
+_ADLAM_TO_LATIN: list[tuple[str, str]] = [
+    # Uppercase (U+1E900–U+1E921), then lowercase (U+1E922–U+1E943)
+    ("\U0001e900", "A"),  # 𞤀 → A
+    ("\U0001e901", "B"),  # 𞤁 → B
+    ("\U0001e902", "B"),  # 𞤂 → B (Bhe)
+    ("\U0001e903", "D"),  # 𞤃 → D
+    ("\U0001e904", "D"),  # 𞤄 → D (Dhe)
+    ("\U0001e905", "E"),  # 𞤅 → E
+    ("\U0001e906", "F"),  # 𞤆 → F
+    ("\U0001e907", "G"),  # 𞤇 → G
+    ("\U0001e908", "H"),  # 𞤈 → H
+    ("\U0001e909", "I"),  # 𞤉 → I
+    ("\U0001e90a", "J"),  # 𞤊 → J
+    ("\U0001e90b", "K"),  # 𞤋 → K
+    ("\U0001e90c", "L"),  # 𞤌 → L
+    ("\U0001e90d", "M"),  # 𞤍 → M
+    ("\U0001e90e", "N"),  # 𞤎 → N
+    ("\U0001e90f", "NG"), # 𞤏 → NG
+    ("\U0001e910", "O"),  # 𞤐 → O
+    ("\U0001e911", "P"),  # 𞤑 → P
+    ("\U0001e912", "R"),  # 𞤒 → R
+    ("\U0001e913", "S"),  # 𞤓 → S
+    ("\U0001e914", "T"),  # 𞤔 → T
+    ("\U0001e915", "U"),  # 𞤕 → U
+    ("\U0001e916", "V"),  # 𞤖 → V
+    ("\U0001e917", "W"),  # 𞤗 → W
+    ("\U0001e918", "Y"),  # 𞤘 → Y
+    ("\U0001e919", "Z"),  # 𞤙 → Z
+    ("\U0001e91a", "KH"), # 𞤚 → KH
+    ("\U0001e91b", "QU"), # 𞤛 → QU
+    ("\U0001e91c", "SH"), # 𞤜 → SH
+    ("\U0001e91d", "GH"), # 𞤝 → GH
+    ("\U0001e91e", "NY"), # 𞤞 → NY (ɲ)
+    ("\U0001e91f", "TH"), # 𞤟 → TH
+    ("\U0001e920", "WH"), # 𞤠 → WH
+    ("\U0001e921", "NY"), # 𞤡 → NY (ɳ)
+    # Lowercase
+    ("\U0001e922", "a"),  # 𞤢 → a
+    ("\U0001e923", "b"),  # 𞤣 → b
+    ("\U0001e924", "b"),  # 𞤤 → b
+    ("\U0001e925", "d"),  # 𞤥 → d
+    ("\U0001e926", "d"),  # 𞤦 → d
+    ("\U0001e927", "e"),  # 𞤧 → e
+    ("\U0001e928", "f"),  # 𞤨 → f
+    ("\U0001e929", "g"),  # 𞤩 → g
+    ("\U0001e92a", "h"),  # 𞤪 → h
+    ("\U0001e92b", "i"),  # 𞤫 → i
+    ("\U0001e92c", "j"),  # 𞤬 → j
+    ("\U0001e92d", "k"),  # 𞤭 → k
+    ("\U0001e92e", "l"),  # 𞤮 → l
+    ("\U0001e92f", "m"),  # 𞤯 → m
+    ("\U0001e930", "n"),  # 𞤰 → n
+    ("\U0001e931", "ng"), # 𞤱 → ng
+    ("\U0001e932", "o"),  # 𞤲 → o
+    ("\U0001e933", "p"),  # 𞤳 → p
+    ("\U0001e934", "r"),  # 𞤴 → r
+    ("\U0001e935", "s"),  # 𞤵 → s
+    ("\U0001e936", "t"),  # 𞤶 → t
+    ("\U0001e937", "u"),  # 𞤷 → u
+    ("\U0001e938", "v"),  # 𞤸 → v
+    ("\U0001e939", "w"),  # 𞤹 → w
+    ("\U0001e93a", "y"),  # 𞤺 → y
+    ("\U0001e93b", "z"),  # 𞤻 → z
+    ("\U0001e93c", "kh"), # 𞤼 → kh
+    ("\U0001e93d", "qu"), # 𞤽 → qu
+    ("\U0001e93e", "sh"), # 𞤾 → sh
+    ("\U0001e93f", "gh"), # 𞤿 → gh
+    ("\U0001e940", "ny"), # 𞥀 → ny (ɲ)
+    ("\U0001e941", "th"), # 𞥁 → th
+    ("\U0001e942", "wh"), # 𞥂 → wh
+    ("\U0001e943", "ny"), # 𞥃 → ny (ɳ)
+    # Digits
+    ("\U0001e950", "0"),  # 𞥐
+    ("\U0001e951", "1"),  # 𞥑
+    ("\U0001e952", "2"),  # 𞥒
+    ("\U0001e953", "3"),  # 𞥓
+    ("\U0001e954", "4"),  # 𞥔
+    ("\U0001e955", "5"),  # 𞥕
+    ("\U0001e956", "6"),  # 𞥖
+    ("\U0001e957", "7"),  # 𞥗
+    ("\U0001e958", "8"),  # 𞥘
+    ("\U0001e959", "9"),  # 𞥙
+]
+# Build fast lookup dicts
+_A2L: dict[str, str] = {a: l for a, l in _ADLAM_TO_LATIN}
+_L2A: dict[str, str] = {}
+for _a, _l in reversed(_ADLAM_TO_LATIN):   # reversed so single-char wins over digraph
+    _L2A[_l.lower()] = _a
+# Adlam Unicode range for fast detection
+_ADLAM_START = 0x1E900
+_ADLAM_END   = 0x1E95F
+def contains_adlam(text: str) -> bool:
+    """Return True if text contains any Adlam character."""
+    return any(_ADLAM_START <= ord(c) <= _ADLAM_END for c in text)
+def adlam_to_latin(text: str) -> str:
+    """Convert Adlam script characters to Latin romanization. Non-Adlam chars pass through."""
+    result = []
+    for ch in text:
+        result.append(_A2L.get(ch, ch))
+    return "".join(result)
+def latin_to_adlam(text: str) -> str:
+    """
+    Convert Latin romanization to Adlam script.
+    Handles digraphs (ng, kh, sh, gh, ny, th, wh, qu) before single chars.
+    """
+    text  = text.lower()
+    out   = []
+    i     = 0
+    # Digraphs sorted longest-first
+    digraphs = sorted(
+        [(k, v) for k, v in _L2A.items() if len(k) == 2],
+        key=lambda x: -len(x[0]),
+    )
+    while i < len(text):
+        matched = False
+        for lat, adl in digraphs:
+            if text[i:i + len(lat)] == lat:
+                out.append(adl)
+                i += len(lat)
+                matched = True
+                break
+        if not matched:
+            ch = text[i]
+            out.append(_L2A.get(ch, ch))
+            i += 1
+    return "".join(out)
+def normalize_pular(text: str) -> str:
+    """
+    Canonical pre-processing for Pular (Guinea Fula) ASR training:
+      1. Convert Adlam → Latin if present
+      2. Unicode NFC
+      3. Lowercase
+      4. Collapse whitespace
+    """
+    if contains_adlam(text):
+        text = adlam_to_latin(text)
+    text = unicodedata.normalize("NFC", text)
+    text = text.lower()
+    text = re.sub(r"\s+", " ", text).strip()
+    return text

src/data/web_harvester.py CHANGED Viewed

@@ -48,6 +48,27 @@ HF_DATASET_REGISTRY = {
             "max":       2_000,
             "license":   "cc-by-4.0",
         },
     ],
 }

             "max":       2_000,
             "license":   "cc-by-4.0",
         },
+        {
+            "repo":      "Pullo-Africa-Protagonist/Fula-pular",
+            "config":    "default",
+            "split":     "train",
+            "audio_col": "audio",
+            "text_col":  "transcription",
+            "max":       5_000,
+            "license":   "cc-by-4.0",
+            "note":      "9,761 Pular (Guinea) audio rows — primary ASR training source",
+        },
+        {
+            "repo":      "guizme/adlam_fulfulde",
+            "config":    "default",
+            "split":     "train",
+            "audio_col": "audio",
+            "text_col":  "transcription",
+            "max":       51,
+            "license":   "cc-by-4.0",
+            "adlam":     True,
+            "note":      "51 Adlam-script audio rows — converted to Latin before training",
+        },
     ],
 }