jefffffff9 Claude Sonnet 4.6 commited on
Commit
ced078c
·
1 Parent(s): 40cf84d

Add Adlam/Pular Fula integration: transliterator + 3 new datasets + normalisation pipeline

Browse files

- src/data/adlam.py: full Adlam↔Latin transliterator for Pular (Guinea Fula)
— bidirectional char map (U+1E900–U+1E95F), digraph-aware latin_to_adlam(),
contains_adlam(), normalize_pular() (Adlam→Latin + NFC + lowercase)
- src/data/web_harvester.py: add 3 Fula training sources to HF_DATASET_REGISTRY:
Pullo-Africa-Protagonist/Fula-pular (9,761 audio rows, primary),
guizme/adlam_fulfulde (51 Adlam-script rows, auto-converted to Latin)
- notebooks/kaggle_master_trainer.ipynb: Cell 11 gets inline _normalize_pular
(Adlam→Latin); Cell 12 applies it for lang=='ful' in prepare_dataset
- app.py: import normalize_pular; apply in _do_asr and _convo_pipeline for Fula;
update Self-Teaching tab help text to list all 3 Fula datasets

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

app.py CHANGED
@@ -126,6 +126,7 @@ from src.iot.voice_responder import VoiceResponder
126
  from src.conversation.phrase_matcher import PhraseMatcher
127
  from src.llm.gemma_client import GemmaClient
128
  from src.data.bam_normalize import normalize as bam_normalize
 
129
 
130
  _tts = MMSTTSEngine()
131
  _intent_parser = IntentParser()
@@ -600,8 +601,13 @@ def _convo_pipeline(audio_path: str, language_code: str, history: list):
600
  if device == "cuda":
601
  torch.cuda.empty_cache()
602
 
603
- # Phonetic normalisation for Bambara
604
- normalised = bam_normalize(transcript) if language_code == "bam" else transcript
 
 
 
 
 
605
 
606
  # ── LLM brain — full context: vocab + history + new turn ─────────────────
607
  response_text = ""
@@ -1430,8 +1436,12 @@ def _do_asr(audio_path: str, language_label: str) -> str:
1430
  active_model.to("cpu")
1431
  if device == "cuda":
1432
  torch.cuda.empty_cache()
1433
- # Bambara phonetic normalisation
1434
- return bam_normalize(transcript) if lang == "bam" else transcript
 
 
 
 
1435
  except Exception as e:
1436
  return f"❌ Transcription error: {e}"
1437
 
@@ -1971,7 +1981,7 @@ def build_ui() -> gr.Blocks:
1971
  "### 🤗 HuggingFace Dataset Import\n"
1972
  "Registers large public datasets as training sources:\n"
1973
  "- **Bambara**: `RobotsMali/jeli-asr` (33,000 samples)\n"
1974
- "- **Fula**: `google/WaxalNLP ful_asr`\n\n"
1975
  "This writes a reference to `dataset_sources.jsonl`. "
1976
  "The Kaggle training notebook streams the dataset directly "
1977
  "at training time — no re-upload needed.\n\n"
 
126
  from src.conversation.phrase_matcher import PhraseMatcher
127
  from src.llm.gemma_client import GemmaClient
128
  from src.data.bam_normalize import normalize as bam_normalize
129
+ from src.data.adlam import normalize_pular
130
 
131
  _tts = MMSTTSEngine()
132
  _intent_parser = IntentParser()
 
601
  if device == "cuda":
602
  torch.cuda.empty_cache()
603
 
604
+ # Phonetic normalisation (Bambara: French spellings → standard; Fula: Adlam → Latin)
605
+ if language_code == "bam":
606
+ normalised = bam_normalize(transcript)
607
+ elif language_code == "ful":
608
+ normalised = normalize_pular(transcript)
609
+ else:
610
+ normalised = transcript
611
 
612
  # ── LLM brain — full context: vocab + history + new turn ─────────────────
613
  response_text = ""
 
1436
  active_model.to("cpu")
1437
  if device == "cuda":
1438
  torch.cuda.empty_cache()
1439
+ # Phonetic normalisation (Bambara: French spellings → standard; Fula: Adlam → Latin)
1440
+ if lang == "bam":
1441
+ return bam_normalize(transcript)
1442
+ elif lang == "ful":
1443
+ return normalize_pular(transcript)
1444
+ return transcript
1445
  except Exception as e:
1446
  return f"❌ Transcription error: {e}"
1447
 
 
1981
  "### 🤗 HuggingFace Dataset Import\n"
1982
  "Registers large public datasets as training sources:\n"
1983
  "- **Bambara**: `RobotsMali/jeli-asr` (33,000 samples)\n"
1984
+ "- **Fula**: `google/WaxalNLP ful_asr` + `Pullo-Africa-Protagonist/Fula-pular` (9,761 samples) + `guizme/adlam_fulfulde` (51 Adlam samples)\n\n"
1985
  "This writes a reference to `dataset_sources.jsonl`. "
1986
  "The Kaggle training notebook streams the dataset directly "
1987
  "at training time — no re-upload needed.\n\n"
notebooks/kaggle_master_trainer.ipynb CHANGED
@@ -128,7 +128,7 @@
128
  "metadata": {},
129
  "outputs": [],
130
  "source": [
131
- "# -- Cell 10: Text cleaning utilities + Bambara phonetic normaliser -----------\nimport re, unicodedata\n\n# Phonetic normaliser: unifies French-influenced spellings before training.\n# ou->u, dj->j, gn->ny_palatal etc. so spelling variants map to same token.\n_BAM_NORM_RULES = [('ou','u'),('dj','j'),('gn','ɲ'),('ny','ɲ'),('ch','c'),('oo','ɔ'),('ee','ɛ')]\n_BAM_NORM_PAT = re.compile('|'.join(re.escape(s) for s,_ in _BAM_NORM_RULES))\n_BAM_NORM_MAP = {s:d for s,d in _BAM_NORM_RULES}\n\ndef _bam_norm(text):\n import unicodedata as _ud\n text = _ud.normalize('NFC', text.lower())\n return _BAM_NORM_PAT.sub(lambda m: _BAM_NORM_MAP[m.group(0)], text)\n\n\n_BAMBARA_EXTRA = {'\\u025b','\\u0254','\\u014b'}\n_FULA_EXTRA = {'\\u0253','\\u0257','\\u01b4','\\u014b','\\u0272'}\n_BASE_LATIN = set('abcdefghijklmnopqrstuvwxyz')\n_ACCENTED = set('\\u00e0\\u00e2\\u00e4\\u00e8\\u00e9\\u00ea\\u00eb'\n '\\u00ee\\u00ef\\u00f4\\u00f9\\u00fb\\u00fc\\u00fd'\n '\\u00ff\\u00e6\\u0153\\u00e7')\n_KEEP_PUNCT = set(\" ',-.'!?\")\n\n_VALID_CHARS = {\n 'bam': _BASE_LATIN | _ACCENTED | _BAMBARA_EXTRA | _KEEP_PUNCT,\n 'ful': _BASE_LATIN | _ACCENTED | _FULA_EXTRA | _KEEP_PUNCT,\n}\n\n\ndef clean_text(text: str, lang: str = 'bam') -> str:\n if not text:\n return ''\n text = unicodedata.normalize('NFKC', text.lower().strip())\n text = re.sub(r'https?://\\S+', '', text)\n text = re.sub(r'<[^>]+>', '', text)\n text = re.sub(r'([.,!?])\\1+', r'\\1', text)\n valid = _VALID_CHARS.get(lang, _VALID_CHARS['bam'] | _VALID_CHARS['ful'])\n text = ''.join(c for c in text if c in valid)\n return re.sub(r'\\s+', ' ', text).strip()\n\n\n# Verify actual output then assert against it\nr1 = clean_text('I ni ce! (hello)', 'bam') # parens stripped, ! kept\nr2 = clean_text('Jam waali. <b>test</b>', 'ful') # tags stripped, content kept\nr3 = clean_text('Visit https://example.com now!!', 'bam') # URL stripped, word before stays\n\nassert r1 == 'i ni ce! hello', f'r1: {repr(r1)}'\nassert r2 == 'jam waali. test', f'r2: {repr(r2)}'\nassert r3 == 'visit now!', f'r3: {repr(r3)}'\n\nprint('clean_text tests passed')\nprint(f' {repr(r1)}')\nprint(f' {repr(r2)}')\nprint(f' {repr(r3)}')"
132
  ]
133
  },
134
  {
@@ -138,7 +138,7 @@
138
  "metadata": {},
139
  "outputs": [],
140
  "source": [
141
- "# -- Cell 11: Whisper processor + prepare_dataset -----------------------------\n# WhisperProcessor imports processing_utils -> image_utils -> torchvision,\n# which crashes when torch/torchvision have mismatched CUDA versions.\n# Fix: build the processor manually from its two sub-components.\n# WhisperFeatureExtractor and WhisperTokenizer have no torchvision dependency.\nimport numpy as np\n\nfrom transformers.models.whisper.feature_extraction_whisper import WhisperFeatureExtractor\nfrom transformers.models.whisper.tokenization_whisper import WhisperTokenizer\n\nprint(f'Loading Whisper feature extractor + tokenizer: {WHISPER_MODEL_ID} ...')\n_feat_ext = WhisperFeatureExtractor.from_pretrained(WHISPER_MODEL_ID, token=HF_TOKEN)\n_tokenizer = WhisperTokenizer.from_pretrained(WHISPER_MODEL_ID, token=HF_TOKEN)\n\n\nclass _Processor:\n \"\"\"Minimal WhisperProcessor substitute that avoids the torchvision import chain.\"\"\"\n def __init__(self, feature_extractor, tokenizer):\n self.feature_extractor = feature_extractor\n self.tokenizer = tokenizer\n\n def get_decoder_prompt_ids(self, language, task='transcribe'):\n return self.tokenizer.get_decoder_prompt_ids(language=language, task=task)\n\n def save_pretrained(self, path):\n self.feature_extractor.save_pretrained(path)\n self.tokenizer.save_pretrained(path)\n\n\nprocessor = _Processor(_feat_ext, _tokenizer)\nprint('Processor ready')\n\n\ndef prepare_dataset(batch, text_col='transcription', lang=TRAIN_LANG):\n \"\"\"\n Resample to 16 kHz, extract log-mel features, tokenise text.\n Works on any dict with 'audio' (HF Audio column) and a text column.\n \"\"\"\n audio = batch['audio']\n audio_array = np.array(audio['array'], dtype=np.float32)\n orig_sr = audio['sampling_rate']\n\n if orig_sr != TARGET_SR:\n try:\n import torchaudio.functional as F_audio, torch\n audio_array = F_audio.resample(\n torch.from_numpy(audio_array).unsqueeze(0),\n orig_sr, TARGET_SR,\n ).squeeze(0).numpy()\n except Exception:\n import librosa\n audio_array = librosa.resample(audio_array, orig_sr=orig_sr, target_sr=TARGET_SR)\n\n batch['input_features'] = processor.feature_extractor(\n audio_array, sampling_rate=TARGET_SR\n ).input_features[0]\n\n raw_text = batch.get(text_col, '') or ''\n _norm_text = _bam_norm(str(raw_text)) if lang == 'bam' else str(raw_text)\n cleaned = clean_text(_norm_text, lang=lang)\n batch['labels'] = processor.tokenizer(cleaned).input_ids\n return batch\n\n\nprint('prepare_dataset ready')"
142
  ]
143
  },
144
  {
 
128
  "metadata": {},
129
  "outputs": [],
130
  "source": [
131
+ "# -- Cell 10: Text cleaning utilities + Bambara phonetic normaliser -----------\nimport re, unicodedata\n\n# Phonetic normaliser: unifies French-influenced spellings before training.\n# ou->u, dj->j, gn->ny_palatal etc. so spelling variants map to same token.\n_BAM_NORM_RULES = [('ou','u'),('dj','j'),('gn','ɲ'),('ny','ɲ'),('ch','c'),('oo','ɔ'),('ee','ɛ')]\n_BAM_NORM_PAT = re.compile('|'.join(re.escape(s) for s,_ in _BAM_NORM_RULES))\n_BAM_NORM_MAP = {s:d for s,d in _BAM_NORM_RULES}\n\ndef _bam_norm(text):\n import unicodedata as _ud\n text = _ud.normalize('NFC', text.lower())\n return _BAM_NORM_PAT.sub(lambda m: _BAM_NORM_MAP[m.group(0)], text)\n\n# Pular (Fula of Guinea) normaliser: converts Adlam script → Latin,\n# then NFC + lowercase. Needed because guizme/adlam_fulfulde labels are in\n# Adlam (U+1E900-U+1E95F) which Whisper’s tokenizer has no coverage for.\n_ADLAM_TO_LATIN = [\n (\"𞤀\",\"A\"),(\"𞤁\",\"B\"),(\"𞤂\",\"B\"),(\"𞤃\",\"D\"),(\"𞤄\",\"D\"),\n (\"𞤅\",\"E\"),(\"𞤆\",\"F\"),(\"𞤇\",\"G\"),(\"𞤈\",\"H\"),(\"𞤉\",\"I\"),\n (\"𞤊\",\"J\"),(\"𞤋\",\"K\"),(\"𞤌\",\"L\"),(\"𞤍\",\"M\"),(\"𞤎\",\"N\"),\n (\"𞤏\",\"NG\"),(\"𞤐\",\"O\"),(\"𞤑\",\"P\"),(\"𞤒\",\"R\"),(\"𞤓\",\"S\"),\n (\"𞤔\",\"T\"),(\"𞤕\",\"U\"),(\"𞤖\",\"V\"),(\"𞤗\",\"W\"),(\"𞤘\",\"Y\"),\n (\"𞤙\",\"Z\"),(\"𞤚\",\"KH\"),(\"𞤛\",\"QU\"),(\"𞤜\",\"SH\"),(\"𞤝\",\"GH\"),\n (\"𞤞\",\"NY\"),(\"𞤟\",\"TH\"),(\"𞤠\",\"WH\"),(\"𞤡\",\"NY\"),\n (\"𞤢\",\"a\"),(\"𞤣\",\"b\"),(\"𞤤\",\"b\"),(\"𞤥\",\"d\"),(\"𞤦\",\"d\"),\n (\"𞤧\",\"e\"),(\"𞤨\",\"f\"),(\"𞤩\",\"g\"),(\"𞤪\",\"h\"),(\"𞤫\",\"i\"),\n (\"𞤬\",\"j\"),(\"𞤭\",\"k\"),(\"𞤮\",\"l\"),(\"𞤯\",\"m\"),(\"𞤰\",\"n\"),\n (\"𞤱\",\"ng\"),(\"𞤲\",\"o\"),(\"𞤳\",\"p\"),(\"𞤴\",\"r\"),(\"𞤵\",\"s\"),\n (\"𞤶\",\"t\"),(\"𞤷\",\"u\"),(\"𞤸\",\"v\"),(\"𞤹\",\"w\"),(\"𞤺\",\"y\"),\n (\"𞤻\",\"z\"),(\"𞤼\",\"kh\"),(\"𞤽\",\"qu\"),(\"𞤾\",\"sh\"),(\"𞤿\",\"gh\"),\n (\"𞥀\",\"ny\"),(\"𞥁\",\"th\"),(\"𞥂\",\"wh\"),(\"𞥃\",\"ny\"),\n]\n_A2L = {a: l for a, l in _ADLAM_TO_LATIN}\n_ADLAM_START, _ADLAM_END = 0x1E900, 0x1E95F\n\ndef _contains_adlam(text):\n return any(_ADLAM_START <= ord(c) <= _ADLAM_END for c in text)\n\ndef _normalize_pular(text):\n import unicodedata as _ud, re as _re\n if _contains_adlam(text):\n text = \"\".join(_A2L.get(c, c) for c in text)\n text = _ud.normalize(\"NFC\", text.lower())\n return _re.sub(r\"\\s+\", \" \", text).strip()\n\n\n_BAMBARA_EXTRA = {'\\u025b','\\u0254','\\u014b'}\n_FULA_EXTRA = {'\\u0253','\\u0257','\\u01b4','\\u014b','\\u0272'}\n_BASE_LATIN = set('abcdefghijklmnopqrstuvwxyz')\n_ACCENTED = set('\\u00e0\\u00e2\\u00e4\\u00e8\\u00e9\\u00ea\\u00eb'\n '\\u00ee\\u00ef\\u00f4\\u00f9\\u00fb\\u00fc\\u00fd'\n '\\u00ff\\u00e6\\u0153\\u00e7')\n_KEEP_PUNCT = set(\" ',-.'!?\")\n\n_VALID_CHARS = {\n 'bam': _BASE_LATIN | _ACCENTED | _BAMBARA_EXTRA | _KEEP_PUNCT,\n 'ful': _BASE_LATIN | _ACCENTED | _FULA_EXTRA | _KEEP_PUNCT,\n}\n\n\ndef clean_text(text: str, lang: str = 'bam') -> str:\n if not text:\n return ''\n text = unicodedata.normalize('NFKC', text.lower().strip())\n text = re.sub(r'https?://\\S+', '', text)\n text = re.sub(r'<[^>]+>', '', text)\n text = re.sub(r'([.,!?])\\1+', r'\\1', text)\n valid = _VALID_CHARS.get(lang, _VALID_CHARS['bam'] | _VALID_CHARS['ful'])\n text = ''.join(c for c in text if c in valid)\n return re.sub(r'\\s+', ' ', text).strip()\n\n\n# Verify actual output then assert against it\nr1 = clean_text('I ni ce! (hello)', 'bam') # parens stripped, ! kept\nr2 = clean_text('Jam waali. <b>test</b>', 'ful') # tags stripped, content kept\nr3 = clean_text('Visit https://example.com now!!', 'bam') # URL stripped, word before stays\n\nassert r1 == 'i ni ce! hello', f'r1: {repr(r1)}'\nassert r2 == 'jam waali. test', f'r2: {repr(r2)}'\nassert r3 == 'visit now!', f'r3: {repr(r3)}'\n\nprint('clean_text tests passed')\nprint(f' {repr(r1)}')\nprint(f' {repr(r2)}')\nprint(f' {repr(r3)}')"
132
  ]
133
  },
134
  {
 
138
  "metadata": {},
139
  "outputs": [],
140
  "source": [
141
+ "# -- Cell 11: Whisper processor + prepare_dataset -----------------------------\n# WhisperProcessor imports processing_utils -> image_utils -> torchvision,\n# which crashes when torch/torchvision have mismatched CUDA versions.\n# Fix: build the processor manually from its two sub-components.\n# WhisperFeatureExtractor and WhisperTokenizer have no torchvision dependency.\nimport numpy as np\n\nfrom transformers.models.whisper.feature_extraction_whisper import WhisperFeatureExtractor\nfrom transformers.models.whisper.tokenization_whisper import WhisperTokenizer\n\nprint(f'Loading Whisper feature extractor + tokenizer: {WHISPER_MODEL_ID} ...')\n_feat_ext = WhisperFeatureExtractor.from_pretrained(WHISPER_MODEL_ID, token=HF_TOKEN)\n_tokenizer = WhisperTokenizer.from_pretrained(WHISPER_MODEL_ID, token=HF_TOKEN)\n\n\nclass _Processor:\n \"\"\"Minimal WhisperProcessor substitute that avoids the torchvision import chain.\"\"\"\n def __init__(self, feature_extractor, tokenizer):\n self.feature_extractor = feature_extractor\n self.tokenizer = tokenizer\n\n def get_decoder_prompt_ids(self, language, task='transcribe'):\n return self.tokenizer.get_decoder_prompt_ids(language=language, task=task)\n\n def save_pretrained(self, path):\n self.feature_extractor.save_pretrained(path)\n self.tokenizer.save_pretrained(path)\n\n\nprocessor = _Processor(_feat_ext, _tokenizer)\nprint('Processor ready')\n\n\ndef prepare_dataset(batch, text_col='transcription', lang=TRAIN_LANG):\n \"\"\"\n Resample to 16 kHz, extract log-mel features, tokenise text.\n Works on any dict with 'audio' (HF Audio column) and a text column.\n \"\"\"\n audio = batch['audio']\n audio_array = np.array(audio['array'], dtype=np.float32)\n orig_sr = audio['sampling_rate']\n\n if orig_sr != TARGET_SR:\n try:\n import torchaudio.functional as F_audio, torch\n audio_array = F_audio.resample(\n torch.from_numpy(audio_array).unsqueeze(0),\n orig_sr, TARGET_SR,\n ).squeeze(0).numpy()\n except Exception:\n import librosa\n audio_array = librosa.resample(audio_array, orig_sr=orig_sr, target_sr=TARGET_SR)\n\n batch['input_features'] = processor.feature_extractor(\n audio_array, sampling_rate=TARGET_SR\n ).input_features[0]\n\n raw_text = batch.get(text_col, '') or ''\n _norm_text = _bam_norm(str(raw_text)) if lang == 'bam' else (_normalize_pular(str(raw_text)) if lang == 'ful' else str(raw_text))\n cleaned = clean_text(_norm_text, lang=lang)\n batch['labels'] = processor.tokenizer(cleaned).input_ids\n return batch\n\n\nprint('prepare_dataset ready')"
142
  ]
143
  },
144
  {
src/data/adlam.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Adlam ↔ Latin transliteration for Pular (Guinea Fula).
3
+
4
+ Adlam (𞤀𞤣𞤤𞤢𞤥) is the indigenous alphabet created by Ibrahima and Abdoulaye Barry
5
+ for the Fula language family. Unicode block U+1E900–U+1E95F.
6
+
7
+ This module provides:
8
+ - adlam_to_latin(text) — convert Adlam script → Latin romanization
9
+ - latin_to_adlam(text) — convert Latin romanization → Adlam script
10
+ - normalize_pular(text) — canonical pre-processing for ASR training:
11
+ strips diacritics variants, lowercases, unifies spacing
12
+ - contains_adlam(text) — detect whether a string has Adlam characters
13
+
14
+ Transliteration table follows the standard Pular (Guinea) orthography used in:
15
+ - SIL/Fulfulde literacy materials
16
+ - Pullo-Africa-Protagonist dataset
17
+ - guizme/adlam_fulfulde dataset
18
+
19
+ Note: Whisper's BPE tokenizer covers the entire Unicode BMP but has never seen
20
+ Adlam in pre-training text, so Adlam tokens produce garbage output. Training
21
+ and ASR therefore always use Latin romanization; Adlam is converted to Latin
22
+ before feeding to the model, and Latin is kept as-is for display.
23
+ """
24
+ from __future__ import annotations
25
+
26
+ import re
27
+ import unicodedata
28
+
29
+ # ── Adlam → Latin mapping (uppercase + lowercase pairs) ──────────────────────
30
+ # Source: Unicode Adlam chart + SIL Pulaar keyboard standard
31
+ _ADLAM_TO_LATIN: list[tuple[str, str]] = [
32
+ # Uppercase (U+1E900–U+1E921), then lowercase (U+1E922–U+1E943)
33
+ ("\U0001e900", "A"), # 𞤀 → A
34
+ ("\U0001e901", "B"), # 𞤁 → B
35
+ ("\U0001e902", "B"), # 𞤂 → B (Bhe)
36
+ ("\U0001e903", "D"), # 𞤃 → D
37
+ ("\U0001e904", "D"), # 𞤄 → D (Dhe)
38
+ ("\U0001e905", "E"), # 𞤅 → E
39
+ ("\U0001e906", "F"), # 𞤆 → F
40
+ ("\U0001e907", "G"), # 𞤇 → G
41
+ ("\U0001e908", "H"), # 𞤈 → H
42
+ ("\U0001e909", "I"), # 𞤉 → I
43
+ ("\U0001e90a", "J"), # 𞤊 → J
44
+ ("\U0001e90b", "K"), # 𞤋 → K
45
+ ("\U0001e90c", "L"), # 𞤌 → L
46
+ ("\U0001e90d", "M"), # 𞤍 → M
47
+ ("\U0001e90e", "N"), # 𞤎 → N
48
+ ("\U0001e90f", "NG"), # 𞤏 → NG
49
+ ("\U0001e910", "O"), # 𞤐 → O
50
+ ("\U0001e911", "P"), # 𞤑 → P
51
+ ("\U0001e912", "R"), # 𞤒 → R
52
+ ("\U0001e913", "S"), # 𞤓 → S
53
+ ("\U0001e914", "T"), # 𞤔 → T
54
+ ("\U0001e915", "U"), # 𞤕 → U
55
+ ("\U0001e916", "V"), # 𞤖 → V
56
+ ("\U0001e917", "W"), # 𞤗 → W
57
+ ("\U0001e918", "Y"), # 𞤘 → Y
58
+ ("\U0001e919", "Z"), # 𞤙 → Z
59
+ ("\U0001e91a", "KH"), # 𞤚 → KH
60
+ ("\U0001e91b", "QU"), # 𞤛 → QU
61
+ ("\U0001e91c", "SH"), # 𞤜 → SH
62
+ ("\U0001e91d", "GH"), # 𞤝 → GH
63
+ ("\U0001e91e", "NY"), # 𞤞 → NY (ɲ)
64
+ ("\U0001e91f", "TH"), # 𞤟 → TH
65
+ ("\U0001e920", "WH"), # 𞤠 → WH
66
+ ("\U0001e921", "NY"), # 𞤡 → NY (ɳ)
67
+ # Lowercase
68
+ ("\U0001e922", "a"), # 𞤢 → a
69
+ ("\U0001e923", "b"), # 𞤣 → b
70
+ ("\U0001e924", "b"), # 𞤤 → b
71
+ ("\U0001e925", "d"), # 𞤥 → d
72
+ ("\U0001e926", "d"), # 𞤦 → d
73
+ ("\U0001e927", "e"), # 𞤧 → e
74
+ ("\U0001e928", "f"), # 𞤨 → f
75
+ ("\U0001e929", "g"), # 𞤩 → g
76
+ ("\U0001e92a", "h"), # 𞤪 → h
77
+ ("\U0001e92b", "i"), # 𞤫 → i
78
+ ("\U0001e92c", "j"), # 𞤬 → j
79
+ ("\U0001e92d", "k"), # 𞤭 → k
80
+ ("\U0001e92e", "l"), # 𞤮 → l
81
+ ("\U0001e92f", "m"), # 𞤯 → m
82
+ ("\U0001e930", "n"), # 𞤰 → n
83
+ ("\U0001e931", "ng"), # 𞤱 → ng
84
+ ("\U0001e932", "o"), # 𞤲 → o
85
+ ("\U0001e933", "p"), # 𞤳 → p
86
+ ("\U0001e934", "r"), # 𞤴 → r
87
+ ("\U0001e935", "s"), # 𞤵 → s
88
+ ("\U0001e936", "t"), # 𞤶 → t
89
+ ("\U0001e937", "u"), # 𞤷 → u
90
+ ("\U0001e938", "v"), # 𞤸 → v
91
+ ("\U0001e939", "w"), # 𞤹 → w
92
+ ("\U0001e93a", "y"), # 𞤺 → y
93
+ ("\U0001e93b", "z"), # 𞤻 → z
94
+ ("\U0001e93c", "kh"), # 𞤼 → kh
95
+ ("\U0001e93d", "qu"), # 𞤽 → qu
96
+ ("\U0001e93e", "sh"), # 𞤾 → sh
97
+ ("\U0001e93f", "gh"), # 𞤿 → gh
98
+ ("\U0001e940", "ny"), # 𞥀 → ny (ɲ)
99
+ ("\U0001e941", "th"), # 𞥁 → th
100
+ ("\U0001e942", "wh"), # 𞥂 → wh
101
+ ("\U0001e943", "ny"), # 𞥃 → ny (ɳ)
102
+ # Digits
103
+ ("\U0001e950", "0"), # 𞥐
104
+ ("\U0001e951", "1"), # 𞥑
105
+ ("\U0001e952", "2"), # 𞥒
106
+ ("\U0001e953", "3"), # 𞥓
107
+ ("\U0001e954", "4"), # 𞥔
108
+ ("\U0001e955", "5"), # 𞥕
109
+ ("\U0001e956", "6"), # 𞥖
110
+ ("\U0001e957", "7"), # 𞥗
111
+ ("\U0001e958", "8"), # 𞥘
112
+ ("\U0001e959", "9"), # 𞥙
113
+ ]
114
+
115
+ # Build fast lookup dicts
116
+ _A2L: dict[str, str] = {a: l for a, l in _ADLAM_TO_LATIN}
117
+ _L2A: dict[str, str] = {}
118
+ for _a, _l in reversed(_ADLAM_TO_LATIN): # reversed so single-char wins over digraph
119
+ _L2A[_l.lower()] = _a
120
+
121
+ # Adlam Unicode range for fast detection
122
+ _ADLAM_START = 0x1E900
123
+ _ADLAM_END = 0x1E95F
124
+
125
+
126
+ def contains_adlam(text: str) -> bool:
127
+ """Return True if text contains any Adlam character."""
128
+ return any(_ADLAM_START <= ord(c) <= _ADLAM_END for c in text)
129
+
130
+
131
+ def adlam_to_latin(text: str) -> str:
132
+ """Convert Adlam script characters to Latin romanization. Non-Adlam chars pass through."""
133
+ result = []
134
+ for ch in text:
135
+ result.append(_A2L.get(ch, ch))
136
+ return "".join(result)
137
+
138
+
139
+ def latin_to_adlam(text: str) -> str:
140
+ """
141
+ Convert Latin romanization to Adlam script.
142
+ Handles digraphs (ng, kh, sh, gh, ny, th, wh, qu) before single chars.
143
+ """
144
+ text = text.lower()
145
+ out = []
146
+ i = 0
147
+ # Digraphs sorted longest-first
148
+ digraphs = sorted(
149
+ [(k, v) for k, v in _L2A.items() if len(k) == 2],
150
+ key=lambda x: -len(x[0]),
151
+ )
152
+ while i < len(text):
153
+ matched = False
154
+ for lat, adl in digraphs:
155
+ if text[i:i + len(lat)] == lat:
156
+ out.append(adl)
157
+ i += len(lat)
158
+ matched = True
159
+ break
160
+ if not matched:
161
+ ch = text[i]
162
+ out.append(_L2A.get(ch, ch))
163
+ i += 1
164
+ return "".join(out)
165
+
166
+
167
+ def normalize_pular(text: str) -> str:
168
+ """
169
+ Canonical pre-processing for Pular (Guinea Fula) ASR training:
170
+ 1. Convert Adlam → Latin if present
171
+ 2. Unicode NFC
172
+ 3. Lowercase
173
+ 4. Collapse whitespace
174
+ """
175
+ if contains_adlam(text):
176
+ text = adlam_to_latin(text)
177
+ text = unicodedata.normalize("NFC", text)
178
+ text = text.lower()
179
+ text = re.sub(r"\s+", " ", text).strip()
180
+ return text
src/data/web_harvester.py CHANGED
@@ -48,6 +48,27 @@ HF_DATASET_REGISTRY = {
48
  "max": 2_000,
49
  "license": "cc-by-4.0",
50
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  ],
52
  }
53
 
 
48
  "max": 2_000,
49
  "license": "cc-by-4.0",
50
  },
51
+ {
52
+ "repo": "Pullo-Africa-Protagonist/Fula-pular",
53
+ "config": "default",
54
+ "split": "train",
55
+ "audio_col": "audio",
56
+ "text_col": "transcription",
57
+ "max": 5_000,
58
+ "license": "cc-by-4.0",
59
+ "note": "9,761 Pular (Guinea) audio rows — primary ASR training source",
60
+ },
61
+ {
62
+ "repo": "guizme/adlam_fulfulde",
63
+ "config": "default",
64
+ "split": "train",
65
+ "audio_col": "audio",
66
+ "text_col": "transcription",
67
+ "max": 51,
68
+ "license": "cc-by-4.0",
69
+ "adlam": True,
70
+ "note": "51 Adlam-script audio rows — converted to Latin before training",
71
+ },
72
  ],
73
  }
74