Spaces:
Running
Add Adlam/Pular Fula integration: transliterator + 3 new datasets + normalisation pipeline
Browse files- src/data/adlam.py: full Adlam↔Latin transliterator for Pular (Guinea Fula)
— bidirectional char map (U+1E900–U+1E95F), digraph-aware latin_to_adlam(),
contains_adlam(), normalize_pular() (Adlam→Latin + NFC + lowercase)
- src/data/web_harvester.py: add 3 Fula training sources to HF_DATASET_REGISTRY:
Pullo-Africa-Protagonist/Fula-pular (9,761 audio rows, primary),
guizme/adlam_fulfulde (51 Adlam-script rows, auto-converted to Latin)
- notebooks/kaggle_master_trainer.ipynb: Cell 11 gets inline _normalize_pular
(Adlam→Latin); Cell 12 applies it for lang=='ful' in prepare_dataset
- app.py: import normalize_pular; apply in _do_asr and _convo_pipeline for Fula;
update Self-Teaching tab help text to list all 3 Fula datasets
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
- app.py +15 -5
- notebooks/kaggle_master_trainer.ipynb +2 -2
- src/data/adlam.py +180 -0
- src/data/web_harvester.py +21 -0
|
@@ -126,6 +126,7 @@ from src.iot.voice_responder import VoiceResponder
|
|
| 126 |
from src.conversation.phrase_matcher import PhraseMatcher
|
| 127 |
from src.llm.gemma_client import GemmaClient
|
| 128 |
from src.data.bam_normalize import normalize as bam_normalize
|
|
|
|
| 129 |
|
| 130 |
_tts = MMSTTSEngine()
|
| 131 |
_intent_parser = IntentParser()
|
|
@@ -600,8 +601,13 @@ def _convo_pipeline(audio_path: str, language_code: str, history: list):
|
|
| 600 |
if device == "cuda":
|
| 601 |
torch.cuda.empty_cache()
|
| 602 |
|
| 603 |
-
# Phonetic normalisation
|
| 604 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 605 |
|
| 606 |
# ── LLM brain — full context: vocab + history + new turn ─────────────────
|
| 607 |
response_text = ""
|
|
@@ -1430,8 +1436,12 @@ def _do_asr(audio_path: str, language_label: str) -> str:
|
|
| 1430 |
active_model.to("cpu")
|
| 1431 |
if device == "cuda":
|
| 1432 |
torch.cuda.empty_cache()
|
| 1433 |
-
# Bambara
|
| 1434 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1435 |
except Exception as e:
|
| 1436 |
return f"❌ Transcription error: {e}"
|
| 1437 |
|
|
@@ -1971,7 +1981,7 @@ def build_ui() -> gr.Blocks:
|
|
| 1971 |
"### 🤗 HuggingFace Dataset Import\n"
|
| 1972 |
"Registers large public datasets as training sources:\n"
|
| 1973 |
"- **Bambara**: `RobotsMali/jeli-asr` (33,000 samples)\n"
|
| 1974 |
-
"- **Fula**: `google/WaxalNLP ful_asr`\n\n"
|
| 1975 |
"This writes a reference to `dataset_sources.jsonl`. "
|
| 1976 |
"The Kaggle training notebook streams the dataset directly "
|
| 1977 |
"at training time — no re-upload needed.\n\n"
|
|
|
|
| 126 |
from src.conversation.phrase_matcher import PhraseMatcher
|
| 127 |
from src.llm.gemma_client import GemmaClient
|
| 128 |
from src.data.bam_normalize import normalize as bam_normalize
|
| 129 |
+
from src.data.adlam import normalize_pular
|
| 130 |
|
| 131 |
_tts = MMSTTSEngine()
|
| 132 |
_intent_parser = IntentParser()
|
|
|
|
| 601 |
if device == "cuda":
|
| 602 |
torch.cuda.empty_cache()
|
| 603 |
|
| 604 |
+
# Phonetic normalisation (Bambara: French spellings → standard; Fula: Adlam → Latin)
|
| 605 |
+
if language_code == "bam":
|
| 606 |
+
normalised = bam_normalize(transcript)
|
| 607 |
+
elif language_code == "ful":
|
| 608 |
+
normalised = normalize_pular(transcript)
|
| 609 |
+
else:
|
| 610 |
+
normalised = transcript
|
| 611 |
|
| 612 |
# ── LLM brain — full context: vocab + history + new turn ─────────────────
|
| 613 |
response_text = ""
|
|
|
|
| 1436 |
active_model.to("cpu")
|
| 1437 |
if device == "cuda":
|
| 1438 |
torch.cuda.empty_cache()
|
| 1439 |
+
# Phonetic normalisation (Bambara: French spellings → standard; Fula: Adlam → Latin)
|
| 1440 |
+
if lang == "bam":
|
| 1441 |
+
return bam_normalize(transcript)
|
| 1442 |
+
elif lang == "ful":
|
| 1443 |
+
return normalize_pular(transcript)
|
| 1444 |
+
return transcript
|
| 1445 |
except Exception as e:
|
| 1446 |
return f"❌ Transcription error: {e}"
|
| 1447 |
|
|
|
|
| 1981 |
"### 🤗 HuggingFace Dataset Import\n"
|
| 1982 |
"Registers large public datasets as training sources:\n"
|
| 1983 |
"- **Bambara**: `RobotsMali/jeli-asr` (33,000 samples)\n"
|
| 1984 |
+
"- **Fula**: `google/WaxalNLP ful_asr` + `Pullo-Africa-Protagonist/Fula-pular` (9,761 samples) + `guizme/adlam_fulfulde` (51 Adlam samples)\n\n"
|
| 1985 |
"This writes a reference to `dataset_sources.jsonl`. "
|
| 1986 |
"The Kaggle training notebook streams the dataset directly "
|
| 1987 |
"at training time — no re-upload needed.\n\n"
|
|
@@ -128,7 +128,7 @@
|
|
| 128 |
"metadata": {},
|
| 129 |
"outputs": [],
|
| 130 |
"source": [
|
| 131 |
-
"# -- Cell 10: Text cleaning utilities + Bambara phonetic normaliser -----------\nimport re, unicodedata\n\n# Phonetic normaliser: unifies French-influenced spellings before training.\n# ou->u, dj->j, gn->ny_palatal etc. so spelling variants map to same token.\n_BAM_NORM_RULES = [('ou','u'),('dj','j'),('gn','ɲ'),('ny','ɲ'),('ch','c'),('oo','ɔ'),('ee','ɛ')]\n_BAM_NORM_PAT = re.compile('|'.join(re.escape(s) for s,_ in _BAM_NORM_RULES))\n_BAM_NORM_MAP = {s:d for s,d in _BAM_NORM_RULES}\n\ndef _bam_norm(text):\n import unicodedata as _ud\n text = _ud.normalize('NFC', text.lower())\n return _BAM_NORM_PAT.sub(lambda m: _BAM_NORM_MAP[m.group(0)], text)\n\n\n_BAMBARA_EXTRA = {'\\u025b','\\u0254','\\u014b'}\n_FULA_EXTRA = {'\\u0253','\\u0257','\\u01b4','\\u014b','\\u0272'}\n_BASE_LATIN = set('abcdefghijklmnopqrstuvwxyz')\n_ACCENTED = set('\\u00e0\\u00e2\\u00e4\\u00e8\\u00e9\\u00ea\\u00eb'\n '\\u00ee\\u00ef\\u00f4\\u00f9\\u00fb\\u00fc\\u00fd'\n '\\u00ff\\u00e6\\u0153\\u00e7')\n_KEEP_PUNCT = set(\" ',-.'!?\")\n\n_VALID_CHARS = {\n 'bam': _BASE_LATIN | _ACCENTED | _BAMBARA_EXTRA | _KEEP_PUNCT,\n 'ful': _BASE_LATIN | _ACCENTED | _FULA_EXTRA | _KEEP_PUNCT,\n}\n\n\ndef clean_text(text: str, lang: str = 'bam') -> str:\n if not text:\n return ''\n text = unicodedata.normalize('NFKC', text.lower().strip())\n text = re.sub(r'https?://\\S+', '', text)\n text = re.sub(r'<[^>]+>', '', text)\n text = re.sub(r'([.,!?])\\1+', r'\\1', text)\n valid = _VALID_CHARS.get(lang, _VALID_CHARS['bam'] | _VALID_CHARS['ful'])\n text = ''.join(c for c in text if c in valid)\n return re.sub(r'\\s+', ' ', text).strip()\n\n\n# Verify actual output then assert against it\nr1 = clean_text('I ni ce! (hello)', 'bam') # parens stripped, ! kept\nr2 = clean_text('Jam waali. <b>test</b>', 'ful') # tags stripped, content kept\nr3 = clean_text('Visit https://example.com now!!', 'bam') # URL stripped, word before stays\n\nassert r1 == 'i ni ce! hello', f'r1: {repr(r1)}'\nassert r2 == 'jam waali. test', f'r2: {repr(r2)}'\nassert r3 == 'visit now!', f'r3: {repr(r3)}'\n\nprint('clean_text tests passed')\nprint(f' {repr(r1)}')\nprint(f' {repr(r2)}')\nprint(f' {repr(r3)}')"
|
| 132 |
]
|
| 133 |
},
|
| 134 |
{
|
|
@@ -138,7 +138,7 @@
|
|
| 138 |
"metadata": {},
|
| 139 |
"outputs": [],
|
| 140 |
"source": [
|
| 141 |
-
"# -- Cell 11: Whisper processor + prepare_dataset -----------------------------\n# WhisperProcessor imports processing_utils -> image_utils -> torchvision,\n# which crashes when torch/torchvision have mismatched CUDA versions.\n# Fix: build the processor manually from its two sub-components.\n# WhisperFeatureExtractor and WhisperTokenizer have no torchvision dependency.\nimport numpy as np\n\nfrom transformers.models.whisper.feature_extraction_whisper import WhisperFeatureExtractor\nfrom transformers.models.whisper.tokenization_whisper import WhisperTokenizer\n\nprint(f'Loading Whisper feature extractor + tokenizer: {WHISPER_MODEL_ID} ...')\n_feat_ext = WhisperFeatureExtractor.from_pretrained(WHISPER_MODEL_ID, token=HF_TOKEN)\n_tokenizer = WhisperTokenizer.from_pretrained(WHISPER_MODEL_ID, token=HF_TOKEN)\n\n\nclass _Processor:\n \"\"\"Minimal WhisperProcessor substitute that avoids the torchvision import chain.\"\"\"\n def __init__(self, feature_extractor, tokenizer):\n self.feature_extractor = feature_extractor\n self.tokenizer = tokenizer\n\n def get_decoder_prompt_ids(self, language, task='transcribe'):\n return self.tokenizer.get_decoder_prompt_ids(language=language, task=task)\n\n def save_pretrained(self, path):\n self.feature_extractor.save_pretrained(path)\n self.tokenizer.save_pretrained(path)\n\n\nprocessor = _Processor(_feat_ext, _tokenizer)\nprint('Processor ready')\n\n\ndef prepare_dataset(batch, text_col='transcription', lang=TRAIN_LANG):\n \"\"\"\n Resample to 16 kHz, extract log-mel features, tokenise text.\n Works on any dict with 'audio' (HF Audio column) and a text column.\n \"\"\"\n audio = batch['audio']\n audio_array = np.array(audio['array'], dtype=np.float32)\n orig_sr = audio['sampling_rate']\n\n if orig_sr != TARGET_SR:\n try:\n import torchaudio.functional as F_audio, torch\n audio_array = F_audio.resample(\n torch.from_numpy(audio_array).unsqueeze(0),\n orig_sr, TARGET_SR,\n ).squeeze(0).numpy()\n except Exception:\n import librosa\n audio_array = librosa.resample(audio_array, orig_sr=orig_sr, target_sr=TARGET_SR)\n\n batch['input_features'] = processor.feature_extractor(\n audio_array, sampling_rate=TARGET_SR\n ).input_features[0]\n\n raw_text = batch.get(text_col, '') or ''\n _norm_text = _bam_norm(str(raw_text)) if lang == 'bam' else str(raw_text)\n cleaned = clean_text(_norm_text, lang=lang)\n batch['labels'] = processor.tokenizer(cleaned).input_ids\n return batch\n\n\nprint('prepare_dataset ready')"
|
| 142 |
]
|
| 143 |
},
|
| 144 |
{
|
|
|
|
| 128 |
"metadata": {},
|
| 129 |
"outputs": [],
|
| 130 |
"source": [
|
| 131 |
+
"# -- Cell 10: Text cleaning utilities + Bambara phonetic normaliser -----------\nimport re, unicodedata\n\n# Phonetic normaliser: unifies French-influenced spellings before training.\n# ou->u, dj->j, gn->ny_palatal etc. so spelling variants map to same token.\n_BAM_NORM_RULES = [('ou','u'),('dj','j'),('gn','ɲ'),('ny','ɲ'),('ch','c'),('oo','ɔ'),('ee','ɛ')]\n_BAM_NORM_PAT = re.compile('|'.join(re.escape(s) for s,_ in _BAM_NORM_RULES))\n_BAM_NORM_MAP = {s:d for s,d in _BAM_NORM_RULES}\n\ndef _bam_norm(text):\n import unicodedata as _ud\n text = _ud.normalize('NFC', text.lower())\n return _BAM_NORM_PAT.sub(lambda m: _BAM_NORM_MAP[m.group(0)], text)\n\n# Pular (Fula of Guinea) normaliser: converts Adlam script → Latin,\n# then NFC + lowercase. Needed because guizme/adlam_fulfulde labels are in\n# Adlam (U+1E900-U+1E95F) which Whisper’s tokenizer has no coverage for.\n_ADLAM_TO_LATIN = [\n (\"𞤀\",\"A\"),(\"𞤁\",\"B\"),(\"𞤂\",\"B\"),(\"𞤃\",\"D\"),(\"𞤄\",\"D\"),\n (\"𞤅\",\"E\"),(\"𞤆\",\"F\"),(\"𞤇\",\"G\"),(\"𞤈\",\"H\"),(\"𞤉\",\"I\"),\n (\"𞤊\",\"J\"),(\"𞤋\",\"K\"),(\"𞤌\",\"L\"),(\"𞤍\",\"M\"),(\"𞤎\",\"N\"),\n (\"𞤏\",\"NG\"),(\"𞤐\",\"O\"),(\"𞤑\",\"P\"),(\"𞤒\",\"R\"),(\"𞤓\",\"S\"),\n (\"𞤔\",\"T\"),(\"𞤕\",\"U\"),(\"𞤖\",\"V\"),(\"𞤗\",\"W\"),(\"𞤘\",\"Y\"),\n (\"𞤙\",\"Z\"),(\"𞤚\",\"KH\"),(\"𞤛\",\"QU\"),(\"𞤜\",\"SH\"),(\"𞤝\",\"GH\"),\n (\"𞤞\",\"NY\"),(\"𞤟\",\"TH\"),(\"𞤠\",\"WH\"),(\"𞤡\",\"NY\"),\n (\"𞤢\",\"a\"),(\"𞤣\",\"b\"),(\"𞤤\",\"b\"),(\"𞤥\",\"d\"),(\"𞤦\",\"d\"),\n (\"𞤧\",\"e\"),(\"𞤨\",\"f\"),(\"𞤩\",\"g\"),(\"𞤪\",\"h\"),(\"𞤫\",\"i\"),\n (\"𞤬\",\"j\"),(\"𞤭\",\"k\"),(\"𞤮\",\"l\"),(\"𞤯\",\"m\"),(\"𞤰\",\"n\"),\n (\"𞤱\",\"ng\"),(\"𞤲\",\"o\"),(\"𞤳\",\"p\"),(\"𞤴\",\"r\"),(\"𞤵\",\"s\"),\n (\"𞤶\",\"t\"),(\"𞤷\",\"u\"),(\"𞤸\",\"v\"),(\"𞤹\",\"w\"),(\"𞤺\",\"y\"),\n (\"𞤻\",\"z\"),(\"𞤼\",\"kh\"),(\"𞤽\",\"qu\"),(\"𞤾\",\"sh\"),(\"𞤿\",\"gh\"),\n (\"𞥀\",\"ny\"),(\"𞥁\",\"th\"),(\"𞥂\",\"wh\"),(\"𞥃\",\"ny\"),\n]\n_A2L = {a: l for a, l in _ADLAM_TO_LATIN}\n_ADLAM_START, _ADLAM_END = 0x1E900, 0x1E95F\n\ndef _contains_adlam(text):\n return any(_ADLAM_START <= ord(c) <= _ADLAM_END for c in text)\n\ndef _normalize_pular(text):\n import unicodedata as _ud, re as _re\n if _contains_adlam(text):\n text = \"\".join(_A2L.get(c, c) for c in text)\n text = _ud.normalize(\"NFC\", text.lower())\n return _re.sub(r\"\\s+\", \" \", text).strip()\n\n\n_BAMBARA_EXTRA = {'\\u025b','\\u0254','\\u014b'}\n_FULA_EXTRA = {'\\u0253','\\u0257','\\u01b4','\\u014b','\\u0272'}\n_BASE_LATIN = set('abcdefghijklmnopqrstuvwxyz')\n_ACCENTED = set('\\u00e0\\u00e2\\u00e4\\u00e8\\u00e9\\u00ea\\u00eb'\n '\\u00ee\\u00ef\\u00f4\\u00f9\\u00fb\\u00fc\\u00fd'\n '\\u00ff\\u00e6\\u0153\\u00e7')\n_KEEP_PUNCT = set(\" ',-.'!?\")\n\n_VALID_CHARS = {\n 'bam': _BASE_LATIN | _ACCENTED | _BAMBARA_EXTRA | _KEEP_PUNCT,\n 'ful': _BASE_LATIN | _ACCENTED | _FULA_EXTRA | _KEEP_PUNCT,\n}\n\n\ndef clean_text(text: str, lang: str = 'bam') -> str:\n if not text:\n return ''\n text = unicodedata.normalize('NFKC', text.lower().strip())\n text = re.sub(r'https?://\\S+', '', text)\n text = re.sub(r'<[^>]+>', '', text)\n text = re.sub(r'([.,!?])\\1+', r'\\1', text)\n valid = _VALID_CHARS.get(lang, _VALID_CHARS['bam'] | _VALID_CHARS['ful'])\n text = ''.join(c for c in text if c in valid)\n return re.sub(r'\\s+', ' ', text).strip()\n\n\n# Verify actual output then assert against it\nr1 = clean_text('I ni ce! (hello)', 'bam') # parens stripped, ! kept\nr2 = clean_text('Jam waali. <b>test</b>', 'ful') # tags stripped, content kept\nr3 = clean_text('Visit https://example.com now!!', 'bam') # URL stripped, word before stays\n\nassert r1 == 'i ni ce! hello', f'r1: {repr(r1)}'\nassert r2 == 'jam waali. test', f'r2: {repr(r2)}'\nassert r3 == 'visit now!', f'r3: {repr(r3)}'\n\nprint('clean_text tests passed')\nprint(f' {repr(r1)}')\nprint(f' {repr(r2)}')\nprint(f' {repr(r3)}')"
|
| 132 |
]
|
| 133 |
},
|
| 134 |
{
|
|
|
|
| 138 |
"metadata": {},
|
| 139 |
"outputs": [],
|
| 140 |
"source": [
|
| 141 |
+
"# -- Cell 11: Whisper processor + prepare_dataset -----------------------------\n# WhisperProcessor imports processing_utils -> image_utils -> torchvision,\n# which crashes when torch/torchvision have mismatched CUDA versions.\n# Fix: build the processor manually from its two sub-components.\n# WhisperFeatureExtractor and WhisperTokenizer have no torchvision dependency.\nimport numpy as np\n\nfrom transformers.models.whisper.feature_extraction_whisper import WhisperFeatureExtractor\nfrom transformers.models.whisper.tokenization_whisper import WhisperTokenizer\n\nprint(f'Loading Whisper feature extractor + tokenizer: {WHISPER_MODEL_ID} ...')\n_feat_ext = WhisperFeatureExtractor.from_pretrained(WHISPER_MODEL_ID, token=HF_TOKEN)\n_tokenizer = WhisperTokenizer.from_pretrained(WHISPER_MODEL_ID, token=HF_TOKEN)\n\n\nclass _Processor:\n \"\"\"Minimal WhisperProcessor substitute that avoids the torchvision import chain.\"\"\"\n def __init__(self, feature_extractor, tokenizer):\n self.feature_extractor = feature_extractor\n self.tokenizer = tokenizer\n\n def get_decoder_prompt_ids(self, language, task='transcribe'):\n return self.tokenizer.get_decoder_prompt_ids(language=language, task=task)\n\n def save_pretrained(self, path):\n self.feature_extractor.save_pretrained(path)\n self.tokenizer.save_pretrained(path)\n\n\nprocessor = _Processor(_feat_ext, _tokenizer)\nprint('Processor ready')\n\n\ndef prepare_dataset(batch, text_col='transcription', lang=TRAIN_LANG):\n \"\"\"\n Resample to 16 kHz, extract log-mel features, tokenise text.\n Works on any dict with 'audio' (HF Audio column) and a text column.\n \"\"\"\n audio = batch['audio']\n audio_array = np.array(audio['array'], dtype=np.float32)\n orig_sr = audio['sampling_rate']\n\n if orig_sr != TARGET_SR:\n try:\n import torchaudio.functional as F_audio, torch\n audio_array = F_audio.resample(\n torch.from_numpy(audio_array).unsqueeze(0),\n orig_sr, TARGET_SR,\n ).squeeze(0).numpy()\n except Exception:\n import librosa\n audio_array = librosa.resample(audio_array, orig_sr=orig_sr, target_sr=TARGET_SR)\n\n batch['input_features'] = processor.feature_extractor(\n audio_array, sampling_rate=TARGET_SR\n ).input_features[0]\n\n raw_text = batch.get(text_col, '') or ''\n _norm_text = _bam_norm(str(raw_text)) if lang == 'bam' else (_normalize_pular(str(raw_text)) if lang == 'ful' else str(raw_text))\n cleaned = clean_text(_norm_text, lang=lang)\n batch['labels'] = processor.tokenizer(cleaned).input_ids\n return batch\n\n\nprint('prepare_dataset ready')"
|
| 142 |
]
|
| 143 |
},
|
| 144 |
{
|
|
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Adlam ↔ Latin transliteration for Pular (Guinea Fula).
|
| 3 |
+
|
| 4 |
+
Adlam (𞤀𞤣𞤤𞤢𞤥) is the indigenous alphabet created by Ibrahima and Abdoulaye Barry
|
| 5 |
+
for the Fula language family. Unicode block U+1E900–U+1E95F.
|
| 6 |
+
|
| 7 |
+
This module provides:
|
| 8 |
+
- adlam_to_latin(text) — convert Adlam script → Latin romanization
|
| 9 |
+
- latin_to_adlam(text) — convert Latin romanization → Adlam script
|
| 10 |
+
- normalize_pular(text) — canonical pre-processing for ASR training:
|
| 11 |
+
strips diacritics variants, lowercases, unifies spacing
|
| 12 |
+
- contains_adlam(text) — detect whether a string has Adlam characters
|
| 13 |
+
|
| 14 |
+
Transliteration table follows the standard Pular (Guinea) orthography used in:
|
| 15 |
+
- SIL/Fulfulde literacy materials
|
| 16 |
+
- Pullo-Africa-Protagonist dataset
|
| 17 |
+
- guizme/adlam_fulfulde dataset
|
| 18 |
+
|
| 19 |
+
Note: Whisper's BPE tokenizer covers the entire Unicode BMP but has never seen
|
| 20 |
+
Adlam in pre-training text, so Adlam tokens produce garbage output. Training
|
| 21 |
+
and ASR therefore always use Latin romanization; Adlam is converted to Latin
|
| 22 |
+
before feeding to the model, and Latin is kept as-is for display.
|
| 23 |
+
"""
|
| 24 |
+
from __future__ import annotations
|
| 25 |
+
|
| 26 |
+
import re
|
| 27 |
+
import unicodedata
|
| 28 |
+
|
| 29 |
+
# ── Adlam → Latin mapping (uppercase + lowercase pairs) ──────────────────────
|
| 30 |
+
# Source: Unicode Adlam chart + SIL Pulaar keyboard standard
|
| 31 |
+
_ADLAM_TO_LATIN: list[tuple[str, str]] = [
|
| 32 |
+
# Uppercase (U+1E900–U+1E921), then lowercase (U+1E922–U+1E943)
|
| 33 |
+
("\U0001e900", "A"), # 𞤀 → A
|
| 34 |
+
("\U0001e901", "B"), # 𞤁 → B
|
| 35 |
+
("\U0001e902", "B"), # 𞤂 → B (Bhe)
|
| 36 |
+
("\U0001e903", "D"), # 𞤃 → D
|
| 37 |
+
("\U0001e904", "D"), # 𞤄 → D (Dhe)
|
| 38 |
+
("\U0001e905", "E"), # 𞤅 → E
|
| 39 |
+
("\U0001e906", "F"), # 𞤆 → F
|
| 40 |
+
("\U0001e907", "G"), # 𞤇 → G
|
| 41 |
+
("\U0001e908", "H"), # 𞤈 → H
|
| 42 |
+
("\U0001e909", "I"), # 𞤉 → I
|
| 43 |
+
("\U0001e90a", "J"), # 𞤊 → J
|
| 44 |
+
("\U0001e90b", "K"), # 𞤋 → K
|
| 45 |
+
("\U0001e90c", "L"), # 𞤌 → L
|
| 46 |
+
("\U0001e90d", "M"), # 𞤍 → M
|
| 47 |
+
("\U0001e90e", "N"), # 𞤎 → N
|
| 48 |
+
("\U0001e90f", "NG"), # 𞤏 → NG
|
| 49 |
+
("\U0001e910", "O"), # 𞤐 → O
|
| 50 |
+
("\U0001e911", "P"), # 𞤑 → P
|
| 51 |
+
("\U0001e912", "R"), # 𞤒 → R
|
| 52 |
+
("\U0001e913", "S"), # 𞤓 → S
|
| 53 |
+
("\U0001e914", "T"), # 𞤔 → T
|
| 54 |
+
("\U0001e915", "U"), # 𞤕 → U
|
| 55 |
+
("\U0001e916", "V"), # 𞤖 → V
|
| 56 |
+
("\U0001e917", "W"), # 𞤗 → W
|
| 57 |
+
("\U0001e918", "Y"), # 𞤘 → Y
|
| 58 |
+
("\U0001e919", "Z"), # 𞤙 → Z
|
| 59 |
+
("\U0001e91a", "KH"), # 𞤚 → KH
|
| 60 |
+
("\U0001e91b", "QU"), # 𞤛 → QU
|
| 61 |
+
("\U0001e91c", "SH"), # 𞤜 → SH
|
| 62 |
+
("\U0001e91d", "GH"), # 𞤝 → GH
|
| 63 |
+
("\U0001e91e", "NY"), # 𞤞 → NY (ɲ)
|
| 64 |
+
("\U0001e91f", "TH"), # 𞤟 → TH
|
| 65 |
+
("\U0001e920", "WH"), # 𞤠 → WH
|
| 66 |
+
("\U0001e921", "NY"), # 𞤡 → NY (ɳ)
|
| 67 |
+
# Lowercase
|
| 68 |
+
("\U0001e922", "a"), # 𞤢 → a
|
| 69 |
+
("\U0001e923", "b"), # 𞤣 → b
|
| 70 |
+
("\U0001e924", "b"), # 𞤤 → b
|
| 71 |
+
("\U0001e925", "d"), # 𞤥 → d
|
| 72 |
+
("\U0001e926", "d"), # 𞤦 → d
|
| 73 |
+
("\U0001e927", "e"), # 𞤧 → e
|
| 74 |
+
("\U0001e928", "f"), # 𞤨 → f
|
| 75 |
+
("\U0001e929", "g"), # 𞤩 → g
|
| 76 |
+
("\U0001e92a", "h"), # 𞤪 → h
|
| 77 |
+
("\U0001e92b", "i"), # 𞤫 → i
|
| 78 |
+
("\U0001e92c", "j"), # 𞤬 → j
|
| 79 |
+
("\U0001e92d", "k"), # 𞤭 → k
|
| 80 |
+
("\U0001e92e", "l"), # 𞤮 → l
|
| 81 |
+
("\U0001e92f", "m"), # 𞤯 → m
|
| 82 |
+
("\U0001e930", "n"), # 𞤰 → n
|
| 83 |
+
("\U0001e931", "ng"), # 𞤱 → ng
|
| 84 |
+
("\U0001e932", "o"), # 𞤲 → o
|
| 85 |
+
("\U0001e933", "p"), # 𞤳 → p
|
| 86 |
+
("\U0001e934", "r"), # 𞤴 → r
|
| 87 |
+
("\U0001e935", "s"), # 𞤵 → s
|
| 88 |
+
("\U0001e936", "t"), # 𞤶 → t
|
| 89 |
+
("\U0001e937", "u"), # 𞤷 → u
|
| 90 |
+
("\U0001e938", "v"), # 𞤸 → v
|
| 91 |
+
("\U0001e939", "w"), # 𞤹 → w
|
| 92 |
+
("\U0001e93a", "y"), # 𞤺 → y
|
| 93 |
+
("\U0001e93b", "z"), # 𞤻 → z
|
| 94 |
+
("\U0001e93c", "kh"), # 𞤼 → kh
|
| 95 |
+
("\U0001e93d", "qu"), # 𞤽 → qu
|
| 96 |
+
("\U0001e93e", "sh"), # 𞤾 → sh
|
| 97 |
+
("\U0001e93f", "gh"), # 𞤿 → gh
|
| 98 |
+
("\U0001e940", "ny"), # 𞥀 → ny (ɲ)
|
| 99 |
+
("\U0001e941", "th"), # 𞥁 → th
|
| 100 |
+
("\U0001e942", "wh"), # 𞥂 → wh
|
| 101 |
+
("\U0001e943", "ny"), # 𞥃 → ny (ɳ)
|
| 102 |
+
# Digits
|
| 103 |
+
("\U0001e950", "0"), # 𞥐
|
| 104 |
+
("\U0001e951", "1"), # 𞥑
|
| 105 |
+
("\U0001e952", "2"), # 𞥒
|
| 106 |
+
("\U0001e953", "3"), # 𞥓
|
| 107 |
+
("\U0001e954", "4"), # 𞥔
|
| 108 |
+
("\U0001e955", "5"), # 𞥕
|
| 109 |
+
("\U0001e956", "6"), # 𞥖
|
| 110 |
+
("\U0001e957", "7"), # 𞥗
|
| 111 |
+
("\U0001e958", "8"), # 𞥘
|
| 112 |
+
("\U0001e959", "9"), # 𞥙
|
| 113 |
+
]
|
| 114 |
+
|
| 115 |
+
# Build fast lookup dicts
|
| 116 |
+
_A2L: dict[str, str] = {a: l for a, l in _ADLAM_TO_LATIN}
|
| 117 |
+
_L2A: dict[str, str] = {}
|
| 118 |
+
for _a, _l in reversed(_ADLAM_TO_LATIN): # reversed so single-char wins over digraph
|
| 119 |
+
_L2A[_l.lower()] = _a
|
| 120 |
+
|
| 121 |
+
# Adlam Unicode range for fast detection
|
| 122 |
+
_ADLAM_START = 0x1E900
|
| 123 |
+
_ADLAM_END = 0x1E95F
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
def contains_adlam(text: str) -> bool:
|
| 127 |
+
"""Return True if text contains any Adlam character."""
|
| 128 |
+
return any(_ADLAM_START <= ord(c) <= _ADLAM_END for c in text)
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
def adlam_to_latin(text: str) -> str:
|
| 132 |
+
"""Convert Adlam script characters to Latin romanization. Non-Adlam chars pass through."""
|
| 133 |
+
result = []
|
| 134 |
+
for ch in text:
|
| 135 |
+
result.append(_A2L.get(ch, ch))
|
| 136 |
+
return "".join(result)
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
def latin_to_adlam(text: str) -> str:
|
| 140 |
+
"""
|
| 141 |
+
Convert Latin romanization to Adlam script.
|
| 142 |
+
Handles digraphs (ng, kh, sh, gh, ny, th, wh, qu) before single chars.
|
| 143 |
+
"""
|
| 144 |
+
text = text.lower()
|
| 145 |
+
out = []
|
| 146 |
+
i = 0
|
| 147 |
+
# Digraphs sorted longest-first
|
| 148 |
+
digraphs = sorted(
|
| 149 |
+
[(k, v) for k, v in _L2A.items() if len(k) == 2],
|
| 150 |
+
key=lambda x: -len(x[0]),
|
| 151 |
+
)
|
| 152 |
+
while i < len(text):
|
| 153 |
+
matched = False
|
| 154 |
+
for lat, adl in digraphs:
|
| 155 |
+
if text[i:i + len(lat)] == lat:
|
| 156 |
+
out.append(adl)
|
| 157 |
+
i += len(lat)
|
| 158 |
+
matched = True
|
| 159 |
+
break
|
| 160 |
+
if not matched:
|
| 161 |
+
ch = text[i]
|
| 162 |
+
out.append(_L2A.get(ch, ch))
|
| 163 |
+
i += 1
|
| 164 |
+
return "".join(out)
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
def normalize_pular(text: str) -> str:
|
| 168 |
+
"""
|
| 169 |
+
Canonical pre-processing for Pular (Guinea Fula) ASR training:
|
| 170 |
+
1. Convert Adlam → Latin if present
|
| 171 |
+
2. Unicode NFC
|
| 172 |
+
3. Lowercase
|
| 173 |
+
4. Collapse whitespace
|
| 174 |
+
"""
|
| 175 |
+
if contains_adlam(text):
|
| 176 |
+
text = adlam_to_latin(text)
|
| 177 |
+
text = unicodedata.normalize("NFC", text)
|
| 178 |
+
text = text.lower()
|
| 179 |
+
text = re.sub(r"\s+", " ", text).strip()
|
| 180 |
+
return text
|
|
@@ -48,6 +48,27 @@ HF_DATASET_REGISTRY = {
|
|
| 48 |
"max": 2_000,
|
| 49 |
"license": "cc-by-4.0",
|
| 50 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
],
|
| 52 |
}
|
| 53 |
|
|
|
|
| 48 |
"max": 2_000,
|
| 49 |
"license": "cc-by-4.0",
|
| 50 |
},
|
| 51 |
+
{
|
| 52 |
+
"repo": "Pullo-Africa-Protagonist/Fula-pular",
|
| 53 |
+
"config": "default",
|
| 54 |
+
"split": "train",
|
| 55 |
+
"audio_col": "audio",
|
| 56 |
+
"text_col": "transcription",
|
| 57 |
+
"max": 5_000,
|
| 58 |
+
"license": "cc-by-4.0",
|
| 59 |
+
"note": "9,761 Pular (Guinea) audio rows — primary ASR training source",
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"repo": "guizme/adlam_fulfulde",
|
| 63 |
+
"config": "default",
|
| 64 |
+
"split": "train",
|
| 65 |
+
"audio_col": "audio",
|
| 66 |
+
"text_col": "transcription",
|
| 67 |
+
"max": 51,
|
| 68 |
+
"license": "cc-by-4.0",
|
| 69 |
+
"adlam": True,
|
| 70 |
+
"note": "51 Adlam-script audio rows — converted to Latin before training",
|
| 71 |
+
},
|
| 72 |
],
|
| 73 |
}
|
| 74 |
|