Spaces:
Running
Running
jefffffff9
Add Adlam/Pular Fula integration: transliterator + 3 new datasets + normalisation pipeline
ced078c | """ | |
| Adlam ↔ Latin transliteration for Pular (Guinea Fula). | |
| Adlam (𞤀𞤣𞤤𞤢𞤥) is the indigenous alphabet created by Ibrahima and Abdoulaye Barry | |
| for the Fula language family. Unicode block U+1E900–U+1E95F. | |
| This module provides: | |
| - adlam_to_latin(text) — convert Adlam script → Latin romanization | |
| - latin_to_adlam(text) — convert Latin romanization → Adlam script | |
| - normalize_pular(text) — canonical pre-processing for ASR training: | |
| strips diacritics variants, lowercases, unifies spacing | |
| - contains_adlam(text) — detect whether a string has Adlam characters | |
| Transliteration table follows the standard Pular (Guinea) orthography used in: | |
| - SIL/Fulfulde literacy materials | |
| - Pullo-Africa-Protagonist dataset | |
| - guizme/adlam_fulfulde dataset | |
| Note: Whisper's BPE tokenizer covers the entire Unicode BMP but has never seen | |
| Adlam in pre-training text, so Adlam tokens produce garbage output. Training | |
| and ASR therefore always use Latin romanization; Adlam is converted to Latin | |
| before feeding to the model, and Latin is kept as-is for display. | |
| """ | |
| from __future__ import annotations | |
| import re | |
| import unicodedata | |
| # ── Adlam → Latin mapping (uppercase + lowercase pairs) ────────────────────── | |
| # Source: Unicode Adlam chart + SIL Pulaar keyboard standard | |
| _ADLAM_TO_LATIN: list[tuple[str, str]] = [ | |
| # Uppercase (U+1E900–U+1E921), then lowercase (U+1E922–U+1E943) | |
| ("\U0001e900", "A"), # 𞤀 → A | |
| ("\U0001e901", "B"), # 𞤁 → B | |
| ("\U0001e902", "B"), # 𞤂 → B (Bhe) | |
| ("\U0001e903", "D"), # 𞤃 → D | |
| ("\U0001e904", "D"), # 𞤄 → D (Dhe) | |
| ("\U0001e905", "E"), # 𞤅 → E | |
| ("\U0001e906", "F"), # 𞤆 → F | |
| ("\U0001e907", "G"), # 𞤇 → G | |
| ("\U0001e908", "H"), # 𞤈 → H | |
| ("\U0001e909", "I"), # 𞤉 → I | |
| ("\U0001e90a", "J"), # 𞤊 → J | |
| ("\U0001e90b", "K"), # 𞤋 → K | |
| ("\U0001e90c", "L"), # 𞤌 → L | |
| ("\U0001e90d", "M"), # 𞤍 → M | |
| ("\U0001e90e", "N"), # 𞤎 → N | |
| ("\U0001e90f", "NG"), # 𞤏 → NG | |
| ("\U0001e910", "O"), # 𞤐 → O | |
| ("\U0001e911", "P"), # 𞤑 → P | |
| ("\U0001e912", "R"), # 𞤒 → R | |
| ("\U0001e913", "S"), # 𞤓 → S | |
| ("\U0001e914", "T"), # 𞤔 → T | |
| ("\U0001e915", "U"), # 𞤕 → U | |
| ("\U0001e916", "V"), # 𞤖 → V | |
| ("\U0001e917", "W"), # 𞤗 → W | |
| ("\U0001e918", "Y"), # 𞤘 → Y | |
| ("\U0001e919", "Z"), # 𞤙 → Z | |
| ("\U0001e91a", "KH"), # 𞤚 → KH | |
| ("\U0001e91b", "QU"), # 𞤛 → QU | |
| ("\U0001e91c", "SH"), # 𞤜 → SH | |
| ("\U0001e91d", "GH"), # 𞤝 → GH | |
| ("\U0001e91e", "NY"), # 𞤞 → NY (ɲ) | |
| ("\U0001e91f", "TH"), # 𞤟 → TH | |
| ("\U0001e920", "WH"), # 𞤠 → WH | |
| ("\U0001e921", "NY"), # 𞤡 → NY (ɳ) | |
| # Lowercase | |
| ("\U0001e922", "a"), # 𞤢 → a | |
| ("\U0001e923", "b"), # 𞤣 → b | |
| ("\U0001e924", "b"), # 𞤤 → b | |
| ("\U0001e925", "d"), # 𞤥 → d | |
| ("\U0001e926", "d"), # 𞤦 → d | |
| ("\U0001e927", "e"), # 𞤧 → e | |
| ("\U0001e928", "f"), # 𞤨 → f | |
| ("\U0001e929", "g"), # 𞤩 → g | |
| ("\U0001e92a", "h"), # 𞤪 → h | |
| ("\U0001e92b", "i"), # 𞤫 → i | |
| ("\U0001e92c", "j"), # 𞤬 → j | |
| ("\U0001e92d", "k"), # 𞤭 → k | |
| ("\U0001e92e", "l"), # 𞤮 → l | |
| ("\U0001e92f", "m"), # 𞤯 → m | |
| ("\U0001e930", "n"), # 𞤰 → n | |
| ("\U0001e931", "ng"), # 𞤱 → ng | |
| ("\U0001e932", "o"), # 𞤲 → o | |
| ("\U0001e933", "p"), # 𞤳 → p | |
| ("\U0001e934", "r"), # 𞤴 → r | |
| ("\U0001e935", "s"), # 𞤵 → s | |
| ("\U0001e936", "t"), # 𞤶 → t | |
| ("\U0001e937", "u"), # 𞤷 → u | |
| ("\U0001e938", "v"), # 𞤸 → v | |
| ("\U0001e939", "w"), # 𞤹 → w | |
| ("\U0001e93a", "y"), # 𞤺 → y | |
| ("\U0001e93b", "z"), # 𞤻 → z | |
| ("\U0001e93c", "kh"), # 𞤼 → kh | |
| ("\U0001e93d", "qu"), # 𞤽 → qu | |
| ("\U0001e93e", "sh"), # 𞤾 → sh | |
| ("\U0001e93f", "gh"), # 𞤿 → gh | |
| ("\U0001e940", "ny"), # 𞥀 → ny (ɲ) | |
| ("\U0001e941", "th"), # 𞥁 → th | |
| ("\U0001e942", "wh"), # 𞥂 → wh | |
| ("\U0001e943", "ny"), # 𞥃 → ny (ɳ) | |
| # Digits | |
| ("\U0001e950", "0"), # 𞥐 | |
| ("\U0001e951", "1"), # 𞥑 | |
| ("\U0001e952", "2"), # 𞥒 | |
| ("\U0001e953", "3"), # 𞥓 | |
| ("\U0001e954", "4"), # 𞥔 | |
| ("\U0001e955", "5"), # 𞥕 | |
| ("\U0001e956", "6"), # 𞥖 | |
| ("\U0001e957", "7"), # 𞥗 | |
| ("\U0001e958", "8"), # 𞥘 | |
| ("\U0001e959", "9"), # 𞥙 | |
| ] | |
| # Build fast lookup dicts | |
| _A2L: dict[str, str] = {a: l for a, l in _ADLAM_TO_LATIN} | |
| _L2A: dict[str, str] = {} | |
| for _a, _l in reversed(_ADLAM_TO_LATIN): # reversed so single-char wins over digraph | |
| _L2A[_l.lower()] = _a | |
| # Adlam Unicode range for fast detection | |
| _ADLAM_START = 0x1E900 | |
| _ADLAM_END = 0x1E95F | |
| def contains_adlam(text: str) -> bool: | |
| """Return True if text contains any Adlam character.""" | |
| return any(_ADLAM_START <= ord(c) <= _ADLAM_END for c in text) | |
| def adlam_to_latin(text: str) -> str: | |
| """Convert Adlam script characters to Latin romanization. Non-Adlam chars pass through.""" | |
| result = [] | |
| for ch in text: | |
| result.append(_A2L.get(ch, ch)) | |
| return "".join(result) | |
| def latin_to_adlam(text: str) -> str: | |
| """ | |
| Convert Latin romanization to Adlam script. | |
| Handles digraphs (ng, kh, sh, gh, ny, th, wh, qu) before single chars. | |
| """ | |
| text = text.lower() | |
| out = [] | |
| i = 0 | |
| # Digraphs sorted longest-first | |
| digraphs = sorted( | |
| [(k, v) for k, v in _L2A.items() if len(k) == 2], | |
| key=lambda x: -len(x[0]), | |
| ) | |
| while i < len(text): | |
| matched = False | |
| for lat, adl in digraphs: | |
| if text[i:i + len(lat)] == lat: | |
| out.append(adl) | |
| i += len(lat) | |
| matched = True | |
| break | |
| if not matched: | |
| ch = text[i] | |
| out.append(_L2A.get(ch, ch)) | |
| i += 1 | |
| return "".join(out) | |
| def normalize_pular(text: str) -> str: | |
| """ | |
| Canonical pre-processing for Pular (Guinea Fula) ASR training: | |
| 1. Convert Adlam → Latin if present | |
| 2. Unicode NFC | |
| 3. Lowercase | |
| 4. Collapse whitespace | |
| """ | |
| if contains_adlam(text): | |
| text = adlam_to_latin(text) | |
| text = unicodedata.normalize("NFC", text) | |
| text = text.lower() | |
| text = re.sub(r"\s+", " ", text).strip() | |
| return text | |