ground-zero / src /data /adlam.py
jefffffff9
Add Adlam/Pular Fula integration: transliterator + 3 new datasets + normalisation pipeline
ced078c
"""
Adlam ↔ Latin transliteration for Pular (Guinea Fula).
Adlam (𞤀𞤣𞤤𞤢𞤥) is the indigenous alphabet created by Ibrahima and Abdoulaye Barry
for the Fula language family. Unicode block U+1E900–U+1E95F.
This module provides:
- adlam_to_latin(text) — convert Adlam script → Latin romanization
- latin_to_adlam(text) — convert Latin romanization → Adlam script
- normalize_pular(text) — canonical pre-processing for ASR training:
strips diacritics variants, lowercases, unifies spacing
- contains_adlam(text) — detect whether a string has Adlam characters
Transliteration table follows the standard Pular (Guinea) orthography used in:
- SIL/Fulfulde literacy materials
- Pullo-Africa-Protagonist dataset
- guizme/adlam_fulfulde dataset
Note: Whisper's BPE tokenizer covers the entire Unicode BMP but has never seen
Adlam in pre-training text, so Adlam tokens produce garbage output. Training
and ASR therefore always use Latin romanization; Adlam is converted to Latin
before feeding to the model, and Latin is kept as-is for display.
"""
from __future__ import annotations
import re
import unicodedata
# ── Adlam → Latin mapping (uppercase + lowercase pairs) ──────────────────────
# Source: Unicode Adlam chart + SIL Pulaar keyboard standard
_ADLAM_TO_LATIN: list[tuple[str, str]] = [
# Uppercase (U+1E900–U+1E921), then lowercase (U+1E922–U+1E943)
("\U0001e900", "A"), # 𞤀 → A
("\U0001e901", "B"), # 𞤁 → B
("\U0001e902", "B"), # 𞤂 → B (Bhe)
("\U0001e903", "D"), # 𞤃 → D
("\U0001e904", "D"), # 𞤄 → D (Dhe)
("\U0001e905", "E"), # 𞤅 → E
("\U0001e906", "F"), # 𞤆 → F
("\U0001e907", "G"), # 𞤇 → G
("\U0001e908", "H"), # 𞤈 → H
("\U0001e909", "I"), # 𞤉 → I
("\U0001e90a", "J"), # 𞤊 → J
("\U0001e90b", "K"), # 𞤋 → K
("\U0001e90c", "L"), # 𞤌 → L
("\U0001e90d", "M"), # 𞤍 → M
("\U0001e90e", "N"), # 𞤎 → N
("\U0001e90f", "NG"), # 𞤏 → NG
("\U0001e910", "O"), # 𞤐 → O
("\U0001e911", "P"), # 𞤑 → P
("\U0001e912", "R"), # 𞤒 → R
("\U0001e913", "S"), # 𞤓 → S
("\U0001e914", "T"), # 𞤔 → T
("\U0001e915", "U"), # 𞤕 → U
("\U0001e916", "V"), # 𞤖 → V
("\U0001e917", "W"), # 𞤗 → W
("\U0001e918", "Y"), # 𞤘 → Y
("\U0001e919", "Z"), # 𞤙 → Z
("\U0001e91a", "KH"), # 𞤚 → KH
("\U0001e91b", "QU"), # 𞤛 → QU
("\U0001e91c", "SH"), # 𞤜 → SH
("\U0001e91d", "GH"), # 𞤝 → GH
("\U0001e91e", "NY"), # 𞤞 → NY (ɲ)
("\U0001e91f", "TH"), # 𞤟 → TH
("\U0001e920", "WH"), # 𞤠 → WH
("\U0001e921", "NY"), # 𞤡 → NY (ɳ)
# Lowercase
("\U0001e922", "a"), # 𞤢 → a
("\U0001e923", "b"), # 𞤣 → b
("\U0001e924", "b"), # 𞤤 → b
("\U0001e925", "d"), # 𞤥 → d
("\U0001e926", "d"), # 𞤦 → d
("\U0001e927", "e"), # 𞤧 → e
("\U0001e928", "f"), # 𞤨 → f
("\U0001e929", "g"), # 𞤩 → g
("\U0001e92a", "h"), # 𞤪 → h
("\U0001e92b", "i"), # 𞤫 → i
("\U0001e92c", "j"), # 𞤬 → j
("\U0001e92d", "k"), # 𞤭 → k
("\U0001e92e", "l"), # 𞤮 → l
("\U0001e92f", "m"), # 𞤯 → m
("\U0001e930", "n"), # 𞤰 → n
("\U0001e931", "ng"), # 𞤱 → ng
("\U0001e932", "o"), # 𞤲 → o
("\U0001e933", "p"), # 𞤳 → p
("\U0001e934", "r"), # 𞤴 → r
("\U0001e935", "s"), # 𞤵 → s
("\U0001e936", "t"), # 𞤶 → t
("\U0001e937", "u"), # 𞤷 → u
("\U0001e938", "v"), # 𞤸 → v
("\U0001e939", "w"), # 𞤹 → w
("\U0001e93a", "y"), # 𞤺 → y
("\U0001e93b", "z"), # 𞤻 → z
("\U0001e93c", "kh"), # 𞤼 → kh
("\U0001e93d", "qu"), # 𞤽 → qu
("\U0001e93e", "sh"), # 𞤾 → sh
("\U0001e93f", "gh"), # 𞤿 → gh
("\U0001e940", "ny"), # 𞥀 → ny (ɲ)
("\U0001e941", "th"), # 𞥁 → th
("\U0001e942", "wh"), # 𞥂 → wh
("\U0001e943", "ny"), # 𞥃 → ny (ɳ)
# Digits
("\U0001e950", "0"), # 𞥐
("\U0001e951", "1"), # 𞥑
("\U0001e952", "2"), # 𞥒
("\U0001e953", "3"), # 𞥓
("\U0001e954", "4"), # 𞥔
("\U0001e955", "5"), # 𞥕
("\U0001e956", "6"), # 𞥖
("\U0001e957", "7"), # 𞥗
("\U0001e958", "8"), # 𞥘
("\U0001e959", "9"), # 𞥙
]
# Build fast lookup dicts
_A2L: dict[str, str] = {a: l for a, l in _ADLAM_TO_LATIN}
_L2A: dict[str, str] = {}
for _a, _l in reversed(_ADLAM_TO_LATIN): # reversed so single-char wins over digraph
_L2A[_l.lower()] = _a
# Adlam Unicode range for fast detection
_ADLAM_START = 0x1E900
_ADLAM_END = 0x1E95F
def contains_adlam(text: str) -> bool:
"""Return True if text contains any Adlam character."""
return any(_ADLAM_START <= ord(c) <= _ADLAM_END for c in text)
def adlam_to_latin(text: str) -> str:
"""Convert Adlam script characters to Latin romanization. Non-Adlam chars pass through."""
result = []
for ch in text:
result.append(_A2L.get(ch, ch))
return "".join(result)
def latin_to_adlam(text: str) -> str:
"""
Convert Latin romanization to Adlam script.
Handles digraphs (ng, kh, sh, gh, ny, th, wh, qu) before single chars.
"""
text = text.lower()
out = []
i = 0
# Digraphs sorted longest-first
digraphs = sorted(
[(k, v) for k, v in _L2A.items() if len(k) == 2],
key=lambda x: -len(x[0]),
)
while i < len(text):
matched = False
for lat, adl in digraphs:
if text[i:i + len(lat)] == lat:
out.append(adl)
i += len(lat)
matched = True
break
if not matched:
ch = text[i]
out.append(_L2A.get(ch, ch))
i += 1
return "".join(out)
def normalize_pular(text: str) -> str:
"""
Canonical pre-processing for Pular (Guinea Fula) ASR training:
1. Convert Adlam → Latin if present
2. Unicode NFC
3. Lowercase
4. Collapse whitespace
"""
if contains_adlam(text):
text = adlam_to_latin(text)
text = unicodedata.normalize("NFC", text)
text = text.lower()
text = re.sub(r"\s+", " ", text).strip()
return text