Spaces:

MataStrategy
/

ground-zero

Running

jefffffff9

Add Adlam/Pular Fula integration: transliterator + 3 new datasets + normalisation pipeline

ced078c 19 days ago

6.47 kB

	"""
	Adlam ↔ Latin transliteration for Pular (Guinea Fula).

	Adlam (𞤀𞤣𞤤𞤢𞤥) is the indigenous alphabet created by Ibrahima and Abdoulaye Barry
	for the Fula language family. Unicode block U+1E900–U+1E95F.

	This module provides:
	- adlam_to_latin(text) — convert Adlam script → Latin romanization
	- latin_to_adlam(text) — convert Latin romanization → Adlam script
	- normalize_pular(text) — canonical pre-processing for ASR training:
	strips diacritics variants, lowercases, unifies spacing
	- contains_adlam(text) — detect whether a string has Adlam characters

	Transliteration table follows the standard Pular (Guinea) orthography used in:
	- SIL/Fulfulde literacy materials
	- Pullo-Africa-Protagonist dataset
	- guizme/adlam_fulfulde dataset

	Note: Whisper's BPE tokenizer covers the entire Unicode BMP but has never seen
	Adlam in pre-training text, so Adlam tokens produce garbage output. Training
	and ASR therefore always use Latin romanization; Adlam is converted to Latin
	before feeding to the model, and Latin is kept as-is for display.
	"""
	from __future__ import annotations

	import re
	import unicodedata

	# ── Adlam → Latin mapping (uppercase + lowercase pairs) ──────────────────────
	# Source: Unicode Adlam chart + SIL Pulaar keyboard standard
	_ADLAM_TO_LATIN: list[tuple[str, str]] = [
	# Uppercase (U+1E900–U+1E921), then lowercase (U+1E922–U+1E943)
	("\U0001e900", "A"), # 𞤀 → A
	("\U0001e901", "B"), # 𞤁 → B
	("\U0001e902", "B"), # 𞤂 → B (Bhe)
	("\U0001e903", "D"), # 𞤃 → D
	("\U0001e904", "D"), # 𞤄 → D (Dhe)
	("\U0001e905", "E"), # 𞤅 → E
	("\U0001e906", "F"), # 𞤆 → F
	("\U0001e907", "G"), # 𞤇 → G
	("\U0001e908", "H"), # 𞤈 → H
	("\U0001e909", "I"), # 𞤉 → I
	("\U0001e90a", "J"), # 𞤊 → J
	("\U0001e90b", "K"), # 𞤋 → K
	("\U0001e90c", "L"), # 𞤌 → L
	("\U0001e90d", "M"), # 𞤍 → M
	("\U0001e90e", "N"), # 𞤎 → N
	("\U0001e90f", "NG"), # 𞤏 → NG
	("\U0001e910", "O"), # 𞤐 → O
	("\U0001e911", "P"), # 𞤑 → P
	("\U0001e912", "R"), # 𞤒 → R
	("\U0001e913", "S"), # 𞤓 → S
	("\U0001e914", "T"), # 𞤔 → T
	("\U0001e915", "U"), # 𞤕 → U
	("\U0001e916", "V"), # 𞤖 → V
	("\U0001e917", "W"), # 𞤗 → W
	("\U0001e918", "Y"), # 𞤘 → Y
	("\U0001e919", "Z"), # 𞤙 → Z
	("\U0001e91a", "KH"), # 𞤚 → KH
	("\U0001e91b", "QU"), # 𞤛 → QU
	("\U0001e91c", "SH"), # 𞤜 → SH
	("\U0001e91d", "GH"), # 𞤝 → GH
	("\U0001e91e", "NY"), # 𞤞 → NY (ɲ)
	("\U0001e91f", "TH"), # 𞤟 → TH
	("\U0001e920", "WH"), # 𞤠 → WH
	("\U0001e921", "NY"), # 𞤡 → NY (ɳ)
	# Lowercase
	("\U0001e922", "a"), # 𞤢 → a
	("\U0001e923", "b"), # 𞤣 → b
	("\U0001e924", "b"), # 𞤤 → b
	("\U0001e925", "d"), # 𞤥 → d
	("\U0001e926", "d"), # 𞤦 → d
	("\U0001e927", "e"), # 𞤧 → e
	("\U0001e928", "f"), # 𞤨 → f
	("\U0001e929", "g"), # 𞤩 → g
	("\U0001e92a", "h"), # 𞤪 → h
	("\U0001e92b", "i"), # 𞤫 → i
	("\U0001e92c", "j"), # 𞤬 → j
	("\U0001e92d", "k"), # 𞤭 → k
	("\U0001e92e", "l"), # 𞤮 → l
	("\U0001e92f", "m"), # 𞤯 → m
	("\U0001e930", "n"), # 𞤰 → n
	("\U0001e931", "ng"), # 𞤱 → ng
	("\U0001e932", "o"), # 𞤲 → o
	("\U0001e933", "p"), # 𞤳 → p
	("\U0001e934", "r"), # 𞤴 → r
	("\U0001e935", "s"), # 𞤵 → s
	("\U0001e936", "t"), # 𞤶 → t
	("\U0001e937", "u"), # 𞤷 → u
	("\U0001e938", "v"), # 𞤸 → v
	("\U0001e939", "w"), # 𞤹 → w
	("\U0001e93a", "y"), # 𞤺 → y
	("\U0001e93b", "z"), # 𞤻 → z
	("\U0001e93c", "kh"), # 𞤼 → kh
	("\U0001e93d", "qu"), # 𞤽 → qu
	("\U0001e93e", "sh"), # 𞤾 → sh
	("\U0001e93f", "gh"), # 𞤿 → gh
	("\U0001e940", "ny"), # 𞥀 → ny (ɲ)
	("\U0001e941", "th"), # 𞥁 → th
	("\U0001e942", "wh"), # 𞥂 → wh
	("\U0001e943", "ny"), # 𞥃 → ny (ɳ)
	# Digits
	("\U0001e950", "0"), # 𞥐
	("\U0001e951", "1"), # 𞥑
	("\U0001e952", "2"), # 𞥒
	("\U0001e953", "3"), # 𞥓
	("\U0001e954", "4"), # 𞥔
	("\U0001e955", "5"), # 𞥕
	("\U0001e956", "6"), # 𞥖
	("\U0001e957", "7"), # 𞥗
	("\U0001e958", "8"), # 𞥘
	("\U0001e959", "9"), # 𞥙
	]

	# Build fast lookup dicts
	_A2L: dict[str, str] = {a: l for a, l in _ADLAM_TO_LATIN}
	_L2A: dict[str, str] = {}
	for _a, _l in reversed(_ADLAM_TO_LATIN): # reversed so single-char wins over digraph
	_L2A[_l.lower()] = _a

	# Adlam Unicode range for fast detection
	_ADLAM_START = 0x1E900
	_ADLAM_END = 0x1E95F


	def contains_adlam(text: str) -> bool:
	"""Return True if text contains any Adlam character."""
	return any(_ADLAM_START <= ord(c) <= _ADLAM_END for c in text)


	def adlam_to_latin(text: str) -> str:
	"""Convert Adlam script characters to Latin romanization. Non-Adlam chars pass through."""
	result = []
	for ch in text:
	result.append(_A2L.get(ch, ch))
	return "".join(result)


	def latin_to_adlam(text: str) -> str:
	"""
	Convert Latin romanization to Adlam script.
	Handles digraphs (ng, kh, sh, gh, ny, th, wh, qu) before single chars.
	"""
	text = text.lower()
	out = []
	i = 0
	# Digraphs sorted longest-first
	digraphs = sorted(
	[(k, v) for k, v in _L2A.items() if len(k) == 2],
	key=lambda x: -len(x[0]),
	)
	while i < len(text):
	matched = False
	for lat, adl in digraphs:
	if text[i:i + len(lat)] == lat:
	out.append(adl)
	i += len(lat)
	matched = True
	break
	if not matched:
	ch = text[i]
	out.append(_L2A.get(ch, ch))
	i += 1
	return "".join(out)


	def normalize_pular(text: str) -> str:
	"""
	Canonical pre-processing for Pular (Guinea Fula) ASR training:
	1. Convert Adlam → Latin if present
	2. Unicode NFC
	3. Lowercase
	4. Collapse whitespace
	"""
	if contains_adlam(text):
	text = adlam_to_latin(text)
	text = unicodedata.normalize("NFC", text)
	text = text.lower()
	text = re.sub(r"\s+", " ", text).strip()
	return text