sanatan_ai / modules /languages /transliterator.py
vikramvasudevan's picture
Upload folder using huggingface_hub
7217024 verified
from modules.languages.constants import LANGUAGES
import re
import unicodedata
def clean_text(text: str) -> str:
"""
Remove invisible or non-standard Unicode characters that break transliteration.
"""
# Normalize Unicode (decompose + recompose)
text = unicodedata.normalize("NFC", text)
# Remove zero-width and control characters
invisible_pattern = r'[\u200B-\u200D\uFEFF\u2060]'
text = re.sub(invisible_pattern, '', text)
# Replace non-breaking spaces with regular spaces
text = text.replace('\xa0', ' ')
# Remove stray control chars except \n
text = ''.join(ch for ch in text if ch.isprintable() or ch == '\n')
# Trim multiple spaces
text = re.sub(r'[ ]{2,}', ' ', text)
return text.strip()
def fn_transliterate(input_text: str, input_language: str = 'autodetect') -> dict:
try:
from aksharamukha import transliterate
input_text = clean_text(input_text) # <-- sanitize here
target_scripts = {lang["code"]: lang["aksharamukha_name"] for lang in LANGUAGES}
output = {}
for code, script_name in target_scripts.items():
if(input_text is not None and input_text.strip() != ""):
transliterated_text = transliterate.process(input_language, script_name, input_text)
else:
transliterated_text = input_text
output[code] = transliterated_text
return output
except Exception as e:
print(f"Error transliterating '{input_text[:30]}...': {e}")
return {lang["code"]: '-' for lang in LANGUAGES}