Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
| from modules.languages.constants import LANGUAGES | |
| import re | |
| import unicodedata | |
| def clean_text(text: str) -> str: | |
| """ | |
| Remove invisible or non-standard Unicode characters that break transliteration. | |
| """ | |
| # Normalize Unicode (decompose + recompose) | |
| text = unicodedata.normalize("NFC", text) | |
| # Remove zero-width and control characters | |
| invisible_pattern = r'[\u200B-\u200D\uFEFF\u2060]' | |
| text = re.sub(invisible_pattern, '', text) | |
| # Replace non-breaking spaces with regular spaces | |
| text = text.replace('\xa0', ' ') | |
| # Remove stray control chars except \n | |
| text = ''.join(ch for ch in text if ch.isprintable() or ch == '\n') | |
| # Trim multiple spaces | |
| text = re.sub(r'[ ]{2,}', ' ', text) | |
| return text.strip() | |
| def fn_transliterate(input_text: str, input_language: str = 'autodetect') -> dict: | |
| try: | |
| from aksharamukha import transliterate | |
| input_text = clean_text(input_text) # <-- sanitize here | |
| target_scripts = {lang["code"]: lang["aksharamukha_name"] for lang in LANGUAGES} | |
| output = {} | |
| for code, script_name in target_scripts.items(): | |
| if(input_text is not None and input_text.strip() != ""): | |
| transliterated_text = transliterate.process(input_language, script_name, input_text) | |
| else: | |
| transliterated_text = input_text | |
| output[code] = transliterated_text | |
| return output | |
| except Exception as e: | |
| print(f"Error transliterating '{input_text[:30]}...': {e}") | |
| return {lang["code"]: '-' for lang in LANGUAGES} |