Spaces:

vikramvasudevan
/

sanatan_ai

Running on CPU Upgrade

sanatan_ai / modules /languages /transliterator.py

Upload folder using huggingface_hub

7217024 verified 25 days ago

1.59 kB

	from modules.languages.constants import LANGUAGES
	import re
	import unicodedata

	def clean_text(text: str) -> str:
	"""
	Remove invisible or non-standard Unicode characters that break transliteration.
	"""
	# Normalize Unicode (decompose + recompose)
	text = unicodedata.normalize("NFC", text)

	# Remove zero-width and control characters
	invisible_pattern = r'[\u200B-\u200D\uFEFF\u2060]'
	text = re.sub(invisible_pattern, '', text)

	# Replace non-breaking spaces with regular spaces
	text = text.replace('\xa0', ' ')

	# Remove stray control chars except \n
	text = ''.join(ch for ch in text if ch.isprintable() or ch == '\n')

	# Trim multiple spaces
	text = re.sub(r'[ ]{2,}', ' ', text)

	return text.strip()


	def fn_transliterate(input_text: str, input_language: str = 'autodetect') -> dict:
	try:
	from aksharamukha import transliterate

	input_text = clean_text(input_text) # <-- sanitize here

	target_scripts = {lang["code"]: lang["aksharamukha_name"] for lang in LANGUAGES}
	output = {}

	for code, script_name in target_scripts.items():
	if(input_text is not None and input_text.strip() != ""):
	transliterated_text = transliterate.process(input_language, script_name, input_text)
	else:
	transliterated_text = input_text
	output[code] = transliterated_text

	return output

	except Exception as e:
	print(f"Error transliterating '{input_text[:30]}...': {e}")
	return {lang["code"]: '-' for lang in LANGUAGES}