Spaces:

Dionyssos
/

SHIFT

Running

App Files Files

xet

Community

SHIFT / textual.py

Dionyssos

txt ruls

2049895 20 days ago

raw

history blame contribute delete

16.1 kB

	import re
	import unicodedata
	from num2words import num2words
	from num2word_greek.numbers2words import convert_numbers

	def only_greek_or_only_latin(text,
	lang='grc'):

	latin_to_greek_map = {
	'a': 'α', 'b': 'β', 'g': 'γ', 'd': 'δ', 'e': 'ε',
	'ch': 'τσο', # Example of a multi-character Latin sequence
	'z': 'ζ', 'h': 'χ', 'i': 'ι', 'j': 'ζ', 'k': 'κ', 'l': 'λ',
	'm': 'μ', 'n': 'ν', 'x': 'ξ', 'o': 'ο', 'p': 'π', 'q': 'κ',
	'v': 'β', 'sc': 'σκ', 'r': 'ρ', 's': 'σ', 't': 'τ',
	'u': 'ου', 'f': 'φ', 'c': 'σ', 'w': 'β', 'y': 'γ',
	}

	greek_to_latin_map = {
	'ου': 'ou', # Prioritize common diphthongs/digraphs
	'α': 'a', 'β': 'v', 'γ': 'g', 'δ': 'd', 'ε': 'e',
	'ζ': 'z', 'η': 'i', 'θ': 'th', 'ι': 'i', 'κ': 'k',
	'λ': 'l', 'μ': 'm', 'ν': 'n', 'ξ': 'x', 'ο': 'o',
	'π': 'p', 'ρ': 'r', 'σ': 's', 'τ': 't', 'υ': 'y', # 'y' is a common transliteration for upsilon
	'φ': 'f', 'χ': 'ch', 'ψ': 'ps', 'ω': 'o',
	'ς': 's', # Final sigma
	}

	cyrillic_to_latin_map = {
	# 'а': 'a', 'б': 'b', 'в': 'v', 'г': 'g', 'д': 'd', 'е': 'e', 'ё': 'yo', 'ж': 'zh',
	# 'з': 'z', 'и': 'i', 'й': 'y', 'к': 'k', 'л': 'l', 'м': 'm', 'н': 'n', 'о': 'o',
	# 'п': 'p', 'р': 'r', 'с': 's', 'т': 't', 'у': 'u', 'ф': 'f', 'х': 'kh', 'ц': 'ts',
	# 'ч': 'ch', 'ш': 'sh', 'щ': 'shch', 'ъ': '', 'ы': 'y', 'ь': '', 'э': 'e', 'ю': 'yu',
	# 'я': 'ya',
	# ----------------кључеви
	'а': 'a', 'б': 'b', 'в': 'v', 'г': 'g', 'д': 'd', 'е': 'e', 'ж': 'z',
	'з': 'z', 'и': 'i', 'ј': 'j', 'к': 'k', 'л': 'l', 'м': 'm', 'н': 'n',
	'о': 'o', 'п': 'p', 'р': 'r', 'с': 's', 'т': 't', 'у': 'u', 'ф': 'f',
	'х': 'h', 'ц': 'c', 'ч': 'c', 'ш': 's', "ž": "z",
	'ђ': 'dzi', 'љ': 'li', 'њ': 'ni', 'ћ': 'c', 'џ': 'dz',
	'ё': 'e', 'й': 'i', 'щ': 's', 'ъ': '', 'ы': 'y', 'ь': '',
	'э': 'e', 'ю': 'io', 'я': 'a',
	'ѓ': 'y', 'ѕ': 's', 'ќ': 'k',
	}

	# Cyrillic to Greek on phonetic similarity.
	cyrillic_to_greek_map = {
	# 'а': 'α', 'б': 'β', 'в': 'β', 'г': 'γ', 'д': 'δ', 'е': 'ε', 'ё': 'ιο', 'ж': 'ζ',
	# 'з': 'ζ', 'и': 'ι', 'й': 'ι', 'κ': 'κ', 'λ': 'λ', 'м': 'μ', 'н': 'ν', 'о': 'ο',
	# 'π': 'π', 'ρ': 'ρ', 'σ': 'σ', 'τ': 'τ', 'у': 'ου', 'ф': 'φ', 'х': 'χ', 'ц': 'τσ',
	# 'ч': 'τσ', # or τζ depending on desired sound
	# 'ш': 'σ', 'щ': 'σ', # approximations
	# 'ъ': '', 'ы': 'ι', 'ь': '', 'э': 'ε', 'ю': 'ιου',
	# 'я': 'ια',
	# --------------------
	'а': 'α', 'б': 'μπ', 'в': 'β', 'г': 'γ', 'д': 'δ', 'е': 'ε',
	'ж': 'ζ', 'з': 'ζ', 'и': 'ι', 'й': 'ι', 'к': 'κ',
	'л': 'λ', 'м': 'μ', 'н': 'ν', 'о': 'ο', 'п': 'π', 'р': 'ρ',
	'с': 'τσ', 'т': 'τ', 'у': 'ού', 'ф': 'φ', 'х': 'χ', 'ц': 'τσ',
	'ч': 'τσ', 'ш': 'σ', 'щ': 'σ',
	#
	'ђ': 'ντζι', 'љ': 'λι', 'њ': 'νι', 'ћ': 'τσ', 'џ': 'ντζ',
	'ы': 'ι', 'ь': '',
	'э': 'ε', 'ю': 'ιο', 'я': 'ια',
	'ѓ': 'γ', 'ѕ': 'σ',
	}


	# Convert the input text to lowercase, preserving accents for Latin characters.
	# casefold() is used for more robust caseless matching across Unicode characters.
	lowercased_text = text.lower() #casefold()
	output_chars = []
	current_index = 0

	if lang == 'grc':
	# Combine all relevant maps for direct lookup to Greek
	conversion_map = {latin_to_greek_map, cyrillic_to_greek_map}

	# Sort keys by length in reverse order to handle multi-character sequences first
	sorted_source_keys = sorted(
	list(latin_to_greek_map.keys()) + list(cyrillic_to_greek_map.keys()),
	key=len,
	reverse=True
	)

	while current_index < len(lowercased_text):
	found_conversion = False
	for key in sorted_source_keys:
	if lowercased_text.startswith(key, current_index):
	output_chars.append(conversion_map[key])
	current_index += len(key)
	found_conversion = True
	break
	if not found_conversion:
	# If no specific mapping found, append the character as is.
	# This handles unmapped characters and already Greek characters.
	output_chars.append(lowercased_text[current_index])
	current_index += 1
	return ''.join(output_chars)

	else: # Default to 'lat' conversion
	# Combine Greek to Latin and Cyrillic to Latin maps.
	# Cyrillic map keys will take precedence in case of overlap if defined after Greek.
	combined_to_latin_map = {greek_to_latin_map, cyrillic_to_latin_map}

	# Sort all relevant source keys by length in reverse for replacement
	sorted_source_keys = sorted(
	list(greek_to_latin_map.keys()) + list(cyrillic_to_latin_map.keys()),
	key=len,
	reverse=True
	)

	while current_index < len(lowercased_text):
	found_conversion = False
	for key in sorted_source_keys:
	if lowercased_text.startswith(key, current_index):
	latin_equivalent = combined_to_latin_map[key]

	# Strip accents ONLY if the source character was from the Greek map.
	# This preserves accents on original Latin characters (like 'é')
	# and allows for intentional accent stripping from Greek transliterations.
	if key in greek_to_latin_map:
	normalized_latin = unicodedata.normalize('NFD', latin_equivalent)
	stripped_latin = ''.join(c for c in normalized_latin if not unicodedata.combining(c))
	output_chars.append(stripped_latin)
	else:
	output_chars.append(latin_equivalent)

	current_index += len(key)
	found_conversion = True
	break

	if not found_conversion:
	# If no conversion happened from Greek or Cyrillic, append the character as is.
	# This preserves existing Latin characters (including accented ones from input),
	# numbers, punctuation, and other symbols.
	output_chars.append(lowercased_text[current_index])
	current_index += 1

	return ''.join(output_chars)


	def fix_vocals(text, lang='ron'):

	# Longer phrases should come before shorter ones to prevent partial matches.

	ron_replacements = {
	'ţ': 'ț',
	'ț': 'ts',
	'î': 'u',
	'â': 'a',
	'ş': 's',
	'w': 'oui',
	'k': 'c',
	'l': 'll',
	# Math symbols
	'sqrt': ' rădăcina pătrată din ',
	'^': ' la puterea ',
	'+': ' plus ',
	' - ': ' minus ', # only replace if standalone so to not say minus if is a-b-c
	# '*': ' ori ', # times
	'/': ' împărțit la ', # divided by
	'=': ' egal cu ', # equals
	'pi': ' pi ',
	'<': ' mai mic decât ',
	'>': ' mai mare decât',
	'%': ' la sută ', # percent (from previous)
	'€': ' euro ',
	'$': ' dolar ',
	'£': ' liră ',
	'&': ' și ', # and
	#'@': ' la ', # at
	#'#': ' diez ', # hash
	'∑': ' sumă ',
	'∫': ' integrală ',
	#'√': ' rădăcina pătrată a ', # more generic square root
	}

	eng_replacements = {
	'wik': 'weaky',
	'sh': 'ss',
	'ch': 'ttss',
	'oo': 'oeo',
	# Math symbols for English
	'sqrt': ' square root of ',
	'^': ' to the power of ',
	'+': ' plus ',
	' - ': ' minus ',
	# '*': ' times ',
	' / ': ' divided by ',
	'=': ' equals ',
	'pi': ' pi ',
	'<': ' less than ',
	'>': ' greater than ',
	# Additional common math symbols from previous list
	'%': ' percent ',
	'€': ' euro ',
	'$': ' dollar ',
	'£': ' pound ',
	'&': ' and ',
	'@': ' at ',
	'#': ' hash ',
	}

	serbian_replacements = {
	'rn': 'rrn',
	'ć': 'č',
	'c': 'č',
	'č': 'ts',
	'đ': 'dz',
	'j': 'i',
	'l': 'lll',
	'w': 'v',
	'h': 'hh',
	# https://huggingface.co/facebook/mms-tts-rmc-script_latin
	'sqrt': 'kvadratni koren iz',
	'^': ' na stepen ',
	'+': ' plus ',
	' - ': ' minus ',
	'*': ' puta ',
	' / ': ' podeljeno sa ',
	'=': ' jednako ',
	'pi': ' pi ',
	'<': ' manje od ',
	'>': ' veće od ',
	'%': ' procenat ',
	'€': ' evro ',
	'$': ' dolar ',
	'£': ' funta ',
	# Others
	# 'rn': 'rrn',
	# 'ć': 'č',
	# 'c': 'č',
	# 'đ': 'd',
	# 'l': 'le',
	# 'ij': 'i',
	# 'ji': 'i',
	# 'j': 'i',
	# 'služ': 'sloooozz', # 'službeno'
	# 'suver': 'siuveeerra', # 'suverena'
	# 'država': 'dirrezav', # 'država'
	# 'iči': 'ici', # 'Graniči'
	# 's ': 'se', # a s with space
	# 'q': 'ku',
	# 'w': 'aou',
	# 'z': 's',
	# "š": "s",
	# 'th': 'ta',
	# 'v': 'vv',
	# "ć": "č",
	# "đ": "ď",
	# "lj": "ľ",
	# "nj": "ň",
	# "c": "č"
	}

	deu_replacements = {
	'sch': 'sh',
	'ch': 'kh',
	'ie': 'ee',
	'ei': 'ai',
	'ä': 'ae',
	'ö': 'oe',
	'ü': 'ue',
	'ß': 'ss',
	# Math symbols for German
	'sqrt': ' Quadratwurzel aus ',
	'^': ' hoch ',
	'+': ' plus ',
	' - ': ' minus ',
	'*': ' mal ',
	' / ': ' geteilt durch ',
	'=': ' gleich ',
	'pi': ' pi ',
	'<': ' kleiner als ',
	'>': ' größer als',
	# Additional common math symbols from previous list
	'%': ' prozent ',
	'€': ' euro ',
	'$': ' dollar ',
	'£': ' pfund ',
	'&': ' und ',
	'@': ' at ', # 'Klammeraffe' is also common but 'at' is simpler
	'#': ' raute ',
	}

	fra_replacements = {
	# French specific phonetic replacements (add as needed)
	# e.g., 'ç': 's', 'é': 'e', etc.
	'w': 'v',
	# Math symbols for French
	'sqrt': ' racine carrée de ',
	'^': ' à la puissance ',
	'+': ' plus ',
	' - ': ' moins ', # tiré ;
	'*': ' fois ',
	' / ': ' divisé par ',
	'=': ' égale ',
	'pi': ' pi ',
	'<': ' inférieur à ',
	'>': ' supérieur à ',
	# Add more common math symbols as needed for French
	'%': ' pour cent ',
	'€': ' euro ',
	'$': ' dollar ',
	'£': ' livre ',
	'&': ' et ',
	'@': ' arobase ',
	}

	hun_replacements = {
	# Hungarian specific phonetic replacements (add as needed)
	# e.g., 'á': 'a', 'é': 'e', etc.
	'ch': 'ts',
	'cs': 'tz',
	'g': 'gk',
	'w': 'v',
	'z': 'zz',
	# Math symbols for Hungarian
	'sqrt': ' négyzetgyök ',
	'^': ' hatvány ',
	'+': ' plusz ',
	' - ': ' mínusz ',
	'*': ' szorozva ',
	' / ': ' osztva ',
	'=': ' egyenlő ',
	'pi': ' pi ',
	'<': ' kisebb mint ',
	'>': ' nagyobb mint ',
	'%': ' százalék ',
	'€': ' euró ',
	'$': ' dollár ',
	'£': ' font ',
	'&': ' és ',
	'@': ' kukac ',
	'#': ' kettőskereszt ',
	}

	grc_replacements = {
	# Ancient Greek specific phonetic replacements (add as needed)
	# These are more about transliterating Greek letters if they are in the input text.
	# Math symbols for Ancient Greek (literal translations)
	'sqrt': ' τετραγωνικὴ ῥίζα ',
	'^': ' εἰς τὴν δύναμιν ',
	'+': ' σὺν ',
	' - ': ' χωρὶς ',
	' * ': ' πολλάκις ',
	' / ': ' διαιρέω ',
	'=': ' ἴσον ',
	'pi': ' πῖ ',
	'<': ' ἔλαττον ',
	'>': ' μεῖζον ',
	'%': ' τοῖς ἑκατόν ', # tois hekaton - 'of the hundred'
	'€': ' εὐρώ ',
	'$': ' δολάριον ',
	'£': ' λίρα ',
	'&': ' καὶ ',
	'@': ' ἀτ ', # at
	'#': ' δίεση ', # hash
	}


	# Select the appropriate replacement dictionary based on the language
	replacements_map = {
	'grc': grc_replacements,
	'ron': ron_replacements,
	'eng': eng_replacements,
	'deu': deu_replacements,
	'fra': fra_replacements,
	'hun': hun_replacements,
	'rmc-script_latin': serbian_replacements,
	}

	current_replacements = replacements_map.get(lang)
	if current_replacements:
	# Sort replacements by length of the key in descending order.
	# This is crucial for correctly replacing multi-character strings (like 'sqrt', 'sch')
	# before their shorter substrings ('s', 'ch', 'q', 'r', 't').
	sorted_replacements = sorted(current_replacements.items(), key=lambda item: len(item[0]), reverse=True)
	for old, new in sorted_replacements:
	text = text.replace(old, new)
	return text
	else:
	# If the language is not supported, return the original text
	print(f"Warning: Language '{lang}' not supported for text replacement. Returning original text.")
	return text


	def _num2words(text='01234', lang=None):
	if lang == 'grc':
	return convert_numbers(text)
	return num2words(text, lang=lang) # HAS TO BE kwarg lang=lang


	def transliterate_number(number_string,
	lang=None):
	if lang == 'rmc-script_latin':
	lang = 'sr'
	exponential_pronoun = ' puta deset na stepen od '
	comma = ' tačka '
	elif lang == 'ron':
	lang = 'ro'
	exponential_pronoun = ' tízszer a erejéig '
	comma = ' virgulă '
	elif lang == 'hun':
	lang = 'hu'
	exponential_pronoun = ' tízszer a erejéig '
	comma = ' virgula '
	elif lang == 'deu':
	exponential_pronoun = ' mal zehn hoch '
	comma = ' komma '
	elif lang == 'fra':
	lang = 'fr'
	exponential_pronoun = ' puissance '
	comma = 'virgule'
	elif lang == 'grc':
	exponential_pronoun = ' εις την δυναμην του '
	comma = 'κομμα'
	else:
	lang = lang[:2]
	exponential_pronoun = ' times ten to the power of '
	comma = ' point '

	def replace_number(match):
	prefix = match.group(1) or ""
	number_part = match.group(2)
	suffix = match.group(5) or ""

	try:
	if 'e' in number_part.lower():
	base, exponent = number_part.lower().split('e')
	words = _num2words(base, lang=lang) + exponential_pronoun + _num2words(exponent, lang=lang)
	elif '.' in number_part:
	integer_part, decimal_part = number_part.split('.')
	words = _num2words(integer_part, lang=lang) + comma + " ".join(
	[_num2words(digit, lang=lang) for digit in decimal_part])
	else:
	words = _num2words(number_part, lang=lang)
	return prefix + words + suffix
	except ValueError:
	return match.group(0) # Return original if conversion fails

	pattern = r'([^\d])(\d+(\.\d+)?([Ee][+-]?\d+)?)([^\d])'
	return re.sub(pattern, replace_number, number_string)