SHIFT / textual.py
Dionyssos's picture
txt ruls
2049895
import re
import unicodedata
from num2words import num2words
from num2word_greek.numbers2words import convert_numbers
def only_greek_or_only_latin(text,
lang='grc'):
latin_to_greek_map = {
'a': 'α', 'b': 'β', 'g': 'γ', 'd': 'δ', 'e': 'ε',
'ch': 'τσο', # Example of a multi-character Latin sequence
'z': 'ζ', 'h': 'χ', 'i': 'ι', 'j': 'ζ', 'k': 'κ', 'l': 'λ',
'm': 'μ', 'n': 'ν', 'x': 'ξ', 'o': 'ο', 'p': 'π', 'q': 'κ',
'v': 'β', 'sc': 'σκ', 'r': 'ρ', 's': 'σ', 't': 'τ',
'u': 'ου', 'f': 'φ', 'c': 'σ', 'w': 'β', 'y': 'γ',
}
greek_to_latin_map = {
'ου': 'ou', # Prioritize common diphthongs/digraphs
'α': 'a', 'β': 'v', 'γ': 'g', 'δ': 'd', 'ε': 'e',
'ζ': 'z', 'η': 'i', 'θ': 'th', 'ι': 'i', 'κ': 'k',
'λ': 'l', 'μ': 'm', 'ν': 'n', 'ξ': 'x', 'ο': 'o',
'π': 'p', 'ρ': 'r', 'σ': 's', 'τ': 't', 'υ': 'y', # 'y' is a common transliteration for upsilon
'φ': 'f', 'χ': 'ch', 'ψ': 'ps', 'ω': 'o',
'ς': 's', # Final sigma
}
cyrillic_to_latin_map = {
# 'а': 'a', 'б': 'b', 'в': 'v', 'г': 'g', 'д': 'd', 'е': 'e', 'ё': 'yo', 'ж': 'zh',
# 'з': 'z', 'и': 'i', 'й': 'y', 'к': 'k', 'л': 'l', 'м': 'm', 'н': 'n', 'о': 'o',
# 'п': 'p', 'р': 'r', 'с': 's', 'т': 't', 'у': 'u', 'ф': 'f', 'х': 'kh', 'ц': 'ts',
# 'ч': 'ch', 'ш': 'sh', 'щ': 'shch', 'ъ': '', 'ы': 'y', 'ь': '', 'э': 'e', 'ю': 'yu',
# 'я': 'ya',
# ----------------кључеви
'а': 'a', 'б': 'b', 'в': 'v', 'г': 'g', 'д': 'd', 'е': 'e', 'ж': 'z',
'з': 'z', 'и': 'i', 'ј': 'j', 'к': 'k', 'л': 'l', 'м': 'm', 'н': 'n',
'о': 'o', 'п': 'p', 'р': 'r', 'с': 's', 'т': 't', 'у': 'u', 'ф': 'f',
'х': 'h', 'ц': 'c', 'ч': 'c', 'ш': 's', "ž": "z",
'ђ': 'dzi', 'љ': 'li', 'њ': 'ni', 'ћ': 'c', 'џ': 'dz',
'ё': 'e', 'й': 'i', 'щ': 's', 'ъ': '', 'ы': 'y', 'ь': '',
'э': 'e', 'ю': 'io', 'я': 'a',
'ѓ': 'y', 'ѕ': 's', 'ќ': 'k',
}
# Cyrillic to Greek on phonetic similarity.
cyrillic_to_greek_map = {
# 'а': 'α', 'б': 'β', 'в': 'β', 'г': 'γ', 'д': 'δ', 'е': 'ε', 'ё': 'ιο', 'ж': 'ζ',
# 'з': 'ζ', 'и': 'ι', 'й': 'ι', 'κ': 'κ', 'λ': 'λ', 'м': 'μ', 'н': 'ν', 'о': 'ο',
# 'π': 'π', 'ρ': 'ρ', 'σ': 'σ', 'τ': 'τ', 'у': 'ου', 'ф': 'φ', 'х': 'χ', 'ц': 'τσ',
# 'ч': 'τσ', # or τζ depending on desired sound
# 'ш': 'σ', 'щ': 'σ', # approximations
# 'ъ': '', 'ы': 'ι', 'ь': '', 'э': 'ε', 'ю': 'ιου',
# 'я': 'ια',
# --------------------
'а': 'α', 'б': 'μπ', 'в': 'β', 'г': 'γ', 'д': 'δ', 'е': 'ε',
'ж': 'ζ', 'з': 'ζ', 'и': 'ι', 'й': 'ι', 'к': 'κ',
'л': 'λ', 'м': 'μ', 'н': 'ν', 'о': 'ο', 'п': 'π', 'р': 'ρ',
'с': 'τσ', 'т': 'τ', 'у': 'ού', 'ф': 'φ', 'х': 'χ', 'ц': 'τσ',
'ч': 'τσ', 'ш': 'σ', 'щ': 'σ',
#
'ђ': 'ντζι', 'љ': 'λι', 'њ': 'νι', 'ћ': 'τσ', 'џ': 'ντζ',
'ы': 'ι', 'ь': '',
'э': 'ε', 'ю': 'ιο', 'я': 'ια',
'ѓ': 'γ', 'ѕ': 'σ',
}
# Convert the input text to lowercase, preserving accents for Latin characters.
# casefold() is used for more robust caseless matching across Unicode characters.
lowercased_text = text.lower() #casefold()
output_chars = []
current_index = 0
if lang == 'grc':
# Combine all relevant maps for direct lookup to Greek
conversion_map = {**latin_to_greek_map, **cyrillic_to_greek_map}
# Sort keys by length in reverse order to handle multi-character sequences first
sorted_source_keys = sorted(
list(latin_to_greek_map.keys()) + list(cyrillic_to_greek_map.keys()),
key=len,
reverse=True
)
while current_index < len(lowercased_text):
found_conversion = False
for key in sorted_source_keys:
if lowercased_text.startswith(key, current_index):
output_chars.append(conversion_map[key])
current_index += len(key)
found_conversion = True
break
if not found_conversion:
# If no specific mapping found, append the character as is.
# This handles unmapped characters and already Greek characters.
output_chars.append(lowercased_text[current_index])
current_index += 1
return ''.join(output_chars)
else: # Default to 'lat' conversion
# Combine Greek to Latin and Cyrillic to Latin maps.
# Cyrillic map keys will take precedence in case of overlap if defined after Greek.
combined_to_latin_map = {**greek_to_latin_map, **cyrillic_to_latin_map}
# Sort all relevant source keys by length in reverse for replacement
sorted_source_keys = sorted(
list(greek_to_latin_map.keys()) + list(cyrillic_to_latin_map.keys()),
key=len,
reverse=True
)
while current_index < len(lowercased_text):
found_conversion = False
for key in sorted_source_keys:
if lowercased_text.startswith(key, current_index):
latin_equivalent = combined_to_latin_map[key]
# Strip accents ONLY if the source character was from the Greek map.
# This preserves accents on original Latin characters (like 'é')
# and allows for intentional accent stripping from Greek transliterations.
if key in greek_to_latin_map:
normalized_latin = unicodedata.normalize('NFD', latin_equivalent)
stripped_latin = ''.join(c for c in normalized_latin if not unicodedata.combining(c))
output_chars.append(stripped_latin)
else:
output_chars.append(latin_equivalent)
current_index += len(key)
found_conversion = True
break
if not found_conversion:
# If no conversion happened from Greek or Cyrillic, append the character as is.
# This preserves existing Latin characters (including accented ones from input),
# numbers, punctuation, and other symbols.
output_chars.append(lowercased_text[current_index])
current_index += 1
return ''.join(output_chars)
def fix_vocals(text, lang='ron'):
# Longer phrases should come before shorter ones to prevent partial matches.
ron_replacements = {
'ţ': 'ț',
'ț': 'ts',
'î': 'u',
'â': 'a',
'ş': 's',
'w': 'oui',
'k': 'c',
'l': 'll',
# Math symbols
'sqrt': ' rădăcina pătrată din ',
'^': ' la puterea ',
'+': ' plus ',
' - ': ' minus ', # only replace if standalone so to not say minus if is a-b-c
# '*': ' ori ', # times
'/': ' împărțit la ', # divided by
'=': ' egal cu ', # equals
'pi': ' pi ',
'<': ' mai mic decât ',
'>': ' mai mare decât',
'%': ' la sută ', # percent (from previous)
'€': ' euro ',
'$': ' dolar ',
'£': ' liră ',
'&': ' și ', # and
#'@': ' la ', # at
#'#': ' diez ', # hash
'∑': ' sumă ',
'∫': ' integrală ',
#'√': ' rădăcina pătrată a ', # more generic square root
}
eng_replacements = {
'wik': 'weaky',
'sh': 'ss',
'ch': 'ttss',
'oo': 'oeo',
# Math symbols for English
'sqrt': ' square root of ',
'^': ' to the power of ',
'+': ' plus ',
' - ': ' minus ',
# '*': ' times ',
' / ': ' divided by ',
'=': ' equals ',
'pi': ' pi ',
'<': ' less than ',
'>': ' greater than ',
# Additional common math symbols from previous list
'%': ' percent ',
'€': ' euro ',
'$': ' dollar ',
'£': ' pound ',
'&': ' and ',
'@': ' at ',
'#': ' hash ',
}
serbian_replacements = {
'rn': 'rrn',
'ć': 'č',
'c': 'č',
'č': 'ts',
'đ': 'dz',
'j': 'i',
'l': 'lll',
'w': 'v',
'h': 'hh',
# https://huggingface.co/facebook/mms-tts-rmc-script_latin
'sqrt': 'kvadratni koren iz',
'^': ' na stepen ',
'+': ' plus ',
' - ': ' minus ',
'*': ' puta ',
' / ': ' podeljeno sa ',
'=': ' jednako ',
'pi': ' pi ',
'<': ' manje od ',
'>': ' veće od ',
'%': ' procenat ',
'€': ' evro ',
'$': ' dolar ',
'£': ' funta ',
# Others
# 'rn': 'rrn',
# 'ć': 'č',
# 'c': 'č',
# 'đ': 'd',
# 'l': 'le',
# 'ij': 'i',
# 'ji': 'i',
# 'j': 'i',
# 'služ': 'sloooozz', # 'službeno'
# 'suver': 'siuveeerra', # 'suverena'
# 'država': 'dirrezav', # 'država'
# 'iči': 'ici', # 'Graniči'
# 's ': 'se', # a s with space
# 'q': 'ku',
# 'w': 'aou',
# 'z': 's',
# "š": "s",
# 'th': 'ta',
# 'v': 'vv',
# "ć": "č",
# "đ": "ď",
# "lj": "ľ",
# "nj": "ň",
# "c": "č"
}
deu_replacements = {
'sch': 'sh',
'ch': 'kh',
'ie': 'ee',
'ei': 'ai',
'ä': 'ae',
'ö': 'oe',
'ü': 'ue',
'ß': 'ss',
# Math symbols for German
'sqrt': ' Quadratwurzel aus ',
'^': ' hoch ',
'+': ' plus ',
' - ': ' minus ',
'*': ' mal ',
' / ': ' geteilt durch ',
'=': ' gleich ',
'pi': ' pi ',
'<': ' kleiner als ',
'>': ' größer als',
# Additional common math symbols from previous list
'%': ' prozent ',
'€': ' euro ',
'$': ' dollar ',
'£': ' pfund ',
'&': ' und ',
'@': ' at ', # 'Klammeraffe' is also common but 'at' is simpler
'#': ' raute ',
}
fra_replacements = {
# French specific phonetic replacements (add as needed)
# e.g., 'ç': 's', 'é': 'e', etc.
'w': 'v',
# Math symbols for French
'sqrt': ' racine carrée de ',
'^': ' à la puissance ',
'+': ' plus ',
' - ': ' moins ', # tiré ;
'*': ' fois ',
' / ': ' divisé par ',
'=': ' égale ',
'pi': ' pi ',
'<': ' inférieur à ',
'>': ' supérieur à ',
# Add more common math symbols as needed for French
'%': ' pour cent ',
'€': ' euro ',
'$': ' dollar ',
'£': ' livre ',
'&': ' et ',
'@': ' arobase ',
}
hun_replacements = {
# Hungarian specific phonetic replacements (add as needed)
# e.g., 'á': 'a', 'é': 'e', etc.
'ch': 'ts',
'cs': 'tz',
'g': 'gk',
'w': 'v',
'z': 'zz',
# Math symbols for Hungarian
'sqrt': ' négyzetgyök ',
'^': ' hatvány ',
'+': ' plusz ',
' - ': ' mínusz ',
'*': ' szorozva ',
' / ': ' osztva ',
'=': ' egyenlő ',
'pi': ' pi ',
'<': ' kisebb mint ',
'>': ' nagyobb mint ',
'%': ' százalék ',
'€': ' euró ',
'$': ' dollár ',
'£': ' font ',
'&': ' és ',
'@': ' kukac ',
'#': ' kettőskereszt ',
}
grc_replacements = {
# Ancient Greek specific phonetic replacements (add as needed)
# These are more about transliterating Greek letters if they are in the input text.
# Math symbols for Ancient Greek (literal translations)
'sqrt': ' τετραγωνικὴ ῥίζα ',
'^': ' εἰς τὴν δύναμιν ',
'+': ' σὺν ',
' - ': ' χωρὶς ',
' * ': ' πολλάκις ',
' / ': ' διαιρέω ',
'=': ' ἴσον ',
'pi': ' πῖ ',
'<': ' ἔλαττον ',
'>': ' μεῖζον ',
'%': ' τοῖς ἑκατόν ', # tois hekaton - 'of the hundred'
'€': ' εὐρώ ',
'$': ' δολάριον ',
'£': ' λίρα ',
'&': ' καὶ ',
'@': ' ἀτ ', # at
'#': ' δίεση ', # hash
}
# Select the appropriate replacement dictionary based on the language
replacements_map = {
'grc': grc_replacements,
'ron': ron_replacements,
'eng': eng_replacements,
'deu': deu_replacements,
'fra': fra_replacements,
'hun': hun_replacements,
'rmc-script_latin': serbian_replacements,
}
current_replacements = replacements_map.get(lang)
if current_replacements:
# Sort replacements by length of the key in descending order.
# This is crucial for correctly replacing multi-character strings (like 'sqrt', 'sch')
# before their shorter substrings ('s', 'ch', 'q', 'r', 't').
sorted_replacements = sorted(current_replacements.items(), key=lambda item: len(item[0]), reverse=True)
for old, new in sorted_replacements:
text = text.replace(old, new)
return text
else:
# If the language is not supported, return the original text
print(f"Warning: Language '{lang}' not supported for text replacement. Returning original text.")
return text
def _num2words(text='01234', lang=None):
if lang == 'grc':
return convert_numbers(text)
return num2words(text, lang=lang) # HAS TO BE kwarg lang=lang
def transliterate_number(number_string,
lang=None):
if lang == 'rmc-script_latin':
lang = 'sr'
exponential_pronoun = ' puta deset na stepen od '
comma = ' tačka '
elif lang == 'ron':
lang = 'ro'
exponential_pronoun = ' tízszer a erejéig '
comma = ' virgulă '
elif lang == 'hun':
lang = 'hu'
exponential_pronoun = ' tízszer a erejéig '
comma = ' virgula '
elif lang == 'deu':
exponential_pronoun = ' mal zehn hoch '
comma = ' komma '
elif lang == 'fra':
lang = 'fr'
exponential_pronoun = ' puissance '
comma = 'virgule'
elif lang == 'grc':
exponential_pronoun = ' εις την δυναμην του '
comma = 'κομμα'
else:
lang = lang[:2]
exponential_pronoun = ' times ten to the power of '
comma = ' point '
def replace_number(match):
prefix = match.group(1) or ""
number_part = match.group(2)
suffix = match.group(5) or ""
try:
if 'e' in number_part.lower():
base, exponent = number_part.lower().split('e')
words = _num2words(base, lang=lang) + exponential_pronoun + _num2words(exponent, lang=lang)
elif '.' in number_part:
integer_part, decimal_part = number_part.split('.')
words = _num2words(integer_part, lang=lang) + comma + " ".join(
[_num2words(digit, lang=lang) for digit in decimal_part])
else:
words = _num2words(number_part, lang=lang)
return prefix + words + suffix
except ValueError:
return match.group(0) # Return original if conversion fails
pattern = r'([^\d]*)(\d+(\.\d+)?([Ee][+-]?\d+)?)([^\d]*)'
return re.sub(pattern, replace_number, number_string)