import re import unicodedata from num2words import num2words from num2word_greek.numbers2words import convert_numbers def only_greek_or_only_latin(text, lang='grc'): latin_to_greek_map = { 'a': 'α', 'b': 'β', 'g': 'γ', 'd': 'δ', 'e': 'ε', 'ch': 'τσο', # Example of a multi-character Latin sequence 'z': 'ζ', 'h': 'χ', 'i': 'ι', 'j': 'ζ', 'k': 'κ', 'l': 'λ', 'm': 'μ', 'n': 'ν', 'x': 'ξ', 'o': 'ο', 'p': 'π', 'q': 'κ', 'v': 'β', 'sc': 'σκ', 'r': 'ρ', 's': 'σ', 't': 'τ', 'u': 'ου', 'f': 'φ', 'c': 'σ', 'w': 'β', 'y': 'γ', } greek_to_latin_map = { 'ου': 'ou', # Prioritize common diphthongs/digraphs 'α': 'a', 'β': 'v', 'γ': 'g', 'δ': 'd', 'ε': 'e', 'ζ': 'z', 'η': 'i', 'θ': 'th', 'ι': 'i', 'κ': 'k', 'λ': 'l', 'μ': 'm', 'ν': 'n', 'ξ': 'x', 'ο': 'o', 'π': 'p', 'ρ': 'r', 'σ': 's', 'τ': 't', 'υ': 'y', # 'y' is a common transliteration for upsilon 'φ': 'f', 'χ': 'ch', 'ψ': 'ps', 'ω': 'o', 'ς': 's', # Final sigma } cyrillic_to_latin_map = { # 'а': 'a', 'б': 'b', 'в': 'v', 'г': 'g', 'д': 'd', 'е': 'e', 'ё': 'yo', 'ж': 'zh', # 'з': 'z', 'и': 'i', 'й': 'y', 'к': 'k', 'л': 'l', 'м': 'm', 'н': 'n', 'о': 'o', # 'п': 'p', 'р': 'r', 'с': 's', 'т': 't', 'у': 'u', 'ф': 'f', 'х': 'kh', 'ц': 'ts', # 'ч': 'ch', 'ш': 'sh', 'щ': 'shch', 'ъ': '', 'ы': 'y', 'ь': '', 'э': 'e', 'ю': 'yu', # 'я': 'ya', # ----------------кључеви 'а': 'a', 'б': 'b', 'в': 'v', 'г': 'g', 'д': 'd', 'е': 'e', 'ж': 'z', 'з': 'z', 'и': 'i', 'ј': 'j', 'к': 'k', 'л': 'l', 'м': 'm', 'н': 'n', 'о': 'o', 'п': 'p', 'р': 'r', 'с': 's', 'т': 't', 'у': 'u', 'ф': 'f', 'х': 'h', 'ц': 'c', 'ч': 'c', 'ш': 's', "ž": "z", 'ђ': 'dzi', 'љ': 'li', 'њ': 'ni', 'ћ': 'c', 'џ': 'dz', 'ё': 'e', 'й': 'i', 'щ': 's', 'ъ': '', 'ы': 'y', 'ь': '', 'э': 'e', 'ю': 'io', 'я': 'a', 'ѓ': 'y', 'ѕ': 's', 'ќ': 'k', } # Cyrillic to Greek on phonetic similarity. cyrillic_to_greek_map = { # 'а': 'α', 'б': 'β', 'в': 'β', 'г': 'γ', 'д': 'δ', 'е': 'ε', 'ё': 'ιο', 'ж': 'ζ', # 'з': 'ζ', 'и': 'ι', 'й': 'ι', 'κ': 'κ', 'λ': 'λ', 'м': 'μ', 'н': 'ν', 'о': 'ο', # 'π': 'π', 'ρ': 'ρ', 'σ': 'σ', 'τ': 'τ', 'у': 'ου', 'ф': 'φ', 'х': 'χ', 'ц': 'τσ', # 'ч': 'τσ', # or τζ depending on desired sound # 'ш': 'σ', 'щ': 'σ', # approximations # 'ъ': '', 'ы': 'ι', 'ь': '', 'э': 'ε', 'ю': 'ιου', # 'я': 'ια', # -------------------- 'а': 'α', 'б': 'μπ', 'в': 'β', 'г': 'γ', 'д': 'δ', 'е': 'ε', 'ж': 'ζ', 'з': 'ζ', 'и': 'ι', 'й': 'ι', 'к': 'κ', 'л': 'λ', 'м': 'μ', 'н': 'ν', 'о': 'ο', 'п': 'π', 'р': 'ρ', 'с': 'τσ', 'т': 'τ', 'у': 'ού', 'ф': 'φ', 'х': 'χ', 'ц': 'τσ', 'ч': 'τσ', 'ш': 'σ', 'щ': 'σ', # 'ђ': 'ντζι', 'љ': 'λι', 'њ': 'νι', 'ћ': 'τσ', 'џ': 'ντζ', 'ы': 'ι', 'ь': '', 'э': 'ε', 'ю': 'ιο', 'я': 'ια', 'ѓ': 'γ', 'ѕ': 'σ', } # Convert the input text to lowercase, preserving accents for Latin characters. # casefold() is used for more robust caseless matching across Unicode characters. lowercased_text = text.lower() #casefold() output_chars = [] current_index = 0 if lang == 'grc': # Combine all relevant maps for direct lookup to Greek conversion_map = {**latin_to_greek_map, **cyrillic_to_greek_map} # Sort keys by length in reverse order to handle multi-character sequences first sorted_source_keys = sorted( list(latin_to_greek_map.keys()) + list(cyrillic_to_greek_map.keys()), key=len, reverse=True ) while current_index < len(lowercased_text): found_conversion = False for key in sorted_source_keys: if lowercased_text.startswith(key, current_index): output_chars.append(conversion_map[key]) current_index += len(key) found_conversion = True break if not found_conversion: # If no specific mapping found, append the character as is. # This handles unmapped characters and already Greek characters. output_chars.append(lowercased_text[current_index]) current_index += 1 return ''.join(output_chars) else: # Default to 'lat' conversion # Combine Greek to Latin and Cyrillic to Latin maps. # Cyrillic map keys will take precedence in case of overlap if defined after Greek. combined_to_latin_map = {**greek_to_latin_map, **cyrillic_to_latin_map} # Sort all relevant source keys by length in reverse for replacement sorted_source_keys = sorted( list(greek_to_latin_map.keys()) + list(cyrillic_to_latin_map.keys()), key=len, reverse=True ) while current_index < len(lowercased_text): found_conversion = False for key in sorted_source_keys: if lowercased_text.startswith(key, current_index): latin_equivalent = combined_to_latin_map[key] # Strip accents ONLY if the source character was from the Greek map. # This preserves accents on original Latin characters (like 'é') # and allows for intentional accent stripping from Greek transliterations. if key in greek_to_latin_map: normalized_latin = unicodedata.normalize('NFD', latin_equivalent) stripped_latin = ''.join(c for c in normalized_latin if not unicodedata.combining(c)) output_chars.append(stripped_latin) else: output_chars.append(latin_equivalent) current_index += len(key) found_conversion = True break if not found_conversion: # If no conversion happened from Greek or Cyrillic, append the character as is. # This preserves existing Latin characters (including accented ones from input), # numbers, punctuation, and other symbols. output_chars.append(lowercased_text[current_index]) current_index += 1 return ''.join(output_chars) def fix_vocals(text, lang='ron'): # Longer phrases should come before shorter ones to prevent partial matches. ron_replacements = { 'ţ': 'ț', 'ț': 'ts', 'î': 'u', 'â': 'a', 'ş': 's', 'w': 'oui', 'k': 'c', 'l': 'll', # Math symbols 'sqrt': ' rădăcina pătrată din ', '^': ' la puterea ', '+': ' plus ', ' - ': ' minus ', # only replace if standalone so to not say minus if is a-b-c # '*': ' ori ', # times '/': ' împărțit la ', # divided by '=': ' egal cu ', # equals 'pi': ' pi ', '<': ' mai mic decât ', '>': ' mai mare decât', '%': ' la sută ', # percent (from previous) '€': ' euro ', '$': ' dolar ', '£': ' liră ', '&': ' și ', # and #'@': ' la ', # at #'#': ' diez ', # hash '∑': ' sumă ', '∫': ' integrală ', #'√': ' rădăcina pătrată a ', # more generic square root } eng_replacements = { 'wik': 'weaky', 'sh': 'ss', 'ch': 'ttss', 'oo': 'oeo', # Math symbols for English 'sqrt': ' square root of ', '^': ' to the power of ', '+': ' plus ', ' - ': ' minus ', # '*': ' times ', ' / ': ' divided by ', '=': ' equals ', 'pi': ' pi ', '<': ' less than ', '>': ' greater than ', # Additional common math symbols from previous list '%': ' percent ', '€': ' euro ', '$': ' dollar ', '£': ' pound ', '&': ' and ', '@': ' at ', '#': ' hash ', } serbian_replacements = { 'rn': 'rrn', 'ć': 'č', 'c': 'č', 'č': 'ts', 'đ': 'dz', 'j': 'i', 'l': 'lll', 'w': 'v', 'h': 'hh', # https://huggingface.co/facebook/mms-tts-rmc-script_latin 'sqrt': 'kvadratni koren iz', '^': ' na stepen ', '+': ' plus ', ' - ': ' minus ', '*': ' puta ', ' / ': ' podeljeno sa ', '=': ' jednako ', 'pi': ' pi ', '<': ' manje od ', '>': ' veće od ', '%': ' procenat ', '€': ' evro ', '$': ' dolar ', '£': ' funta ', # Others # 'rn': 'rrn', # 'ć': 'č', # 'c': 'č', # 'đ': 'd', # 'l': 'le', # 'ij': 'i', # 'ji': 'i', # 'j': 'i', # 'služ': 'sloooozz', # 'službeno' # 'suver': 'siuveeerra', # 'suverena' # 'država': 'dirrezav', # 'država' # 'iči': 'ici', # 'Graniči' # 's ': 'se', # a s with space # 'q': 'ku', # 'w': 'aou', # 'z': 's', # "š": "s", # 'th': 'ta', # 'v': 'vv', # "ć": "č", # "đ": "ď", # "lj": "ľ", # "nj": "ň", # "c": "č" } deu_replacements = { 'sch': 'sh', 'ch': 'kh', 'ie': 'ee', 'ei': 'ai', 'ä': 'ae', 'ö': 'oe', 'ü': 'ue', 'ß': 'ss', # Math symbols for German 'sqrt': ' Quadratwurzel aus ', '^': ' hoch ', '+': ' plus ', ' - ': ' minus ', '*': ' mal ', ' / ': ' geteilt durch ', '=': ' gleich ', 'pi': ' pi ', '<': ' kleiner als ', '>': ' größer als', # Additional common math symbols from previous list '%': ' prozent ', '€': ' euro ', '$': ' dollar ', '£': ' pfund ', '&': ' und ', '@': ' at ', # 'Klammeraffe' is also common but 'at' is simpler '#': ' raute ', } fra_replacements = { # French specific phonetic replacements (add as needed) # e.g., 'ç': 's', 'é': 'e', etc. 'w': 'v', # Math symbols for French 'sqrt': ' racine carrée de ', '^': ' à la puissance ', '+': ' plus ', ' - ': ' moins ', # tiré ; '*': ' fois ', ' / ': ' divisé par ', '=': ' égale ', 'pi': ' pi ', '<': ' inférieur à ', '>': ' supérieur à ', # Add more common math symbols as needed for French '%': ' pour cent ', '€': ' euro ', '$': ' dollar ', '£': ' livre ', '&': ' et ', '@': ' arobase ', } hun_replacements = { # Hungarian specific phonetic replacements (add as needed) # e.g., 'á': 'a', 'é': 'e', etc. 'ch': 'ts', 'cs': 'tz', 'g': 'gk', 'w': 'v', 'z': 'zz', # Math symbols for Hungarian 'sqrt': ' négyzetgyök ', '^': ' hatvány ', '+': ' plusz ', ' - ': ' mínusz ', '*': ' szorozva ', ' / ': ' osztva ', '=': ' egyenlő ', 'pi': ' pi ', '<': ' kisebb mint ', '>': ' nagyobb mint ', '%': ' százalék ', '€': ' euró ', '$': ' dollár ', '£': ' font ', '&': ' és ', '@': ' kukac ', '#': ' kettőskereszt ', } grc_replacements = { # Ancient Greek specific phonetic replacements (add as needed) # These are more about transliterating Greek letters if they are in the input text. # Math symbols for Ancient Greek (literal translations) 'sqrt': ' τετραγωνικὴ ῥίζα ', '^': ' εἰς τὴν δύναμιν ', '+': ' σὺν ', ' - ': ' χωρὶς ', ' * ': ' πολλάκις ', ' / ': ' διαιρέω ', '=': ' ἴσον ', 'pi': ' πῖ ', '<': ' ἔλαττον ', '>': ' μεῖζον ', '%': ' τοῖς ἑκατόν ', # tois hekaton - 'of the hundred' '€': ' εὐρώ ', '$': ' δολάριον ', '£': ' λίρα ', '&': ' καὶ ', '@': ' ἀτ ', # at '#': ' δίεση ', # hash } # Select the appropriate replacement dictionary based on the language replacements_map = { 'grc': grc_replacements, 'ron': ron_replacements, 'eng': eng_replacements, 'deu': deu_replacements, 'fra': fra_replacements, 'hun': hun_replacements, 'rmc-script_latin': serbian_replacements, } current_replacements = replacements_map.get(lang) if current_replacements: # Sort replacements by length of the key in descending order. # This is crucial for correctly replacing multi-character strings (like 'sqrt', 'sch') # before their shorter substrings ('s', 'ch', 'q', 'r', 't'). sorted_replacements = sorted(current_replacements.items(), key=lambda item: len(item[0]), reverse=True) for old, new in sorted_replacements: text = text.replace(old, new) return text else: # If the language is not supported, return the original text print(f"Warning: Language '{lang}' not supported for text replacement. Returning original text.") return text def _num2words(text='01234', lang=None): if lang == 'grc': return convert_numbers(text) return num2words(text, lang=lang) # HAS TO BE kwarg lang=lang def transliterate_number(number_string, lang=None): if lang == 'rmc-script_latin': lang = 'sr' exponential_pronoun = ' puta deset na stepen od ' comma = ' tačka ' elif lang == 'ron': lang = 'ro' exponential_pronoun = ' tízszer a erejéig ' comma = ' virgulă ' elif lang == 'hun': lang = 'hu' exponential_pronoun = ' tízszer a erejéig ' comma = ' virgula ' elif lang == 'deu': exponential_pronoun = ' mal zehn hoch ' comma = ' komma ' elif lang == 'fra': lang = 'fr' exponential_pronoun = ' puissance ' comma = 'virgule' elif lang == 'grc': exponential_pronoun = ' εις την δυναμην του ' comma = 'κομμα' else: lang = lang[:2] exponential_pronoun = ' times ten to the power of ' comma = ' point ' def replace_number(match): prefix = match.group(1) or "" number_part = match.group(2) suffix = match.group(5) or "" try: if 'e' in number_part.lower(): base, exponent = number_part.lower().split('e') words = _num2words(base, lang=lang) + exponential_pronoun + _num2words(exponent, lang=lang) elif '.' in number_part: integer_part, decimal_part = number_part.split('.') words = _num2words(integer_part, lang=lang) + comma + " ".join( [_num2words(digit, lang=lang) for digit in decimal_part]) else: words = _num2words(number_part, lang=lang) return prefix + words + suffix except ValueError: return match.group(0) # Return original if conversion fails pattern = r'([^\d]*)(\d+(\.\d+)?([Ee][+-]?\d+)?)([^\d]*)' return re.sub(pattern, replace_number, number_string)