Spaces:

Dionyssos
/

SHIFT

Running

File size: 16,068 Bytes

import re
import unicodedata
from num2words import num2words
from num2word_greek.numbers2words import convert_numbers

def only_greek_or_only_latin(text,
                             lang='grc'):

    latin_to_greek_map = {
        'a': 'α', 'b': 'β', 'g': 'γ', 'd': 'δ', 'e': 'ε',
        'ch': 'τσο', # Example of a multi-character Latin sequence
        'z': 'ζ', 'h': 'χ', 'i': 'ι', 'j': 'ζ', 'k': 'κ', 'l': 'λ',
        'm': 'μ', 'n': 'ν', 'x': 'ξ', 'o': 'ο', 'p': 'π', 'q': 'κ',
        'v': 'β', 'sc': 'σκ', 'r': 'ρ', 's': 'σ', 't': 'τ',
        'u': 'ου', 'f': 'φ', 'c': 'σ', 'w': 'β', 'y': 'γ',
    }

    greek_to_latin_map = {
        'ου': 'ou', # Prioritize common diphthongs/digraphs
        'α': 'a', 'β': 'v', 'γ': 'g', 'δ': 'd', 'ε': 'e',
        'ζ': 'z', 'η': 'i', 'θ': 'th', 'ι': 'i', 'κ': 'k',
        'λ': 'l', 'μ': 'm', 'ν': 'n', 'ξ': 'x', 'ο': 'o',
        'π': 'p', 'ρ': 'r', 'σ': 's', 'τ': 't', 'υ': 'y', # 'y' is a common transliteration for upsilon
        'φ': 'f', 'χ': 'ch', 'ψ': 'ps', 'ω': 'o',
        'ς': 's', # Final sigma
    }

    cyrillic_to_latin_map = {
        # 'а': 'a', 'б': 'b', 'в': 'v', 'г': 'g', 'д': 'd', 'е': 'e', 'ё': 'yo', 'ж': 'zh',
        # 'з': 'z', 'и': 'i', 'й': 'y', 'к': 'k', 'л': 'l', 'м': 'm', 'н': 'n', 'о': 'o',
        # 'п': 'p', 'р': 'r', 'с': 's', 'т': 't', 'у': 'u', 'ф': 'f', 'х': 'kh', 'ц': 'ts',
        # 'ч': 'ch', 'ш': 'sh', 'щ': 'shch', 'ъ': '', 'ы': 'y', 'ь': '', 'э': 'e', 'ю': 'yu',
        # 'я': 'ya',
        # ----------------кључеви
        'а': 'a', 'б': 'b', 'в': 'v', 'г': 'g', 'д': 'd', 'е': 'e', 'ж': 'z',
        'з': 'z', 'и': 'i', 'ј': 'j', 'к': 'k', 'л': 'l', 'м': 'm', 'н': 'n',
        'о': 'o', 'п': 'p', 'р': 'r', 'с': 's', 'т': 't', 'у': 'u', 'ф': 'f',
        'х': 'h', 'ц': 'c', 'ч': 'c', 'ш': 's', "ž": "z",
        'ђ': 'dzi', 'љ': 'li', 'њ': 'ni', 'ћ': 'c', 'џ': 'dz',
        'ё': 'e', 'й': 'i', 'щ': 's', 'ъ': '', 'ы': 'y', 'ь': '',
        'э': 'e', 'ю': 'io', 'я': 'a',
        'ѓ': 'y', 'ѕ': 's', 'ќ': 'k',
    }

    # Cyrillic to Greek on phonetic similarity.
    cyrillic_to_greek_map = {
        # 'а': 'α', 'б': 'β', 'в': 'β', 'г': 'γ', 'д': 'δ', 'е': 'ε', 'ё': 'ιο', 'ж': 'ζ',
        # 'з': 'ζ', 'и': 'ι', 'й': 'ι', 'κ': 'κ', 'λ': 'λ', 'м': 'μ', 'н': 'ν', 'о': 'ο',
        # 'π': 'π', 'ρ': 'ρ', 'σ': 'σ', 'τ': 'τ', 'у': 'ου', 'ф': 'φ', 'х': 'χ', 'ц': 'τσ',
        # 'ч': 'τσ', # or τζ depending on desired sound
        # 'ш': 'σ', 'щ': 'σ', # approximations
        # 'ъ': '', 'ы': 'ι', 'ь': '', 'э': 'ε', 'ю': 'ιου',
        # 'я': 'ια',
        # --------------------
        'а': 'α', 'б': 'μπ', 'в': 'β', 'г': 'γ', 'д': 'δ', 'е': 'ε',
        'ж': 'ζ', 'з': 'ζ', 'и': 'ι', 'й': 'ι', 'к': 'κ',
        'л': 'λ', 'м': 'μ', 'н': 'ν', 'о': 'ο', 'п': 'π', 'р': 'ρ',
        'с': 'τσ', 'т': 'τ', 'у': 'ού', 'ф': 'φ', 'х': 'χ', 'ц': 'τσ',
        'ч': 'τσ', 'ш': 'σ', 'щ': 'σ',
        #
        'ђ': 'ντζι', 'љ': 'λι', 'њ': 'νι', 'ћ': 'τσ', 'џ': 'ντζ',
        'ы': 'ι', 'ь': '',
        'э': 'ε', 'ю': 'ιο', 'я': 'ια',
        'ѓ': 'γ', 'ѕ': 'σ',
    }


    # Convert the input text to lowercase, preserving accents for Latin characters.
    # casefold() is used for more robust caseless matching across Unicode characters.
    lowercased_text = text.lower()  #casefold()
    output_chars = []
    current_index = 0

    if lang == 'grc':
        # Combine all relevant maps for direct lookup to Greek
        conversion_map = {**latin_to_greek_map, **cyrillic_to_greek_map}

        # Sort keys by length in reverse order to handle multi-character sequences first
        sorted_source_keys = sorted(
            list(latin_to_greek_map.keys()) + list(cyrillic_to_greek_map.keys()),
            key=len,
            reverse=True
        )

        while current_index < len(lowercased_text):
            found_conversion = False
            for key in sorted_source_keys:
                if lowercased_text.startswith(key, current_index):
                    output_chars.append(conversion_map[key])
                    current_index += len(key)
                    found_conversion = True
                    break
            if not found_conversion:
                # If no specific mapping found, append the character as is.
                # This handles unmapped characters and already Greek characters.
                output_chars.append(lowercased_text[current_index])
                current_index += 1
        return ''.join(output_chars)

    else: # Default to 'lat' conversion
        # Combine Greek to Latin and Cyrillic to Latin maps.
        # Cyrillic map keys will take precedence in case of overlap if defined after Greek.
        combined_to_latin_map = {**greek_to_latin_map, **cyrillic_to_latin_map}

        # Sort all relevant source keys by length in reverse for replacement
        sorted_source_keys = sorted(
            list(greek_to_latin_map.keys()) + list(cyrillic_to_latin_map.keys()),
            key=len,
            reverse=True
        )

        while current_index < len(lowercased_text):
            found_conversion = False
            for key in sorted_source_keys:
                if lowercased_text.startswith(key, current_index):
                    latin_equivalent = combined_to_latin_map[key]

                    # Strip accents ONLY if the source character was from the Greek map.
                    # This preserves accents on original Latin characters (like 'é')
                    # and allows for intentional accent stripping from Greek transliterations.
                    if key in greek_to_latin_map:
                        normalized_latin = unicodedata.normalize('NFD', latin_equivalent)
                        stripped_latin = ''.join(c for c in normalized_latin if not unicodedata.combining(c))
                        output_chars.append(stripped_latin)
                    else:
                        output_chars.append(latin_equivalent)

                    current_index += len(key)
                    found_conversion = True
                    break

            if not found_conversion:
                # If no conversion happened from Greek or Cyrillic, append the character as is.
                # This preserves existing Latin characters (including accented ones from input),
                # numbers, punctuation, and other symbols.
                output_chars.append(lowercased_text[current_index])
                current_index += 1

        return ''.join(output_chars)


def fix_vocals(text, lang='ron'):

    # Longer phrases should come before shorter ones to prevent partial matches.

    ron_replacements = {
        'ţ': 'ț',
        'ț': 'ts',
        'î': 'u',
        'â': 'a',
        'ş': 's',
        'w': 'oui',
        'k': 'c',
        'l': 'll',
        # Math symbols
        'sqrt': ' rădăcina pătrată din ',
        '^': ' la puterea ',
        '+': ' plus ',
        ' - ': ' minus ',  # only replace if standalone so to not say minus if is a-b-c
        # '*': ' ori ',  # times
        '/': ' împărțit la ',  # divided by
        '=': ' egal cu ',  # equals
        'pi': ' pi ',
        '<': ' mai mic decât ',
        '>': ' mai mare decât',
        '%': ' la sută ', # percent (from previous)
        '€': ' euro ',
        '$': ' dolar ',
        '£': ' liră ',
        '&': ' și ',  # and
        #'@': ' la ',  # at
        #'#': ' diez ',  # hash
        '∑': ' sumă ',
        '∫': ' integrală ',
        #'√': ' rădăcina pătrată a ', # more generic square root
    }

    eng_replacements = {
        'wik': 'weaky',
        'sh': 'ss',
        'ch': 'ttss',
        'oo': 'oeo',
        # Math symbols for English
        'sqrt': ' square root of ',
        '^': ' to the power of ',
        '+': ' plus ',
        ' - ': ' minus ',
        # '*': ' times ',
        ' / ': ' divided by ',
        '=': ' equals ',
        'pi': ' pi ',
        '<': ' less than ',
        '>': ' greater than ',
        # Additional common math symbols from previous list
        '%': ' percent ',
        '€': ' euro ',
        '$': ' dollar ',
        '£': ' pound ',
        '&': ' and ',
        '@': ' at ',
        '#': ' hash ',
    }

    serbian_replacements = {
        'rn': 'rrn',
        'ć': 'č',
        'c': 'č',
        'č': 'ts',
        'đ': 'dz',
        'j': 'i',
        'l': 'lll',
        'w': 'v',
        'h': 'hh',
        #  https://huggingface.co/facebook/mms-tts-rmc-script_latin
        'sqrt': 'kvadratni koren iz',
        '^': ' na stepen ',
        '+': ' plus ',
        ' - ': ' minus ',
        '*': ' puta ',
        ' / ': ' podeljeno sa ',
        '=': ' jednako ',
        'pi': ' pi ',
        '<': ' manje od ',
        '>': ' veće od ',
        '%': ' procenat ',
        '€': ' evro ',
        '$': ' dolar ',
        '£': ' funta ',
        # Others
        #     'rn': 'rrn',
        # 'ć': 'č',
        # 'c': 'č',
        # 'đ': 'd',
        # 'l': 'le',
        # 'ij': 'i',
        # 'ji': 'i',
        # 'j': 'i',
        # 'služ': 'sloooozz',  # 'službeno'
        # 'suver': 'siuveeerra',  # 'suverena'
        # 'država': 'dirrezav',  # 'država'
        # 'iči': 'ici',  # 'Graniči'
        # 's ': 'se',  # a s with space
        # 'q': 'ku',
        # 'w': 'aou',
        # 'z': 's',
        # "š": "s",
        # 'th': 'ta',
        # 'v': 'vv',
        # "ć": "č",
        # "đ": "ď",
        # "lj": "ľ",
        # "nj": "ň",
        # "c": "č"
    }

    deu_replacements = {
        'sch': 'sh',
        'ch': 'kh',
        'ie': 'ee',
        'ei': 'ai',
        'ä': 'ae',
        'ö': 'oe',
        'ü': 'ue',
        'ß': 'ss',
        # Math symbols for German
        'sqrt': ' Quadratwurzel aus ',
        '^': ' hoch ',
        '+': ' plus ',
        ' - ': ' minus ',
        '*': ' mal ',
        ' / ': ' geteilt durch ',
        '=': ' gleich ',
        'pi': ' pi ',
        '<': ' kleiner als ',
        '>': ' größer als',
        # Additional common math symbols from previous list
        '%': ' prozent ',
        '€': ' euro ',
        '$': ' dollar ',
        '£': ' pfund ',
        '&': ' und ',
        '@': ' at ', # 'Klammeraffe' is also common but 'at' is simpler
        '#': ' raute ',
    }

    fra_replacements = {
        # French specific phonetic replacements (add as needed)
        # e.g., 'ç': 's', 'é': 'e', etc.
        'w': 'v',
        # Math symbols for French
        'sqrt': ' racine carrée de ',
        '^': ' à la puissance ',
        '+': ' plus ',
        ' - ': ' moins ',  # tiré ;
        '*': ' fois ',
        ' / ': ' divisé par ',
        '=': ' égale ',
        'pi': ' pi ',
        '<': ' inférieur à ',
        '>': ' supérieur à ',
        # Add more common math symbols as needed for French
        '%': ' pour cent ',
        '€': ' euro ',
        '$': ' dollar ',
        '£': ' livre ',
        '&': ' et ',
        '@': ' arobase ',
    }

    hun_replacements = {
        # Hungarian specific phonetic replacements (add as needed)
        # e.g., 'á': 'a', 'é': 'e', etc.
        'ch': 'ts',
        'cs': 'tz',
        'g': 'gk',
        'w': 'v',
        'z': 'zz',
        # Math symbols for Hungarian
        'sqrt': ' négyzetgyök ',
        '^': ' hatvány ',
        '+': ' plusz ',
        ' - ': ' mínusz ',
        '*': ' szorozva ',
        ' / ': ' osztva ',
        '=': ' egyenlő ',
        'pi': ' pi ',
        '<': ' kisebb mint ',
        '>': ' nagyobb mint ',
        '%': ' százalék ',
        '€': ' euró ',
        '$': ' dollár ',
        '£': ' font ',
        '&': ' és ',
        '@': ' kukac ',
        '#': ' kettőskereszt ',
    }

    grc_replacements = {
        # Ancient Greek specific phonetic replacements (add as needed)
        # These are more about transliterating Greek letters if they are in the input text.
        # Math symbols for Ancient Greek (literal translations)
        'sqrt': ' τετραγωνικὴ ῥίζα ',
        '^': ' εἰς τὴν δύναμιν ',
        '+': ' σὺν ',
        ' - ': ' χωρὶς ',
        ' * ': ' πολλάκις ',
        ' / ': ' διαιρέω ',
        '=': ' ἴσον ',
        'pi': ' πῖ ',
        '<': ' ἔλαττον ',
        '>': ' μεῖζον ',
        '%': ' τοῖς ἑκατόν ', # tois hekaton - 'of the hundred'
        '€': ' εὐρώ ',
        '$': ' δολάριον ',
        '£': ' λίρα ',
        '&': ' καὶ ',
        '@': ' ἀτ ', # at
        '#': ' δίεση ', # hash
    }


    # Select the appropriate replacement dictionary based on the language
    replacements_map = {
        'grc': grc_replacements,
        'ron': ron_replacements,
        'eng': eng_replacements,
        'deu': deu_replacements,
        'fra': fra_replacements,
        'hun': hun_replacements,
        'rmc-script_latin': serbian_replacements,
    }

    current_replacements = replacements_map.get(lang)
    if current_replacements:
        # Sort replacements by length of the key in descending order.
        # This is crucial for correctly replacing multi-character strings (like 'sqrt', 'sch')
        # before their shorter substrings ('s', 'ch', 'q', 'r', 't').
        sorted_replacements = sorted(current_replacements.items(), key=lambda item: len(item[0]), reverse=True)
        for old, new in sorted_replacements:
            text = text.replace(old, new)
        return text
    else:
        # If the language is not supported, return the original text
        print(f"Warning: Language '{lang}' not supported for text replacement. Returning original text.")
        return text


def _num2words(text='01234', lang=None):
    if lang == 'grc':
        return convert_numbers(text)
    return num2words(text, lang=lang)  # HAS TO BE kwarg lang=lang


def transliterate_number(number_string,
                         lang=None):
    if lang == 'rmc-script_latin':
        lang = 'sr'
        exponential_pronoun = ' puta deset na stepen od '
        comma = ' tačka '
    elif lang == 'ron':
        lang = 'ro'
        exponential_pronoun = ' tízszer a erejéig '
        comma = ' virgulă '
    elif lang == 'hun':
        lang = 'hu'
        exponential_pronoun = ' tízszer a erejéig '
        comma = ' virgula '
    elif lang == 'deu':
        exponential_pronoun = ' mal zehn hoch '
        comma = ' komma '
    elif lang == 'fra':
        lang = 'fr'
        exponential_pronoun = ' puissance '
        comma = 'virgule'
    elif lang == 'grc':
        exponential_pronoun = ' εις την δυναμην του '
        comma = 'κομμα'
    else:
        lang = lang[:2]
        exponential_pronoun = ' times ten to the power of '
        comma = ' point '

    def replace_number(match):
        prefix = match.group(1) or ""
        number_part = match.group(2)
        suffix = match.group(5) or ""

        try:
            if 'e' in number_part.lower():
                base, exponent = number_part.lower().split('e')
                words = _num2words(base, lang=lang) + exponential_pronoun + _num2words(exponent, lang=lang)
            elif '.' in number_part:
                integer_part, decimal_part = number_part.split('.')
                words = _num2words(integer_part, lang=lang) + comma + " ".join(
                    [_num2words(digit, lang=lang) for digit in decimal_part])
            else:
                words = _num2words(number_part, lang=lang)
            return prefix + words + suffix
        except ValueError:
            return match.group(0)  # Return original if conversion fails

    pattern = r'([^\d]*)(\d+(\.\d+)?([Ee][+-]?\d+)?)([^\d]*)'
    return re.sub(pattern, replace_number, number_string)