|
import re |
|
import unicodedata |
|
from num2words import num2words |
|
from num2word_greek.numbers2words import convert_numbers |
|
|
|
def only_greek_or_only_latin(text, |
|
lang='grc'): |
|
|
|
latin_to_greek_map = { |
|
'a': 'α', 'b': 'β', 'g': 'γ', 'd': 'δ', 'e': 'ε', |
|
'ch': 'τσο', |
|
'z': 'ζ', 'h': 'χ', 'i': 'ι', 'j': 'ζ', 'k': 'κ', 'l': 'λ', |
|
'm': 'μ', 'n': 'ν', 'x': 'ξ', 'o': 'ο', 'p': 'π', 'q': 'κ', |
|
'v': 'β', 'sc': 'σκ', 'r': 'ρ', 's': 'σ', 't': 'τ', |
|
'u': 'ου', 'f': 'φ', 'c': 'σ', 'w': 'β', 'y': 'γ', |
|
} |
|
|
|
greek_to_latin_map = { |
|
'ου': 'ou', |
|
'α': 'a', 'β': 'v', 'γ': 'g', 'δ': 'd', 'ε': 'e', |
|
'ζ': 'z', 'η': 'i', 'θ': 'th', 'ι': 'i', 'κ': 'k', |
|
'λ': 'l', 'μ': 'm', 'ν': 'n', 'ξ': 'x', 'ο': 'o', |
|
'π': 'p', 'ρ': 'r', 'σ': 's', 'τ': 't', 'υ': 'y', |
|
'φ': 'f', 'χ': 'ch', 'ψ': 'ps', 'ω': 'o', |
|
'ς': 's', |
|
} |
|
|
|
cyrillic_to_latin_map = { |
|
|
|
|
|
|
|
|
|
|
|
|
|
'а': 'a', 'б': 'b', 'в': 'v', 'г': 'g', 'д': 'd', 'е': 'e', 'ж': 'z', |
|
'з': 'z', 'и': 'i', 'ј': 'j', 'к': 'k', 'л': 'l', 'м': 'm', 'н': 'n', |
|
'о': 'o', 'п': 'p', 'р': 'r', 'с': 's', 'т': 't', 'у': 'u', 'ф': 'f', |
|
'х': 'h', 'ц': 'c', 'ч': 'c', 'ш': 's', "ž": "z", |
|
'ђ': 'dzi', 'љ': 'li', 'њ': 'ni', 'ћ': 'c', 'џ': 'dz', |
|
'ё': 'e', 'й': 'i', 'щ': 's', 'ъ': '', 'ы': 'y', 'ь': '', |
|
'э': 'e', 'ю': 'io', 'я': 'a', |
|
'ѓ': 'y', 'ѕ': 's', 'ќ': 'k', |
|
} |
|
|
|
|
|
cyrillic_to_greek_map = { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
'а': 'α', 'б': 'μπ', 'в': 'β', 'г': 'γ', 'д': 'δ', 'е': 'ε', |
|
'ж': 'ζ', 'з': 'ζ', 'и': 'ι', 'й': 'ι', 'к': 'κ', |
|
'л': 'λ', 'м': 'μ', 'н': 'ν', 'о': 'ο', 'п': 'π', 'р': 'ρ', |
|
'с': 'τσ', 'т': 'τ', 'у': 'ού', 'ф': 'φ', 'х': 'χ', 'ц': 'τσ', |
|
'ч': 'τσ', 'ш': 'σ', 'щ': 'σ', |
|
|
|
'ђ': 'ντζι', 'љ': 'λι', 'њ': 'νι', 'ћ': 'τσ', 'џ': 'ντζ', |
|
'ы': 'ι', 'ь': '', |
|
'э': 'ε', 'ю': 'ιο', 'я': 'ια', |
|
'ѓ': 'γ', 'ѕ': 'σ', |
|
} |
|
|
|
|
|
|
|
|
|
lowercased_text = text.lower() |
|
output_chars = [] |
|
current_index = 0 |
|
|
|
if lang == 'grc': |
|
|
|
conversion_map = {**latin_to_greek_map, **cyrillic_to_greek_map} |
|
|
|
|
|
sorted_source_keys = sorted( |
|
list(latin_to_greek_map.keys()) + list(cyrillic_to_greek_map.keys()), |
|
key=len, |
|
reverse=True |
|
) |
|
|
|
while current_index < len(lowercased_text): |
|
found_conversion = False |
|
for key in sorted_source_keys: |
|
if lowercased_text.startswith(key, current_index): |
|
output_chars.append(conversion_map[key]) |
|
current_index += len(key) |
|
found_conversion = True |
|
break |
|
if not found_conversion: |
|
|
|
|
|
output_chars.append(lowercased_text[current_index]) |
|
current_index += 1 |
|
return ''.join(output_chars) |
|
|
|
else: |
|
|
|
|
|
combined_to_latin_map = {**greek_to_latin_map, **cyrillic_to_latin_map} |
|
|
|
|
|
sorted_source_keys = sorted( |
|
list(greek_to_latin_map.keys()) + list(cyrillic_to_latin_map.keys()), |
|
key=len, |
|
reverse=True |
|
) |
|
|
|
while current_index < len(lowercased_text): |
|
found_conversion = False |
|
for key in sorted_source_keys: |
|
if lowercased_text.startswith(key, current_index): |
|
latin_equivalent = combined_to_latin_map[key] |
|
|
|
|
|
|
|
|
|
if key in greek_to_latin_map: |
|
normalized_latin = unicodedata.normalize('NFD', latin_equivalent) |
|
stripped_latin = ''.join(c for c in normalized_latin if not unicodedata.combining(c)) |
|
output_chars.append(stripped_latin) |
|
else: |
|
output_chars.append(latin_equivalent) |
|
|
|
current_index += len(key) |
|
found_conversion = True |
|
break |
|
|
|
if not found_conversion: |
|
|
|
|
|
|
|
output_chars.append(lowercased_text[current_index]) |
|
current_index += 1 |
|
|
|
return ''.join(output_chars) |
|
|
|
|
|
def fix_vocals(text, lang='ron'): |
|
|
|
|
|
|
|
ron_replacements = { |
|
'ţ': 'ț', |
|
'ț': 'ts', |
|
'î': 'u', |
|
'â': 'a', |
|
'ş': 's', |
|
'w': 'oui', |
|
'k': 'c', |
|
'l': 'll', |
|
|
|
'sqrt': ' rădăcina pătrată din ', |
|
'^': ' la puterea ', |
|
'+': ' plus ', |
|
' - ': ' minus ', |
|
|
|
'/': ' împărțit la ', |
|
'=': ' egal cu ', |
|
'pi': ' pi ', |
|
'<': ' mai mic decât ', |
|
'>': ' mai mare decât', |
|
'%': ' la sută ', |
|
'€': ' euro ', |
|
'$': ' dolar ', |
|
'£': ' liră ', |
|
'&': ' și ', |
|
|
|
|
|
'∑': ' sumă ', |
|
'∫': ' integrală ', |
|
|
|
} |
|
|
|
eng_replacements = { |
|
'wik': 'weaky', |
|
'sh': 'ss', |
|
'ch': 'ttss', |
|
'oo': 'oeo', |
|
|
|
'sqrt': ' square root of ', |
|
'^': ' to the power of ', |
|
'+': ' plus ', |
|
' - ': ' minus ', |
|
|
|
' / ': ' divided by ', |
|
'=': ' equals ', |
|
'pi': ' pi ', |
|
'<': ' less than ', |
|
'>': ' greater than ', |
|
|
|
'%': ' percent ', |
|
'€': ' euro ', |
|
'$': ' dollar ', |
|
'£': ' pound ', |
|
'&': ' and ', |
|
'@': ' at ', |
|
'#': ' hash ', |
|
} |
|
|
|
serbian_replacements = { |
|
'rn': 'rrn', |
|
'ć': 'č', |
|
'c': 'č', |
|
'č': 'ts', |
|
'đ': 'dz', |
|
'j': 'i', |
|
'l': 'lll', |
|
'w': 'v', |
|
'h': 'hh', |
|
|
|
'sqrt': 'kvadratni koren iz', |
|
'^': ' na stepen ', |
|
'+': ' plus ', |
|
' - ': ' minus ', |
|
'*': ' puta ', |
|
' / ': ' podeljeno sa ', |
|
'=': ' jednako ', |
|
'pi': ' pi ', |
|
'<': ' manje od ', |
|
'>': ' veće od ', |
|
'%': ' procenat ', |
|
'€': ' evro ', |
|
'$': ' dolar ', |
|
'£': ' funta ', |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
} |
|
|
|
deu_replacements = { |
|
'sch': 'sh', |
|
'ch': 'kh', |
|
'ie': 'ee', |
|
'ei': 'ai', |
|
'ä': 'ae', |
|
'ö': 'oe', |
|
'ü': 'ue', |
|
'ß': 'ss', |
|
|
|
'sqrt': ' Quadratwurzel aus ', |
|
'^': ' hoch ', |
|
'+': ' plus ', |
|
' - ': ' minus ', |
|
'*': ' mal ', |
|
' / ': ' geteilt durch ', |
|
'=': ' gleich ', |
|
'pi': ' pi ', |
|
'<': ' kleiner als ', |
|
'>': ' größer als', |
|
|
|
'%': ' prozent ', |
|
'€': ' euro ', |
|
'$': ' dollar ', |
|
'£': ' pfund ', |
|
'&': ' und ', |
|
'@': ' at ', |
|
'#': ' raute ', |
|
} |
|
|
|
fra_replacements = { |
|
|
|
|
|
'w': 'v', |
|
|
|
'sqrt': ' racine carrée de ', |
|
'^': ' à la puissance ', |
|
'+': ' plus ', |
|
' - ': ' moins ', |
|
'*': ' fois ', |
|
' / ': ' divisé par ', |
|
'=': ' égale ', |
|
'pi': ' pi ', |
|
'<': ' inférieur à ', |
|
'>': ' supérieur à ', |
|
|
|
'%': ' pour cent ', |
|
'€': ' euro ', |
|
'$': ' dollar ', |
|
'£': ' livre ', |
|
'&': ' et ', |
|
'@': ' arobase ', |
|
} |
|
|
|
hun_replacements = { |
|
|
|
|
|
'ch': 'ts', |
|
'cs': 'tz', |
|
'g': 'gk', |
|
'w': 'v', |
|
'z': 'zz', |
|
|
|
'sqrt': ' négyzetgyök ', |
|
'^': ' hatvány ', |
|
'+': ' plusz ', |
|
' - ': ' mínusz ', |
|
'*': ' szorozva ', |
|
' / ': ' osztva ', |
|
'=': ' egyenlő ', |
|
'pi': ' pi ', |
|
'<': ' kisebb mint ', |
|
'>': ' nagyobb mint ', |
|
'%': ' százalék ', |
|
'€': ' euró ', |
|
'$': ' dollár ', |
|
'£': ' font ', |
|
'&': ' és ', |
|
'@': ' kukac ', |
|
'#': ' kettőskereszt ', |
|
} |
|
|
|
grc_replacements = { |
|
|
|
|
|
|
|
'sqrt': ' τετραγωνικὴ ῥίζα ', |
|
'^': ' εἰς τὴν δύναμιν ', |
|
'+': ' σὺν ', |
|
' - ': ' χωρὶς ', |
|
' * ': ' πολλάκις ', |
|
' / ': ' διαιρέω ', |
|
'=': ' ἴσον ', |
|
'pi': ' πῖ ', |
|
'<': ' ἔλαττον ', |
|
'>': ' μεῖζον ', |
|
'%': ' τοῖς ἑκατόν ', |
|
'€': ' εὐρώ ', |
|
'$': ' δολάριον ', |
|
'£': ' λίρα ', |
|
'&': ' καὶ ', |
|
'@': ' ἀτ ', |
|
'#': ' δίεση ', |
|
} |
|
|
|
|
|
|
|
replacements_map = { |
|
'grc': grc_replacements, |
|
'ron': ron_replacements, |
|
'eng': eng_replacements, |
|
'deu': deu_replacements, |
|
'fra': fra_replacements, |
|
'hun': hun_replacements, |
|
'rmc-script_latin': serbian_replacements, |
|
} |
|
|
|
current_replacements = replacements_map.get(lang) |
|
if current_replacements: |
|
|
|
|
|
|
|
sorted_replacements = sorted(current_replacements.items(), key=lambda item: len(item[0]), reverse=True) |
|
for old, new in sorted_replacements: |
|
text = text.replace(old, new) |
|
return text |
|
else: |
|
|
|
print(f"Warning: Language '{lang}' not supported for text replacement. Returning original text.") |
|
return text |
|
|
|
|
|
def _num2words(text='01234', lang=None): |
|
if lang == 'grc': |
|
return convert_numbers(text) |
|
return num2words(text, lang=lang) |
|
|
|
|
|
def transliterate_number(number_string, |
|
lang=None): |
|
if lang == 'rmc-script_latin': |
|
lang = 'sr' |
|
exponential_pronoun = ' puta deset na stepen od ' |
|
comma = ' tačka ' |
|
elif lang == 'ron': |
|
lang = 'ro' |
|
exponential_pronoun = ' tízszer a erejéig ' |
|
comma = ' virgulă ' |
|
elif lang == 'hun': |
|
lang = 'hu' |
|
exponential_pronoun = ' tízszer a erejéig ' |
|
comma = ' virgula ' |
|
elif lang == 'deu': |
|
exponential_pronoun = ' mal zehn hoch ' |
|
comma = ' komma ' |
|
elif lang == 'fra': |
|
lang = 'fr' |
|
exponential_pronoun = ' puissance ' |
|
comma = 'virgule' |
|
elif lang == 'grc': |
|
exponential_pronoun = ' εις την δυναμην του ' |
|
comma = 'κομμα' |
|
else: |
|
lang = lang[:2] |
|
exponential_pronoun = ' times ten to the power of ' |
|
comma = ' point ' |
|
|
|
def replace_number(match): |
|
prefix = match.group(1) or "" |
|
number_part = match.group(2) |
|
suffix = match.group(5) or "" |
|
|
|
try: |
|
if 'e' in number_part.lower(): |
|
base, exponent = number_part.lower().split('e') |
|
words = _num2words(base, lang=lang) + exponential_pronoun + _num2words(exponent, lang=lang) |
|
elif '.' in number_part: |
|
integer_part, decimal_part = number_part.split('.') |
|
words = _num2words(integer_part, lang=lang) + comma + " ".join( |
|
[_num2words(digit, lang=lang) for digit in decimal_part]) |
|
else: |
|
words = _num2words(number_part, lang=lang) |
|
return prefix + words + suffix |
|
except ValueError: |
|
return match.group(0) |
|
|
|
pattern = r'([^\d]*)(\d+(\.\d+)?([Ee][+-]?\d+)?)([^\d]*)' |
|
return re.sub(pattern, replace_number, number_string) |
|
|