File size: 1,782 Bytes
b8e56e6
 
53bd534
 
 
 
 
648fedb
53bd534
 
 
 
 
 
 
 
 
 
 
 
 
c959ca0
 
53bd534
 
 
 
 
 
 
648fedb
 
 
53bd534
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7338704
53bd534
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
from nltk.tokenize import word_tokenize

special_combs = {
    "Dzs" : "Ј", "dzs" : "ј",
    "Dz" : "Ъ", "dz" : "ъ", #Actually the sound of ds in kids
    "Cs" : "Ч", "cs" : "ч",
    "Zs" : "Ж", "zs" : "ж",
    "Sz" : "С", "sz" : "с",
    "Ly" : "y", "ly" : "y"
}

hungarian_dict = {
    "á" : "a", "Á" : "A",
    "é" : "e", "É" : "E",
    "í" : "i", "Í" : "I",
    "ó" : "o", "Ó" : "O",
    "ö" : "a", "Ö" : "A",
    "ő" : "a", "Ő" : "A",
    "ú" : "u", "Ú" : "U",
    "ü" : "ю", "Ü" : "Ю",
    "ű" : "ю", "Ű" : "Ю",
    "j" : "y", "J" : "Y",
    "s" : "sh", "S": "Sh"
}

cyrillic_equiv_dict = {
    "ъ" : "ds", "ь" : "Ds",
    "ч" : "ch", "Ч" : "Ch",
    "ж" : "zh", "Ж" : "Zh",
    "ш" : "sh", "Ш" : "Sh",
    "ј" : "j" , "Ј" : "J",
    "ю" : "yu", "Ю" : "Yu",
    "с" : "s" , "С" : "S"
}

def check_special_comb(word):
    for comb in special_combs:
        if comb in word:
            word = word.replace(comb,special_combs[comb])
    return word

def hungarian_letter_to_eng(letter):
    if letter in hungarian_dict:
        return hungarian_dict[letter]
    else:
        return letter

def cyrillic_to_eng(word):
    for cyrillic in cyrillic_equiv_dict:
        if cyrillic in word:
            word = word.replace(cyrillic,cyrillic_equiv_dict[cyrillic])
    return word


def hungarian_sentence_to_latin(word):
    assert type(word)==str, "Input must be a string"
    # print("Original word: ", word)
    word = check_special_comb(word)
    # print("Just after special combination replacement: -",word)
    word = ''.join([hungarian_letter_to_eng(letter) for letter in word])
    # print("After regular word replacement: -",word)
    word = cyrillic_to_eng(word)
    # print("Simplified pronunciation: -",word)
    return word