File size: 3,091 Bytes
f9e5028
 
 
 
 
 
 
 
 
 
 
 
 
9e96240
 
 
 
f9e5028
 
1ce668d
 
f9e5028
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9e96240
 
 
 
 
 
 
 
f9e5028
 
 
 
 
 
 
9e96240
 
f9e5028
 
 
 
 
 
9e96240
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
from unittest import skip
from gruut import sentences
import torch

importer = torch.package.PackageImporter("ukrainian-accentor/accentor-lite.pt")
accentor = importer.load_pickle("uk-accentor", "model")
replace_accents = importer.load_pickle("uk-accentor", "replace_accents")

# Using GPU
# accentor.cuda()
# Back to CPU
# accentor.cpu()

vowels = "аеєиіїоуюя"
consonants = "бвгґджзйклмнпрстфхцчшщь"
special = "'"
alphabet = vowels + consonants + special

def accent_word(word):
    with torch.no_grad():
        stressed_words = accentor.predict([word], mode='stress')
    plused_words = [replace_accents(x) for x in stressed_words]
    return plused_words[0]

def sentence_to_stress(sentence):
    words = sentence.split()
    words = sum([[word, " "] for word in words], start=[])
    new_list = []
    for word in words:
        first_word_sep = list(map(lambda letter: letter in alphabet, word.lower()))
        if all(first_word_sep):
            new_list.append(word)
        else:
            current_index = 0
            past_index = 0
            for letter in first_word_sep:
                if letter == False:
                    new_list.append(word[past_index:current_index])
                    new_list.append(word[current_index])
                    past_index = current_index + 1
                current_index += 1
            new_list.append(word[past_index:current_index])
    #print(list(filter(lambda x: len(x) > 0, new_list)))
    for word_index in range(0, len(new_list)):
        element = new_list[word_index]
        first_word_sep = list(map(lambda letter: letter in alphabet, element.lower()))
        if not all(first_word_sep) or len(element) == 0:
            continue
        else:
            vowels_in_words = list(map(lambda letter: letter in vowels, new_list[word_index]))
            if vowels_in_words.count(True) == 0:
                continue
            elif vowels_in_words.count(True) == 1:
                vowel_index = vowels_in_words.index(True)
                new_list[word_index] = new_list[word_index][0:vowel_index] + "+" + new_list[word_index][vowel_index::]
            else:
                new_list[word_index] = accent_word(new_list[word_index])

    return "".join(new_list)


if __name__ == "__main__":
    sentence = "Кам'янець-Подільський - місто в Хмельницькій області України, центр Кам'янець-Подільської міської об'єднаної територіальної громади і Кам'янець-Подільського району."
    print(sentence_to_stress(sentence))
    sentence = "Привіт, як тебе звати?"
    print(sentence_to_stress(sentence))
    #test_words1 = ["словотворення", "архаїчний", "програма", "а-ля-фуршет"]

    stressed_words = accentor.predict(["привіт"], mode='stress')
    plused_words = [replace_accents(x) for x in stressed_words]

    print('With stress:', stressed_words)
    print('With pluses:', plused_words)