Yarik commited on
Commit
acee69e
1 Parent(s): 61d74e2

Add application file

Browse files
accentor_lib/-setup.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from os import path, getenv
2
+
3
+ from setuptools import setup, find_packages
4
+
5
+
6
+ def get_requirements(requirements_filename: str):
7
+ requirements_file = path.join(path.abspath(path.dirname(__file__)), "requirements", requirements_filename)
8
+ with open(requirements_file, 'r', encoding='utf-8') as r:
9
+ requirements = r.readlines()
10
+ requirements = [r.strip() for r in requirements if r.strip() and not r.strip().startswith("#")]
11
+
12
+ for i in range(0, len(requirements)):
13
+ r = requirements[i]
14
+ if "@" in r:
15
+ parts = [p.lower() if p.strip().startswith("git+http") else p for p in r.split('@')]
16
+ r = "@".join(parts)
17
+ if getenv("GITHUB_TOKEN"):
18
+ if "github.com" in r:
19
+ r = r.replace("github.com", f"{getenv('GITHUB_TOKEN')}@github.com")
20
+ requirements[i] = r
21
+ return requirements
22
+
23
+
24
+ with open("README.md", "r") as f:
25
+ long_description = f.read()
26
+
27
+ with open("./version.py", "r", encoding="utf-8") as v:
28
+ for line in v.readlines():
29
+ if line.startswith("__version__"):
30
+ if '"' in line:
31
+ version = line.split('"')[1]
32
+ else:
33
+ version = line.split("'")[1]
34
+
35
+ setup(
36
+ name='ukrainian-accentor-transformer',
37
+ version=version,
38
+ description='Adds word stress for texts in Ukrainian',
39
+ long_description=long_description,
40
+ long_description_content_type='text/markdown',
41
+ url='https://github.com/Theodotus1243/ukrainian-accentor-transformer',
42
+ author='Theodotus1243',
43
+ license='MIT',
44
+ packages=find_packages(),
45
+ install_requires=get_requirements("requirements.txt"),
46
+ zip_safe=True,
47
+ keywords='ukrainian accent stress nlp transformer linguistics',
48
+ )
accentor_lib/LICENSE.md ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Bohdan Mykhailenko
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
accentor_lib/README.md ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ukrainian Accentor Transformer
2
+
3
+ This repository contains a model to make accents in Ukrainian words.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ pip install git+https://github.com/Theodotus1243/ukrainian-accentor-transformer.git
9
+ ```
10
+
11
+ ## Example
12
+
13
+ ```python
14
+ >>> from ukrainian_accentor_transformer import Accentor
15
+ >>> text = "Кам'янець-Подільський - місто в Хмельницькій області України, центр Кам'янець-Подільської міської об'єднаної територіальної громади і Кам'янець-Подільського району."
16
+ >>> accentor = Accentor()
17
+ >>> accentor(text)
18
+
19
+ "Кам'яне́ць-Поді́льський - мі́сто в Хмельни́цькій о́бласті Украї́ни, центр Кам'яне́ць-Поді́льської місько́ї об'є́днаної територіа́льної грома́ди і Кам'яне́ць-Поді́льського райо́ну."
20
+ ```
21
+
22
+ ## Attribution
23
+
24
+ Trained on dataset - [News corpus](https://lang.org.ua/en/corpora/#anchor5)
25
+ by [Dmytro Chaplynskyi](https://github.com/dchaplinsky) from [lang-uk](https://github.com/lang-uk)\
26
+ Stressed using [ukrainian-word-stress](https://github.com/lang-uk/ukrainian-word-stress)
27
+ by [Oleksiy Syvokon](https://github.com/asivokon)
accentor_lib/requirements/requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ ctranslate2
2
+ sentencepiece
3
+ # huggingface
4
+ huggingface-hub
accentor_lib/requirements/test_requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ pytest
2
+ pytest-timeout
accentor_lib/tests/sentences.txt ADDED
The diff for this file is too large to render. See raw diff
 
accentor_lib/tests/test_accentor.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import unittest
4
+
5
+ sys.path.append(os.path.dirname(os.path.dirname(__file__)))
6
+ from ukrainian_accentor_transformer import Accentor
7
+
8
+
9
+ class TestAccentor(unittest.TestCase):
10
+
11
+ @classmethod
12
+ def setUpClass(TestAccentor):
13
+ TestAccentor.accentor = Accentor()
14
+
15
+ def test_simple_accent(self):
16
+ text = "Привіт хлопче, як справи."
17
+ accented = self.accentor(text)
18
+ self.assertEqual(text, accented.replace("\u0301", ""))
19
+
20
+ def test_batch_accent(self):
21
+ text1 = "Привіт хлопче, як справи."
22
+ text2 = "в мене все добре, дякую."
23
+ accented1, accented2 = self.accentor([text1, text2])
24
+ self.assertEqual(text1, accented1.replace("\u0301", ""))
25
+ self.assertEqual(text2, accented2.replace("\u0301", ""))
26
+
27
+ def test_long_sentence(self):
28
+ text = "Адже як би не оцінював галичан один страшно інтелігентний виходець з радянсько єврейських середовищ київського Подолу самі галичани вважають свою культуру і традицію політичну і релігійну побутову й господарську на голову вищою від усього що за Збручем"
29
+ accented = self.accentor(text)
30
+ self.assertEqual(text, accented.replace("\u0301", ""))
31
+
32
+ def test_long_sentence(self):
33
+ text = "Веселка також райдуга атмосферне оптичне явище що являє собою одну дві чи декілька спектральних дуг або кіл якщо дивитися з повітря що спостерігаються на тлі хмари якщо вона розташована проти Сонця Червоний колір спектру ми бачимо з зовнішнього боку первинної веселки а фіолетовий із внутрішнього"
34
+ accented = self.accentor(text)
35
+ self.assertEqual(text, accented.replace("\u0301", ""))
36
+
37
+ def test_corpus(self):
38
+ with open("tests/sentences.txt") as sentences_file:
39
+ sentences = sentences_file.readlines()
40
+ accented = self.accentor(sentences)
41
+ clean_sentences = self.accentor._clean_accents(accented)
42
+ for sentence, clean_sentence in zip(sentences, clean_sentences):
43
+ self.assertEqual(sentence, clean_sentence)
44
+
45
+
46
+ if __name__ == '__main__':
47
+ unittest.main()
accentor_lib/ukrainian_accentor_transformer/__init__.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Union, Tuple
2
+
3
+ import ctranslate2
4
+ import sentencepiece as spm
5
+ from huggingface_hub import snapshot_download
6
+
7
+ from .sequence_utils import diff_fix
8
+
9
+
10
+ class Accentor:
11
+ _hf_repo = "theodotus/ukrainian-accentor-transformer@v0.1"
12
+
13
+ max_len = 30
14
+ split_tokens = set([".", ",", "!", "?"])
15
+
16
+ _init_config = {
17
+ "inter_threads": 2,
18
+ "intra_threads": 4
19
+ }
20
+
21
+ _run_config = {
22
+ "repetition_penalty": 1.2,
23
+ "max_batch_size": 8
24
+ }
25
+
26
+ def __init__(self, device: str = "cpu"):
27
+ self._init_model(device=device)
28
+
29
+ def __call__(self, sentence: Union[List[str], str],
30
+ symbol: str = "stress", mode: str = "reduced") -> Union[List[str], str]:
31
+ """
32
+ Add word stress to texts in Ukrainian
33
+ Args:
34
+ sentence: sentence to accent
35
+
36
+ Returns:
37
+ accented_sentence
38
+
39
+ Examples:
40
+ Simple usage.
41
+
42
+ >>> from ukrainian_accentor_transformer import Accentor
43
+ >>> accentor = Accentor()
44
+ >>> accented_sentence = accentor("Привіт хлопче")
45
+ """
46
+
47
+ if (type(sentence) is str):
48
+ sentences = [sentence]
49
+ elif (type(sentence) is list):
50
+ sentences = sentence
51
+
52
+ accented_sentences = self._accent(sentences=sentences, symbol=symbol, mode=mode)
53
+
54
+ if (type(sentence) is str):
55
+ accented_sentence = accented_sentences[0]
56
+ elif (type(sentence) is list):
57
+ accented_sentence = accented_sentences
58
+
59
+ return accented_sentence
60
+
61
+ def _accent(self, sentences: List[str], symbol: str, mode: str) -> List[str]:
62
+ """
63
+ Internal accent function
64
+ Args:
65
+ sentences: list of sentences to accent
66
+
67
+ Returns:
68
+ accented_sentences
69
+ """
70
+
71
+ clean_sentences = self._clean_accents(sentences)
72
+
73
+ tokenized_sentences = self.sp.encode(clean_sentences, out_type=str)
74
+ splitted_sentences = self._split_punctuation(tokenized_sentences)
75
+ short_sentences = self._split_long(splitted_sentences)
76
+
77
+ translation_batch, join_list = self._to_translation_batch(short_sentences)
78
+ results = self.model.translate_batch(translation_batch, **self._run_config)
79
+ accented_tokens = [result.hypotheses[0] for result in results]
80
+
81
+ join_sentences = self._join_long(accented_tokens, join_list)
82
+ accented_sentences = self.sp.decode(join_sentences)
83
+
84
+ fixed_sentences = self._diff_fix(clean_sentences, accented_sentences)
85
+
86
+ return fixed_sentences
87
+
88
+ def _clean_accents(self, sentences: List[str]) -> List[str]:
89
+ clean_sentences = [sentence.replace("\u0301", "") for sentence in sentences]
90
+ return clean_sentences
91
+
92
+ def _split_punctuation(self, tokenized_sentences: List[List[str]]) -> List[List[List[str]]]:
93
+ splitted_sentences = []
94
+ for tokenized in tokenized_sentences:
95
+ splitted = self._split_punctuation_sentence(tokenized)
96
+ splitted_sentences.append(splitted)
97
+ return splitted_sentences
98
+
99
+ def _split_punctuation_sentence(self, tokenized: List[str]) -> List[List[str]]:
100
+ splitted = []
101
+ start_idx = 0
102
+ for idx, token in enumerate(tokenized, start=1):
103
+ if token in self.split_tokens:
104
+ splitted.append(tokenized[start_idx:idx])
105
+ start_idx = idx
106
+ else:
107
+ if (start_idx < len(tokenized)):
108
+ splitted.append(tokenized[start_idx:])
109
+ return splitted
110
+
111
+ def _split_long(self, splitted_sentences: List[List[List[str]]]) -> List[List[List[str]]]:
112
+ while True:
113
+ short_sentences = []
114
+ for tokenized in splitted_sentences:
115
+ short = self._split_long_sentence(tokenized)
116
+ short_sentences.append(short)
117
+ if splitted_sentences == short_sentences:
118
+ break
119
+ else:
120
+ splitted_sentences = short_sentences
121
+ return short_sentences
122
+
123
+ def _split_long_sentence(self, splitted: List[List[str]]) -> List[List[str]]:
124
+ short = []
125
+ for sentence in splitted:
126
+ if (len(sentence) < self.max_len):
127
+ short.append(sentence)
128
+ else:
129
+ middle_idx = self._find_middle_space(sentence)
130
+ short.append(sentence[:middle_idx])
131
+ short.append(sentence[middle_idx:])
132
+ return short
133
+
134
+ @staticmethod
135
+ def _find_middle_space(sentence: List[str]) -> int:
136
+ middle_idx = len(sentence) // 2
137
+ max_shift = len(sentence) // 10
138
+ for i in range(max_shift):
139
+ left_idx = middle_idx - i
140
+ right_idx = middle_idx + i
141
+ if (sentence[left_idx][0] == "▁"):
142
+ return left_idx
143
+ if (sentence[right_idx][0] == "▁"):
144
+ return right_idx
145
+ else:
146
+ return middle_idx
147
+
148
+ def _to_translation_batch(self, splitted_sentences: List[List[List[str]]]) -> Tuple[List[List[str]], List[int]]:
149
+ join_list = [len(sentence) for sentence in splitted_sentences]
150
+ translation_batch = sum(splitted_sentences, [])
151
+ return translation_batch, join_list
152
+
153
+ def _join_long(self, splitted_sentences: List[List[str]], join_list: List[int]) -> List[List[str]]:
154
+ join_sentences = []
155
+ sentence_idx = 0
156
+ for join_len in join_list:
157
+ sentence = sum(splitted_sentences[sentence_idx:sentence_idx + join_len], [])
158
+ join_sentences.append(sentence)
159
+ sentence_idx += join_len
160
+ return join_sentences
161
+
162
+ def _diff_fix(self, sentences: List[str], accented_sentences: List[str]):
163
+ fixed_sentences = [diff_fix(input=sentence, output=accented_sentence)
164
+ for sentence, accented_sentence in zip(sentences, accented_sentences)]
165
+ return fixed_sentences
166
+
167
+ def _init_model(self, device: str) -> None:
168
+ """
169
+ Initialize a model and tokenizer
170
+ Args:
171
+ device: device where to run model: "cpu" or "cuda"
172
+ """
173
+ repo_path = self._download_huggingface(self._hf_repo)
174
+
175
+ self.model = ctranslate2.Translator(f"{repo_path}/ctranslate2/", device=device, **self._init_config)
176
+ self.sp = spm.SentencePieceProcessor(model_file=f"{repo_path}/tokenizer.model")
177
+
178
+ @staticmethod
179
+ def _download_huggingface(repo_name: str) -> str:
180
+ """
181
+ Download a file from Huggingface
182
+ Args:
183
+ repo_name: name of repository to download
184
+
185
+ Returns:
186
+ repo_path
187
+ """
188
+
189
+ # get revision
190
+ repo_name, *suffix = repo_name.split("@")
191
+ revision = dict(enumerate(suffix)).get(0, None)
192
+
193
+ repo_path = snapshot_download(repo_name, revision=revision)
194
+
195
+ return repo_path
accentor_lib/ukrainian_accentor_transformer/sequence_utils.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from difflib import SequenceMatcher
2
+
3
+
4
+ def accent_flag(code: list, output: str):
5
+ flag = (
6
+ (code[2] - code[1] == 1) and
7
+ (output[code[1]:code[2]] == "\u0301") and
8
+ (code[0] == 'delete')
9
+ )
10
+ return flag
11
+
12
+
13
+ def get_opcodes(input: str, output: str):
14
+ opcodes = SequenceMatcher(a=output, b=input, autojunk=False).get_opcodes()
15
+ # Keep accent
16
+ for idx in range(len(opcodes)):
17
+ code = opcodes[idx]
18
+ if accent_flag(code, output):
19
+ opcodes[idx] = ("equal", *code[1:])
20
+ return opcodes
21
+
22
+
23
+ def diff_fix(input: str, output: str):
24
+ opcodes = get_opcodes(input=input, output=output)
25
+ fixed = ""
26
+ for code in opcodes:
27
+ operation, idxs = code[0], code[1:]
28
+ if operation == "equal":
29
+ fixed += output[idxs[0]:idxs[1]]
30
+ elif operation == "delete":
31
+ pass
32
+ elif (operation == "insert") or (operation == "replace"):
33
+ fixed += input[idxs[2]:idxs[3]]
34
+ return fixed
accentor_lib/version.py ADDED
@@ -0,0 +1 @@
 
 
1
+ __version__ = "0.1.0"
accentor_lib/version_bump.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fileinput
2
+ from os.path import join, dirname
3
+
4
+ with open(join(dirname(__file__), "version.py"), "r", encoding="utf-8") as v:
5
+ for line in v.readlines():
6
+ if line.startswith("__version__"):
7
+ if '"' in line:
8
+ version = line.split('"')[1]
9
+ else:
10
+ version = line.split("'")[1]
11
+
12
+ if "a" not in version:
13
+ parts = version.split('.')
14
+ parts[-1] = str(int(parts[-1]) + 1)
15
+ version = '.'.join(parts)
16
+ version = f"{version}a0"
17
+ else:
18
+ post = version.split("a")[1]
19
+ new_post = int(post) + 1
20
+ version = version.replace(f"a{post}", f"a{new_post}")
21
+
22
+ for line in fileinput.input(join(dirname(__file__), "version.py"), inplace=True):
23
+ if line.startswith("__version__"):
24
+ print(f"__version__ = \"{version}\"")
25
+ else:
26
+ print(line.rstrip('\n'))