Spaces:

Pro100Sata
/

xche_audio

Sleeping

App Files Files Community

Yarik commited on Jun 2

Commit

acee69e

•

1 Parent(s): 61d74e2

Add application file

Browse files

Files changed (11) hide show

accentor_lib/-setup.py +48 -0
accentor_lib/LICENSE.md +21 -0
accentor_lib/README.md +27 -0
accentor_lib/requirements/requirements.txt +4 -0
accentor_lib/requirements/test_requirements.txt +2 -0
accentor_lib/tests/sentences.txt +0 -0
accentor_lib/tests/test_accentor.py +47 -0
accentor_lib/ukrainian_accentor_transformer/__init__.py +195 -0
accentor_lib/ukrainian_accentor_transformer/sequence_utils.py +34 -0
accentor_lib/version.py +1 -0
accentor_lib/version_bump.py +26 -0

accentor_lib/-setup.py ADDED Viewed

	@@ -0,0 +1,48 @@

+from os import path, getenv
+from setuptools import setup, find_packages
+def get_requirements(requirements_filename: str):
+    requirements_file = path.join(path.abspath(path.dirname(__file__)), "requirements", requirements_filename)
+    with open(requirements_file, 'r', encoding='utf-8') as r:
+        requirements = r.readlines()
+    requirements = [r.strip() for r in requirements if r.strip() and not r.strip().startswith("#")]
+    for i in range(0, len(requirements)):
+        r = requirements[i]
+        if "@" in r:
+            parts = [p.lower() if p.strip().startswith("git+http") else p for p in r.split('@')]
+            r = "@".join(parts)
+            if getenv("GITHUB_TOKEN"):
+                if "github.com" in r:
+                    r = r.replace("github.com", f"{getenv('GITHUB_TOKEN')}@github.com")
+            requirements[i] = r
+    return requirements
+with open("README.md", "r") as f:
+    long_description = f.read()
+with open("./version.py", "r", encoding="utf-8") as v:
+    for line in v.readlines():
+        if line.startswith("__version__"):
+            if '"' in line:
+                version = line.split('"')[1]
+            else:
+                version = line.split("'")[1]
+setup(
+    name='ukrainian-accentor-transformer',
+    version=version,
+    description='Adds word stress for texts in Ukrainian',
+    long_description=long_description,
+    long_description_content_type='text/markdown',
+    url='https://github.com/Theodotus1243/ukrainian-accentor-transformer',
+    author='Theodotus1243',
+    license='MIT',
+    packages=find_packages(),
+    install_requires=get_requirements("requirements.txt"),
+    zip_safe=True,
+    keywords='ukrainian accent stress nlp transformer linguistics',
+)

accentor_lib/LICENSE.md ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2023 Bohdan Mykhailenko
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

accentor_lib/README.md ADDED Viewed

	@@ -0,0 +1,27 @@

+# Ukrainian Accentor Transformer
+This repository contains a model to make accents in Ukrainian words.
+## Installation
+```bash
+pip install git+https://github.com/Theodotus1243/ukrainian-accentor-transformer.git
+```
+## Example
+```python
+>>> from ukrainian_accentor_transformer import Accentor
+>>> text = "Кам'янець-Подільський - місто в Хмельницькій області України, центр Кам'янець-Подільської міської об'єднаної територіальної громади і Кам'янець-Подільського району."
+>>> accentor = Accentor()
+>>> accentor(text)
+"Кам'яне́ць-Поді́льський - мі́сто в Хмельни́цькій о́бласті Украї́ни, центр Кам'яне́ць-Поді́льської місько́ї об'є́днаної територіа́льної грома́ди і Кам'яне́ць-Поді́льського райо́ну."
+```
+## Attribution
+Trained on dataset - [News corpus](https://lang.org.ua/en/corpora/#anchor5)
+by [Dmytro Chaplynskyi](https://github.com/dchaplinsky) from [lang-uk](https://github.com/lang-uk)\
+Stressed using [ukrainian-word-stress](https://github.com/lang-uk/ukrainian-word-stress)
+by [Oleksiy Syvokon](https://github.com/asivokon)

accentor_lib/requirements/requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+ctranslate2
+sentencepiece
+# huggingface
+huggingface-hub

accentor_lib/requirements/test_requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ pytest
2	+ pytest-timeout

accentor_lib/tests/sentences.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

accentor_lib/tests/test_accentor.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import os
+import sys
+import unittest
+sys.path.append(os.path.dirname(os.path.dirname(__file__)))
+from ukrainian_accentor_transformer import Accentor
+class TestAccentor(unittest.TestCase):
+    @classmethod
+    def setUpClass(TestAccentor):
+        TestAccentor.accentor = Accentor()
+    def test_simple_accent(self):
+        text = "Привіт хлопче, як справи."
+        accented = self.accentor(text)
+        self.assertEqual(text, accented.replace("\u0301", ""))
+    def test_batch_accent(self):
+        text1 = "Привіт хлопче, як справи."
+        text2 = "в мене все добре, дякую."
+        accented1, accented2 = self.accentor([text1, text2])
+        self.assertEqual(text1, accented1.replace("\u0301", ""))
+        self.assertEqual(text2, accented2.replace("\u0301", ""))
+    def test_long_sentence(self):
+        text = "Адже як би не оцінював галичан один страшно інтелігентний виходець з радянсько єврейських середовищ київського Подолу самі галичани вважають свою культуру і традицію політичну і релігійну побутову й господарську на голову вищою від усього що за Збручем"
+        accented = self.accentor(text)
+        self.assertEqual(text, accented.replace("\u0301", ""))
+    def test_long_sentence(self):
+        text = "Веселка також райдуга атмосферне оптичне явище що являє собою одну дві чи декілька спектральних дуг або кіл якщо дивитися з повітря що спостерігаються на тлі хмари якщо вона розташована проти Сонця Червоний колір спектру ми бачимо з зовнішнього боку первинної веселки а фіолетовий із внутрішнього"
+        accented = self.accentor(text)
+        self.assertEqual(text, accented.replace("\u0301", ""))
+    def test_corpus(self):
+        with open("tests/sentences.txt") as sentences_file:
+            sentences = sentences_file.readlines()
+        accented = self.accentor(sentences)
+        clean_sentences = self.accentor._clean_accents(accented)
+        for sentence, clean_sentence in zip(sentences, clean_sentences):
+            self.assertEqual(sentence, clean_sentence)
+if __name__ == '__main__':
+    unittest.main()

accentor_lib/ukrainian_accentor_transformer/__init__.py ADDED Viewed

	@@ -0,0 +1,195 @@

+from typing import List, Union, Tuple
+import ctranslate2
+import sentencepiece as spm
+from huggingface_hub import snapshot_download
+from .sequence_utils import diff_fix
+class Accentor:
+    _hf_repo = "theodotus/ukrainian-accentor-transformer@v0.1"
+    max_len = 30
+    split_tokens = set([".", ",", "!", "?"])
+    _init_config = {
+        "inter_threads": 2,
+        "intra_threads": 4
+    }
+    _run_config = {
+        "repetition_penalty": 1.2,
+        "max_batch_size": 8
+    }
+    def __init__(self, device: str = "cpu"):
+        self._init_model(device=device)
+    def __call__(self, sentence: Union[List[str], str],
+                 symbol: str = "stress", mode: str = "reduced") -> Union[List[str], str]:
+        """
+        Add word stress to texts in Ukrainian
+        Args:
+            sentence: sentence to accent
+        Returns:
+            accented_sentence
+        Examples:
+            Simple usage.
+            >>> from ukrainian_accentor_transformer import Accentor
+            >>> accentor = Accentor()
+            >>> accented_sentence = accentor("Привіт хлопче")
+        """
+        if (type(sentence) is str):
+            sentences = [sentence]
+        elif (type(sentence) is list):
+            sentences = sentence
+        accented_sentences = self._accent(sentences=sentences, symbol=symbol, mode=mode)
+        if (type(sentence) is str):
+            accented_sentence = accented_sentences[0]
+        elif (type(sentence) is list):
+            accented_sentence = accented_sentences
+        return accented_sentence
+    def _accent(self, sentences: List[str], symbol: str, mode: str) -> List[str]:
+        """
+        Internal accent function
+        Args:
+            sentences: list of sentences to accent
+        Returns:
+            accented_sentences
+        """
+        clean_sentences = self._clean_accents(sentences)
+        tokenized_sentences = self.sp.encode(clean_sentences, out_type=str)
+        splitted_sentences = self._split_punctuation(tokenized_sentences)
+        short_sentences = self._split_long(splitted_sentences)
+        translation_batch, join_list = self._to_translation_batch(short_sentences)
+        results = self.model.translate_batch(translation_batch, **self._run_config)
+        accented_tokens = [result.hypotheses[0] for result in results]
+        join_sentences = self._join_long(accented_tokens, join_list)
+        accented_sentences = self.sp.decode(join_sentences)
+        fixed_sentences = self._diff_fix(clean_sentences, accented_sentences)
+        return fixed_sentences
+    def _clean_accents(self, sentences: List[str]) -> List[str]:
+        clean_sentences = [sentence.replace("\u0301", "") for sentence in sentences]
+        return clean_sentences
+    def _split_punctuation(self, tokenized_sentences: List[List[str]]) -> List[List[List[str]]]:
+        splitted_sentences = []
+        for tokenized in tokenized_sentences:
+            splitted = self._split_punctuation_sentence(tokenized)
+            splitted_sentences.append(splitted)
+        return splitted_sentences
+    def _split_punctuation_sentence(self, tokenized: List[str]) -> List[List[str]]:
+        splitted = []
+        start_idx = 0
+        for idx, token in enumerate(tokenized, start=1):
+            if token in self.split_tokens:
+                splitted.append(tokenized[start_idx:idx])
+                start_idx = idx
+        else:
+            if (start_idx < len(tokenized)):
+                splitted.append(tokenized[start_idx:])
+        return splitted
+    def _split_long(self, splitted_sentences: List[List[List[str]]]) -> List[List[List[str]]]:
+        while True:
+            short_sentences = []
+            for tokenized in splitted_sentences:
+                short = self._split_long_sentence(tokenized)
+                short_sentences.append(short)
+            if splitted_sentences == short_sentences:
+                break
+            else:
+                splitted_sentences = short_sentences
+        return short_sentences
+    def _split_long_sentence(self, splitted: List[List[str]]) -> List[List[str]]:
+        short = []
+        for sentence in splitted:
+            if (len(sentence) < self.max_len):
+                short.append(sentence)
+            else:
+                middle_idx = self._find_middle_space(sentence)
+                short.append(sentence[:middle_idx])
+                short.append(sentence[middle_idx:])
+        return short
+    @staticmethod
+    def _find_middle_space(sentence: List[str]) -> int:
+        middle_idx = len(sentence) // 2
+        max_shift = len(sentence) // 10
+        for i in range(max_shift):
+            left_idx = middle_idx - i
+            right_idx = middle_idx + i
+            if (sentence[left_idx][0] == "▁"):
+                return left_idx
+            if (sentence[right_idx][0] == "▁"):
+                return right_idx
+        else:
+            return middle_idx
+    def _to_translation_batch(self, splitted_sentences: List[List[List[str]]]) -> Tuple[List[List[str]], List[int]]:
+        join_list = [len(sentence) for sentence in splitted_sentences]
+        translation_batch = sum(splitted_sentences, [])
+        return translation_batch, join_list
+    def _join_long(self, splitted_sentences: List[List[str]], join_list: List[int]) -> List[List[str]]:
+        join_sentences = []
+        sentence_idx = 0
+        for join_len in join_list:
+            sentence = sum(splitted_sentences[sentence_idx:sentence_idx + join_len], [])
+            join_sentences.append(sentence)
+            sentence_idx += join_len
+        return join_sentences
+    def _diff_fix(self, sentences: List[str], accented_sentences: List[str]):
+        fixed_sentences = [diff_fix(input=sentence, output=accented_sentence)
+                           for sentence, accented_sentence in zip(sentences, accented_sentences)]
+        return fixed_sentences
+    def _init_model(self, device: str) -> None:
+        """
+        Initialize a model and tokenizer
+        Args:
+            device: device where to run model: "cpu" or "cuda"
+        """
+        repo_path = self._download_huggingface(self._hf_repo)
+        self.model = ctranslate2.Translator(f"{repo_path}/ctranslate2/", device=device, **self._init_config)
+        self.sp = spm.SentencePieceProcessor(model_file=f"{repo_path}/tokenizer.model")
+    @staticmethod
+    def _download_huggingface(repo_name: str) -> str:
+        """
+        Download a file from Huggingface
+        Args:
+            repo_name: name of repository to download
+        Returns:
+            repo_path
+        """
+        # get revision
+        repo_name, *suffix = repo_name.split("@")
+        revision = dict(enumerate(suffix)).get(0, None)
+        repo_path = snapshot_download(repo_name, revision=revision)
+        return repo_path

accentor_lib/ukrainian_accentor_transformer/sequence_utils.py ADDED Viewed

	@@ -0,0 +1,34 @@

+from difflib import SequenceMatcher
+def accent_flag(code: list, output: str):
+    flag = (
+            (code[2] - code[1] == 1) and
+            (output[code[1]:code[2]] == "\u0301") and
+            (code[0] == 'delete')
+    )
+    return flag
+def get_opcodes(input: str, output: str):
+    opcodes = SequenceMatcher(a=output, b=input, autojunk=False).get_opcodes()
+    # Keep accent
+    for idx in range(len(opcodes)):
+        code = opcodes[idx]
+        if accent_flag(code, output):
+            opcodes[idx] = ("equal", *code[1:])
+    return opcodes
+def diff_fix(input: str, output: str):
+    opcodes = get_opcodes(input=input, output=output)
+    fixed = ""
+    for code in opcodes:
+        operation, idxs = code[0], code[1:]
+        if operation == "equal":
+            fixed += output[idxs[0]:idxs[1]]
+        elif operation == "delete":
+            pass
+        elif (operation == "insert") or (operation == "replace"):
+            fixed += input[idxs[2]:idxs[3]]
+    return fixed

accentor_lib/version.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ __version__ = "0.1.0"

accentor_lib/version_bump.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import fileinput
+from os.path import join, dirname
+with open(join(dirname(__file__), "version.py"), "r", encoding="utf-8") as v:
+    for line in v.readlines():
+        if line.startswith("__version__"):
+            if '"' in line:
+                version = line.split('"')[1]
+            else:
+                version = line.split("'")[1]
+if "a" not in version:
+    parts = version.split('.')
+    parts[-1] = str(int(parts[-1]) + 1)
+    version = '.'.join(parts)
+    version = f"{version}a0"
+else:
+    post = version.split("a")[1]
+    new_post = int(post) + 1
+    version = version.replace(f"a{post}", f"a{new_post}")
+for line in fileinput.input(join(dirname(__file__), "version.py"), inplace=True):
+    if line.startswith("__version__"):
+        print(f"__version__ = \"{version}\"")
+    else:
+        print(line.rstrip('\n'))