Spaces:

TeraTTS
/

TTS

Running

App Files Files Community

Den4ikAI commited on Aug 31, 2023

Commit

c80d52e

•

1 Parent(s): c5e6494

Delete folder ruaccent with huggingface_hub

Browse files

Files changed (5) hide show

ruaccent/__init__.py +0 -1
ruaccent/accent_model.py +0 -27
ruaccent/char_tokenizer.py +0 -112
ruaccent/omograph_model.py +0 -21
ruaccent/ruaccent.py +0 -153

ruaccent/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- from .ruaccent import RUAccent

ruaccent/accent_model.py DELETED Viewed

@@ -1,27 +0,0 @@
-import torch
-from .char_tokenizer import CharTokenizer
-from transformers import AutoModelForTokenClassification
-class AccentModel:
-    def __init__(self, allow_cuda=True) -> None:
-        self.device = torch.device('cuda' if torch.cuda.is_available()  and allow_cuda else 'cpu')
-    def load(self, path):
-        self.model = AutoModelForTokenClassification.from_pretrained(path).to(self.device)
-        self.tokenizer = CharTokenizer.from_pretrained(path)
-    def render_stress(self, word, token_classes):
-        if 'STRESS' in token_classes:
-            index = token_classes.index('STRESS')
-            word = list(word)
-            word[index-1] = '+' + word[index-1]
-            return ''.join(word)
-        else:
-            return word
-    def put_accent(self, word):
-        inputs = self.tokenizer(word, return_tensors="pt").to(self.device)
-        with torch.no_grad():
-            logits = self.model(**inputs).logits
-            predictions = torch.argmax(logits, dim=2)
-            predicted_token_class = [self.model.config.id2label[t.item()] for t in predictions[0]]
-        return self.render_stress(word, predicted_token_class)

ruaccent/char_tokenizer.py DELETED Viewed

@@ -1,112 +0,0 @@
-import os
-from typing import Optional, Tuple, List
-from collections import OrderedDict
-from transformers import PreTrainedTokenizer
-def load_vocab(vocab_file):
-    vocab = OrderedDict()
-    with open(vocab_file, "r", encoding="utf-8") as reader:
-        tokens = reader.readlines()
-    for index, token in enumerate(tokens):
-        token = token.rstrip("\n")
-        vocab[token] = index
-    return vocab
-class CharTokenizer(PreTrainedTokenizer):
-    vocab_files_names = {"vocab_file": "vocab.txt"}
-    def __init__(
-        self,
-        vocab_file=None,
-        pad_token="[pad]",
-        unk_token="[unk]",
-        bos_token="[bos]",
-        eos_token="[eos]",
-        do_lower_case=False,
-        *args,
-        **kwargs
-    ):
-        super().__init__(
-            pad_token=pad_token,
-            unk_token=unk_token,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            do_lower_case=do_lower_case,
-            **kwargs
-        )
-        self.do_lower_case = do_lower_case
-        if not vocab_file or not os.path.isfile(vocab_file):
-            self.vocab = OrderedDict()
-            self.ids_to_tokens = OrderedDict()
-        else:
-            self.vocab = load_vocab(vocab_file)
-            self.ids_to_tokens = OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
-    @property
-    def vocab_size(self):
-        return len(self.vocab)
-    def get_vocab(self):
-        return self.vocab
-    def _convert_token_to_id(self, token):
-        if self.do_lower_case:
-            token = token.lower()
-        return self.vocab.get(token, self.vocab[self.unk_token])
-    def _convert_id_to_token(self, index):
-        return self.ids_to_tokens[index]
-    def _tokenize(self, text):
-        if self.do_lower_case:
-            text = text.lower()
-        return list(text)
-    def convert_tokens_to_string(self, tokens):
-        return "".join(tokens)
-    def build_inputs_with_special_tokens(
-        self,
-        token_ids_0: List[int],
-        token_ids_1: Optional[List[int]] = None
-    ) -> List[int]:
-        bos = [self.bos_token_id]
-        eos = [self.eos_token_id]
-        return bos + token_ids_0 + eos
-    def get_special_tokens_mask(
-         self,
-         token_ids_0: List[int],
-         token_ids_1: Optional[List[int]] = None
-    ) -> List[int]:
-        return [1] + ([0] * len(token_ids_0)) + [1]
-    def create_token_type_ids_from_sequences(
-        self,
-        token_ids_0: List[int],
-        token_ids_1: Optional[List[int]] = None
-    ) -> List[int]:
-        return (len(token_ids_0) + 2) * [0]
-    def save_vocabulary(
-        self,
-        save_directory: str,
-        filename_prefix: Optional[str] = None
-    ) -> Tuple[str]:
-        assert os.path.isdir(save_directory)
-        vocab_file = os.path.join(
-            save_directory,
-            (filename_prefix + "-" if filename_prefix else "") +
-            self.vocab_files_names["vocab_file"]
-        )
-        index = 0
-        with open(vocab_file, "w", encoding="utf-8") as writer:
-            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
-                assert index == token_index
-                writer.write(token + "\n")
-                index += 1
-        return (vocab_file,)

ruaccent/omograph_model.py DELETED Viewed

@@ -1,21 +0,0 @@
-from transformers import AutoModelForSequenceClassification, AutoTokenizer
-import torch
-class OmographModel:
-    def __init__(self, allow_cuda=True) -> None:
-        self.device = torch.device('cuda' if torch.cuda.is_available()  and allow_cuda else 'cpu')
-    def load(self, path):
-        self.nli_model = AutoModelForSequenceClassification.from_pretrained(path, torch_dtype=torch.bfloat16).to(self.device)
-        self.tokenizer = AutoTokenizer.from_pretrained(path)
-    def classify(self, text, hypotheses):
-        encodings = self.tokenizer.batch_encode_plus([(text, hyp) for hyp in hypotheses], return_tensors='pt', padding=True)
-        input_ids = encodings['input_ids'].to(self.device)
-        with torch.no_grad():
-            logits = self.nli_model(input_ids)[0]
-            entail_contradiction_logits = logits[:,[0,2]]
-            probs = entail_contradiction_logits.softmax(dim=1)
-            prob_label_is_true = [float(p[1]) for p in probs]
-        return hypotheses[prob_label_is_true.index(max(prob_label_is_true))]

ruaccent/ruaccent.py DELETED Viewed

@@ -1,153 +0,0 @@
-import json
-import pathlib
-from huggingface_hub import snapshot_download
-import os
-from os.path import join as join_path
-from .omograph_model import OmographModel
-from .accent_model import AccentModel
-import re
-class RUAccent:
-    def __init__(self, workdir=None, allow_cuda=True):
-        self.omograph_model = OmographModel(allow_cuda=allow_cuda)
-        self.accent_model = AccentModel(allow_cuda=allow_cuda)
-        if not workdir:
-            self.workdir = str(pathlib.Path(__file__).resolve().parent)
-        else:
-            self.workdir = workdir
-    def load(
-        self,
-        omograph_model_size="medium",
-        dict_load_startup=False,
-        disable_accent_dict=False,
-        repo="TeraTTS/accentuator",
-    ):
-        if not os.path.exists(
-            join_path(self.workdir, "dictionary")
-        ) or not os.path.exists(join_path(self.workdir, "nn")):
-            snapshot_download(
-                repo_id=repo,
-                ignore_patterns=["*.md", "*.gitattributes"],
-                local_dir=self.workdir,
-                local_dir_use_symlinks=False,
-            )
-        self.omographs = json.load(
-            open(join_path(self.workdir, "dictionary/omographs.json"), encoding='utf-8')
-        )
-        self.yo_words = json.load(
-            open(join_path(self.workdir, "dictionary/yo_words.json"), encoding='utf-8')
-        )
-        self.dict_load_startup = dict_load_startup
-        if dict_load_startup:
-            self.accents = json.load(
-                open(join_path(self.workdir, "dictionary/accents.json"), encoding='utf-8')
-            )
-        if disable_accent_dict:
-            self.accents = {}
-            self.disable_accent_dict = True
-        else:
-            self.disable_accent_dict = False
-        if omograph_model_size not in ["small", "medium"]:
-            raise NotImplementedError
-        self.omograph_model.load(
-            join_path(self.workdir, f"nn/nn_omograph/{omograph_model_size}/")
-        )
-        self.accent_model.load(join_path(self.workdir, "nn/nn_accent/"))
-    def split_by_words(self, string):
-        result = re.findall(r"\w*(?:\+\w+)*|[^\w\s]+", string.lower())
-        return [res for res in result if res]
-    def extract_initial_letters(self, text):
-        words = text
-        initial_letters = []
-        for word in words:
-            if len(word) > 2 and '+' not in word and not bool(re.search('[a-zA-Z]', word)):
-                initial_letters.append(word[0])
-        return initial_letters
-    def load_dict(self, text):
-        chars = self.extract_initial_letters(text)
-        out_dict = {}
-        for char in chars:
-            out_dict.update(
-                json.load(
-                    open(
-                        join_path(self.workdir, f"dictionary/letter_accent/{char}.json"),
-                        encoding='utf-8'
-                    )
-                )
-            )
-        return out_dict
-    def count_vowels(self, text):
-        vowels = "аеёиоуыэюяАЕЁИОУЫЭЮЯ"
-        return sum(1 for char in text if char in vowels)
-    def has_punctuation(self, text):
-        for char in text:
-            if char in "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~":
-                return True
-        return False
-    def delete_spaces_before_punc(self, text):
-        punc = "!\"#$%&'()*,./:;<=>?@[\\]^_`{|}~"
-        for char in punc:
-            text = text.replace(" " + char, char)
-        return text
-    def process_yo(self, text):
-        splitted_text = text
-        for i, word in enumerate(splitted_text):
-            splitted_text[i] = self.yo_words.get(word, word)
-        return splitted_text
-    def process_omographs(self, text):
-        splitted_text = text
-        founded_omographs = []
-        for i, word in enumerate(splitted_text):
-            variants = self.omographs.get(word)
-            if variants:
-                founded_omographs.append(
-                    {"word": word, "variants": variants, "position": i}
-                )
-        for omograph in founded_omographs:
-            splitted_text[
-                omograph["position"]
-            ] = f"<w>{splitted_text[omograph['position']]}</w>"
-            cls = self.omograph_model.classify(
-                " ".join(splitted_text), omograph["variants"]
-            )
-            splitted_text[omograph["position"]] = cls
-        return splitted_text
-    def process_accent(self, text):
-        if not self.dict_load_startup and not self.disable_accent_dict:
-            self.accents = self.load_dict(text)
-        splitted_text = text
-        for i, word in enumerate(splitted_text):
-            stressed_word = self.accents.get(word, word)
-            if stressed_word == word and not self.has_punctuation(word) and self.count_vowels(word) > 1:
-                splitted_text[i] = self.accent_model.put_accent(word)
-            else:
-                splitted_text[i] = stressed_word
-        return splitted_text
-    def process_all(self, text):
-        text = self.split_by_words(text)
-        processed_text = self.process_yo(text)
-        processed_text = self.process_omographs(processed_text)
-        processed_text = self.process_accent(processed_text)
-        processed_text = " ".join(processed_text)
-        processed_text = self.delete_spaces_before_punc(processed_text)
-        return processed_text