Delete folder ruaccent with huggingface_hub
Browse files- ruaccent/__init__.py +0 -1
- ruaccent/accent_model.py +0 -27
- ruaccent/char_tokenizer.py +0 -112
- ruaccent/omograph_model.py +0 -21
- ruaccent/ruaccent.py +0 -153
ruaccent/__init__.py
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
from .ruaccent import RUAccent
|
|
|
|
ruaccent/accent_model.py
DELETED
@@ -1,27 +0,0 @@
|
|
1 |
-
import torch
|
2 |
-
from .char_tokenizer import CharTokenizer
|
3 |
-
from transformers import AutoModelForTokenClassification
|
4 |
-
|
5 |
-
class AccentModel:
|
6 |
-
def __init__(self, allow_cuda=True) -> None:
|
7 |
-
self.device = torch.device('cuda' if torch.cuda.is_available() and allow_cuda else 'cpu')
|
8 |
-
def load(self, path):
|
9 |
-
self.model = AutoModelForTokenClassification.from_pretrained(path).to(self.device)
|
10 |
-
self.tokenizer = CharTokenizer.from_pretrained(path)
|
11 |
-
|
12 |
-
def render_stress(self, word, token_classes):
|
13 |
-
if 'STRESS' in token_classes:
|
14 |
-
index = token_classes.index('STRESS')
|
15 |
-
word = list(word)
|
16 |
-
word[index-1] = '+' + word[index-1]
|
17 |
-
return ''.join(word)
|
18 |
-
else:
|
19 |
-
return word
|
20 |
-
|
21 |
-
def put_accent(self, word):
|
22 |
-
inputs = self.tokenizer(word, return_tensors="pt").to(self.device)
|
23 |
-
with torch.no_grad():
|
24 |
-
logits = self.model(**inputs).logits
|
25 |
-
predictions = torch.argmax(logits, dim=2)
|
26 |
-
predicted_token_class = [self.model.config.id2label[t.item()] for t in predictions[0]]
|
27 |
-
return self.render_stress(word, predicted_token_class)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ruaccent/char_tokenizer.py
DELETED
@@ -1,112 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
from typing import Optional, Tuple, List
|
3 |
-
from collections import OrderedDict
|
4 |
-
|
5 |
-
from transformers import PreTrainedTokenizer
|
6 |
-
|
7 |
-
|
8 |
-
def load_vocab(vocab_file):
|
9 |
-
vocab = OrderedDict()
|
10 |
-
with open(vocab_file, "r", encoding="utf-8") as reader:
|
11 |
-
tokens = reader.readlines()
|
12 |
-
for index, token in enumerate(tokens):
|
13 |
-
token = token.rstrip("\n")
|
14 |
-
vocab[token] = index
|
15 |
-
return vocab
|
16 |
-
|
17 |
-
|
18 |
-
class CharTokenizer(PreTrainedTokenizer):
|
19 |
-
vocab_files_names = {"vocab_file": "vocab.txt"}
|
20 |
-
|
21 |
-
def __init__(
|
22 |
-
self,
|
23 |
-
vocab_file=None,
|
24 |
-
pad_token="[pad]",
|
25 |
-
unk_token="[unk]",
|
26 |
-
bos_token="[bos]",
|
27 |
-
eos_token="[eos]",
|
28 |
-
do_lower_case=False,
|
29 |
-
*args,
|
30 |
-
**kwargs
|
31 |
-
):
|
32 |
-
super().__init__(
|
33 |
-
pad_token=pad_token,
|
34 |
-
unk_token=unk_token,
|
35 |
-
bos_token=bos_token,
|
36 |
-
eos_token=eos_token,
|
37 |
-
do_lower_case=do_lower_case,
|
38 |
-
**kwargs
|
39 |
-
)
|
40 |
-
self.do_lower_case = do_lower_case
|
41 |
-
|
42 |
-
if not vocab_file or not os.path.isfile(vocab_file):
|
43 |
-
self.vocab = OrderedDict()
|
44 |
-
self.ids_to_tokens = OrderedDict()
|
45 |
-
else:
|
46 |
-
self.vocab = load_vocab(vocab_file)
|
47 |
-
self.ids_to_tokens = OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
|
48 |
-
|
49 |
-
@property
|
50 |
-
def vocab_size(self):
|
51 |
-
return len(self.vocab)
|
52 |
-
|
53 |
-
def get_vocab(self):
|
54 |
-
return self.vocab
|
55 |
-
|
56 |
-
def _convert_token_to_id(self, token):
|
57 |
-
if self.do_lower_case:
|
58 |
-
token = token.lower()
|
59 |
-
return self.vocab.get(token, self.vocab[self.unk_token])
|
60 |
-
|
61 |
-
def _convert_id_to_token(self, index):
|
62 |
-
return self.ids_to_tokens[index]
|
63 |
-
|
64 |
-
def _tokenize(self, text):
|
65 |
-
if self.do_lower_case:
|
66 |
-
text = text.lower()
|
67 |
-
return list(text)
|
68 |
-
|
69 |
-
def convert_tokens_to_string(self, tokens):
|
70 |
-
return "".join(tokens)
|
71 |
-
|
72 |
-
def build_inputs_with_special_tokens(
|
73 |
-
self,
|
74 |
-
token_ids_0: List[int],
|
75 |
-
token_ids_1: Optional[List[int]] = None
|
76 |
-
) -> List[int]:
|
77 |
-
bos = [self.bos_token_id]
|
78 |
-
eos = [self.eos_token_id]
|
79 |
-
return bos + token_ids_0 + eos
|
80 |
-
|
81 |
-
def get_special_tokens_mask(
|
82 |
-
self,
|
83 |
-
token_ids_0: List[int],
|
84 |
-
token_ids_1: Optional[List[int]] = None
|
85 |
-
) -> List[int]:
|
86 |
-
return [1] + ([0] * len(token_ids_0)) + [1]
|
87 |
-
|
88 |
-
def create_token_type_ids_from_sequences(
|
89 |
-
self,
|
90 |
-
token_ids_0: List[int],
|
91 |
-
token_ids_1: Optional[List[int]] = None
|
92 |
-
) -> List[int]:
|
93 |
-
return (len(token_ids_0) + 2) * [0]
|
94 |
-
|
95 |
-
def save_vocabulary(
|
96 |
-
self,
|
97 |
-
save_directory: str,
|
98 |
-
filename_prefix: Optional[str] = None
|
99 |
-
) -> Tuple[str]:
|
100 |
-
assert os.path.isdir(save_directory)
|
101 |
-
vocab_file = os.path.join(
|
102 |
-
save_directory,
|
103 |
-
(filename_prefix + "-" if filename_prefix else "") +
|
104 |
-
self.vocab_files_names["vocab_file"]
|
105 |
-
)
|
106 |
-
index = 0
|
107 |
-
with open(vocab_file, "w", encoding="utf-8") as writer:
|
108 |
-
for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
|
109 |
-
assert index == token_index
|
110 |
-
writer.write(token + "\n")
|
111 |
-
index += 1
|
112 |
-
return (vocab_file,)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ruaccent/omograph_model.py
DELETED
@@ -1,21 +0,0 @@
|
|
1 |
-
from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
2 |
-
import torch
|
3 |
-
|
4 |
-
class OmographModel:
|
5 |
-
def __init__(self, allow_cuda=True) -> None:
|
6 |
-
self.device = torch.device('cuda' if torch.cuda.is_available() and allow_cuda else 'cpu')
|
7 |
-
|
8 |
-
def load(self, path):
|
9 |
-
self.nli_model = AutoModelForSequenceClassification.from_pretrained(path, torch_dtype=torch.bfloat16).to(self.device)
|
10 |
-
self.tokenizer = AutoTokenizer.from_pretrained(path)
|
11 |
-
|
12 |
-
def classify(self, text, hypotheses):
|
13 |
-
encodings = self.tokenizer.batch_encode_plus([(text, hyp) for hyp in hypotheses], return_tensors='pt', padding=True)
|
14 |
-
input_ids = encodings['input_ids'].to(self.device)
|
15 |
-
with torch.no_grad():
|
16 |
-
logits = self.nli_model(input_ids)[0]
|
17 |
-
entail_contradiction_logits = logits[:,[0,2]]
|
18 |
-
probs = entail_contradiction_logits.softmax(dim=1)
|
19 |
-
prob_label_is_true = [float(p[1]) for p in probs]
|
20 |
-
|
21 |
-
return hypotheses[prob_label_is_true.index(max(prob_label_is_true))]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ruaccent/ruaccent.py
DELETED
@@ -1,153 +0,0 @@
|
|
1 |
-
import json
|
2 |
-
import pathlib
|
3 |
-
from huggingface_hub import snapshot_download
|
4 |
-
import os
|
5 |
-
from os.path import join as join_path
|
6 |
-
from .omograph_model import OmographModel
|
7 |
-
from .accent_model import AccentModel
|
8 |
-
import re
|
9 |
-
|
10 |
-
|
11 |
-
class RUAccent:
|
12 |
-
def __init__(self, workdir=None, allow_cuda=True):
|
13 |
-
self.omograph_model = OmographModel(allow_cuda=allow_cuda)
|
14 |
-
self.accent_model = AccentModel(allow_cuda=allow_cuda)
|
15 |
-
if not workdir:
|
16 |
-
self.workdir = str(pathlib.Path(__file__).resolve().parent)
|
17 |
-
else:
|
18 |
-
self.workdir = workdir
|
19 |
-
|
20 |
-
def load(
|
21 |
-
self,
|
22 |
-
omograph_model_size="medium",
|
23 |
-
dict_load_startup=False,
|
24 |
-
disable_accent_dict=False,
|
25 |
-
repo="TeraTTS/accentuator",
|
26 |
-
):
|
27 |
-
if not os.path.exists(
|
28 |
-
join_path(self.workdir, "dictionary")
|
29 |
-
) or not os.path.exists(join_path(self.workdir, "nn")):
|
30 |
-
snapshot_download(
|
31 |
-
repo_id=repo,
|
32 |
-
ignore_patterns=["*.md", "*.gitattributes"],
|
33 |
-
local_dir=self.workdir,
|
34 |
-
local_dir_use_symlinks=False,
|
35 |
-
)
|
36 |
-
self.omographs = json.load(
|
37 |
-
open(join_path(self.workdir, "dictionary/omographs.json"), encoding='utf-8')
|
38 |
-
)
|
39 |
-
self.yo_words = json.load(
|
40 |
-
open(join_path(self.workdir, "dictionary/yo_words.json"), encoding='utf-8')
|
41 |
-
)
|
42 |
-
self.dict_load_startup = dict_load_startup
|
43 |
-
|
44 |
-
if dict_load_startup:
|
45 |
-
self.accents = json.load(
|
46 |
-
open(join_path(self.workdir, "dictionary/accents.json"), encoding='utf-8')
|
47 |
-
)
|
48 |
-
if disable_accent_dict:
|
49 |
-
self.accents = {}
|
50 |
-
self.disable_accent_dict = True
|
51 |
-
else:
|
52 |
-
self.disable_accent_dict = False
|
53 |
-
|
54 |
-
if omograph_model_size not in ["small", "medium"]:
|
55 |
-
raise NotImplementedError
|
56 |
-
|
57 |
-
self.omograph_model.load(
|
58 |
-
join_path(self.workdir, f"nn/nn_omograph/{omograph_model_size}/")
|
59 |
-
)
|
60 |
-
self.accent_model.load(join_path(self.workdir, "nn/nn_accent/"))
|
61 |
-
|
62 |
-
|
63 |
-
def split_by_words(self, string):
|
64 |
-
result = re.findall(r"\w*(?:\+\w+)*|[^\w\s]+", string.lower())
|
65 |
-
return [res for res in result if res]
|
66 |
-
|
67 |
-
def extract_initial_letters(self, text):
|
68 |
-
words = text
|
69 |
-
initial_letters = []
|
70 |
-
for word in words:
|
71 |
-
if len(word) > 2 and '+' not in word and not bool(re.search('[a-zA-Z]', word)):
|
72 |
-
initial_letters.append(word[0])
|
73 |
-
return initial_letters
|
74 |
-
|
75 |
-
def load_dict(self, text):
|
76 |
-
chars = self.extract_initial_letters(text)
|
77 |
-
out_dict = {}
|
78 |
-
for char in chars:
|
79 |
-
out_dict.update(
|
80 |
-
json.load(
|
81 |
-
open(
|
82 |
-
join_path(self.workdir, f"dictionary/letter_accent/{char}.json"),
|
83 |
-
encoding='utf-8'
|
84 |
-
)
|
85 |
-
)
|
86 |
-
)
|
87 |
-
return out_dict
|
88 |
-
|
89 |
-
def count_vowels(self, text):
|
90 |
-
vowels = "аеёиоуыэюяАЕЁИОУЫЭЮЯ"
|
91 |
-
return sum(1 for char in text if char in vowels)
|
92 |
-
|
93 |
-
def has_punctuation(self, text):
|
94 |
-
for char in text:
|
95 |
-
if char in "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~":
|
96 |
-
return True
|
97 |
-
return False
|
98 |
-
|
99 |
-
def delete_spaces_before_punc(self, text):
|
100 |
-
punc = "!\"#$%&'()*,./:;<=>?@[\\]^_`{|}~"
|
101 |
-
for char in punc:
|
102 |
-
text = text.replace(" " + char, char)
|
103 |
-
return text
|
104 |
-
|
105 |
-
def process_yo(self, text):
|
106 |
-
splitted_text = text
|
107 |
-
|
108 |
-
for i, word in enumerate(splitted_text):
|
109 |
-
splitted_text[i] = self.yo_words.get(word, word)
|
110 |
-
return splitted_text
|
111 |
-
|
112 |
-
def process_omographs(self, text):
|
113 |
-
splitted_text = text
|
114 |
-
|
115 |
-
founded_omographs = []
|
116 |
-
for i, word in enumerate(splitted_text):
|
117 |
-
variants = self.omographs.get(word)
|
118 |
-
if variants:
|
119 |
-
founded_omographs.append(
|
120 |
-
{"word": word, "variants": variants, "position": i}
|
121 |
-
)
|
122 |
-
for omograph in founded_omographs:
|
123 |
-
splitted_text[
|
124 |
-
omograph["position"]
|
125 |
-
] = f"<w>{splitted_text[omograph['position']]}</w>"
|
126 |
-
cls = self.omograph_model.classify(
|
127 |
-
" ".join(splitted_text), omograph["variants"]
|
128 |
-
)
|
129 |
-
splitted_text[omograph["position"]] = cls
|
130 |
-
return splitted_text
|
131 |
-
|
132 |
-
def process_accent(self, text):
|
133 |
-
if not self.dict_load_startup and not self.disable_accent_dict:
|
134 |
-
self.accents = self.load_dict(text)
|
135 |
-
|
136 |
-
splitted_text = text
|
137 |
-
|
138 |
-
for i, word in enumerate(splitted_text):
|
139 |
-
stressed_word = self.accents.get(word, word)
|
140 |
-
if stressed_word == word and not self.has_punctuation(word) and self.count_vowels(word) > 1:
|
141 |
-
splitted_text[i] = self.accent_model.put_accent(word)
|
142 |
-
else:
|
143 |
-
splitted_text[i] = stressed_word
|
144 |
-
return splitted_text
|
145 |
-
|
146 |
-
def process_all(self, text):
|
147 |
-
text = self.split_by_words(text)
|
148 |
-
processed_text = self.process_yo(text)
|
149 |
-
processed_text = self.process_omographs(processed_text)
|
150 |
-
processed_text = self.process_accent(processed_text)
|
151 |
-
processed_text = " ".join(processed_text)
|
152 |
-
processed_text = self.delete_spaces_before_punc(processed_text)
|
153 |
-
return processed_text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|