Den4ikAI commited on
Commit
c80d52e
1 Parent(s): c5e6494

Delete folder ruaccent with huggingface_hub

Browse files
ruaccent/__init__.py DELETED
@@ -1 +0,0 @@
1
- from .ruaccent import RUAccent
 
 
ruaccent/accent_model.py DELETED
@@ -1,27 +0,0 @@
1
- import torch
2
- from .char_tokenizer import CharTokenizer
3
- from transformers import AutoModelForTokenClassification
4
-
5
- class AccentModel:
6
- def __init__(self, allow_cuda=True) -> None:
7
- self.device = torch.device('cuda' if torch.cuda.is_available() and allow_cuda else 'cpu')
8
- def load(self, path):
9
- self.model = AutoModelForTokenClassification.from_pretrained(path).to(self.device)
10
- self.tokenizer = CharTokenizer.from_pretrained(path)
11
-
12
- def render_stress(self, word, token_classes):
13
- if 'STRESS' in token_classes:
14
- index = token_classes.index('STRESS')
15
- word = list(word)
16
- word[index-1] = '+' + word[index-1]
17
- return ''.join(word)
18
- else:
19
- return word
20
-
21
- def put_accent(self, word):
22
- inputs = self.tokenizer(word, return_tensors="pt").to(self.device)
23
- with torch.no_grad():
24
- logits = self.model(**inputs).logits
25
- predictions = torch.argmax(logits, dim=2)
26
- predicted_token_class = [self.model.config.id2label[t.item()] for t in predictions[0]]
27
- return self.render_stress(word, predicted_token_class)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ruaccent/char_tokenizer.py DELETED
@@ -1,112 +0,0 @@
1
- import os
2
- from typing import Optional, Tuple, List
3
- from collections import OrderedDict
4
-
5
- from transformers import PreTrainedTokenizer
6
-
7
-
8
- def load_vocab(vocab_file):
9
- vocab = OrderedDict()
10
- with open(vocab_file, "r", encoding="utf-8") as reader:
11
- tokens = reader.readlines()
12
- for index, token in enumerate(tokens):
13
- token = token.rstrip("\n")
14
- vocab[token] = index
15
- return vocab
16
-
17
-
18
- class CharTokenizer(PreTrainedTokenizer):
19
- vocab_files_names = {"vocab_file": "vocab.txt"}
20
-
21
- def __init__(
22
- self,
23
- vocab_file=None,
24
- pad_token="[pad]",
25
- unk_token="[unk]",
26
- bos_token="[bos]",
27
- eos_token="[eos]",
28
- do_lower_case=False,
29
- *args,
30
- **kwargs
31
- ):
32
- super().__init__(
33
- pad_token=pad_token,
34
- unk_token=unk_token,
35
- bos_token=bos_token,
36
- eos_token=eos_token,
37
- do_lower_case=do_lower_case,
38
- **kwargs
39
- )
40
- self.do_lower_case = do_lower_case
41
-
42
- if not vocab_file or not os.path.isfile(vocab_file):
43
- self.vocab = OrderedDict()
44
- self.ids_to_tokens = OrderedDict()
45
- else:
46
- self.vocab = load_vocab(vocab_file)
47
- self.ids_to_tokens = OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
48
-
49
- @property
50
- def vocab_size(self):
51
- return len(self.vocab)
52
-
53
- def get_vocab(self):
54
- return self.vocab
55
-
56
- def _convert_token_to_id(self, token):
57
- if self.do_lower_case:
58
- token = token.lower()
59
- return self.vocab.get(token, self.vocab[self.unk_token])
60
-
61
- def _convert_id_to_token(self, index):
62
- return self.ids_to_tokens[index]
63
-
64
- def _tokenize(self, text):
65
- if self.do_lower_case:
66
- text = text.lower()
67
- return list(text)
68
-
69
- def convert_tokens_to_string(self, tokens):
70
- return "".join(tokens)
71
-
72
- def build_inputs_with_special_tokens(
73
- self,
74
- token_ids_0: List[int],
75
- token_ids_1: Optional[List[int]] = None
76
- ) -> List[int]:
77
- bos = [self.bos_token_id]
78
- eos = [self.eos_token_id]
79
- return bos + token_ids_0 + eos
80
-
81
- def get_special_tokens_mask(
82
- self,
83
- token_ids_0: List[int],
84
- token_ids_1: Optional[List[int]] = None
85
- ) -> List[int]:
86
- return [1] + ([0] * len(token_ids_0)) + [1]
87
-
88
- def create_token_type_ids_from_sequences(
89
- self,
90
- token_ids_0: List[int],
91
- token_ids_1: Optional[List[int]] = None
92
- ) -> List[int]:
93
- return (len(token_ids_0) + 2) * [0]
94
-
95
- def save_vocabulary(
96
- self,
97
- save_directory: str,
98
- filename_prefix: Optional[str] = None
99
- ) -> Tuple[str]:
100
- assert os.path.isdir(save_directory)
101
- vocab_file = os.path.join(
102
- save_directory,
103
- (filename_prefix + "-" if filename_prefix else "") +
104
- self.vocab_files_names["vocab_file"]
105
- )
106
- index = 0
107
- with open(vocab_file, "w", encoding="utf-8") as writer:
108
- for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
109
- assert index == token_index
110
- writer.write(token + "\n")
111
- index += 1
112
- return (vocab_file,)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ruaccent/omograph_model.py DELETED
@@ -1,21 +0,0 @@
1
- from transformers import AutoModelForSequenceClassification, AutoTokenizer
2
- import torch
3
-
4
- class OmographModel:
5
- def __init__(self, allow_cuda=True) -> None:
6
- self.device = torch.device('cuda' if torch.cuda.is_available() and allow_cuda else 'cpu')
7
-
8
- def load(self, path):
9
- self.nli_model = AutoModelForSequenceClassification.from_pretrained(path, torch_dtype=torch.bfloat16).to(self.device)
10
- self.tokenizer = AutoTokenizer.from_pretrained(path)
11
-
12
- def classify(self, text, hypotheses):
13
- encodings = self.tokenizer.batch_encode_plus([(text, hyp) for hyp in hypotheses], return_tensors='pt', padding=True)
14
- input_ids = encodings['input_ids'].to(self.device)
15
- with torch.no_grad():
16
- logits = self.nli_model(input_ids)[0]
17
- entail_contradiction_logits = logits[:,[0,2]]
18
- probs = entail_contradiction_logits.softmax(dim=1)
19
- prob_label_is_true = [float(p[1]) for p in probs]
20
-
21
- return hypotheses[prob_label_is_true.index(max(prob_label_is_true))]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ruaccent/ruaccent.py DELETED
@@ -1,153 +0,0 @@
1
- import json
2
- import pathlib
3
- from huggingface_hub import snapshot_download
4
- import os
5
- from os.path import join as join_path
6
- from .omograph_model import OmographModel
7
- from .accent_model import AccentModel
8
- import re
9
-
10
-
11
- class RUAccent:
12
- def __init__(self, workdir=None, allow_cuda=True):
13
- self.omograph_model = OmographModel(allow_cuda=allow_cuda)
14
- self.accent_model = AccentModel(allow_cuda=allow_cuda)
15
- if not workdir:
16
- self.workdir = str(pathlib.Path(__file__).resolve().parent)
17
- else:
18
- self.workdir = workdir
19
-
20
- def load(
21
- self,
22
- omograph_model_size="medium",
23
- dict_load_startup=False,
24
- disable_accent_dict=False,
25
- repo="TeraTTS/accentuator",
26
- ):
27
- if not os.path.exists(
28
- join_path(self.workdir, "dictionary")
29
- ) or not os.path.exists(join_path(self.workdir, "nn")):
30
- snapshot_download(
31
- repo_id=repo,
32
- ignore_patterns=["*.md", "*.gitattributes"],
33
- local_dir=self.workdir,
34
- local_dir_use_symlinks=False,
35
- )
36
- self.omographs = json.load(
37
- open(join_path(self.workdir, "dictionary/omographs.json"), encoding='utf-8')
38
- )
39
- self.yo_words = json.load(
40
- open(join_path(self.workdir, "dictionary/yo_words.json"), encoding='utf-8')
41
- )
42
- self.dict_load_startup = dict_load_startup
43
-
44
- if dict_load_startup:
45
- self.accents = json.load(
46
- open(join_path(self.workdir, "dictionary/accents.json"), encoding='utf-8')
47
- )
48
- if disable_accent_dict:
49
- self.accents = {}
50
- self.disable_accent_dict = True
51
- else:
52
- self.disable_accent_dict = False
53
-
54
- if omograph_model_size not in ["small", "medium"]:
55
- raise NotImplementedError
56
-
57
- self.omograph_model.load(
58
- join_path(self.workdir, f"nn/nn_omograph/{omograph_model_size}/")
59
- )
60
- self.accent_model.load(join_path(self.workdir, "nn/nn_accent/"))
61
-
62
-
63
- def split_by_words(self, string):
64
- result = re.findall(r"\w*(?:\+\w+)*|[^\w\s]+", string.lower())
65
- return [res for res in result if res]
66
-
67
- def extract_initial_letters(self, text):
68
- words = text
69
- initial_letters = []
70
- for word in words:
71
- if len(word) > 2 and '+' not in word and not bool(re.search('[a-zA-Z]', word)):
72
- initial_letters.append(word[0])
73
- return initial_letters
74
-
75
- def load_dict(self, text):
76
- chars = self.extract_initial_letters(text)
77
- out_dict = {}
78
- for char in chars:
79
- out_dict.update(
80
- json.load(
81
- open(
82
- join_path(self.workdir, f"dictionary/letter_accent/{char}.json"),
83
- encoding='utf-8'
84
- )
85
- )
86
- )
87
- return out_dict
88
-
89
- def count_vowels(self, text):
90
- vowels = "аеёиоуыэюяАЕЁИОУЫЭЮЯ"
91
- return sum(1 for char in text if char in vowels)
92
-
93
- def has_punctuation(self, text):
94
- for char in text:
95
- if char in "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~":
96
- return True
97
- return False
98
-
99
- def delete_spaces_before_punc(self, text):
100
- punc = "!\"#$%&'()*,./:;<=>?@[\\]^_`{|}~"
101
- for char in punc:
102
- text = text.replace(" " + char, char)
103
- return text
104
-
105
- def process_yo(self, text):
106
- splitted_text = text
107
-
108
- for i, word in enumerate(splitted_text):
109
- splitted_text[i] = self.yo_words.get(word, word)
110
- return splitted_text
111
-
112
- def process_omographs(self, text):
113
- splitted_text = text
114
-
115
- founded_omographs = []
116
- for i, word in enumerate(splitted_text):
117
- variants = self.omographs.get(word)
118
- if variants:
119
- founded_omographs.append(
120
- {"word": word, "variants": variants, "position": i}
121
- )
122
- for omograph in founded_omographs:
123
- splitted_text[
124
- omograph["position"]
125
- ] = f"<w>{splitted_text[omograph['position']]}</w>"
126
- cls = self.omograph_model.classify(
127
- " ".join(splitted_text), omograph["variants"]
128
- )
129
- splitted_text[omograph["position"]] = cls
130
- return splitted_text
131
-
132
- def process_accent(self, text):
133
- if not self.dict_load_startup and not self.disable_accent_dict:
134
- self.accents = self.load_dict(text)
135
-
136
- splitted_text = text
137
-
138
- for i, word in enumerate(splitted_text):
139
- stressed_word = self.accents.get(word, word)
140
- if stressed_word == word and not self.has_punctuation(word) and self.count_vowels(word) > 1:
141
- splitted_text[i] = self.accent_model.put_accent(word)
142
- else:
143
- splitted_text[i] = stressed_word
144
- return splitted_text
145
-
146
- def process_all(self, text):
147
- text = self.split_by_words(text)
148
- processed_text = self.process_yo(text)
149
- processed_text = self.process_omographs(processed_text)
150
- processed_text = self.process_accent(processed_text)
151
- processed_text = " ".join(processed_text)
152
- processed_text = self.delete_spaces_before_punc(processed_text)
153
- return processed_text