Spaces:
Running
Running
a-v-bely
commited on
Commit
·
92aa5ff
1
Parent(s):
192a825
upd code
Browse files
utilities_language_bert/rus_sentence_bert.py
CHANGED
@@ -21,25 +21,7 @@ class SENTENCE:
|
|
21 |
self.sentence_lemma_pos.append((lemma_pos, token))
|
22 |
|
23 |
def bind_phrases(self):
|
24 |
-
|
25 |
-
for i in range(len(self.sentence_lemma_pos) - 1):
|
26 |
-
phrase_candidate = f'{self.sentence_lemma_pos[i][0]}_{self.sentence_lemma_pos[i + 1][0]}'
|
27 |
-
if phrase_candidate in PHRASES and not previous_was_phrase:
|
28 |
-
# phrase is {phrase: {original_token1: spacy.token, original_token2: spacy.token}}
|
29 |
-
phrase = [
|
30 |
-
f'{self.sentence_lemma_pos[i][0]}_{self.sentence_lemma_pos[i + 1][0]}',
|
31 |
-
{
|
32 |
-
'original_token1': self.sentence_lemma_pos[i][1],
|
33 |
-
'original_token2': self.sentence_lemma_pos[i + 1][1]
|
34 |
-
}
|
35 |
-
]
|
36 |
-
self.sentence_phrases.append(phrase)
|
37 |
-
previous_was_phrase = True
|
38 |
-
else:
|
39 |
-
if not previous_was_phrase:
|
40 |
-
self.sentence_phrases.append(self.sentence_lemma_pos[i][1])
|
41 |
-
previous_was_phrase = False
|
42 |
-
self.sentence_phrases.append(self.sentence_lemma_pos[-1][1])
|
43 |
|
44 |
def search_target_words_automatically(self, target_minimum: set, frequency_dict: dict = None, summary: list=None):
|
45 |
for token in self.sentence_phrases:
|
|
|
21 |
self.sentence_lemma_pos.append((lemma_pos, token))
|
22 |
|
23 |
def bind_phrases(self):
|
24 |
+
self.sentence_phrases = self.parsed
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
def search_target_words_automatically(self, target_minimum: set, frequency_dict: dict = None, summary: list=None):
|
27 |
for token in self.sentence_phrases:
|
utilities_language_general/rus_constants.py
CHANGED
@@ -109,22 +109,22 @@ BAD_USER_TARGET_WORDS = []
|
|
109 |
COMBINE_POS = {
|
110 |
'simple':
|
111 |
{
|
112 |
-
'A1': {'VERB': ['AUX']},
|
113 |
-
'A2': {'VERB': ['AUX']},
|
114 |
-
'B1': {'VERB': ['AUX']},
|
115 |
-
'B2': {'VERB': ['AUX']},
|
116 |
-
'C1': {'VERB': ['AUX']},
|
117 |
-
'C2': {'VERB': ['AUX']},
|
118 |
-
'Без уровня': {'VERB': ['AUX']}
|
119 |
},
|
120 |
'phrase':
|
121 |
{
|
122 |
-
'A1': {'VERB': ['AUX']},
|
123 |
-
'A2': {'VERB': ['AUX']},
|
124 |
-
'B1': {'VERB': ['AUX']},
|
125 |
-
'B2': {'VERB': ['AUX']},
|
126 |
-
'C1': {'VERB': ['AUX']},
|
127 |
-
'C2': {'VERB': ['AUX']},
|
128 |
-
'Без уровня': {'VERB': ['AUX']}
|
129 |
},
|
130 |
}
|
|
|
109 |
COMBINE_POS = {
|
110 |
'simple':
|
111 |
{
|
112 |
+
'A1': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], },
|
113 |
+
'A2': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], },
|
114 |
+
'B1': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], },
|
115 |
+
'B2': {'VERB': ['AUX'], '': ['VERB'], },
|
116 |
+
'C1': {'VERB': ['AUX'], '': ['VERB'], },
|
117 |
+
'C2': {'VERB': ['AUX'], '': ['VERB'], },
|
118 |
+
'Без уровня': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], }
|
119 |
},
|
120 |
'phrase':
|
121 |
{
|
122 |
+
'A1': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], },
|
123 |
+
'A2': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], },
|
124 |
+
'B1': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], },
|
125 |
+
'B2': {'VERB': ['AUX'], '': ['VERB'], },
|
126 |
+
'C1': {'VERB': ['AUX'], '': ['VERB'], },
|
127 |
+
'C2': {'VERB': ['AUX'], '': ['VERB'], },
|
128 |
+
'Без уровня': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], }
|
129 |
},
|
130 |
}
|
utilities_language_general/rus_utils.py
CHANGED
@@ -99,14 +99,13 @@ def make_inflection(text: str, pos: str or list, tags: set, level: str) -> str o
|
|
99 |
return None
|
100 |
else:
|
101 |
word_form = morph.parse(text)[0].inflect(tags)
|
102 |
-
|
103 |
-
if pos == 'VERB' and 'impf' in tags and level in ('A1', 'A2'):
|
104 |
tags.discard('impf')
|
105 |
tags.add('perf')
|
106 |
word_form = morph.parse(text)[0].inflect(tags)
|
107 |
if word_form is not None:
|
108 |
return word_form.word
|
109 |
-
elif pos == 'VERB' and 'perf' in tags and level in ('A1', 'A2'):
|
110 |
tags.discard('perf')
|
111 |
tags.add('impf')
|
112 |
word_form = morph.parse(text)[0].inflect(tags)
|
@@ -209,7 +208,7 @@ def check_token_bert(token, current_minimum: set = None, stop_words=stop_list,
|
|
209 |
|
210 |
def get_distractors_from_model(doc, model, scaler, classifier, pos_dict:dict, target_text:str, lemma: str, pos: str, gender: str,
|
211 |
lemma_index:int, global_distractors: set, distractor_minimum: set, level_name: str, max_num_distractors: int,
|
212 |
-
max_length_ratio=5, min_edit_distance_ratio=0.
|
213 |
distractors = []
|
214 |
query = lemma if '_' in lemma else f'{lemma}_{pos}'
|
215 |
lemma = '_'.join(lemma.split('_')[::2])
|
@@ -240,8 +239,7 @@ def get_distractors_from_model(doc, model, scaler, classifier, pos_dict:dict, ta
|
|
240 |
and (candidate_gender == gender and level_name in ('B1', 'B2', 'C1', 'C2') or level_name in ('A1', 'A2'))
|
241 |
and length_ratio <= max_length_ratio
|
242 |
and distractor_lemma not in global_distractors
|
243 |
-
and edit_distance(lemma, distractor_lemma) / ((len(lemma) + len(distractor_lemma)) / 2)
|
244 |
-
min_edit_distance_ratio)
|
245 |
if condition:
|
246 |
if distractor_minimum is not None:
|
247 |
if distractor_lemma in distractor_minimum:
|
@@ -290,7 +288,7 @@ def get_distractors_from_model(doc, model, scaler, classifier, pos_dict:dict, ta
|
|
290 |
|
291 |
def get_distractors_from_model_bert(model, scaler, classifier, pos_dict:dict, level_name: str, lemma: str, pos: str, gender: str,
|
292 |
text_with_masked_task: str, global_distractors: set, distractor_minimum: set,
|
293 |
-
max_num_distractors: int, max_length_ratio=5, min_edit_distance_ratio=0.
|
294 |
_distractors = []
|
295 |
try:
|
296 |
if distractor_minimum:
|
@@ -329,7 +327,7 @@ def get_distractors_from_model_bert(model, scaler, classifier, pos_dict:dict, le
|
|
329 |
and (length_ratio <= max_length_ratio) # May be changed if case of phrases
|
330 |
and (distractor_lemma not in global_distractors)
|
331 |
and (edit_distance(lemma, distractor_lemma) # May be changed if case of phrases
|
332 |
-
/ ((len(lemma) + len(distractor_lemma)) / 2)
|
333 |
if condition:
|
334 |
if distractor_minimum is not None:
|
335 |
if distractor_lemma in distractor_minimum:
|
|
|
99 |
return None
|
100 |
else:
|
101 |
word_form = morph.parse(text)[0].inflect(tags)
|
102 |
+
if word_form is None and pos == 'VERB' and 'impf' in tags and level in ('A1', 'A2'):
|
|
|
103 |
tags.discard('impf')
|
104 |
tags.add('perf')
|
105 |
word_form = morph.parse(text)[0].inflect(tags)
|
106 |
if word_form is not None:
|
107 |
return word_form.word
|
108 |
+
elif word_form is None and pos == 'VERB' and 'perf' in tags and level in ('A1', 'A2'):
|
109 |
tags.discard('perf')
|
110 |
tags.add('impf')
|
111 |
word_form = morph.parse(text)[0].inflect(tags)
|
|
|
208 |
|
209 |
def get_distractors_from_model(doc, model, scaler, classifier, pos_dict:dict, target_text:str, lemma: str, pos: str, gender: str,
|
210 |
lemma_index:int, global_distractors: set, distractor_minimum: set, level_name: str, max_num_distractors: int,
|
211 |
+
max_length_ratio=5, min_edit_distance_ratio=0.4):
|
212 |
distractors = []
|
213 |
query = lemma if '_' in lemma else f'{lemma}_{pos}'
|
214 |
lemma = '_'.join(lemma.split('_')[::2])
|
|
|
239 |
and (candidate_gender == gender and level_name in ('B1', 'B2', 'C1', 'C2') or level_name in ('A1', 'A2'))
|
240 |
and length_ratio <= max_length_ratio
|
241 |
and distractor_lemma not in global_distractors
|
242 |
+
and edit_distance(lemma, distractor_lemma) / ((len(lemma) + len(distractor_lemma)) / 2) >= min_edit_distance_ratio)
|
|
|
243 |
if condition:
|
244 |
if distractor_minimum is not None:
|
245 |
if distractor_lemma in distractor_minimum:
|
|
|
288 |
|
289 |
def get_distractors_from_model_bert(model, scaler, classifier, pos_dict:dict, level_name: str, lemma: str, pos: str, gender: str,
|
290 |
text_with_masked_task: str, global_distractors: set, distractor_minimum: set,
|
291 |
+
max_num_distractors: int, max_length_ratio=5, min_edit_distance_ratio=0.4):
|
292 |
_distractors = []
|
293 |
try:
|
294 |
if distractor_minimum:
|
|
|
327 |
and (length_ratio <= max_length_ratio) # May be changed if case of phrases
|
328 |
and (distractor_lemma not in global_distractors)
|
329 |
and (edit_distance(lemma, distractor_lemma) # May be changed if case of phrases
|
330 |
+
/ ((len(lemma) + len(distractor_lemma)) / 2) >= min_edit_distance_ratio))
|
331 |
if condition:
|
332 |
if distractor_minimum is not None:
|
333 |
if distractor_lemma in distractor_minimum:
|