Spaces:
Runtime error
Runtime error
a-v-bely
commited on
Commit
•
ea7c789
1
Parent(s):
307a5f3
Towards distractor classification
Browse files- utilities_language_bert/esp_sentence_bert.py +4 -14
- utilities_language_general/esp_constants.py +45 -12
- utilities_language_general/esp_utils.py +29 -13
- utilities_language_general/similarity_measures.py +283 -0
- utilities_language_w2v/esp_main_workflow_w2v.py +24 -44
- utilities_language_w2v/esp_sentence_w2v.py +10 -16
utilities_language_bert/esp_sentence_bert.py
CHANGED
@@ -1,13 +1,9 @@
|
|
1 |
import string
|
2 |
-
from random import random
|
3 |
-
from random import sample
|
4 |
-
from utilities_language_general.esp_constants import nlp
|
5 |
from utilities_language_general.morphology import inflect
|
6 |
-
from utilities_language_general.esp_constants import PHRASES
|
7 |
-
from utilities_language_general.esp_utils import check_token_bert
|
8 |
-
|
9 |
-
from utilities_language_general.esp_constants import BAD_USER_TARGET_WORDS
|
10 |
-
from utilities_language_general.esp_utils import get_distractors_from_model_bert
|
11 |
|
12 |
|
13 |
class SENTENCE:
|
@@ -195,12 +191,6 @@ class TASK:
|
|
195 |
def attach_distractors_to_target_word(self, model, global_distractors, distractor_minimum,
|
196 |
level_name, max_frequency):
|
197 |
pos = self.pos[0] if self.pos[0] == 'phrase' else self.pos[1]
|
198 |
-
# distractors_full_text = get_distractors_from_model_bert(model=model, lemma=self.lemma, pos=pos,
|
199 |
-
# gender=self.gender, level_name=level_name,
|
200 |
-
# text_with_masked_task=self.text_with_masked_task,
|
201 |
-
# global_distractors=global_distractors,
|
202 |
-
# distractor_minimum=distractor_minimum,
|
203 |
-
# max_num_distractors=self.max_num_distractors)
|
204 |
distractors_sentence = get_distractors_from_model_bert(model=model, lemma=self.lemma, pos=pos,
|
205 |
gender=self.gender, level_name=level_name,
|
206 |
text_with_masked_task=self.masked_sentence,
|
|
|
1 |
import string
|
2 |
+
from random import random, sample
|
|
|
|
|
3 |
from utilities_language_general.morphology import inflect
|
4 |
+
from utilities_language_general.esp_constants import nlp, PHRASES, BAD_USER_TARGET_WORDS
|
5 |
+
from utilities_language_general.esp_utils import check_token_bert, fix_irregular_lemma, get_distractors_from_model_bert
|
6 |
+
|
|
|
|
|
7 |
|
8 |
|
9 |
class SENTENCE:
|
|
|
191 |
def attach_distractors_to_target_word(self, model, global_distractors, distractor_minimum,
|
192 |
level_name, max_frequency):
|
193 |
pos = self.pos[0] if self.pos[0] == 'phrase' else self.pos[1]
|
|
|
|
|
|
|
|
|
|
|
|
|
194 |
distractors_sentence = get_distractors_from_model_bert(model=model, lemma=self.lemma, pos=pos,
|
195 |
gender=self.gender, level_name=level_name,
|
196 |
text_with_masked_task=self.masked_sentence,
|
utilities_language_general/esp_constants.py
CHANGED
@@ -2,15 +2,20 @@ import json
|
|
2 |
import spacy
|
3 |
import gensim
|
4 |
import streamlit as st
|
|
|
|
|
5 |
from transformers import pipeline
|
6 |
from summarizer import Summarizer
|
7 |
|
8 |
|
9 |
@st.cache_resource
|
10 |
-
def load_w2v(
|
11 |
with st.spinner('Загружаю языковую модель'):
|
12 |
-
|
13 |
-
|
|
|
|
|
|
|
14 |
|
15 |
|
16 |
@st.cache_resource
|
@@ -26,14 +31,31 @@ def load_bert():
|
|
26 |
_pipeline = pipeline(task="fill-mask", model="a-v-white/bert-base-spanish-wwm-cased-finetuned-literature-pro")
|
27 |
return _pipeline
|
28 |
|
|
|
29 |
@st.cache_resource
|
30 |
def load_summarizer():
|
31 |
return Summarizer()
|
32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
nlp = load_spacy()
|
34 |
summarization = load_summarizer()
|
35 |
-
w2v_model_1_path = r'model1.gz'
|
36 |
-
w2v_model_2_path = r'model2.gz'
|
37 |
|
38 |
# Upload minimums
|
39 |
a1_path, a1_target_set = r'lexical_minimums/A1_MINIMUM.txt', set()
|
@@ -42,19 +64,25 @@ b1_path, b1_target_set = r'lexical_minimums/B1_MINIMUM.txt', set()
|
|
42 |
b2_path, b2_target_set = r'lexical_minimums/B2_MINIMUM.txt', set()
|
43 |
c1_path, c1_target_set = r'lexical_minimums/C1_MINIMUM.txt', set()
|
44 |
c2_path, c2_target_set = r'lexical_minimums/C2_MINIMUM.txt', set()
|
45 |
-
|
|
|
46 |
minimums_sets = (a1_target_set, a2_target_set, b1_target_set, b2_target_set, c1_target_set, c2_target_set)
|
47 |
for i in range(len(minimums_paths)):
|
48 |
with open(minimums_paths[i], 'r', encoding='utf-8') as read_file:
|
49 |
for line in read_file:
|
50 |
minimums_sets[i].add(line.strip())
|
51 |
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
|
|
|
|
|
|
|
|
|
|
58 |
|
59 |
with open('language_data/phrases.json', 'r', encoding='utf-8') as f:
|
60 |
PHRASES = set(json.load(f)['PHRASES'])
|
@@ -77,6 +105,8 @@ COMBINE_POS = {
|
|
77 |
'ADJ':['NOUN'], 'NOUN': ['ADJ']},
|
78 |
'C2': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX', 'AUX_VERB'], 'AUX': ['VERB', 'AUX_VERB'], 'AUX_VERB': ['VERB', 'AUX'], 'AUX_AUX': ['AUX'],
|
79 |
'ADJ':['NOUN'], 'NOUN': ['ADJ']},
|
|
|
|
|
80 |
},
|
81 |
'phrase':
|
82 |
{
|
@@ -89,5 +119,8 @@ COMBINE_POS = {
|
|
89 |
'ADJ':['NOUN'], 'NOUN': ['ADJ']},
|
90 |
'C2': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX', 'AUX_VERB'], 'AUX': ['VERB', 'AUX_VERB'], 'AUX_VERB': ['VERB', 'AUX'], 'AUX_AUX': ['AUX'],
|
91 |
'ADJ':['NOUN'], 'NOUN': ['ADJ']},
|
|
|
|
|
|
|
92 |
},
|
93 |
}
|
|
|
2 |
import spacy
|
3 |
import gensim
|
4 |
import streamlit as st
|
5 |
+
|
6 |
+
from pickle import load
|
7 |
from transformers import pipeline
|
8 |
from summarizer import Summarizer
|
9 |
|
10 |
|
11 |
@st.cache_resource
|
12 |
+
def load_w2v(model):
|
13 |
with st.spinner('Загружаю языковую модель'):
|
14 |
+
if model == 'model1':
|
15 |
+
model_path = r'language_data/model1.gz'
|
16 |
+
else:
|
17 |
+
model_path = r'language_data/model2.gz'
|
18 |
+
return gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=True)
|
19 |
|
20 |
|
21 |
@st.cache_resource
|
|
|
31 |
_pipeline = pipeline(task="fill-mask", model="a-v-white/bert-base-spanish-wwm-cased-finetuned-literature-pro")
|
32 |
return _pipeline
|
33 |
|
34 |
+
|
35 |
@st.cache_resource
|
36 |
def load_summarizer():
|
37 |
return Summarizer()
|
38 |
|
39 |
+
|
40 |
+
@st.cache_resource
|
41 |
+
def load_classifiers(model):
|
42 |
+
if model == 'model1':
|
43 |
+
scaler_path = 'language_data/model1_with_wn_minmaxscaler.pickle'
|
44 |
+
classifier_path = 'language_data/model1_with_wn_catboost_classifier.pickle'
|
45 |
+
elif model == 'model2':
|
46 |
+
scaler_path = 'language_data/model2_with_wn_minmaxscaler.pickle'
|
47 |
+
classifier_path = 'language_data/model2_with_wn_catboost_classifier.pickle'
|
48 |
+
else:
|
49 |
+
scaler_path = 'language_data/model3_with_wn_minmaxscaler.pickle'
|
50 |
+
classifier_path = 'language_data/model3_with_wn_catboost_classifier.pickle'
|
51 |
+
with (open(scaler_path, 'rb') as f1, open(classifier_path, 'rb') as f2, open('language_data/pos_dict.pickle', 'rb') as f3):
|
52 |
+
scaler = load(f1)
|
53 |
+
classifier = load(f2)
|
54 |
+
pos_dict = load(f3)
|
55 |
+
return pos_dict, scaler, classifier
|
56 |
+
|
57 |
nlp = load_spacy()
|
58 |
summarization = load_summarizer()
|
|
|
|
|
59 |
|
60 |
# Upload minimums
|
61 |
a1_path, a1_target_set = r'lexical_minimums/A1_MINIMUM.txt', set()
|
|
|
64 |
b2_path, b2_target_set = r'lexical_minimums/B2_MINIMUM.txt', set()
|
65 |
c1_path, c1_target_set = r'lexical_minimums/C1_MINIMUM.txt', set()
|
66 |
c2_path, c2_target_set = r'lexical_minimums/C2_MINIMUM.txt', set()
|
67 |
+
|
68 |
+
minimums_paths = (a1_path, a2_path, b1_path, b2_path, c1_path, c2_path)
|
69 |
minimums_sets = (a1_target_set, a2_target_set, b1_target_set, b2_target_set, c1_target_set, c2_target_set)
|
70 |
for i in range(len(minimums_paths)):
|
71 |
with open(minimums_paths[i], 'r', encoding='utf-8') as read_file:
|
72 |
for line in read_file:
|
73 |
minimums_sets[i].add(line.strip())
|
74 |
|
75 |
+
MINIMUM_SETS = {
|
76 |
+
'A1': (a1_target_set, a1_target_set),
|
77 |
+
'A2': (a2_target_set, a2_target_set.union(a1_target_set)),
|
78 |
+
'B1': (b1_target_set, b1_target_set.union(a2_target_set)),
|
79 |
+
'B2': (b2_target_set, b2_target_set.union(b1_target_set)),
|
80 |
+
'C1': (c1_target_set, c1_target_set.union(b2_target_set)),
|
81 |
+
'C2': (c2_target_set, c2_target_set.union(c1_target_set)),
|
82 |
+
'Без уровня': (None, None)
|
83 |
+
}
|
84 |
+
|
85 |
+
LEVEL_NUMBERS = {'A1': 1, 'A2': 1, 'B1': 2, 'B2': 3, 'C1': 4, 'C2': 4}
|
86 |
|
87 |
with open('language_data/phrases.json', 'r', encoding='utf-8') as f:
|
88 |
PHRASES = set(json.load(f)['PHRASES'])
|
|
|
105 |
'ADJ':['NOUN'], 'NOUN': ['ADJ']},
|
106 |
'C2': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX', 'AUX_VERB'], 'AUX': ['VERB', 'AUX_VERB'], 'AUX_VERB': ['VERB', 'AUX'], 'AUX_AUX': ['AUX'],
|
107 |
'ADJ':['NOUN'], 'NOUN': ['ADJ']},
|
108 |
+
'Без уровня': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX', 'AUX_VERB'], 'AUX': ['VERB', 'AUX_VERB'],
|
109 |
+
'AUX_VERB': ['VERB', 'AUX'], 'AUX_AUX': ['AUX'], 'AUX_ADJ': ['PRON_VERB'], 'PRON_VERB': ['AUX_ADJ'], 'ADP': ['SCONJ', 'ADV'], 'ADJ': ['NOUN'], 'NOUN': ['ADJ']}
|
110 |
},
|
111 |
'phrase':
|
112 |
{
|
|
|
119 |
'ADJ':['NOUN'], 'NOUN': ['ADJ']},
|
120 |
'C2': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX', 'AUX_VERB'], 'AUX': ['VERB', 'AUX_VERB'], 'AUX_VERB': ['VERB', 'AUX'], 'AUX_AUX': ['AUX'],
|
121 |
'ADJ':['NOUN'], 'NOUN': ['ADJ']},
|
122 |
+
'Без уровня': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX', 'AUX_VERB'], 'AUX': ['VERB', 'AUX_VERB'],
|
123 |
+
'AUX_VERB': ['VERB', 'AUX'], 'AUX_AUX': ['AUX'], 'AUX_ADJ': ['PRON_VERB'], 'PRON_VERB': ['AUX_ADJ'],
|
124 |
+
'ADP': ['SCONJ', 'ADV'], 'ADJ': ['NOUN'], 'NOUN': ['ADJ']}
|
125 |
},
|
126 |
}
|
utilities_language_general/esp_utils.py
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
from nltk import edit_distance
|
2 |
from utilities.utils import answer_letter
|
3 |
-
from utilities_language_general.
|
4 |
-
from utilities_language_general.esp_constants import FIX_LEMMA
|
5 |
-
|
6 |
|
7 |
|
8 |
def prepare_target_words(target_words):
|
@@ -107,12 +107,13 @@ def check_token_bert(token, current_minimum: set = None, check_allowed_pos: set
|
|
107 |
return False
|
108 |
|
109 |
|
110 |
-
def get_distractors_from_model(model, lemma: str, pos: str, gender: str
|
111 |
-
distractor_minimum: set, level_name: str, max_num_distractors: int,
|
112 |
max_length_ratio=5, min_edit_distance_ratio=0.5):
|
113 |
|
114 |
distractors = []
|
115 |
query = lemma if '_' in lemma else f'{lemma}_{pos}'
|
|
|
116 |
lemma = '_'.join(lemma.split('_')[::2])
|
117 |
if model.has_index_for(query):
|
118 |
candidates = model.most_similar(query, topn=max_num_distractors + 100)
|
@@ -126,19 +127,23 @@ def get_distractors_from_model(model, lemma: str, pos: str, gender: str or None,
|
|
126 |
for candidate in candidates:
|
127 |
if candidate[0].count('_') == 1 and pos != 'phrase':
|
128 |
distractor_lemma, distractor_pos = candidate[0].split('_')
|
|
|
|
|
|
|
129 |
distractor_similarity = candidate[1]
|
130 |
candidate_gender = get_tags(distractor_lemma).get('Gender')
|
131 |
length_ratio = abs(len(lemma) - len(distractor_lemma))
|
132 |
condition = ((distractor_pos == pos
|
133 |
or (COMBINE_POS['simple'][level_name].get(pos) is not None and COMBINE_POS['simple'][level_name].get(distractor_pos) is not None
|
134 |
-
and distractor_pos in COMBINE_POS['simple'][level_name][pos] and pos in COMBINE_POS['simple'][level_name][distractor_pos])
|
135 |
-
|
136 |
and distractor_lemma != lemma
|
|
|
|
|
137 |
and (candidate_gender == gender and level_name in ('B1', 'B2', 'C1', 'C2'))
|
138 |
and length_ratio <= max_length_ratio
|
139 |
and distractor_lemma not in global_distractors
|
140 |
-
and edit_distance(lemma, distractor_lemma) / ((len(lemma) + len(distractor_lemma)) / 2) >
|
141 |
-
min_edit_distance_ratio)
|
142 |
if condition:
|
143 |
if distractor_minimum is not None:
|
144 |
if distractor_lemma in distractor_minimum:
|
@@ -146,14 +151,18 @@ def get_distractors_from_model(model, lemma: str, pos: str, gender: str or None,
|
|
146 |
global_distractors.add(distractor_lemma)
|
147 |
else:
|
148 |
distractors.append((distractor_lemma, distractor_similarity))
|
149 |
-
global_distractors.add(distractor_lemma)
|
150 |
else:
|
151 |
-
if candidate[0].count('_')
|
|
|
152 |
continue
|
153 |
d1_lemma, d1_pos, d2_lemma, d2_pos = candidate[0].split('_')
|
154 |
d_pos = f'{d1_pos}_{d2_pos}'
|
155 |
distractor_lemma = f'{d1_lemma}_{d2_lemma}'
|
156 |
distractor_similarity = candidate[1]
|
|
|
|
|
|
|
157 |
condition = (((d1_pos == pos or d2_pos == pos)
|
158 |
or (COMBINE_POS['phrase'][level_name].get(d_pos) is not None and COMBINE_POS['phrase'][level_name].get(pos) is not None
|
159 |
and d_pos in COMBINE_POS['phrase'][level_name].get(d_pos) and pos in COMBINE_POS['phrase'][level_name].get(pos) )
|
@@ -161,7 +170,10 @@ def get_distractors_from_model(model, lemma: str, pos: str, gender: str or None,
|
|
161 |
and pos in ('phrase', 'VERB', 'AUX', 'SCONJ', 'ADP'))
|
162 |
or (d2_pos in ('VERB', 'AUX', 'SCONJ', 'ADP')
|
163 |
and pos in ('phrase', 'VERB', 'AUX', 'SCONJ', 'ADP')))
|
|
|
164 |
and candidate[0] != lemma
|
|
|
|
|
165 |
and distractor_lemma != lemma
|
166 |
and distractor_lemma not in global_distractors)
|
167 |
if condition:
|
@@ -180,8 +192,8 @@ def get_distractors_from_model(model, lemma: str, pos: str, gender: str or None,
|
|
180 |
return distractors
|
181 |
|
182 |
|
183 |
-
def get_distractors_from_model_bert(model, text_with_masked_task: str, lemma: str, pos: str, gender: str
|
184 |
-
global_distractors: set, distractor_minimum: set, level_name: str,
|
185 |
max_num_distractors: int, max_length_ratio=5, min_edit_distance_ratio=0.5):
|
186 |
_distractors = []
|
187 |
try:
|
@@ -205,9 +217,13 @@ def get_distractors_from_model_bert(model, text_with_masked_task: str, lemma: st
|
|
205 |
distractor_similarity = candidate_distractor[1]
|
206 |
candidate_gender = get_tags(distractor_lemma).get('Gender')
|
207 |
length_ratio = abs(len(lemma) - len(distractor_lemma))
|
|
|
|
|
|
|
208 |
if ((distractor_pos == pos
|
209 |
or (COMBINE_POS['simple'][level_name].get(pos) is not None and COMBINE_POS['simple'][level_name].get(distractor_pos) is not None
|
210 |
and distractor_pos in COMBINE_POS['simple'][level_name][pos] and pos in COMBINE_POS['simple'][level_name][distractor_pos]))
|
|
|
211 |
and distractor_lemma != lemma
|
212 |
and (len(_distractors) < max_num_distractors+100)
|
213 |
and (candidate_gender == gender and level_name in ('B1', 'B2', 'C1', 'C2'))
|
|
|
1 |
from nltk import edit_distance
|
2 |
from utilities.utils import answer_letter
|
3 |
+
from utilities_language_general.similarity_measures import make_decision
|
4 |
+
from utilities_language_general.esp_constants import nlp, FIX_LEMMA, COMBINE_POS
|
5 |
+
|
6 |
|
7 |
|
8 |
def prepare_target_words(target_words):
|
|
|
107 |
return False
|
108 |
|
109 |
|
110 |
+
def get_distractors_from_model(doc, model, scaler, classifier, pos_dict:dict, target_text:str, lemma: str, pos: str, gender: str, lemma_index:int, global_distractors: set,
|
111 |
+
distractor_minimum: set, level_name: str, max_num_distractors: int,
|
112 |
max_length_ratio=5, min_edit_distance_ratio=0.5):
|
113 |
|
114 |
distractors = []
|
115 |
query = lemma if '_' in lemma else f'{lemma}_{pos}'
|
116 |
+
raw_lemma = query
|
117 |
lemma = '_'.join(lemma.split('_')[::2])
|
118 |
if model.has_index_for(query):
|
119 |
candidates = model.most_similar(query, topn=max_num_distractors + 100)
|
|
|
127 |
for candidate in candidates:
|
128 |
if candidate[0].count('_') == 1 and pos != 'phrase':
|
129 |
distractor_lemma, distractor_pos = candidate[0].split('_')
|
130 |
+
decision = make_decision(doc, model_type='w2v', model=model, scaler=scaler, classifier=classifier, pos_dict=pos_dict,
|
131 |
+
level=level_name, target_lemma=raw_lemma, target_text=target_text, target_pos=pos, target_position=lemma_index,
|
132 |
+
substitute_lemma=distractor_lemma, substitute_pos=distractor_pos)
|
133 |
distractor_similarity = candidate[1]
|
134 |
candidate_gender = get_tags(distractor_lemma).get('Gender')
|
135 |
length_ratio = abs(len(lemma) - len(distractor_lemma))
|
136 |
condition = ((distractor_pos == pos
|
137 |
or (COMBINE_POS['simple'][level_name].get(pos) is not None and COMBINE_POS['simple'][level_name].get(distractor_pos) is not None
|
138 |
+
and distractor_pos in COMBINE_POS['simple'][level_name][pos] and pos in COMBINE_POS['simple'][level_name][distractor_pos]))
|
139 |
+
and decision
|
140 |
and distractor_lemma != lemma
|
141 |
+
and distractor_lemma not in lemma
|
142 |
+
and lemma not in distractor_lemma
|
143 |
and (candidate_gender == gender and level_name in ('B1', 'B2', 'C1', 'C2'))
|
144 |
and length_ratio <= max_length_ratio
|
145 |
and distractor_lemma not in global_distractors
|
146 |
+
and edit_distance(lemma, distractor_lemma) / ((len(lemma) + len(distractor_lemma)) / 2) > min_edit_distance_ratio)
|
|
|
147 |
if condition:
|
148 |
if distractor_minimum is not None:
|
149 |
if distractor_lemma in distractor_minimum:
|
|
|
151 |
global_distractors.add(distractor_lemma)
|
152 |
else:
|
153 |
distractors.append((distractor_lemma, distractor_similarity))
|
154 |
+
global_distractors.add(distractor_lemma)
|
155 |
else:
|
156 |
+
if (candidate[0].count('_') == 1 # REMOVE HOTFIX
|
157 |
+
or candidate[0].count('_') > 3 or pos in ('NOUN', 'ADJ', 'NUM')):
|
158 |
continue
|
159 |
d1_lemma, d1_pos, d2_lemma, d2_pos = candidate[0].split('_')
|
160 |
d_pos = f'{d1_pos}_{d2_pos}'
|
161 |
distractor_lemma = f'{d1_lemma}_{d2_lemma}'
|
162 |
distractor_similarity = candidate[1]
|
163 |
+
decision = make_decision(doc, model_type='w2v', model=model, scaler=scaler, classifier=classifier, pos_dict=pos_dict,
|
164 |
+
level=level_name, target_lemma=raw_lemma, target_text=target_text, target_pos=pos, target_position=lemma_index,
|
165 |
+
substitute_lemma=candidate[0], substitute_pos=d_pos)
|
166 |
condition = (((d1_pos == pos or d2_pos == pos)
|
167 |
or (COMBINE_POS['phrase'][level_name].get(d_pos) is not None and COMBINE_POS['phrase'][level_name].get(pos) is not None
|
168 |
and d_pos in COMBINE_POS['phrase'][level_name].get(d_pos) and pos in COMBINE_POS['phrase'][level_name].get(pos) )
|
|
|
170 |
and pos in ('phrase', 'VERB', 'AUX', 'SCONJ', 'ADP'))
|
171 |
or (d2_pos in ('VERB', 'AUX', 'SCONJ', 'ADP')
|
172 |
and pos in ('phrase', 'VERB', 'AUX', 'SCONJ', 'ADP')))
|
173 |
+
and decision
|
174 |
and candidate[0] != lemma
|
175 |
+
and distractor_lemma not in lemma
|
176 |
+
and lemma not in distractor_lemma
|
177 |
and distractor_lemma != lemma
|
178 |
and distractor_lemma not in global_distractors)
|
179 |
if condition:
|
|
|
192 |
return distractors
|
193 |
|
194 |
|
195 |
+
def get_distractors_from_model_bert(doc, model, scaler, classifier, text_with_masked_task: str, lemma: str, pos: str, gender: str, lemma_index:int,
|
196 |
+
global_distractors: set, distractor_minimum: set, level_name: str, pos_dict:dict,
|
197 |
max_num_distractors: int, max_length_ratio=5, min_edit_distance_ratio=0.5):
|
198 |
_distractors = []
|
199 |
try:
|
|
|
217 |
distractor_similarity = candidate_distractor[1]
|
218 |
candidate_gender = get_tags(distractor_lemma).get('Gender')
|
219 |
length_ratio = abs(len(lemma) - len(distractor_lemma))
|
220 |
+
decision = make_decision(doc, model_type='bert', scaler=scaler, classifier=classifier, pos_dict=pos_dict,
|
221 |
+
level=level_name, target_text=lemma, target_pos=pos, target_position=lemma_index,
|
222 |
+
substitute_text=distractor_lemma, substitute_pos=distractor_pos)
|
223 |
if ((distractor_pos == pos
|
224 |
or (COMBINE_POS['simple'][level_name].get(pos) is not None and COMBINE_POS['simple'][level_name].get(distractor_pos) is not None
|
225 |
and distractor_pos in COMBINE_POS['simple'][level_name][pos] and pos in COMBINE_POS['simple'][level_name][distractor_pos]))
|
226 |
+
and decision
|
227 |
and distractor_lemma != lemma
|
228 |
and (len(_distractors) < max_num_distractors+100)
|
229 |
and (candidate_gender == gender and level_name in ('B1', 'B2', 'C1', 'C2'))
|
utilities_language_general/similarity_measures.py
ADDED
@@ -0,0 +1,283 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
from math import pow
|
3 |
+
from nltk.corpus import wordnet as wn
|
4 |
+
from utilities_language_general.esp_constants import nlp, PHRASES, LEVEL_NUMBERS
|
5 |
+
|
6 |
+
|
7 |
+
def eucledian_distance(x, y):
|
8 |
+
return np.sqrt(np.sum((x - y) ** 2))
|
9 |
+
|
10 |
+
def cosine_similarity(x, y):
|
11 |
+
out = np.dot(x, y) / (np.sqrt(np.dot(x, x)) * np.sqrt(np.dot(y, y)))
|
12 |
+
if str(out) != 'nan':
|
13 |
+
return out
|
14 |
+
return None
|
15 |
+
|
16 |
+
def get_vector_for_token(model, token):
|
17 |
+
vector = None
|
18 |
+
|
19 |
+
splitted = token.split('_')
|
20 |
+
token_list = [f'{splitted[i]}_{splitted[i+1]}' for i in range(len(splitted)-1)]
|
21 |
+
|
22 |
+
if model.has_index_for(token):
|
23 |
+
vector = model.get_vector(token)
|
24 |
+
else:
|
25 |
+
try:
|
26 |
+
vector = model.get_mean_vector(token_list)
|
27 |
+
except ValueError:
|
28 |
+
return None
|
29 |
+
return vector
|
30 |
+
|
31 |
+
def compute_metric(func, vector1, vector2):
|
32 |
+
if vector1 is not None and vector2 is not None:
|
33 |
+
return func(vector1, vector2)
|
34 |
+
else:
|
35 |
+
return None
|
36 |
+
|
37 |
+
def compute_positive_cos(x, y):
|
38 |
+
cos_sim = cosine_similarity(x, y)
|
39 |
+
if cos_sim:
|
40 |
+
return (cos_sim + 1) / 2
|
41 |
+
else:
|
42 |
+
return None
|
43 |
+
|
44 |
+
def addition_metric(substitute, target, context):
|
45 |
+
substitute_target_cos = compute_metric(cosine_similarity, substitute, target)
|
46 |
+
if not substitute_target_cos:
|
47 |
+
return None
|
48 |
+
if not context:
|
49 |
+
return None
|
50 |
+
|
51 |
+
context_vectors = []
|
52 |
+
for context_tk in context:
|
53 |
+
substitute_context_cos = compute_metric(cosine_similarity, substitute, context_tk)
|
54 |
+
if substitute_context_cos:
|
55 |
+
context_vectors.append(substitute_context_cos)
|
56 |
+
sum_of_context_vectors = np.sum(context_vectors)
|
57 |
+
|
58 |
+
metric = (substitute_target_cos + sum_of_context_vectors) / (len(context) + 1)
|
59 |
+
return metric
|
60 |
+
|
61 |
+
def balanced_addition_metric(substitute, target, context):
|
62 |
+
substitute_target_cos = compute_metric(cosine_similarity, substitute, target)
|
63 |
+
if not substitute_target_cos:
|
64 |
+
return None
|
65 |
+
if not context:
|
66 |
+
return None
|
67 |
+
|
68 |
+
context_vectors = []
|
69 |
+
for context_tk in context:
|
70 |
+
substitute_context_cos = compute_metric(cosine_similarity, substitute, context_tk)
|
71 |
+
if substitute_context_cos:
|
72 |
+
context_vectors.append(substitute_context_cos)
|
73 |
+
sum_of_context_vectors = np.sum(context_vectors)
|
74 |
+
|
75 |
+
context_len = len(context)
|
76 |
+
metric = (context_len * substitute_target_cos + sum_of_context_vectors) / (2 * context_len)
|
77 |
+
return metric
|
78 |
+
|
79 |
+
def multiplication_metric(substitute, target, context):
|
80 |
+
substitute_target_cos = compute_metric(compute_positive_cos, substitute, target)
|
81 |
+
if not substitute_target_cos:
|
82 |
+
return None
|
83 |
+
if not context:
|
84 |
+
return None
|
85 |
+
|
86 |
+
context_vectors = []
|
87 |
+
for context_tk in context:
|
88 |
+
substitute_context_positive_cos = compute_metric(compute_positive_cos, substitute, context_tk)
|
89 |
+
if substitute_context_positive_cos:
|
90 |
+
context_vectors.append(substitute_context_positive_cos)
|
91 |
+
prod_of_context_vectors = np.prod(context_vectors)
|
92 |
+
try:
|
93 |
+
metric = pow((substitute_target_cos + prod_of_context_vectors), 1 / (len(context) + 1))
|
94 |
+
except ValueError:
|
95 |
+
return None
|
96 |
+
return metric
|
97 |
+
|
98 |
+
def balanced_multiplication_metric(substitute, target, context):
|
99 |
+
substitute_target_cos = compute_metric(compute_positive_cos, substitute, target)
|
100 |
+
if not substitute_target_cos:
|
101 |
+
return None
|
102 |
+
if not context:
|
103 |
+
return None
|
104 |
+
|
105 |
+
context_vectors = []
|
106 |
+
for context_tk in context:
|
107 |
+
substitute_context_positive_cos = compute_metric(compute_positive_cos, substitute, context_tk)
|
108 |
+
if substitute_context_positive_cos:
|
109 |
+
context_vectors.append(substitute_context_positive_cos)
|
110 |
+
prod_of_context_vectors = np.prod(context_vectors)
|
111 |
+
|
112 |
+
context_len = len(context)
|
113 |
+
try:
|
114 |
+
metric = pow((pow(substitute_target_cos, context_len) + prod_of_context_vectors), 1 / (2 * context_len))
|
115 |
+
except ValueError:
|
116 |
+
return None
|
117 |
+
return metric
|
118 |
+
|
119 |
+
def bind_phrases(context_list):
|
120 |
+
context = []
|
121 |
+
previous_was_phrase = False
|
122 |
+
for i in range(len(context_list)-1):
|
123 |
+
phrase_candidate = f'{context_list[i]}_{context_list[i+1]}'
|
124 |
+
if phrase_candidate in PHRASES and not previous_was_phrase:
|
125 |
+
context.append(phrase_candidate)
|
126 |
+
previous_was_phrase = True
|
127 |
+
else:
|
128 |
+
if not previous_was_phrase:
|
129 |
+
context.append(context_list[i])
|
130 |
+
previous_was_phrase = False
|
131 |
+
if context_list:
|
132 |
+
if not context:
|
133 |
+
context.append(context_list[-1])
|
134 |
+
elif not context_list[-1] in context[-1]:
|
135 |
+
context.append(context_list[-1])
|
136 |
+
return context
|
137 |
+
|
138 |
+
def get_context_windows(doc, target_text, window_size):
|
139 |
+
sentence_str = doc.text
|
140 |
+
sentence_masked = sentence_str.lower().replace(target_text.lower().strip(), ' [MASK] ')
|
141 |
+
alpha_tokens_lemma_pos = [f'{tk.lemma_.lower()}_{tk.pos_}' for tk in nlp(sentence_masked) if tk.text.isalpha()]
|
142 |
+
alpha_tokens_lemma_pos_no_stop = [f'{tk.lemma_.lower()}_{tk.pos_}' for tk in nlp(sentence_masked) if tk.text.isalpha() and not tk.is_stop]
|
143 |
+
try:
|
144 |
+
mask_token_index = alpha_tokens_lemma_pos.index('mask_NUM')
|
145 |
+
mask_token_index_no_stop = alpha_tokens_lemma_pos_no_stop.index('mask_NUM')
|
146 |
+
except ValueError:
|
147 |
+
return None
|
148 |
+
|
149 |
+
left_border = max(mask_token_index-window_size, 0)
|
150 |
+
right_border = min(mask_token_index+window_size, len(alpha_tokens_lemma_pos))
|
151 |
+
l_context = alpha_tokens_lemma_pos[left_border:mask_token_index]
|
152 |
+
r_context = alpha_tokens_lemma_pos[mask_token_index+1:right_border+1]
|
153 |
+
|
154 |
+
left_border_no_stop = max(mask_token_index_no_stop-window_size, 0)
|
155 |
+
right_border_no_stop = min(mask_token_index_no_stop+window_size, len(alpha_tokens_lemma_pos_no_stop))
|
156 |
+
l_context_no_stop = alpha_tokens_lemma_pos_no_stop[left_border_no_stop:mask_token_index_no_stop]
|
157 |
+
r_context_no_stop = alpha_tokens_lemma_pos_no_stop[mask_token_index_no_stop+1:right_border_no_stop+1]
|
158 |
+
return (bind_phrases(l_context) + bind_phrases(r_context), bind_phrases(l_context_no_stop) + bind_phrases(r_context_no_stop))
|
159 |
+
|
160 |
+
def get_context_linked_words(doc, target_position, target_text):
|
161 |
+
answer_list = target_text.split(' ')
|
162 |
+
context_words = []
|
163 |
+
for tk in doc:
|
164 |
+
if tk.text.isalpha():
|
165 |
+
if (tk.text in answer_list and abs(target_position - tk.idx) <= sum([len(t) for t in answer_list])):
|
166 |
+
context_words.extend([t for t in tk.subtree if t.text.isalpha() and not t.is_stop])
|
167 |
+
context_words.extend([t for t in tk.children if t.text.isalpha() and not t.is_stop])
|
168 |
+
context_words.extend([t for t in tk.ancestors if t.text.isalpha() and not t.is_stop])
|
169 |
+
context_words = [(tk, f'{tk.lemma_}_{tk.pos_}') for tk in sorted(set(context_words), key=lambda tk: tk.i) if tk.text not in answer_list]
|
170 |
+
context = []
|
171 |
+
previous_was_phrase = False
|
172 |
+
for i in range(len(context_words)-1):
|
173 |
+
phrase_candidate = f'{context_words[i][1]}_{context_words[i+1][1]}'
|
174 |
+
if phrase_candidate in PHRASES and not previous_was_phrase and abs(context_words[i][0].i - context_words[i+1][0].i) <=1:
|
175 |
+
context.append(phrase_candidate)
|
176 |
+
previous_was_phrase = True
|
177 |
+
else:
|
178 |
+
if not previous_was_phrase:
|
179 |
+
context.append(context_words[i][1])
|
180 |
+
if context and context_words:
|
181 |
+
if not context_words[-1][1] in context[-1]:
|
182 |
+
context.append(context_words[-1][1])
|
183 |
+
elif context_words:
|
184 |
+
context.append(context_words[-1][1])
|
185 |
+
return context
|
186 |
+
|
187 |
+
def get_word_net_similarity(token1, token2, metric):
|
188 |
+
token1_list = token1.split('_')[::2]
|
189 |
+
token2_list = token2.split('_')[::2]
|
190 |
+
data = []
|
191 |
+
for token1_part in token1_list:
|
192 |
+
for syn1 in wn.synsets(token1_part, lang='spa'):
|
193 |
+
for token2_part in token2_list:
|
194 |
+
for syn2 in wn.synsets(token2_part, lang='spa'):
|
195 |
+
if syn1.pos() == syn2.pos():
|
196 |
+
data.append(metric(syn1, syn2))
|
197 |
+
if data:
|
198 |
+
data = np.array(data)
|
199 |
+
return data.min(), data.max(), data.mean(), data.std()
|
200 |
+
else:
|
201 |
+
return None, None, None, None
|
202 |
+
|
203 |
+
def compute_all_necessary_metrics(target_lemma, target_text, target_position, substitute_lemma, doc, model_type:str, model=None):
|
204 |
+
|
205 |
+
path_similarity_min, path_similarity_max, path_similarity_mean, path_similarity_std = get_word_net_similarity(target_lemma, substitute_lemma, wn.path_similarity)
|
206 |
+
wup_similarity_min, wup_similarity_max, wup_similarity_mean, wup_similarity_std = get_word_net_similarity(target_lemma, substitute_lemma, wn.wup_similarity)
|
207 |
+
lch_similarity_min, lch_similarity_max, lch_similarity_mean, lch_similarity_std = get_word_net_similarity(target_lemma, substitute_lemma, wn.lch_similarity)
|
208 |
+
|
209 |
+
if model_type == 'bert':
|
210 |
+
return (path_similarity_min, path_similarity_max, path_similarity_mean, path_similarity_std,
|
211 |
+
wup_similarity_min, wup_similarity_max, wup_similarity_mean, wup_similarity_std,
|
212 |
+
lch_similarity_min, lch_similarity_max, lch_similarity_mean, lch_similarity_std)
|
213 |
+
|
214 |
+
target_vector = get_vector_for_token(model, target_lemma)
|
215 |
+
substitute_vector = get_vector_for_token(model, substitute_lemma)
|
216 |
+
|
217 |
+
cosimilarity = compute_metric(cosine_similarity, substitute_vector, target_vector)
|
218 |
+
eucledian_similarity = compute_metric(eucledian_distance, substitute_vector, target_vector)
|
219 |
+
|
220 |
+
context_window3, context_window3_no_stop = get_context_windows(doc=doc, target_text=target_text, window_size=3)
|
221 |
+
context_window5, context_window5_no_stop = get_context_windows(doc=doc, target_text=target_text, window_size=5)
|
222 |
+
context_window_synt = get_context_linked_words(doc, target_position, target_text)
|
223 |
+
|
224 |
+
context_window3 = [get_vector_for_token(model, token) for token in context_window3]
|
225 |
+
context_window3_no_stop = [get_vector_for_token(model, token) for token in context_window3_no_stop]
|
226 |
+
context_window5 = [get_vector_for_token(model, token) for token in context_window5]
|
227 |
+
context_window5_no_stop = [get_vector_for_token(model, token) for token in context_window5_no_stop]
|
228 |
+
context_window_synt = [get_vector_for_token(model, token) for token in context_window_synt]
|
229 |
+
|
230 |
+
add_metric_window3 = addition_metric(target_vector, substitute_vector, context_window3)
|
231 |
+
bal_add_metric_window3 = balanced_addition_metric(target_vector, substitute_vector, context_window3)
|
232 |
+
add_metric_window3_no_stop = addition_metric(target_vector, substitute_vector, context_window3_no_stop)
|
233 |
+
bal_add_metric_window3_no_stop = balanced_addition_metric(target_vector, substitute_vector, context_window3_no_stop)
|
234 |
+
|
235 |
+
mult_metric_window3 = multiplication_metric(target_vector, substitute_vector, context_window3)
|
236 |
+
bal_mult_metric_window3 = balanced_multiplication_metric(target_vector, substitute_vector, context_window3)
|
237 |
+
mult_metric_window3_no_stop = multiplication_metric(target_vector, substitute_vector, context_window3_no_stop)
|
238 |
+
bal_mult_metric_window3_no_stop = balanced_multiplication_metric(target_vector, substitute_vector, context_window3_no_stop)
|
239 |
+
|
240 |
+
add_metric_window5 = addition_metric(target_vector, substitute_vector, context_window5)
|
241 |
+
bal_add_metric_window5 = balanced_addition_metric(target_vector, substitute_vector, context_window5)
|
242 |
+
add_metric_window5_no_stop = addition_metric(target_vector, substitute_vector, context_window5_no_stop)
|
243 |
+
bal_add_metric_window5_no_stop = balanced_addition_metric(target_vector, substitute_vector, context_window5_no_stop)
|
244 |
+
|
245 |
+
mult_metric_window5 = multiplication_metric(target_vector, substitute_vector, context_window5)
|
246 |
+
bal_mult_metric_window5 = balanced_multiplication_metric(target_vector, substitute_vector, context_window5)
|
247 |
+
mult_metric_window5_no_stop = multiplication_metric(target_vector, substitute_vector, context_window5_no_stop)
|
248 |
+
bal_mult_metric_window5_no_stop = balanced_multiplication_metric(target_vector, substitute_vector, context_window5_no_stop)
|
249 |
+
|
250 |
+
add_metric_synt = addition_metric(target_vector, substitute_vector, context_window_synt)
|
251 |
+
bal_add_metric_synt = balanced_addition_metric(target_vector, substitute_vector, context_window_synt)
|
252 |
+
|
253 |
+
mult_metric_synt = multiplication_metric(target_vector, substitute_vector, context_window_synt)
|
254 |
+
bal_mult_metric_synt = balanced_multiplication_metric(target_vector, substitute_vector, context_window_synt)
|
255 |
+
|
256 |
+
return (cosimilarity, eucledian_similarity,
|
257 |
+
add_metric_window3, bal_add_metric_window3,
|
258 |
+
mult_metric_window3, bal_mult_metric_window3,
|
259 |
+
add_metric_window3_no_stop, bal_add_metric_window3_no_stop,
|
260 |
+
mult_metric_window3_no_stop, bal_mult_metric_window3_no_stop,
|
261 |
+
add_metric_window5, bal_add_metric_window5,
|
262 |
+
mult_metric_window5, bal_mult_metric_window5,
|
263 |
+
add_metric_window5_no_stop, bal_add_metric_window5_no_stop,
|
264 |
+
mult_metric_window5_no_stop, bal_mult_metric_window5_no_stop,
|
265 |
+
add_metric_synt, bal_add_metric_synt,
|
266 |
+
mult_metric_synt, bal_mult_metric_synt,
|
267 |
+
path_similarity_min, path_similarity_mean, path_similarity_std, path_similarity_max,
|
268 |
+
wup_similarity_min, wup_similarity_mean, wup_similarity_std, wup_similarity_max,
|
269 |
+
lch_similarity_min, lch_similarity_mean, lch_similarity_std, lch_similarity_max)
|
270 |
+
|
271 |
+
def make_decision(doc, model_type, scaler, classifier, pos_dict, level, target_lemma, target_text, target_pos, target_position,
|
272 |
+
substitute_lemma, substitute_pos, model=None, bert_score=None):
|
273 |
+
# return True
|
274 |
+
metrics = compute_all_necessary_metrics(target_lemma=target_lemma, target_text=target_text, target_position=target_position,
|
275 |
+
substitute_lemma=substitute_lemma, doc=doc, model_type=model_type, model=model)
|
276 |
+
target_multiword, substitute_multiword = target_lemma.count('_') > 2, substitute_lemma.count('_') > 2
|
277 |
+
data = [LEVEL_NUMBERS.get(level), pos_dict.get(target_pos), target_multiword, pos_dict.get(substitute_pos), substitute_multiword] + scaler.transform([metrics]).tolist()[0]
|
278 |
+
if model_type == 'bert':
|
279 |
+
data = [LEVEL_NUMBERS.get(level), pos_dict.get(target_pos), target_multiword, pos_dict.get(substitute_pos), substitute_multiword, bert_score] + scaler.transform([metrics]).tolist()[0]
|
280 |
+
predict = classifier.predict(data)
|
281 |
+
return bool(predict)
|
282 |
+
|
283 |
+
|
utilities_language_w2v/esp_main_workflow_w2v.py
CHANGED
@@ -1,30 +1,24 @@
|
|
1 |
import datetime
|
2 |
from io import StringIO
|
|
|
3 |
from random import sample
|
4 |
from collections import defaultdict
|
5 |
from streamlit import progress as st_progress
|
6 |
from streamlit.elements import WIDGETS as ST_WIDGETS
|
7 |
-
from utilities_language_general.esp_constants import st
|
8 |
-
from utilities_language_w2v.esp_sentence_w2v import TASK
|
9 |
-
from utilities_language_w2v.esp_sentence_w2v import SENTENCE
|
10 |
-
from utilities_language_general.esp_constants import load_w2v
|
11 |
-
from utilities_language_general.esp_utils import prepare_tasks
|
12 |
from streamlit.runtime.uploaded_file_manager import UploadedFile
|
13 |
import utilities_language_general.esp_constants as esp_constants
|
14 |
-
from
|
15 |
-
from utilities_language_general.
|
16 |
-
from utilities_language_general.esp_constants import
|
17 |
-
|
18 |
-
from utilities_language_general.esp_utils import compute_frequency_dict
|
19 |
-
from utilities_language_general.esp_constants import BAD_USER_TARGET_WORDS
|
20 |
|
21 |
|
22 |
def main_workflow(
|
23 |
-
file: UploadedFile
|
24 |
text: str,
|
25 |
-
logs
|
26 |
-
progress
|
27 |
-
progress_d
|
28 |
level: str,
|
29 |
tw_mode_automatic_mode: str,
|
30 |
target_words: str,
|
@@ -68,15 +62,15 @@ def main_workflow(
|
|
68 |
elif text != '':
|
69 |
current_text = text
|
70 |
else:
|
71 |
-
|
72 |
current_text = ''
|
73 |
-
|
74 |
|
75 |
# Process target words
|
76 |
if tw_mode_automatic_mode == 'Самостоятельно':
|
77 |
if target_words == '':
|
78 |
-
|
79 |
-
|
80 |
# Cannot make up paradigm, so only USER_TARGET_WORDS is used
|
81 |
USER_TARGET_WORDS = prepare_target_words(target_words)
|
82 |
tw_mode_automatic_mode = False
|
@@ -107,27 +101,8 @@ def main_workflow(
|
|
107 |
progress.progress(15)
|
108 |
|
109 |
# Choose necessary language minimum according to user's input
|
110 |
-
if level
|
111 |
-
target_minimum =
|
112 |
-
distractor_minimum = esp_constants.a1_distractor_set
|
113 |
-
elif level == 'A2':
|
114 |
-
target_minimum = esp_constants.a2_target_set
|
115 |
-
distractor_minimum = esp_constants.a2_distractor_set
|
116 |
-
elif level == 'B1':
|
117 |
-
target_minimum = esp_constants.b1_target_set
|
118 |
-
distractor_minimum = esp_constants.b1_distractor_set
|
119 |
-
elif level == 'B2':
|
120 |
-
target_minimum = esp_constants.b2_target_set
|
121 |
-
distractor_minimum = esp_constants.b2_distractor_set
|
122 |
-
elif level == 'C1':
|
123 |
-
target_minimum = esp_constants.c1_target_set
|
124 |
-
distractor_minimum = esp_constants.c1_distractor_set
|
125 |
-
elif level == 'C2':
|
126 |
-
target_minimum = esp_constants.c2_target_set
|
127 |
-
distractor_minimum = esp_constants.c2_distractor_set
|
128 |
-
elif level == 'Без уровня':
|
129 |
-
target_minimum = None
|
130 |
-
distractor_minimum = None
|
131 |
else:
|
132 |
target_minimum = None
|
133 |
distractor_minimum = None
|
@@ -137,9 +112,11 @@ def main_workflow(
|
|
137 |
# Define which model is used for distractor generation
|
138 |
logs.update(label='Загружаем языковые модели и другие данные', state='running')
|
139 |
if model_name == 'Модель-1':
|
140 |
-
mask_filler = load_w2v(
|
|
|
141 |
else:
|
142 |
-
mask_filler = load_w2v(
|
|
|
143 |
|
144 |
# Start generation process
|
145 |
workflow = [SENTENCE(original=sent.strip(), n_sentence=num, max_num_distractors=num_distractors)
|
@@ -197,6 +174,9 @@ def main_workflow(
|
|
197 |
|
198 |
for sentence in workflow:
|
199 |
sentence.attach_distractors_to_target_word(model=mask_filler,
|
|
|
|
|
|
|
200 |
global_distractors=GLOBAL_DISTRACTORS,
|
201 |
distractor_minimum=distractor_minimum,
|
202 |
level_name=level,
|
@@ -235,8 +215,8 @@ def main_workflow(
|
|
235 |
NUMBER_TASKS = 10
|
236 |
else:
|
237 |
NUMBER_TASKS = len(RESULT_TASKS)
|
238 |
-
RESULT_TASKS_in_summary = filter(lambda task: task.in_summary, RESULT_TASKS)
|
239 |
-
RESULT_TASTS_not_in_summary = filter(lambda task: not task.in_summary, RESULT_TASKS)
|
240 |
if len(RESULT_TASKS_in_summary) >= NUMBER_TASKS:
|
241 |
RESULT_TASKS = RESULT_TASKS_in_summary
|
242 |
else:
|
|
|
1 |
import datetime
|
2 |
from io import StringIO
|
3 |
+
from typing import Union
|
4 |
from random import sample
|
5 |
from collections import defaultdict
|
6 |
from streamlit import progress as st_progress
|
7 |
from streamlit.elements import WIDGETS as ST_WIDGETS
|
|
|
|
|
|
|
|
|
|
|
8 |
from streamlit.runtime.uploaded_file_manager import UploadedFile
|
9 |
import utilities_language_general.esp_constants as esp_constants
|
10 |
+
from utilities_language_w2v.esp_sentence_w2v import TASK, SENTENCE
|
11 |
+
from utilities_language_general.esp_utils import prepare_tasks, prepare_target_words, compute_frequency_dict
|
12 |
+
from utilities_language_general.esp_constants import st, load_w2v, load_classifiers, nlp, summarization, BAD_USER_TARGET_WORDS, MINIMUM_SETS
|
13 |
+
|
|
|
|
|
14 |
|
15 |
|
16 |
def main_workflow(
|
17 |
+
file: Union[UploadedFile, None],
|
18 |
text: str,
|
19 |
+
logs,
|
20 |
+
progress,
|
21 |
+
progress_d,
|
22 |
level: str,
|
23 |
tw_mode_automatic_mode: str,
|
24 |
target_words: str,
|
|
|
62 |
elif text != '':
|
63 |
current_text = text
|
64 |
else:
|
65 |
+
st.warning('Вы и текст не вставили, и файл не выбрали 😢')
|
66 |
current_text = ''
|
67 |
+
st.stop()
|
68 |
|
69 |
# Process target words
|
70 |
if tw_mode_automatic_mode == 'Самостоятельно':
|
71 |
if target_words == '':
|
72 |
+
st.warning('Вы не ввели целевые слова')
|
73 |
+
st.stop()
|
74 |
# Cannot make up paradigm, so only USER_TARGET_WORDS is used
|
75 |
USER_TARGET_WORDS = prepare_target_words(target_words)
|
76 |
tw_mode_automatic_mode = False
|
|
|
101 |
progress.progress(15)
|
102 |
|
103 |
# Choose necessary language minimum according to user's input
|
104 |
+
if level:
|
105 |
+
target_minimum, distractor_minimum = MINIMUM_SETS[level]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
else:
|
107 |
target_minimum = None
|
108 |
distractor_minimum = None
|
|
|
112 |
# Define which model is used for distractor generation
|
113 |
logs.update(label='Загружаем языковые модели и другие данные', state='running')
|
114 |
if model_name == 'Модель-1':
|
115 |
+
mask_filler = load_w2v('model1')
|
116 |
+
pos_dict, scaler, classifier = load_classifiers('model1')
|
117 |
else:
|
118 |
+
mask_filler = load_w2v('model2')
|
119 |
+
pos_dict, scaler, classifier = load_classifiers('model1')
|
120 |
|
121 |
# Start generation process
|
122 |
workflow = [SENTENCE(original=sent.strip(), n_sentence=num, max_num_distractors=num_distractors)
|
|
|
174 |
|
175 |
for sentence in workflow:
|
176 |
sentence.attach_distractors_to_target_word(model=mask_filler,
|
177 |
+
scaler=scaler,
|
178 |
+
classifier=classifier,
|
179 |
+
pos_dict=pos_dict,
|
180 |
global_distractors=GLOBAL_DISTRACTORS,
|
181 |
distractor_minimum=distractor_minimum,
|
182 |
level_name=level,
|
|
|
215 |
NUMBER_TASKS = 10
|
216 |
else:
|
217 |
NUMBER_TASKS = len(RESULT_TASKS)
|
218 |
+
RESULT_TASKS_in_summary = list(filter(lambda task: task.in_summary, RESULT_TASKS))
|
219 |
+
RESULT_TASTS_not_in_summary = list(filter(lambda task: not task.in_summary, RESULT_TASKS))
|
220 |
if len(RESULT_TASKS_in_summary) >= NUMBER_TASKS:
|
221 |
RESULT_TASKS = RESULT_TASKS_in_summary
|
222 |
else:
|
utilities_language_w2v/esp_sentence_w2v.py
CHANGED
@@ -1,13 +1,9 @@
|
|
1 |
import string
|
2 |
-
from random import random
|
3 |
-
from random import sample
|
4 |
-
from utilities_language_general.esp_constants import nlp
|
5 |
from utilities_language_general.morphology import inflect
|
6 |
-
from utilities_language_general.
|
7 |
-
from utilities_language_general.
|
8 |
-
|
9 |
-
from utilities_language_general.esp_constants import BAD_USER_TARGET_WORDS
|
10 |
-
from utilities_language_general.esp_utils import get_distractors_from_model
|
11 |
|
12 |
|
13 |
class SENTENCE:
|
@@ -46,6 +42,7 @@ class SENTENCE:
|
|
46 |
if not previous_was_phrase:
|
47 |
self.sentence_phrases.append(self.sentence_lemma_pos[i][1])
|
48 |
previous_was_phrase = False
|
|
|
49 |
|
50 |
def search_target_words_automatically(self, model, target_minimum: set, frequency_dict: dict = None, summary:list=None):
|
51 |
for token in self.sentence_phrases:
|
@@ -152,18 +149,15 @@ class SENTENCE:
|
|
152 |
self.search_user_target_words(model=model, user_target_words=user_target_words,
|
153 |
frequency_dict=frequency_dict, summary=summary)
|
154 |
|
155 |
-
def attach_distractors_to_target_word(self, model, global_distractors, distractor_minimum, level_name,
|
156 |
-
max_frequency, logs, progress):
|
157 |
n_target_words = len(self.target_words)
|
158 |
bad_target_words = []
|
159 |
for i, target_word in enumerate(self.target_words):
|
160 |
-
distractors = get_distractors_from_model(model,
|
161 |
-
|
162 |
-
global_distractors=global_distractors,
|
163 |
-
distractor_minimum=distractor_minimum,
|
164 |
-
max_num_distractors=self.max_num_distractors)
|
165 |
if distractors is None or target_word['frequency_in_text'] > max_frequency:
|
166 |
-
target_word['distractors'] = distractors
|
167 |
bad_target_words.append(target_word)
|
168 |
target_word['distractors'] = distractors
|
169 |
target_word['distractors_number'] = len(distractors) if distractors is not None else 0
|
|
|
1 |
import string
|
2 |
+
from random import random, sample
|
|
|
|
|
3 |
from utilities_language_general.morphology import inflect
|
4 |
+
from utilities_language_general.esp_constants import nlp, PHRASES, BAD_USER_TARGET_WORDS
|
5 |
+
from utilities_language_general.esp_utils import check_token, fix_irregular_lemma, get_distractors_from_model
|
6 |
+
|
|
|
|
|
7 |
|
8 |
|
9 |
class SENTENCE:
|
|
|
42 |
if not previous_was_phrase:
|
43 |
self.sentence_phrases.append(self.sentence_lemma_pos[i][1])
|
44 |
previous_was_phrase = False
|
45 |
+
self.sentence_phrases.append(self.sentence_lemma_pos[-1][1])
|
46 |
|
47 |
def search_target_words_automatically(self, model, target_minimum: set, frequency_dict: dict = None, summary:list=None):
|
48 |
for token in self.sentence_phrases:
|
|
|
149 |
self.search_user_target_words(model=model, user_target_words=user_target_words,
|
150 |
frequency_dict=frequency_dict, summary=summary)
|
151 |
|
152 |
+
def attach_distractors_to_target_word(self, model, scaler, classifier, pos_dict, global_distractors, distractor_minimum, level_name, max_frequency, logs, progress):
|
|
|
153 |
n_target_words = len(self.target_words)
|
154 |
bad_target_words = []
|
155 |
for i, target_word in enumerate(self.target_words):
|
156 |
+
distractors = get_distractors_from_model(doc=self.parsed, model=model, scaler=scaler, classifier=classifier, pos_dict=pos_dict,
|
157 |
+
target_text=target_word['original_text'], lemma=target_word['lemma'], pos=target_word['pos'], gender=target_word['gender'],
|
158 |
+
lemma_index=target_word['position_in_sentence'], global_distractors=global_distractors,
|
159 |
+
distractor_minimum=distractor_minimum, level_name=level_name, max_num_distractors=self.max_num_distractors)
|
|
|
160 |
if distractors is None or target_word['frequency_in_text'] > max_frequency:
|
|
|
161 |
bad_target_words.append(target_word)
|
162 |
target_word['distractors'] = distractors
|
163 |
target_word['distractors_number'] = len(distractors) if distractors is not None else 0
|