a-v-bely commited on
Commit
ea7c789
1 Parent(s): 307a5f3

Towards distractor classification

Browse files
utilities_language_bert/esp_sentence_bert.py CHANGED
@@ -1,13 +1,9 @@
1
  import string
2
- from random import random
3
- from random import sample
4
- from utilities_language_general.esp_constants import nlp
5
  from utilities_language_general.morphology import inflect
6
- from utilities_language_general.esp_constants import PHRASES
7
- from utilities_language_general.esp_utils import check_token_bert
8
- from utilities_language_general.esp_utils import fix_irregular_lemma
9
- from utilities_language_general.esp_constants import BAD_USER_TARGET_WORDS
10
- from utilities_language_general.esp_utils import get_distractors_from_model_bert
11
 
12
 
13
  class SENTENCE:
@@ -195,12 +191,6 @@ class TASK:
195
  def attach_distractors_to_target_word(self, model, global_distractors, distractor_minimum,
196
  level_name, max_frequency):
197
  pos = self.pos[0] if self.pos[0] == 'phrase' else self.pos[1]
198
- # distractors_full_text = get_distractors_from_model_bert(model=model, lemma=self.lemma, pos=pos,
199
- # gender=self.gender, level_name=level_name,
200
- # text_with_masked_task=self.text_with_masked_task,
201
- # global_distractors=global_distractors,
202
- # distractor_minimum=distractor_minimum,
203
- # max_num_distractors=self.max_num_distractors)
204
  distractors_sentence = get_distractors_from_model_bert(model=model, lemma=self.lemma, pos=pos,
205
  gender=self.gender, level_name=level_name,
206
  text_with_masked_task=self.masked_sentence,
 
1
  import string
2
+ from random import random, sample
 
 
3
  from utilities_language_general.morphology import inflect
4
+ from utilities_language_general.esp_constants import nlp, PHRASES, BAD_USER_TARGET_WORDS
5
+ from utilities_language_general.esp_utils import check_token_bert, fix_irregular_lemma, get_distractors_from_model_bert
6
+
 
 
7
 
8
 
9
  class SENTENCE:
 
191
  def attach_distractors_to_target_word(self, model, global_distractors, distractor_minimum,
192
  level_name, max_frequency):
193
  pos = self.pos[0] if self.pos[0] == 'phrase' else self.pos[1]
 
 
 
 
 
 
194
  distractors_sentence = get_distractors_from_model_bert(model=model, lemma=self.lemma, pos=pos,
195
  gender=self.gender, level_name=level_name,
196
  text_with_masked_task=self.masked_sentence,
utilities_language_general/esp_constants.py CHANGED
@@ -2,15 +2,20 @@ import json
2
  import spacy
3
  import gensim
4
  import streamlit as st
 
 
5
  from transformers import pipeline
6
  from summarizer import Summarizer
7
 
8
 
9
  @st.cache_resource
10
- def load_w2v(model_path):
11
  with st.spinner('Загружаю языковую модель'):
12
- _w2v_model = gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=True)
13
- return _w2v_model
 
 
 
14
 
15
 
16
  @st.cache_resource
@@ -26,14 +31,31 @@ def load_bert():
26
  _pipeline = pipeline(task="fill-mask", model="a-v-white/bert-base-spanish-wwm-cased-finetuned-literature-pro")
27
  return _pipeline
28
 
 
29
  @st.cache_resource
30
  def load_summarizer():
31
  return Summarizer()
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  nlp = load_spacy()
34
  summarization = load_summarizer()
35
- w2v_model_1_path = r'model1.gz'
36
- w2v_model_2_path = r'model2.gz'
37
 
38
  # Upload minimums
39
  a1_path, a1_target_set = r'lexical_minimums/A1_MINIMUM.txt', set()
@@ -42,19 +64,25 @@ b1_path, b1_target_set = r'lexical_minimums/B1_MINIMUM.txt', set()
42
  b2_path, b2_target_set = r'lexical_minimums/B2_MINIMUM.txt', set()
43
  c1_path, c1_target_set = r'lexical_minimums/C1_MINIMUM.txt', set()
44
  c2_path, c2_target_set = r'lexical_minimums/C2_MINIMUM.txt', set()
45
- minimums_paths = (a1_path, a2_path, b1_path, b2_path)
 
46
  minimums_sets = (a1_target_set, a2_target_set, b1_target_set, b2_target_set, c1_target_set, c2_target_set)
47
  for i in range(len(minimums_paths)):
48
  with open(minimums_paths[i], 'r', encoding='utf-8') as read_file:
49
  for line in read_file:
50
  minimums_sets[i].add(line.strip())
51
 
52
- a1_distractor_set = a1_target_set
53
- a2_distractor_set = a2_target_set.union(a1_target_set)
54
- b1_distractor_set = b1_target_set.union(a2_target_set)
55
- b2_distractor_set = b2_target_set.union(b1_target_set)
56
- c1_distractor_set = c1_target_set.union(b2_target_set)
57
- c2_distractor_set = c2_target_set.union(c1_target_set)
 
 
 
 
 
58
 
59
  with open('language_data/phrases.json', 'r', encoding='utf-8') as f:
60
  PHRASES = set(json.load(f)['PHRASES'])
@@ -77,6 +105,8 @@ COMBINE_POS = {
77
  'ADJ':['NOUN'], 'NOUN': ['ADJ']},
78
  'C2': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX', 'AUX_VERB'], 'AUX': ['VERB', 'AUX_VERB'], 'AUX_VERB': ['VERB', 'AUX'], 'AUX_AUX': ['AUX'],
79
  'ADJ':['NOUN'], 'NOUN': ['ADJ']},
 
 
80
  },
81
  'phrase':
82
  {
@@ -89,5 +119,8 @@ COMBINE_POS = {
89
  'ADJ':['NOUN'], 'NOUN': ['ADJ']},
90
  'C2': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX', 'AUX_VERB'], 'AUX': ['VERB', 'AUX_VERB'], 'AUX_VERB': ['VERB', 'AUX'], 'AUX_AUX': ['AUX'],
91
  'ADJ':['NOUN'], 'NOUN': ['ADJ']},
 
 
 
92
  },
93
  }
 
2
  import spacy
3
  import gensim
4
  import streamlit as st
5
+
6
+ from pickle import load
7
  from transformers import pipeline
8
  from summarizer import Summarizer
9
 
10
 
11
  @st.cache_resource
12
+ def load_w2v(model):
13
  with st.spinner('Загружаю языковую модель'):
14
+ if model == 'model1':
15
+ model_path = r'language_data/model1.gz'
16
+ else:
17
+ model_path = r'language_data/model2.gz'
18
+ return gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=True)
19
 
20
 
21
  @st.cache_resource
 
31
  _pipeline = pipeline(task="fill-mask", model="a-v-white/bert-base-spanish-wwm-cased-finetuned-literature-pro")
32
  return _pipeline
33
 
34
+
35
  @st.cache_resource
36
  def load_summarizer():
37
  return Summarizer()
38
 
39
+
40
+ @st.cache_resource
41
+ def load_classifiers(model):
42
+ if model == 'model1':
43
+ scaler_path = 'language_data/model1_with_wn_minmaxscaler.pickle'
44
+ classifier_path = 'language_data/model1_with_wn_catboost_classifier.pickle'
45
+ elif model == 'model2':
46
+ scaler_path = 'language_data/model2_with_wn_minmaxscaler.pickle'
47
+ classifier_path = 'language_data/model2_with_wn_catboost_classifier.pickle'
48
+ else:
49
+ scaler_path = 'language_data/model3_with_wn_minmaxscaler.pickle'
50
+ classifier_path = 'language_data/model3_with_wn_catboost_classifier.pickle'
51
+ with (open(scaler_path, 'rb') as f1, open(classifier_path, 'rb') as f2, open('language_data/pos_dict.pickle', 'rb') as f3):
52
+ scaler = load(f1)
53
+ classifier = load(f2)
54
+ pos_dict = load(f3)
55
+ return pos_dict, scaler, classifier
56
+
57
  nlp = load_spacy()
58
  summarization = load_summarizer()
 
 
59
 
60
  # Upload minimums
61
  a1_path, a1_target_set = r'lexical_minimums/A1_MINIMUM.txt', set()
 
64
  b2_path, b2_target_set = r'lexical_minimums/B2_MINIMUM.txt', set()
65
  c1_path, c1_target_set = r'lexical_minimums/C1_MINIMUM.txt', set()
66
  c2_path, c2_target_set = r'lexical_minimums/C2_MINIMUM.txt', set()
67
+
68
+ minimums_paths = (a1_path, a2_path, b1_path, b2_path, c1_path, c2_path)
69
  minimums_sets = (a1_target_set, a2_target_set, b1_target_set, b2_target_set, c1_target_set, c2_target_set)
70
  for i in range(len(minimums_paths)):
71
  with open(minimums_paths[i], 'r', encoding='utf-8') as read_file:
72
  for line in read_file:
73
  minimums_sets[i].add(line.strip())
74
 
75
+ MINIMUM_SETS = {
76
+ 'A1': (a1_target_set, a1_target_set),
77
+ 'A2': (a2_target_set, a2_target_set.union(a1_target_set)),
78
+ 'B1': (b1_target_set, b1_target_set.union(a2_target_set)),
79
+ 'B2': (b2_target_set, b2_target_set.union(b1_target_set)),
80
+ 'C1': (c1_target_set, c1_target_set.union(b2_target_set)),
81
+ 'C2': (c2_target_set, c2_target_set.union(c1_target_set)),
82
+ 'Без уровня': (None, None)
83
+ }
84
+
85
+ LEVEL_NUMBERS = {'A1': 1, 'A2': 1, 'B1': 2, 'B2': 3, 'C1': 4, 'C2': 4}
86
 
87
  with open('language_data/phrases.json', 'r', encoding='utf-8') as f:
88
  PHRASES = set(json.load(f)['PHRASES'])
 
105
  'ADJ':['NOUN'], 'NOUN': ['ADJ']},
106
  'C2': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX', 'AUX_VERB'], 'AUX': ['VERB', 'AUX_VERB'], 'AUX_VERB': ['VERB', 'AUX'], 'AUX_AUX': ['AUX'],
107
  'ADJ':['NOUN'], 'NOUN': ['ADJ']},
108
+ 'Без уровня': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX', 'AUX_VERB'], 'AUX': ['VERB', 'AUX_VERB'],
109
+ 'AUX_VERB': ['VERB', 'AUX'], 'AUX_AUX': ['AUX'], 'AUX_ADJ': ['PRON_VERB'], 'PRON_VERB': ['AUX_ADJ'], 'ADP': ['SCONJ', 'ADV'], 'ADJ': ['NOUN'], 'NOUN': ['ADJ']}
110
  },
111
  'phrase':
112
  {
 
119
  'ADJ':['NOUN'], 'NOUN': ['ADJ']},
120
  'C2': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX', 'AUX_VERB'], 'AUX': ['VERB', 'AUX_VERB'], 'AUX_VERB': ['VERB', 'AUX'], 'AUX_AUX': ['AUX'],
121
  'ADJ':['NOUN'], 'NOUN': ['ADJ']},
122
+ 'Без уровня': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX', 'AUX_VERB'], 'AUX': ['VERB', 'AUX_VERB'],
123
+ 'AUX_VERB': ['VERB', 'AUX'], 'AUX_AUX': ['AUX'], 'AUX_ADJ': ['PRON_VERB'], 'PRON_VERB': ['AUX_ADJ'],
124
+ 'ADP': ['SCONJ', 'ADV'], 'ADJ': ['NOUN'], 'NOUN': ['ADJ']}
125
  },
126
  }
utilities_language_general/esp_utils.py CHANGED
@@ -1,8 +1,8 @@
1
  from nltk import edit_distance
2
  from utilities.utils import answer_letter
3
- from utilities_language_general.esp_constants import nlp
4
- from utilities_language_general.esp_constants import FIX_LEMMA
5
- from utilities_language_general.esp_constants import COMBINE_POS
6
 
7
 
8
  def prepare_target_words(target_words):
@@ -107,12 +107,13 @@ def check_token_bert(token, current_minimum: set = None, check_allowed_pos: set
107
  return False
108
 
109
 
110
- def get_distractors_from_model(model, lemma: str, pos: str, gender: str or None, global_distractors: set,
111
- distractor_minimum: set, level_name: str, max_num_distractors: int,
112
  max_length_ratio=5, min_edit_distance_ratio=0.5):
113
 
114
  distractors = []
115
  query = lemma if '_' in lemma else f'{lemma}_{pos}'
 
116
  lemma = '_'.join(lemma.split('_')[::2])
117
  if model.has_index_for(query):
118
  candidates = model.most_similar(query, topn=max_num_distractors + 100)
@@ -126,19 +127,23 @@ def get_distractors_from_model(model, lemma: str, pos: str, gender: str or None,
126
  for candidate in candidates:
127
  if candidate[0].count('_') == 1 and pos != 'phrase':
128
  distractor_lemma, distractor_pos = candidate[0].split('_')
 
 
 
129
  distractor_similarity = candidate[1]
130
  candidate_gender = get_tags(distractor_lemma).get('Gender')
131
  length_ratio = abs(len(lemma) - len(distractor_lemma))
132
  condition = ((distractor_pos == pos
133
  or (COMBINE_POS['simple'][level_name].get(pos) is not None and COMBINE_POS['simple'][level_name].get(distractor_pos) is not None
134
- and distractor_pos in COMBINE_POS['simple'][level_name][pos] and pos in COMBINE_POS['simple'][level_name][distractor_pos])
135
- )
136
  and distractor_lemma != lemma
 
 
137
  and (candidate_gender == gender and level_name in ('B1', 'B2', 'C1', 'C2'))
138
  and length_ratio <= max_length_ratio
139
  and distractor_lemma not in global_distractors
140
- and edit_distance(lemma, distractor_lemma) / ((len(lemma) + len(distractor_lemma)) / 2) >
141
- min_edit_distance_ratio)
142
  if condition:
143
  if distractor_minimum is not None:
144
  if distractor_lemma in distractor_minimum:
@@ -146,14 +151,18 @@ def get_distractors_from_model(model, lemma: str, pos: str, gender: str or None,
146
  global_distractors.add(distractor_lemma)
147
  else:
148
  distractors.append((distractor_lemma, distractor_similarity))
149
- global_distractors.add(distractor_lemma)
150
  else:
151
- if candidate[0].count('_') > 3 or pos in ('NOUN', 'ADJ', 'NUM'):
 
152
  continue
153
  d1_lemma, d1_pos, d2_lemma, d2_pos = candidate[0].split('_')
154
  d_pos = f'{d1_pos}_{d2_pos}'
155
  distractor_lemma = f'{d1_lemma}_{d2_lemma}'
156
  distractor_similarity = candidate[1]
 
 
 
157
  condition = (((d1_pos == pos or d2_pos == pos)
158
  or (COMBINE_POS['phrase'][level_name].get(d_pos) is not None and COMBINE_POS['phrase'][level_name].get(pos) is not None
159
  and d_pos in COMBINE_POS['phrase'][level_name].get(d_pos) and pos in COMBINE_POS['phrase'][level_name].get(pos) )
@@ -161,7 +170,10 @@ def get_distractors_from_model(model, lemma: str, pos: str, gender: str or None,
161
  and pos in ('phrase', 'VERB', 'AUX', 'SCONJ', 'ADP'))
162
  or (d2_pos in ('VERB', 'AUX', 'SCONJ', 'ADP')
163
  and pos in ('phrase', 'VERB', 'AUX', 'SCONJ', 'ADP')))
 
164
  and candidate[0] != lemma
 
 
165
  and distractor_lemma != lemma
166
  and distractor_lemma not in global_distractors)
167
  if condition:
@@ -180,8 +192,8 @@ def get_distractors_from_model(model, lemma: str, pos: str, gender: str or None,
180
  return distractors
181
 
182
 
183
- def get_distractors_from_model_bert(model, text_with_masked_task: str, lemma: str, pos: str, gender: str or None,
184
- global_distractors: set, distractor_minimum: set, level_name: str,
185
  max_num_distractors: int, max_length_ratio=5, min_edit_distance_ratio=0.5):
186
  _distractors = []
187
  try:
@@ -205,9 +217,13 @@ def get_distractors_from_model_bert(model, text_with_masked_task: str, lemma: st
205
  distractor_similarity = candidate_distractor[1]
206
  candidate_gender = get_tags(distractor_lemma).get('Gender')
207
  length_ratio = abs(len(lemma) - len(distractor_lemma))
 
 
 
208
  if ((distractor_pos == pos
209
  or (COMBINE_POS['simple'][level_name].get(pos) is not None and COMBINE_POS['simple'][level_name].get(distractor_pos) is not None
210
  and distractor_pos in COMBINE_POS['simple'][level_name][pos] and pos in COMBINE_POS['simple'][level_name][distractor_pos]))
 
211
  and distractor_lemma != lemma
212
  and (len(_distractors) < max_num_distractors+100)
213
  and (candidate_gender == gender and level_name in ('B1', 'B2', 'C1', 'C2'))
 
1
  from nltk import edit_distance
2
  from utilities.utils import answer_letter
3
+ from utilities_language_general.similarity_measures import make_decision
4
+ from utilities_language_general.esp_constants import nlp, FIX_LEMMA, COMBINE_POS
5
+
6
 
7
 
8
  def prepare_target_words(target_words):
 
107
  return False
108
 
109
 
110
+ def get_distractors_from_model(doc, model, scaler, classifier, pos_dict:dict, target_text:str, lemma: str, pos: str, gender: str, lemma_index:int, global_distractors: set,
111
+ distractor_minimum: set, level_name: str, max_num_distractors: int,
112
  max_length_ratio=5, min_edit_distance_ratio=0.5):
113
 
114
  distractors = []
115
  query = lemma if '_' in lemma else f'{lemma}_{pos}'
116
+ raw_lemma = query
117
  lemma = '_'.join(lemma.split('_')[::2])
118
  if model.has_index_for(query):
119
  candidates = model.most_similar(query, topn=max_num_distractors + 100)
 
127
  for candidate in candidates:
128
  if candidate[0].count('_') == 1 and pos != 'phrase':
129
  distractor_lemma, distractor_pos = candidate[0].split('_')
130
+ decision = make_decision(doc, model_type='w2v', model=model, scaler=scaler, classifier=classifier, pos_dict=pos_dict,
131
+ level=level_name, target_lemma=raw_lemma, target_text=target_text, target_pos=pos, target_position=lemma_index,
132
+ substitute_lemma=distractor_lemma, substitute_pos=distractor_pos)
133
  distractor_similarity = candidate[1]
134
  candidate_gender = get_tags(distractor_lemma).get('Gender')
135
  length_ratio = abs(len(lemma) - len(distractor_lemma))
136
  condition = ((distractor_pos == pos
137
  or (COMBINE_POS['simple'][level_name].get(pos) is not None and COMBINE_POS['simple'][level_name].get(distractor_pos) is not None
138
+ and distractor_pos in COMBINE_POS['simple'][level_name][pos] and pos in COMBINE_POS['simple'][level_name][distractor_pos]))
139
+ and decision
140
  and distractor_lemma != lemma
141
+ and distractor_lemma not in lemma
142
+ and lemma not in distractor_lemma
143
  and (candidate_gender == gender and level_name in ('B1', 'B2', 'C1', 'C2'))
144
  and length_ratio <= max_length_ratio
145
  and distractor_lemma not in global_distractors
146
+ and edit_distance(lemma, distractor_lemma) / ((len(lemma) + len(distractor_lemma)) / 2) > min_edit_distance_ratio)
 
147
  if condition:
148
  if distractor_minimum is not None:
149
  if distractor_lemma in distractor_minimum:
 
151
  global_distractors.add(distractor_lemma)
152
  else:
153
  distractors.append((distractor_lemma, distractor_similarity))
154
+ global_distractors.add(distractor_lemma)
155
  else:
156
+ if (candidate[0].count('_') == 1 # REMOVE HOTFIX
157
+ or candidate[0].count('_') > 3 or pos in ('NOUN', 'ADJ', 'NUM')):
158
  continue
159
  d1_lemma, d1_pos, d2_lemma, d2_pos = candidate[0].split('_')
160
  d_pos = f'{d1_pos}_{d2_pos}'
161
  distractor_lemma = f'{d1_lemma}_{d2_lemma}'
162
  distractor_similarity = candidate[1]
163
+ decision = make_decision(doc, model_type='w2v', model=model, scaler=scaler, classifier=classifier, pos_dict=pos_dict,
164
+ level=level_name, target_lemma=raw_lemma, target_text=target_text, target_pos=pos, target_position=lemma_index,
165
+ substitute_lemma=candidate[0], substitute_pos=d_pos)
166
  condition = (((d1_pos == pos or d2_pos == pos)
167
  or (COMBINE_POS['phrase'][level_name].get(d_pos) is not None and COMBINE_POS['phrase'][level_name].get(pos) is not None
168
  and d_pos in COMBINE_POS['phrase'][level_name].get(d_pos) and pos in COMBINE_POS['phrase'][level_name].get(pos) )
 
170
  and pos in ('phrase', 'VERB', 'AUX', 'SCONJ', 'ADP'))
171
  or (d2_pos in ('VERB', 'AUX', 'SCONJ', 'ADP')
172
  and pos in ('phrase', 'VERB', 'AUX', 'SCONJ', 'ADP')))
173
+ and decision
174
  and candidate[0] != lemma
175
+ and distractor_lemma not in lemma
176
+ and lemma not in distractor_lemma
177
  and distractor_lemma != lemma
178
  and distractor_lemma not in global_distractors)
179
  if condition:
 
192
  return distractors
193
 
194
 
195
+ def get_distractors_from_model_bert(doc, model, scaler, classifier, text_with_masked_task: str, lemma: str, pos: str, gender: str, lemma_index:int,
196
+ global_distractors: set, distractor_minimum: set, level_name: str, pos_dict:dict,
197
  max_num_distractors: int, max_length_ratio=5, min_edit_distance_ratio=0.5):
198
  _distractors = []
199
  try:
 
217
  distractor_similarity = candidate_distractor[1]
218
  candidate_gender = get_tags(distractor_lemma).get('Gender')
219
  length_ratio = abs(len(lemma) - len(distractor_lemma))
220
+ decision = make_decision(doc, model_type='bert', scaler=scaler, classifier=classifier, pos_dict=pos_dict,
221
+ level=level_name, target_text=lemma, target_pos=pos, target_position=lemma_index,
222
+ substitute_text=distractor_lemma, substitute_pos=distractor_pos)
223
  if ((distractor_pos == pos
224
  or (COMBINE_POS['simple'][level_name].get(pos) is not None and COMBINE_POS['simple'][level_name].get(distractor_pos) is not None
225
  and distractor_pos in COMBINE_POS['simple'][level_name][pos] and pos in COMBINE_POS['simple'][level_name][distractor_pos]))
226
+ and decision
227
  and distractor_lemma != lemma
228
  and (len(_distractors) < max_num_distractors+100)
229
  and (candidate_gender == gender and level_name in ('B1', 'B2', 'C1', 'C2'))
utilities_language_general/similarity_measures.py ADDED
@@ -0,0 +1,283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from math import pow
3
+ from nltk.corpus import wordnet as wn
4
+ from utilities_language_general.esp_constants import nlp, PHRASES, LEVEL_NUMBERS
5
+
6
+
7
+ def eucledian_distance(x, y):
8
+ return np.sqrt(np.sum((x - y) ** 2))
9
+
10
+ def cosine_similarity(x, y):
11
+ out = np.dot(x, y) / (np.sqrt(np.dot(x, x)) * np.sqrt(np.dot(y, y)))
12
+ if str(out) != 'nan':
13
+ return out
14
+ return None
15
+
16
+ def get_vector_for_token(model, token):
17
+ vector = None
18
+
19
+ splitted = token.split('_')
20
+ token_list = [f'{splitted[i]}_{splitted[i+1]}' for i in range(len(splitted)-1)]
21
+
22
+ if model.has_index_for(token):
23
+ vector = model.get_vector(token)
24
+ else:
25
+ try:
26
+ vector = model.get_mean_vector(token_list)
27
+ except ValueError:
28
+ return None
29
+ return vector
30
+
31
+ def compute_metric(func, vector1, vector2):
32
+ if vector1 is not None and vector2 is not None:
33
+ return func(vector1, vector2)
34
+ else:
35
+ return None
36
+
37
+ def compute_positive_cos(x, y):
38
+ cos_sim = cosine_similarity(x, y)
39
+ if cos_sim:
40
+ return (cos_sim + 1) / 2
41
+ else:
42
+ return None
43
+
44
+ def addition_metric(substitute, target, context):
45
+ substitute_target_cos = compute_metric(cosine_similarity, substitute, target)
46
+ if not substitute_target_cos:
47
+ return None
48
+ if not context:
49
+ return None
50
+
51
+ context_vectors = []
52
+ for context_tk in context:
53
+ substitute_context_cos = compute_metric(cosine_similarity, substitute, context_tk)
54
+ if substitute_context_cos:
55
+ context_vectors.append(substitute_context_cos)
56
+ sum_of_context_vectors = np.sum(context_vectors)
57
+
58
+ metric = (substitute_target_cos + sum_of_context_vectors) / (len(context) + 1)
59
+ return metric
60
+
61
+ def balanced_addition_metric(substitute, target, context):
62
+ substitute_target_cos = compute_metric(cosine_similarity, substitute, target)
63
+ if not substitute_target_cos:
64
+ return None
65
+ if not context:
66
+ return None
67
+
68
+ context_vectors = []
69
+ for context_tk in context:
70
+ substitute_context_cos = compute_metric(cosine_similarity, substitute, context_tk)
71
+ if substitute_context_cos:
72
+ context_vectors.append(substitute_context_cos)
73
+ sum_of_context_vectors = np.sum(context_vectors)
74
+
75
+ context_len = len(context)
76
+ metric = (context_len * substitute_target_cos + sum_of_context_vectors) / (2 * context_len)
77
+ return metric
78
+
79
+ def multiplication_metric(substitute, target, context):
80
+ substitute_target_cos = compute_metric(compute_positive_cos, substitute, target)
81
+ if not substitute_target_cos:
82
+ return None
83
+ if not context:
84
+ return None
85
+
86
+ context_vectors = []
87
+ for context_tk in context:
88
+ substitute_context_positive_cos = compute_metric(compute_positive_cos, substitute, context_tk)
89
+ if substitute_context_positive_cos:
90
+ context_vectors.append(substitute_context_positive_cos)
91
+ prod_of_context_vectors = np.prod(context_vectors)
92
+ try:
93
+ metric = pow((substitute_target_cos + prod_of_context_vectors), 1 / (len(context) + 1))
94
+ except ValueError:
95
+ return None
96
+ return metric
97
+
98
+ def balanced_multiplication_metric(substitute, target, context):
99
+ substitute_target_cos = compute_metric(compute_positive_cos, substitute, target)
100
+ if not substitute_target_cos:
101
+ return None
102
+ if not context:
103
+ return None
104
+
105
+ context_vectors = []
106
+ for context_tk in context:
107
+ substitute_context_positive_cos = compute_metric(compute_positive_cos, substitute, context_tk)
108
+ if substitute_context_positive_cos:
109
+ context_vectors.append(substitute_context_positive_cos)
110
+ prod_of_context_vectors = np.prod(context_vectors)
111
+
112
+ context_len = len(context)
113
+ try:
114
+ metric = pow((pow(substitute_target_cos, context_len) + prod_of_context_vectors), 1 / (2 * context_len))
115
+ except ValueError:
116
+ return None
117
+ return metric
118
+
119
+ def bind_phrases(context_list):
120
+ context = []
121
+ previous_was_phrase = False
122
+ for i in range(len(context_list)-1):
123
+ phrase_candidate = f'{context_list[i]}_{context_list[i+1]}'
124
+ if phrase_candidate in PHRASES and not previous_was_phrase:
125
+ context.append(phrase_candidate)
126
+ previous_was_phrase = True
127
+ else:
128
+ if not previous_was_phrase:
129
+ context.append(context_list[i])
130
+ previous_was_phrase = False
131
+ if context_list:
132
+ if not context:
133
+ context.append(context_list[-1])
134
+ elif not context_list[-1] in context[-1]:
135
+ context.append(context_list[-1])
136
+ return context
137
+
138
+ def get_context_windows(doc, target_text, window_size):
139
+ sentence_str = doc.text
140
+ sentence_masked = sentence_str.lower().replace(target_text.lower().strip(), ' [MASK] ')
141
+ alpha_tokens_lemma_pos = [f'{tk.lemma_.lower()}_{tk.pos_}' for tk in nlp(sentence_masked) if tk.text.isalpha()]
142
+ alpha_tokens_lemma_pos_no_stop = [f'{tk.lemma_.lower()}_{tk.pos_}' for tk in nlp(sentence_masked) if tk.text.isalpha() and not tk.is_stop]
143
+ try:
144
+ mask_token_index = alpha_tokens_lemma_pos.index('mask_NUM')
145
+ mask_token_index_no_stop = alpha_tokens_lemma_pos_no_stop.index('mask_NUM')
146
+ except ValueError:
147
+ return None
148
+
149
+ left_border = max(mask_token_index-window_size, 0)
150
+ right_border = min(mask_token_index+window_size, len(alpha_tokens_lemma_pos))
151
+ l_context = alpha_tokens_lemma_pos[left_border:mask_token_index]
152
+ r_context = alpha_tokens_lemma_pos[mask_token_index+1:right_border+1]
153
+
154
+ left_border_no_stop = max(mask_token_index_no_stop-window_size, 0)
155
+ right_border_no_stop = min(mask_token_index_no_stop+window_size, len(alpha_tokens_lemma_pos_no_stop))
156
+ l_context_no_stop = alpha_tokens_lemma_pos_no_stop[left_border_no_stop:mask_token_index_no_stop]
157
+ r_context_no_stop = alpha_tokens_lemma_pos_no_stop[mask_token_index_no_stop+1:right_border_no_stop+1]
158
+ return (bind_phrases(l_context) + bind_phrases(r_context), bind_phrases(l_context_no_stop) + bind_phrases(r_context_no_stop))
159
+
160
+ def get_context_linked_words(doc, target_position, target_text):
161
+ answer_list = target_text.split(' ')
162
+ context_words = []
163
+ for tk in doc:
164
+ if tk.text.isalpha():
165
+ if (tk.text in answer_list and abs(target_position - tk.idx) <= sum([len(t) for t in answer_list])):
166
+ context_words.extend([t for t in tk.subtree if t.text.isalpha() and not t.is_stop])
167
+ context_words.extend([t for t in tk.children if t.text.isalpha() and not t.is_stop])
168
+ context_words.extend([t for t in tk.ancestors if t.text.isalpha() and not t.is_stop])
169
+ context_words = [(tk, f'{tk.lemma_}_{tk.pos_}') for tk in sorted(set(context_words), key=lambda tk: tk.i) if tk.text not in answer_list]
170
+ context = []
171
+ previous_was_phrase = False
172
+ for i in range(len(context_words)-1):
173
+ phrase_candidate = f'{context_words[i][1]}_{context_words[i+1][1]}'
174
+ if phrase_candidate in PHRASES and not previous_was_phrase and abs(context_words[i][0].i - context_words[i+1][0].i) <=1:
175
+ context.append(phrase_candidate)
176
+ previous_was_phrase = True
177
+ else:
178
+ if not previous_was_phrase:
179
+ context.append(context_words[i][1])
180
+ if context and context_words:
181
+ if not context_words[-1][1] in context[-1]:
182
+ context.append(context_words[-1][1])
183
+ elif context_words:
184
+ context.append(context_words[-1][1])
185
+ return context
186
+
187
+ def get_word_net_similarity(token1, token2, metric):
188
+ token1_list = token1.split('_')[::2]
189
+ token2_list = token2.split('_')[::2]
190
+ data = []
191
+ for token1_part in token1_list:
192
+ for syn1 in wn.synsets(token1_part, lang='spa'):
193
+ for token2_part in token2_list:
194
+ for syn2 in wn.synsets(token2_part, lang='spa'):
195
+ if syn1.pos() == syn2.pos():
196
+ data.append(metric(syn1, syn2))
197
+ if data:
198
+ data = np.array(data)
199
+ return data.min(), data.max(), data.mean(), data.std()
200
+ else:
201
+ return None, None, None, None
202
+
203
+ def compute_all_necessary_metrics(target_lemma, target_text, target_position, substitute_lemma, doc, model_type:str, model=None):
204
+
205
+ path_similarity_min, path_similarity_max, path_similarity_mean, path_similarity_std = get_word_net_similarity(target_lemma, substitute_lemma, wn.path_similarity)
206
+ wup_similarity_min, wup_similarity_max, wup_similarity_mean, wup_similarity_std = get_word_net_similarity(target_lemma, substitute_lemma, wn.wup_similarity)
207
+ lch_similarity_min, lch_similarity_max, lch_similarity_mean, lch_similarity_std = get_word_net_similarity(target_lemma, substitute_lemma, wn.lch_similarity)
208
+
209
+ if model_type == 'bert':
210
+ return (path_similarity_min, path_similarity_max, path_similarity_mean, path_similarity_std,
211
+ wup_similarity_min, wup_similarity_max, wup_similarity_mean, wup_similarity_std,
212
+ lch_similarity_min, lch_similarity_max, lch_similarity_mean, lch_similarity_std)
213
+
214
+ target_vector = get_vector_for_token(model, target_lemma)
215
+ substitute_vector = get_vector_for_token(model, substitute_lemma)
216
+
217
+ cosimilarity = compute_metric(cosine_similarity, substitute_vector, target_vector)
218
+ eucledian_similarity = compute_metric(eucledian_distance, substitute_vector, target_vector)
219
+
220
+ context_window3, context_window3_no_stop = get_context_windows(doc=doc, target_text=target_text, window_size=3)
221
+ context_window5, context_window5_no_stop = get_context_windows(doc=doc, target_text=target_text, window_size=5)
222
+ context_window_synt = get_context_linked_words(doc, target_position, target_text)
223
+
224
+ context_window3 = [get_vector_for_token(model, token) for token in context_window3]
225
+ context_window3_no_stop = [get_vector_for_token(model, token) for token in context_window3_no_stop]
226
+ context_window5 = [get_vector_for_token(model, token) for token in context_window5]
227
+ context_window5_no_stop = [get_vector_for_token(model, token) for token in context_window5_no_stop]
228
+ context_window_synt = [get_vector_for_token(model, token) for token in context_window_synt]
229
+
230
+ add_metric_window3 = addition_metric(target_vector, substitute_vector, context_window3)
231
+ bal_add_metric_window3 = balanced_addition_metric(target_vector, substitute_vector, context_window3)
232
+ add_metric_window3_no_stop = addition_metric(target_vector, substitute_vector, context_window3_no_stop)
233
+ bal_add_metric_window3_no_stop = balanced_addition_metric(target_vector, substitute_vector, context_window3_no_stop)
234
+
235
+ mult_metric_window3 = multiplication_metric(target_vector, substitute_vector, context_window3)
236
+ bal_mult_metric_window3 = balanced_multiplication_metric(target_vector, substitute_vector, context_window3)
237
+ mult_metric_window3_no_stop = multiplication_metric(target_vector, substitute_vector, context_window3_no_stop)
238
+ bal_mult_metric_window3_no_stop = balanced_multiplication_metric(target_vector, substitute_vector, context_window3_no_stop)
239
+
240
+ add_metric_window5 = addition_metric(target_vector, substitute_vector, context_window5)
241
+ bal_add_metric_window5 = balanced_addition_metric(target_vector, substitute_vector, context_window5)
242
+ add_metric_window5_no_stop = addition_metric(target_vector, substitute_vector, context_window5_no_stop)
243
+ bal_add_metric_window5_no_stop = balanced_addition_metric(target_vector, substitute_vector, context_window5_no_stop)
244
+
245
+ mult_metric_window5 = multiplication_metric(target_vector, substitute_vector, context_window5)
246
+ bal_mult_metric_window5 = balanced_multiplication_metric(target_vector, substitute_vector, context_window5)
247
+ mult_metric_window5_no_stop = multiplication_metric(target_vector, substitute_vector, context_window5_no_stop)
248
+ bal_mult_metric_window5_no_stop = balanced_multiplication_metric(target_vector, substitute_vector, context_window5_no_stop)
249
+
250
+ add_metric_synt = addition_metric(target_vector, substitute_vector, context_window_synt)
251
+ bal_add_metric_synt = balanced_addition_metric(target_vector, substitute_vector, context_window_synt)
252
+
253
+ mult_metric_synt = multiplication_metric(target_vector, substitute_vector, context_window_synt)
254
+ bal_mult_metric_synt = balanced_multiplication_metric(target_vector, substitute_vector, context_window_synt)
255
+
256
+ return (cosimilarity, eucledian_similarity,
257
+ add_metric_window3, bal_add_metric_window3,
258
+ mult_metric_window3, bal_mult_metric_window3,
259
+ add_metric_window3_no_stop, bal_add_metric_window3_no_stop,
260
+ mult_metric_window3_no_stop, bal_mult_metric_window3_no_stop,
261
+ add_metric_window5, bal_add_metric_window5,
262
+ mult_metric_window5, bal_mult_metric_window5,
263
+ add_metric_window5_no_stop, bal_add_metric_window5_no_stop,
264
+ mult_metric_window5_no_stop, bal_mult_metric_window5_no_stop,
265
+ add_metric_synt, bal_add_metric_synt,
266
+ mult_metric_synt, bal_mult_metric_synt,
267
+ path_similarity_min, path_similarity_mean, path_similarity_std, path_similarity_max,
268
+ wup_similarity_min, wup_similarity_mean, wup_similarity_std, wup_similarity_max,
269
+ lch_similarity_min, lch_similarity_mean, lch_similarity_std, lch_similarity_max)
270
+
271
+ def make_decision(doc, model_type, scaler, classifier, pos_dict, level, target_lemma, target_text, target_pos, target_position,
272
+ substitute_lemma, substitute_pos, model=None, bert_score=None):
273
+ # return True
274
+ metrics = compute_all_necessary_metrics(target_lemma=target_lemma, target_text=target_text, target_position=target_position,
275
+ substitute_lemma=substitute_lemma, doc=doc, model_type=model_type, model=model)
276
+ target_multiword, substitute_multiword = target_lemma.count('_') > 2, substitute_lemma.count('_') > 2
277
+ data = [LEVEL_NUMBERS.get(level), pos_dict.get(target_pos), target_multiword, pos_dict.get(substitute_pos), substitute_multiword] + scaler.transform([metrics]).tolist()[0]
278
+ if model_type == 'bert':
279
+ data = [LEVEL_NUMBERS.get(level), pos_dict.get(target_pos), target_multiword, pos_dict.get(substitute_pos), substitute_multiword, bert_score] + scaler.transform([metrics]).tolist()[0]
280
+ predict = classifier.predict(data)
281
+ return bool(predict)
282
+
283
+
utilities_language_w2v/esp_main_workflow_w2v.py CHANGED
@@ -1,30 +1,24 @@
1
  import datetime
2
  from io import StringIO
 
3
  from random import sample
4
  from collections import defaultdict
5
  from streamlit import progress as st_progress
6
  from streamlit.elements import WIDGETS as ST_WIDGETS
7
- from utilities_language_general.esp_constants import st
8
- from utilities_language_w2v.esp_sentence_w2v import TASK
9
- from utilities_language_w2v.esp_sentence_w2v import SENTENCE
10
- from utilities_language_general.esp_constants import load_w2v
11
- from utilities_language_general.esp_utils import prepare_tasks
12
  from streamlit.runtime.uploaded_file_manager import UploadedFile
13
  import utilities_language_general.esp_constants as esp_constants
14
- from utilities_language_general.esp_constants import summarization
15
- from utilities_language_general.esp_constants import w2v_model_1_path
16
- from utilities_language_general.esp_constants import w2v_model_2_path
17
- from utilities_language_general.esp_utils import prepare_target_words
18
- from utilities_language_general.esp_utils import compute_frequency_dict
19
- from utilities_language_general.esp_constants import BAD_USER_TARGET_WORDS
20
 
21
 
22
  def main_workflow(
23
- file: UploadedFile or None,
24
  text: str,
25
- logs: ST_WIDGETS,
26
- progress: st_progress,
27
- progress_d: st_progress,
28
  level: str,
29
  tw_mode_automatic_mode: str,
30
  target_words: str,
@@ -68,15 +62,15 @@ def main_workflow(
68
  elif text != '':
69
  current_text = text
70
  else:
71
- esp_constants.st.warning('Вы и текст не вставили, и файл не выбрали 😢')
72
  current_text = ''
73
- esp_constants.st.stop()
74
 
75
  # Process target words
76
  if tw_mode_automatic_mode == 'Самостоятельно':
77
  if target_words == '':
78
- esp_constants.st.warning('Вы не ввели целевые слова')
79
- esp_constants.st.stop()
80
  # Cannot make up paradigm, so only USER_TARGET_WORDS is used
81
  USER_TARGET_WORDS = prepare_target_words(target_words)
82
  tw_mode_automatic_mode = False
@@ -107,27 +101,8 @@ def main_workflow(
107
  progress.progress(15)
108
 
109
  # Choose necessary language minimum according to user's input
110
- if level == 'A1':
111
- target_minimum = esp_constants.a1_target_set
112
- distractor_minimum = esp_constants.a1_distractor_set
113
- elif level == 'A2':
114
- target_minimum = esp_constants.a2_target_set
115
- distractor_minimum = esp_constants.a2_distractor_set
116
- elif level == 'B1':
117
- target_minimum = esp_constants.b1_target_set
118
- distractor_minimum = esp_constants.b1_distractor_set
119
- elif level == 'B2':
120
- target_minimum = esp_constants.b2_target_set
121
- distractor_minimum = esp_constants.b2_distractor_set
122
- elif level == 'C1':
123
- target_minimum = esp_constants.c1_target_set
124
- distractor_minimum = esp_constants.c1_distractor_set
125
- elif level == 'C2':
126
- target_minimum = esp_constants.c2_target_set
127
- distractor_minimum = esp_constants.c2_distractor_set
128
- elif level == 'Без уровня':
129
- target_minimum = None
130
- distractor_minimum = None
131
  else:
132
  target_minimum = None
133
  distractor_minimum = None
@@ -137,9 +112,11 @@ def main_workflow(
137
  # Define which model is used for distractor generation
138
  logs.update(label='Загружаем языковые модели и другие данные', state='running')
139
  if model_name == 'Модель-1':
140
- mask_filler = load_w2v(w2v_model_1_path)
 
141
  else:
142
- mask_filler = load_w2v(w2v_model_2_path)
 
143
 
144
  # Start generation process
145
  workflow = [SENTENCE(original=sent.strip(), n_sentence=num, max_num_distractors=num_distractors)
@@ -197,6 +174,9 @@ def main_workflow(
197
 
198
  for sentence in workflow:
199
  sentence.attach_distractors_to_target_word(model=mask_filler,
 
 
 
200
  global_distractors=GLOBAL_DISTRACTORS,
201
  distractor_minimum=distractor_minimum,
202
  level_name=level,
@@ -235,8 +215,8 @@ def main_workflow(
235
  NUMBER_TASKS = 10
236
  else:
237
  NUMBER_TASKS = len(RESULT_TASKS)
238
- RESULT_TASKS_in_summary = filter(lambda task: task.in_summary, RESULT_TASKS)
239
- RESULT_TASTS_not_in_summary = filter(lambda task: not task.in_summary, RESULT_TASKS)
240
  if len(RESULT_TASKS_in_summary) >= NUMBER_TASKS:
241
  RESULT_TASKS = RESULT_TASKS_in_summary
242
  else:
 
1
  import datetime
2
  from io import StringIO
3
+ from typing import Union
4
  from random import sample
5
  from collections import defaultdict
6
  from streamlit import progress as st_progress
7
  from streamlit.elements import WIDGETS as ST_WIDGETS
 
 
 
 
 
8
  from streamlit.runtime.uploaded_file_manager import UploadedFile
9
  import utilities_language_general.esp_constants as esp_constants
10
+ from utilities_language_w2v.esp_sentence_w2v import TASK, SENTENCE
11
+ from utilities_language_general.esp_utils import prepare_tasks, prepare_target_words, compute_frequency_dict
12
+ from utilities_language_general.esp_constants import st, load_w2v, load_classifiers, nlp, summarization, BAD_USER_TARGET_WORDS, MINIMUM_SETS
13
+
 
 
14
 
15
 
16
  def main_workflow(
17
+ file: Union[UploadedFile, None],
18
  text: str,
19
+ logs,
20
+ progress,
21
+ progress_d,
22
  level: str,
23
  tw_mode_automatic_mode: str,
24
  target_words: str,
 
62
  elif text != '':
63
  current_text = text
64
  else:
65
+ st.warning('Вы и текст не вставили, и файл не выбрали 😢')
66
  current_text = ''
67
+ st.stop()
68
 
69
  # Process target words
70
  if tw_mode_automatic_mode == 'Самостоятельно':
71
  if target_words == '':
72
+ st.warning('Вы не ввели целевые слова')
73
+ st.stop()
74
  # Cannot make up paradigm, so only USER_TARGET_WORDS is used
75
  USER_TARGET_WORDS = prepare_target_words(target_words)
76
  tw_mode_automatic_mode = False
 
101
  progress.progress(15)
102
 
103
  # Choose necessary language minimum according to user's input
104
+ if level:
105
+ target_minimum, distractor_minimum = MINIMUM_SETS[level]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  else:
107
  target_minimum = None
108
  distractor_minimum = None
 
112
  # Define which model is used for distractor generation
113
  logs.update(label='Загружаем языковые модели и другие данные', state='running')
114
  if model_name == 'Модель-1':
115
+ mask_filler = load_w2v('model1')
116
+ pos_dict, scaler, classifier = load_classifiers('model1')
117
  else:
118
+ mask_filler = load_w2v('model2')
119
+ pos_dict, scaler, classifier = load_classifiers('model1')
120
 
121
  # Start generation process
122
  workflow = [SENTENCE(original=sent.strip(), n_sentence=num, max_num_distractors=num_distractors)
 
174
 
175
  for sentence in workflow:
176
  sentence.attach_distractors_to_target_word(model=mask_filler,
177
+ scaler=scaler,
178
+ classifier=classifier,
179
+ pos_dict=pos_dict,
180
  global_distractors=GLOBAL_DISTRACTORS,
181
  distractor_minimum=distractor_minimum,
182
  level_name=level,
 
215
  NUMBER_TASKS = 10
216
  else:
217
  NUMBER_TASKS = len(RESULT_TASKS)
218
+ RESULT_TASKS_in_summary = list(filter(lambda task: task.in_summary, RESULT_TASKS))
219
+ RESULT_TASTS_not_in_summary = list(filter(lambda task: not task.in_summary, RESULT_TASKS))
220
  if len(RESULT_TASKS_in_summary) >= NUMBER_TASKS:
221
  RESULT_TASKS = RESULT_TASKS_in_summary
222
  else:
utilities_language_w2v/esp_sentence_w2v.py CHANGED
@@ -1,13 +1,9 @@
1
  import string
2
- from random import random
3
- from random import sample
4
- from utilities_language_general.esp_constants import nlp
5
  from utilities_language_general.morphology import inflect
6
- from utilities_language_general.esp_utils import check_token
7
- from utilities_language_general.esp_constants import PHRASES
8
- from utilities_language_general.esp_utils import fix_irregular_lemma
9
- from utilities_language_general.esp_constants import BAD_USER_TARGET_WORDS
10
- from utilities_language_general.esp_utils import get_distractors_from_model
11
 
12
 
13
  class SENTENCE:
@@ -46,6 +42,7 @@ class SENTENCE:
46
  if not previous_was_phrase:
47
  self.sentence_phrases.append(self.sentence_lemma_pos[i][1])
48
  previous_was_phrase = False
 
49
 
50
  def search_target_words_automatically(self, model, target_minimum: set, frequency_dict: dict = None, summary:list=None):
51
  for token in self.sentence_phrases:
@@ -152,18 +149,15 @@ class SENTENCE:
152
  self.search_user_target_words(model=model, user_target_words=user_target_words,
153
  frequency_dict=frequency_dict, summary=summary)
154
 
155
- def attach_distractors_to_target_word(self, model, global_distractors, distractor_minimum, level_name,
156
- max_frequency, logs, progress):
157
  n_target_words = len(self.target_words)
158
  bad_target_words = []
159
  for i, target_word in enumerate(self.target_words):
160
- distractors = get_distractors_from_model(model, lemma=target_word['lemma'], pos=target_word['pos'],
161
- gender=target_word['gender'], level_name=level_name,
162
- global_distractors=global_distractors,
163
- distractor_minimum=distractor_minimum,
164
- max_num_distractors=self.max_num_distractors)
165
  if distractors is None or target_word['frequency_in_text'] > max_frequency:
166
- target_word['distractors'] = distractors
167
  bad_target_words.append(target_word)
168
  target_word['distractors'] = distractors
169
  target_word['distractors_number'] = len(distractors) if distractors is not None else 0
 
1
  import string
2
+ from random import random, sample
 
 
3
  from utilities_language_general.morphology import inflect
4
+ from utilities_language_general.esp_constants import nlp, PHRASES, BAD_USER_TARGET_WORDS
5
+ from utilities_language_general.esp_utils import check_token, fix_irregular_lemma, get_distractors_from_model
6
+
 
 
7
 
8
 
9
  class SENTENCE:
 
42
  if not previous_was_phrase:
43
  self.sentence_phrases.append(self.sentence_lemma_pos[i][1])
44
  previous_was_phrase = False
45
+ self.sentence_phrases.append(self.sentence_lemma_pos[-1][1])
46
 
47
  def search_target_words_automatically(self, model, target_minimum: set, frequency_dict: dict = None, summary:list=None):
48
  for token in self.sentence_phrases:
 
149
  self.search_user_target_words(model=model, user_target_words=user_target_words,
150
  frequency_dict=frequency_dict, summary=summary)
151
 
152
+ def attach_distractors_to_target_word(self, model, scaler, classifier, pos_dict, global_distractors, distractor_minimum, level_name, max_frequency, logs, progress):
 
153
  n_target_words = len(self.target_words)
154
  bad_target_words = []
155
  for i, target_word in enumerate(self.target_words):
156
+ distractors = get_distractors_from_model(doc=self.parsed, model=model, scaler=scaler, classifier=classifier, pos_dict=pos_dict,
157
+ target_text=target_word['original_text'], lemma=target_word['lemma'], pos=target_word['pos'], gender=target_word['gender'],
158
+ lemma_index=target_word['position_in_sentence'], global_distractors=global_distractors,
159
+ distractor_minimum=distractor_minimum, level_name=level_name, max_num_distractors=self.max_num_distractors)
 
160
  if distractors is None or target_word['frequency_in_text'] > max_frequency:
 
161
  bad_target_words.append(target_word)
162
  target_word['distractors'] = distractors
163
  target_word['distractors_number'] = len(distractors) if distractors is not None else 0