togokah commited on
Commit
9efc4ef
1 Parent(s): 015d17f

Prepare for experiment and add morphology to bert

Browse files
pages/2_👨‍🏫_Начало_работы.py CHANGED
@@ -128,13 +128,13 @@ if st.session_state.get('-LOGGED_IN_BOOL-'):
128
  key='-TARGET_WORDS_MODE-', horizontal=True)
129
  DISTRACTOR_MODEL = DISTRACTOR_MODEL_COL.radio(
130
  label='**Модель для выбора неправильных вариантов**',
131
- options=['Худ. лит-ра', 'Новости', 'Все вместе', 'BERT'],
132
  key='-DISTRACTOR_MODEL_MODE-', horizontal=True)
133
  CEFR_NUM_DISTRACTORS_COL, UTW_COL = FORM.columns([2, 2])
134
  with CEFR_NUM_DISTRACTORS_COL:
135
  CEFR_TEXT_LEVEL = custom_select_box(
136
  'Укажите уровень по CEFR:',
137
- ['A1', 'A2', 'B1', 'B2', 'C1', 'C2', 'Нет'],
138
  no_selection_label='-Выберите языковой уровень-')
139
  st.session_state['-CEFR_TEXT_LEVEL-'] = CEFR_TEXT_LEVEL
140
  NUMBER_DISTRACTORS = CEFR_NUM_DISTRACTORS_COL.number_input(
@@ -186,7 +186,7 @@ if st.session_state.get('-LOGGED_IN_BOOL-'):
186
  PROGRESS_BAR_S = st.progress(0)
187
 
188
  # Start generation process. Everything happens inside main_workflow func
189
- if DISTRACTOR_MODEL == 'BERT':
190
  from utilities_language_bert.esp_main_workflow_bert import main_workflow
191
  __TASK_DATA__ = main_workflow(
192
  file=UPLOAD_FILE,
 
128
  key='-TARGET_WORDS_MODE-', horizontal=True)
129
  DISTRACTOR_MODEL = DISTRACTOR_MODEL_COL.radio(
130
  label='**Модель для выбора неправильных вариантов**',
131
+ options=['Модель-1', 'Модель-2'],
132
  key='-DISTRACTOR_MODEL_MODE-', horizontal=True)
133
  CEFR_NUM_DISTRACTORS_COL, UTW_COL = FORM.columns([2, 2])
134
  with CEFR_NUM_DISTRACTORS_COL:
135
  CEFR_TEXT_LEVEL = custom_select_box(
136
  'Укажите уровень по CEFR:',
137
+ ['Без уровня', 'A1', 'A2', 'B1', 'B2', 'C1', 'C2'],
138
  no_selection_label='-Выберите языковой уровень-')
139
  st.session_state['-CEFR_TEXT_LEVEL-'] = CEFR_TEXT_LEVEL
140
  NUMBER_DISTRACTORS = CEFR_NUM_DISTRACTORS_COL.number_input(
 
186
  PROGRESS_BAR_S = st.progress(0)
187
 
188
  # Start generation process. Everything happens inside main_workflow func
189
+ if DISTRACTOR_MODEL == 'Модель-2':
190
  from utilities_language_bert.esp_main_workflow_bert import main_workflow
191
  __TASK_DATA__ = main_workflow(
192
  file=UPLOAD_FILE,
utilities_language_bert/esp_main_workflow_bert.py CHANGED
@@ -122,7 +122,7 @@ def main_workflow(
122
  elif level == 'C2':
123
  target_minimum = esp_constants.c2_target_set
124
  distractor_minimum = esp_constants.c2_distractor_set
125
- elif level == 'Нет':
126
  target_minimum = None
127
  distractor_minimum = None
128
  else:
@@ -150,8 +150,8 @@ def main_workflow(
150
  target_minimum=target_minimum,
151
  user_target_words=USER_TARGET_WORDS,
152
  frequency_dict=FREQ_DICT)
153
- progress.progress(int(30 + (j * (30 / len(workflow)))))
154
- progress_s.progress(60)
155
  DUPLICATE_TARGET_WORDS = defaultdict(list)
156
  for sentence in workflow:
157
  for target_word in sentence.target_words:
@@ -164,7 +164,7 @@ def main_workflow(
164
  if target_word not in RESULT_TW:
165
  global_bad_target_words.append(target_word['original_text'])
166
  sentence.target_words.remove(target_word)
167
- progress_s.progress(65)
168
  logs.success('Выбрали слова-пропуски!')
169
 
170
  for sentence in workflow:
@@ -176,7 +176,7 @@ def main_workflow(
176
 
177
  for sentence in workflow:
178
  sentence.filter_target_words(target_words_automatic_mode=tw_mode_automatic_mode)
179
- progress_s.progress(65)
180
 
181
  RESULT_TASKS = []
182
  for sentence in workflow:
@@ -193,9 +193,14 @@ def main_workflow(
193
  f'Обработали {num}/{len(RESULT_TASKS)} целевых слов!')
194
  logs_d.success(
195
  f'Обработали {len(RESULT_TASKS)}/{len(RESULT_TASKS)} целевых слов!')
 
 
196
 
 
 
197
  progress_s.progress(70)
198
- logs.success('Подобрали неправильные варианты!')
 
199
  for task in RESULT_TASKS:
200
  task.sample_distractors(num_distractors=num_distractors)
201
  progress_s.progress(75)
 
122
  elif level == 'C2':
123
  target_minimum = esp_constants.c2_target_set
124
  distractor_minimum = esp_constants.c2_distractor_set
125
+ elif level == 'Без уровня':
126
  target_minimum = None
127
  distractor_minimum = None
128
  else:
 
150
  target_minimum=target_minimum,
151
  user_target_words=USER_TARGET_WORDS,
152
  frequency_dict=FREQ_DICT)
153
+ progress.progress(int(30 + (j * (20 / len(workflow)))))
154
+ progress_s.progress(50)
155
  DUPLICATE_TARGET_WORDS = defaultdict(list)
156
  for sentence in workflow:
157
  for target_word in sentence.target_words:
 
164
  if target_word not in RESULT_TW:
165
  global_bad_target_words.append(target_word['original_text'])
166
  sentence.target_words.remove(target_word)
167
+ progress_s.progress(55)
168
  logs.success('Выбрали слова-пропуски!')
169
 
170
  for sentence in workflow:
 
176
 
177
  for sentence in workflow:
178
  sentence.filter_target_words(target_words_automatic_mode=tw_mode_automatic_mode)
179
+ progress_s.progress(60)
180
 
181
  RESULT_TASKS = []
182
  for sentence in workflow:
 
193
  f'Обработали {num}/{len(RESULT_TASKS)} целевых слов!')
194
  logs_d.success(
195
  f'Обработали {len(RESULT_TASKS)}/{len(RESULT_TASKS)} целевых слов!')
196
+ progress_s.progress(65)
197
+ logs.success('Подобрали неправильные варианты!')
198
 
199
+ for task in RESULT_TASKS:
200
+ task.inflect_distractors()
201
  progress_s.progress(70)
202
+ logs.success('Просклоняли и проспрягали неправильные варианты!')
203
+
204
  for task in RESULT_TASKS:
205
  task.sample_distractors(num_distractors=num_distractors)
206
  progress_s.progress(75)
utilities_language_bert/esp_sentence_bert.py CHANGED
@@ -2,6 +2,7 @@ import string
2
  from random import random
3
  from random import sample
4
  from utilities_language_general.esp_constants import nlp
 
5
  from utilities_language_general.esp_constants import PHRASES
6
  from utilities_language_general.esp_utils import check_token_bert
7
  from utilities_language_general.esp_utils import fix_irregular_lemma
@@ -169,7 +170,9 @@ class TASK:
169
  self.distractors = None
170
  self.distractors_number = 0
171
  self.bad_target_word = False
 
172
  self.pos = task_data['pos']
 
173
  self.lemma = task_data['lemma']
174
  self.gender = task_data['gender']
175
  self.max_num_distractors = max_num_distractors
@@ -208,20 +211,40 @@ class TASK:
208
  self.distractors = [d[0] for i, d in enumerate(distractors_sentence) if i < 15]
209
  self.distractors_number = len(distractors_sentence) if distractors_sentence is not None else 0
210
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211
  def sample_distractors(self, num_distractors):
212
  if not self.bad_target_word:
213
  num_distractors = min(self.distractors_number, num_distractors) if num_distractors >= 4 else num_distractors
214
  self.distractors = sample(self.distractors[:min(self.distractors_number, 10)], num_distractors)
215
 
216
  def compile_task(self, max_num_distractors):
217
- len_distractors = len(self.distractors)
218
  len_variants = min(len_distractors, max_num_distractors) if max_num_distractors > 4 \
219
  else max_num_distractors
220
  letters = (f'({letter})' for letter in string.ascii_lowercase[:len_variants + 1])
221
  try:
222
- distractors = sample(self.distractors, len_variants) + [self.original_text, ]
223
  except ValueError:
224
- distractors = self.distractors + [self.original_text, ]
225
  tmp_vars = [f'{item[0]} {item[1].replace("_", " ")}'
226
  for item in zip(letters, sorted(distractors, key=lambda _: random()))]
227
  self.variants.append((self.original_text, tmp_vars))
 
2
  from random import random
3
  from random import sample
4
  from utilities_language_general.esp_constants import nlp
5
+ from utilities_language_general.morphology import inflect
6
  from utilities_language_general.esp_constants import PHRASES
7
  from utilities_language_general.esp_utils import check_token_bert
8
  from utilities_language_general.esp_utils import fix_irregular_lemma
 
170
  self.distractors = None
171
  self.distractors_number = 0
172
  self.bad_target_word = False
173
+ self.inflected_distractors = None
174
  self.pos = task_data['pos']
175
+ self.tags = task_data['tags']
176
  self.lemma = task_data['lemma']
177
  self.gender = task_data['gender']
178
  self.max_num_distractors = max_num_distractors
 
211
  self.distractors = [d[0] for i, d in enumerate(distractors_sentence) if i < 15]
212
  self.distractors_number = len(distractors_sentence) if distractors_sentence is not None else 0
213
 
214
+ def inflect_distractors(self):
215
+ inflected_distractors = []
216
+ for distractor_lemma, distractor_similarity in self.distractors:
217
+ if distractor_lemma.count('_') > 1:
218
+ if distractor_lemma.startswith('haber_'):
219
+ distractor_lemma = distractor_lemma.split('_')[-2]
220
+ inflected = inflect(lemma=distractor_lemma, target_pos=self.pos, target_tags=self.tags)
221
+ else:
222
+ continue
223
+ else:
224
+ inflected = inflect(lemma=distractor_lemma, target_pos=self.pos, target_tags=self.tags)
225
+ if inflected is not None:
226
+ inflected_distractors.append(inflected)
227
+ num_distractors = min(4, self.max_num_distractors) if self.max_num_distractors >= 4 \
228
+ else self.max_num_distractors
229
+ if len(inflected_distractors) < num_distractors:
230
+ self.bad_target_word = True
231
+ else:
232
+ self.inflected_distractors = inflected_distractors
233
+
234
  def sample_distractors(self, num_distractors):
235
  if not self.bad_target_word:
236
  num_distractors = min(self.distractors_number, num_distractors) if num_distractors >= 4 else num_distractors
237
  self.distractors = sample(self.distractors[:min(self.distractors_number, 10)], num_distractors)
238
 
239
  def compile_task(self, max_num_distractors):
240
+ len_distractors = len(self.inflected_distractors)
241
  len_variants = min(len_distractors, max_num_distractors) if max_num_distractors > 4 \
242
  else max_num_distractors
243
  letters = (f'({letter})' for letter in string.ascii_lowercase[:len_variants + 1])
244
  try:
245
+ distractors = sample(self.inflected_distractors, len_variants) + [self.original_text, ]
246
  except ValueError:
247
+ distractors = self.inflected_distractors + [self.original_text, ]
248
  tmp_vars = [f'{item[0]} {item[1].replace("_", " ")}'
249
  for item in zip(letters, sorted(distractors, key=lambda _: random()))]
250
  self.variants.append((self.original_text, tmp_vars))
utilities_language_general/esp_constants.py CHANGED
@@ -23,9 +23,7 @@ def load_bert():
23
 
24
 
25
  nlp = load_spacy()
26
- news_model_path = r'news_phrases_s300_cw10_mc50_w4_negative5-075_mean_e10_notshr.bin.gz'
27
- all_model_path = r'ALL_annot_all_pos_spell_g_h_new_phrases_s300_cw10_mc100_w4_negative5-075_mean_e10_shr.bin.gz'
28
- lit_model_path = r'LITERATURA_annot_all_pos_spell_g_h_phrases_s300_cw10_mc50_w4_negative_5-075_mean_e20_shr.bin.gz'
29
 
30
  # Upload minimums
31
  a1_path, a1_target_set = r'lexical_minimums/A1_MINIMUM.txt', set()
 
23
 
24
 
25
  nlp = load_spacy()
26
+ w2v_model_path = r'ALL_annot_all_pos_spell_g_h_new_phrases_s300_cw10_mc100_w4_negative5-075_mean_e10_shr.bin.gz'
 
 
27
 
28
  # Upload minimums
29
  a1_path, a1_target_set = r'lexical_minimums/A1_MINIMUM.txt', set()
utilities_language_general/esp_utils.py CHANGED
@@ -190,23 +190,22 @@ def get_distractors_from_model_bert(model, text_with_masked_task: str, lemma: st
190
  targets=list(distractor_minimum))]
191
  else:
192
  bert_candidates = [token for token in model(text_with_masked_task, top_k=max_num_distractors + 100)]
193
- inflected_candidates = []
194
  for candidate in bert_candidates:
195
  if isinstance(candidate, list):
196
  bert_candidates = candidate
197
  continue
198
  if candidate['token_str'].isalpha():
199
  candidate_morph = nlp(candidate['token_str'])[0]
200
- inflected_candidates.append((f"{candidate_morph.lemma_}_{candidate_morph.text}_{candidate_morph.pos_}",
201
- candidate['score']))
202
  except KeyError:
203
  return None
204
- for candidate_distractor in inflected_candidates:
205
  if '_' in candidate_distractor[0]:
206
- distractor_lemma, distractor_text, distractor_pos, = candidate_distractor[0].split('_')
207
  else:
208
- distractor_lemma, distractor_text, distractor_pos = \
209
- nlp(candidate_distractor[0])[0].lemma_, candidate_distractor[0], nlp(candidate_distractor[0])[0].pos_
210
  distractor_similarity = candidate_distractor[1]
211
  candidate_gender = get_tags(distractor_lemma).get('Gender')
212
  length_ratio = abs(len(lemma) - len(distractor_lemma))
@@ -222,10 +221,10 @@ def get_distractors_from_model_bert(model, text_with_masked_task: str, lemma: st
222
  / ((len(lemma) + len(distractor_lemma)) / 2) > min_edit_distance_ratio)):
223
  if distractor_minimum is not None:
224
  if distractor_lemma in distractor_minimum:
225
- _distractors.append((distractor_text, candidate_distractor[1]))
226
  global_distractors.add(distractor_lemma)
227
  else:
228
- _distractors.append((distractor_text, candidate_distractor[1]))
229
  num_distractors = min(4, max_num_distractors) if max_num_distractors >= 4 else max_num_distractors
230
  if len(_distractors) < num_distractors:
231
  return None
 
190
  targets=list(distractor_minimum))]
191
  else:
192
  bert_candidates = [token for token in model(text_with_masked_task, top_k=max_num_distractors + 100)]
193
+ candidates = []
194
  for candidate in bert_candidates:
195
  if isinstance(candidate, list):
196
  bert_candidates = candidate
197
  continue
198
  if candidate['token_str'].isalpha():
199
  candidate_morph = nlp(candidate['token_str'])[0]
200
+ candidates.append((f"{candidate_morph.lemma_}_{candidate_morph.pos_}", candidate['score']))
 
201
  except KeyError:
202
  return None
203
+ for candidate_distractor in candidates:
204
  if '_' in candidate_distractor[0]:
205
+ distractor_lemma, distractor_pos = candidate_distractor[0].split('_')
206
  else:
207
+ candidate_morph = nlp(candidate_distractor[0])[0]
208
+ distractor_lemma, distractor_pos = candidate_morph.lemma_, candidate_morph.pos_
209
  distractor_similarity = candidate_distractor[1]
210
  candidate_gender = get_tags(distractor_lemma).get('Gender')
211
  length_ratio = abs(len(lemma) - len(distractor_lemma))
 
221
  / ((len(lemma) + len(distractor_lemma)) / 2) > min_edit_distance_ratio)):
222
  if distractor_minimum is not None:
223
  if distractor_lemma in distractor_minimum:
224
+ _distractors.append((distractor_lemma, candidate_distractor[1]))
225
  global_distractors.add(distractor_lemma)
226
  else:
227
+ _distractors.append((distractor_lemma, candidate_distractor[1]))
228
  num_distractors = min(4, max_num_distractors) if max_num_distractors >= 4 else max_num_distractors
229
  if len(_distractors) < num_distractors:
230
  return None
utilities_language_w2v/esp_main_workflow_w2v.py CHANGED
@@ -11,9 +11,7 @@ from utilities_language_general.esp_constants import load_w2v
11
  from utilities_language_general.esp_utils import prepare_tasks
12
  from streamlit.runtime.uploaded_file_manager import UploadedFile
13
  import utilities_language_general.esp_constants as esp_constants
14
- from utilities_language_general.esp_constants import all_model_path
15
- from utilities_language_general.esp_constants import lit_model_path
16
- from utilities_language_general.esp_constants import news_model_path
17
  from utilities_language_general.esp_utils import prepare_target_words
18
  from utilities_language_general.esp_utils import compute_frequency_dict
19
  from utilities_language_general.esp_constants import BAD_USER_TARGET_WORDS
@@ -65,12 +63,7 @@ def main_workflow(
65
  MAX_FREQUENCY = 0
66
 
67
  # Define which model is used for distractor generation
68
- if distractor_model == 'Худ. лит-ра':
69
- mask_filler = load_w2v(lit_model_path)
70
- elif distractor_model == 'Новости':
71
- mask_filler = load_w2v(news_model_path)
72
- else:
73
- mask_filler = load_w2v(all_model_path)
74
 
75
  # Get input text
76
  if file is not None:
@@ -136,7 +129,7 @@ def main_workflow(
136
  elif level == 'C2':
137
  target_minimum = esp_constants.c2_target_set
138
  distractor_minimum = esp_constants.c2_distractor_set
139
- elif level == 'Нет':
140
  target_minimum = None
141
  distractor_minimum = None
142
  else:
 
11
  from utilities_language_general.esp_utils import prepare_tasks
12
  from streamlit.runtime.uploaded_file_manager import UploadedFile
13
  import utilities_language_general.esp_constants as esp_constants
14
+ from utilities_language_general.esp_constants import w2v_model_path
 
 
15
  from utilities_language_general.esp_utils import prepare_target_words
16
  from utilities_language_general.esp_utils import compute_frequency_dict
17
  from utilities_language_general.esp_constants import BAD_USER_TARGET_WORDS
 
63
  MAX_FREQUENCY = 0
64
 
65
  # Define which model is used for distractor generation
66
+ mask_filler = load_w2v(w2v_model_path)
 
 
 
 
 
67
 
68
  # Get input text
69
  if file is not None:
 
129
  elif level == 'C2':
130
  target_minimum = esp_constants.c2_target_set
131
  distractor_minimum = esp_constants.c2_distractor_set
132
+ elif level == 'Без уровня':
133
  target_minimum = None
134
  distractor_minimum = None
135
  else: