a-v-bely commited on
Commit
686cd54
·
1 Parent(s): cd5e287

Update app

Browse files
pages/2_👨‍🏫_Начало_работы.py CHANGED
@@ -4,7 +4,6 @@ from utilities_database.user_database_utils import load_user_tasks_data
4
  from utilities_database.user_database_utils import save_data_in_database
5
  from utilities_database.user_database_widgets import user_save_text_table
6
  from utilities_database.user_database_utils import load_users_particular_task
7
- from streamlit_extras.no_default_selectbox import selectbox as custom_select_box
8
 
9
  # Interface
10
  if st.session_state.get('-LOGGED_IN_BOOL-'):
@@ -15,52 +14,38 @@ if st.session_state.get('-LOGGED_IN_BOOL-'):
15
  '\n\n**_:red[СОЗДАНИЕ ЗАДАНИЙ]_**'
16
  '\n\nПосле выбора данного режима работы появится форма, которую необходимо заполнить:'
17
  '\n\n1. Придумайте **название** для файла с заданиями. '
18
- 'Вы можете оставить это поле пустым - именем по умолчанию служит текущая дата и первые 20 символов '
19
- 'введенного Вами текста.'
20
- '\n\n2. Введите **текст** или выберите **текстовый файл** с исходным текстом, на основе которого Вы хотите '
21
- 'создать задания. '
22
  '\n\n3. Укажите *способ выбора целевых слов*:'
23
  '\n\t* *:green[Автоматически]*: программа сама выберет подходящие по сложности целевые слова.'
24
- '\n\t* *:blue[Самостоятельно]*: введите в соответствующее поле целевые слова через запятую в той форме, '
25
- 'в которой они встречаются в тексте. В этом случае *:orange[языковой уровень]* можно не указывать, но тогда '
26
- 'дистракторы будут полностью случайными и несоотнесёнными с уровнем.'
27
- '\n4. Если Вы выбрали *:green[автоматический поиск целевых слов]*, **_:red[обязательно]_** укажите '
28
- '*:orange[языковой уровень]*. Данный параметр отвечает за выбор лексического минимума, использующегося при '
29
- 'подборе дистракторов.'
30
- '\n5. Если Вы выбрали *:blue[самостоятельный ввод целевых слов]*, проверьте, что заполнили соответствующее '
31
- 'поле. ️ ❗ **:red[Введите слова в той форме, в которой они встречаются в тексте]**.'
32
- '\n6. Укажите число дистракторов - неправильных вариантов ответа. Если указано _более четырех_ '
33
- 'дистракторов, возможно, что в некоторых заданиях будет выведено _меньшее количество, но не менее четырех_ '
34
- 'вариантов. Данное обстоятельство связано с проверкой наличия дистракторов в лексических минимумах.'
35
  '\n7. Выберите **способы вывода** готовых материалов.'
36
- '\n8. Для начала работы нажмите на кнопку **"Запуск"**. Если все поля заполнены верно, '
37
- 'начнется процесс генерации заданий. Прогресс будет отображаться на экране.'
38
  '\n9. По окончании процесса генерации заданий будет выведено **_:green[соответсвующее сообщение]_**. '
39
  'Затем Вы можете перейти на вкладки **просмотра и 📥 сохранения** заданий, а так же 📝**онлайн-теста**.'
40
  '\n\n**_:red[ЗАГРУЗКА ИЗ АРХИВА]_**'
41
- '\n\nПосле выбора данного режима работы появится таблица, в которой перечислены названия заданий, '
42
- 'которые Вы сохранили, языковой уровень и дата их создания.'
43
  ' Для загрузки определенного файла с заданиями:'
44
  '\n1. Введите (или скопируйте из таблицы) название.'
45
  '\n2. Укажите соответсвующий языковой уровень.'
46
  '\n3. Нажмите на кнопку **"Загрузить"**.'
47
  '\n4. Если все поля заполнены верно, Вы увидите сообщение о том, что **:green[задания успешно загружены]**.'
48
- '\n\n\nДля того, чтобы свернуть/развернуть блоки **Инструкций** или **Важной информации**, '
49
- 'кликните по заголовку этого блока или по стрелке (ᐯ / ᐱ), располагающейся в его правом верхнем углу.')
50
- ANNOUNCES = st.expander('**ВАЖНАЯ ИНФОРМАЦИЯ**', expanded=False)
51
- ANNOUNCES.success(
52
- '**Уважаемые пользователи, пожалуйста, после генерации заданий перейдите на вкладку "📝Онлайн-тест" '
53
- 'и заполните там опросник. Таким образом Вы очень поможете в улучшении качества заданий! Спасибо!🤗**')
54
- ANNOUNCES.warning(
55
- '**Сейчас генератор проходит завершающую настройку и отладку, для которой необходимо большое количество '
56
- 'данных об уместности выбранных целевых слов и дистракторов к ним. Поэтому просим Вас отнестись с пониманием '
57
- 'к излишне большому количеству заданий.**')
58
- ANNOUNCES.warning(
59
- '**❗️ㅤУбедительно просим Вас дожидаться окончания генерации или загрузки и не переходить на '
60
- 'другие вкладки до выведения соответствующего сообщения.**')
61
- ANNOUNCES.warning(
62
- '**❗ㅤВ случае появления красных сообщений об ошибке, как правило, проблема решается '
63
- 'повторными нажатиями на нужный Вам элемент. Приносим извинения за неудобства.**')
64
  WHAT_TO_DO = st.radio(
65
  label='**Выберите режим работы**',
66
  options=[
@@ -79,10 +64,11 @@ if st.session_state.get('-LOGGED_IN_BOOL-'):
79
  COL1, COL2 = LOAD_FORM.columns([1, 1])
80
  UPLOAD_CLOUD_FILE_NAME = COL1.text_input('Введите название заданий', placeholder='Жду название')
81
  with COL2:
82
- UPLOAD_CLOUD_CEFR_LEVEL = custom_select_box(
83
- 'Выберите языковой уровень',
84
- ['A1', 'A2', 'B1', 'B2', 'Без уровня'],
85
- no_selection_label='-Выберите языковой уровень-')
 
86
  st.session_state['-UPLOAD_CLOUD_CEFR_LEVEL-'] = UPLOAD_CLOUD_CEFR_LEVEL
87
  LOAD_BUTTON = LOAD_FORM.form_submit_button('Загрузить')
88
  if LOAD_BUTTON:
@@ -132,22 +118,23 @@ if st.session_state.get('-LOGGED_IN_BOOL-'):
132
  key='-DISTRACTOR_MODEL_MODE-', horizontal=True)
133
  CEFR_NUM_DISTRACTORS_COL, UTW_COL = FORM.columns([2, 2])
134
  with CEFR_NUM_DISTRACTORS_COL:
135
- CEFR_TEXT_LEVEL = custom_select_box(
136
- 'Укажите уровень по CEFR:',
137
- ['Без уровня', 'A1', 'A2', 'B1', 'B2'],
138
- no_selection_label='-Выберите языковой уровень-')
 
139
  st.session_state['-CEFR_TEXT_LEVEL-'] = CEFR_TEXT_LEVEL
140
  NUMBER_DISTRACTORS = CEFR_NUM_DISTRACTORS_COL.number_input(
141
  label='**Выберите количество дистракторов в задании:**',
142
  min_value=1,
143
  max_value=9,
144
- value=2,
145
  key='-NUM_DISTRACTORS-')
146
  TARGET_WORDS = UTW_COL.text_area(
147
  label='**Если "Самостоятельно", введите целевые слова:**',
148
  value='',
149
  height=120,
150
- placeholder='Через запятую и пробел',
151
  key='-INPUT_TARGET_WORDS-')
152
  FORM.markdown('**Выберите формат(-ы) вывода:**')
153
  col1, col2, col3 = FORM.columns(3)
@@ -166,47 +153,44 @@ if st.session_state.get('-LOGGED_IN_BOOL-'):
166
 
167
  START_COL, RERUN_COL, EXIT_COL = FORM.columns([1, 1, 1])
168
  START_BUTTON = START_COL.form_submit_button(
169
- label='**Запуск**')
 
170
  RERUN_BUTTON = RERUN_COL.form_submit_button(
171
- label='**Перезагрузка**')
 
172
  EXIT_BUTTON = EXIT_COL.form_submit_button(
173
- label='**Выход**')
 
174
 
175
  if START_BUTTON:
176
- LOGS = st.empty()
177
- LOGS.success('**Готовые задания или онлайн-тест появятся на отдельных страницах**')
178
-
179
  # Initiate interface structure
180
- PROGRESS_BAR = st.progress(0)
181
- LOGS_D = st.empty()
182
- PROGRESS_BAR_S = st.progress(0)
 
183
 
184
  # Start generation process. Everything happens inside main_workflow func
185
  if DISTRACTOR_MODEL == 'Модель-3':
186
- from utilities_language_bert.rus_main_workflow_bert import main_workflow_bert
187
- __TASK_DATA__ = main_workflow_bert(
188
  file=UPLOAD_FILE,
189
  text=UPLOAD_TEXT,
190
  logs=LOGS,
191
- logs_d=LOGS_D,
192
  progress=PROGRESS_BAR,
193
- progress_s=PROGRESS_BAR_S,
194
  level=CEFR_TEXT_LEVEL,
195
  tw_mode_automatic_mode=TARGET_WORDS_MODE,
196
  target_words=TARGET_WORDS,
197
  num_distractors=NUMBER_DISTRACTORS,
198
  save_name=USER__SAVE_IN_CLOUD_FILE_NAME)
199
  else:
200
- PROGRESS_BAR_D = st.progress(0)
201
- from utilities_language_w2v.rus_main_workflow_w2v import main_workflow_w2v
202
- __TASK_DATA__ = main_workflow_w2v(
203
  file=UPLOAD_FILE,
204
  text=UPLOAD_TEXT,
205
  logs=LOGS,
206
- logs_d=LOGS_D,
207
  progress=PROGRESS_BAR,
208
- progress_d=PROGRESS_BAR_D,
209
- progress_s=PROGRESS_BAR_S,
210
  level=CEFR_TEXT_LEVEL,
211
  tw_mode_automatic_mode=TARGET_WORDS_MODE,
212
  target_words=TARGET_WORDS,
@@ -225,28 +209,18 @@ if st.session_state.get('-LOGGED_IN_BOOL-'):
225
  st.session_state['-UPLOAD_CLOUD_FILE_NAME-'] = USER__SAVE_IN_CLOUD_FILE_NAME
226
 
227
  PROGRESS_BAR.progress(100)
228
- PROGRESS_BAR_S.progress(100)
229
- if SAVE_IN_CLOUD:
230
- save_data_in_database(
231
- user_task_database=user_save_text_table,
232
- save_type='download',
233
- save_name=USER__SAVE_IN_CLOUD_FILE_NAME,
234
- cefr_level=CEFR_TEXT_LEVEL,
235
- created_at=str(datetime.datetime.now())[:-7],
236
- creator_name=st.session_state.get('-USER_NAME-'),
237
- generated_result=__TASK_DATA__,
238
- distractor_model=DISTRACTOR_MODEL)
239
- LOGS.success('**Все готово! Сохраняем задания в облако! Чтобы скачать задания перейдите в '
240
- 'соответсвующую вкладку**')
241
- else:
242
- LOGS.success('**Все готово! Готовые задания и/или онлайн-тест доступны в соответствующих вкладках.**')
243
-
244
- # if TARGET_WORDS_MODE == 'Самостоятельно':
245
- # st.error('По разным причинам не смогли не смогли придумать задания со словами: ' +
246
- # ', '.join(__TASK_DATA__['BAD_USER_TARGET_WORDS']))
247
- PROGRESS_BAR = ''
248
- PROGRESS_BAR_S = ''
249
- LOGS_D = ''
250
 
251
  if EXIT_BUTTON:
252
  for key in st.session_state:
@@ -260,12 +234,6 @@ if st.session_state.get('-LOGGED_IN_BOOL-'):
260
  st.error('Что-то пошло не так?! Перезагружаюсь!')
261
  st.session_state["START_GENERATION"] = False
262
  st.stop()
263
- st.experimental_rerun()
264
-
265
- # LABEL
266
- # st.markdown('*Автор-разработчик: А.В.Белый, кафедра математической лингвистики, филологический факультет СПбГУ,'
267
- # ' 4 курс, бакалавриат, "Прикладная, компьютерная и математическая лингвистика (английский язык)"*'
268
- # '\n\n*Научный руководитель: канд. филол. наук, доц. О.А.Митрофанова*')
269
- # st.markdown('*E-mail: st087202@student.spbu.ru*')
270
  else:
271
  st.warning('**Войдите или зарегистрируйтесь**')
 
4
  from utilities_database.user_database_utils import save_data_in_database
5
  from utilities_database.user_database_widgets import user_save_text_table
6
  from utilities_database.user_database_utils import load_users_particular_task
 
7
 
8
  # Interface
9
  if st.session_state.get('-LOGGED_IN_BOOL-'):
 
14
  '\n\n**_:red[СОЗДАНИЕ ЗАДАНИЙ]_**'
15
  '\n\nПосле выбора данного режима работы появится форма, которую необходимо заполнить:'
16
  '\n\n1. Придумайте **название** для файла с заданиями. '
17
+ 'Вы можете оставить это поле пустым - именем по умолчанию служит текущая дата и первые 20 символов'
18
+ ' введенного Вами текста.'
19
+ '\n\n2. Введите **текст** или выберите **текстовый файл** с исходным текстом, на основе которого Вы хотите'
20
+ ' создать задания. '
21
  '\n\n3. Укажите *способ выбора целевых слов*:'
22
  '\n\t* *:green[Автоматически]*: программа сама выберет подходящие по сложности целевые слова.'
23
+ '\n\t* *:blue[Самостоятельно]*: введите в соответствующее поле целевые слова через запятую в той форме,'
24
+ ' в которой они встречаются в тексте. В этом случае *:orange[языковой уровень]* можно не указывать, но тогда'
25
+ ' дистракторы будут полностью случайными и несоотнесёнными с уровнем.'
26
+ '\n4. Если Вы выбрали *:green[автоматический поиск целевых слов]*, **_:red[обязательно]_** укажите'
27
+ ' *:orange[языковой уровень]*. Данный параметр отвечает за выбор лексического минимума, использующегося при'
28
+ ' подборе дистракторов.'
29
+ '\n5. Если Вы выбрали *:blue[самостоятельный ввод целевых слов]*, проверьте, что заполнили соответствующее'
30
+ ' поле. ️ ❗ **:red[Введите слова в той форме, в которой они встречаются в тексте]**.'
31
+ '\n6. Укажите число дистракторов - неправильных вариантов ответа. Если указано _более четырех_'
32
+ ' дистракторов, возможно, что в некоторых заданиях будет выведено _меньшее количество, но не менее четырех_'
33
+ ' вариантов. Данное обстоятельство связано с проверкой наличия дистракторов в лексических минимумах.'
34
  '\n7. Выберите **способы вывода** готовых материалов.'
35
+ '\n8. Для начала работы нажмите на кнопку **"Запуск"**. Если все поля заполнены верно,'
36
+ ' начнется процесс генерации заданий. Прогресс будет отображаться на экране.'
37
  '\n9. По окончании процесса генерации заданий будет выведено **_:green[соответсвующее сообщение]_**. '
38
  'Затем Вы можете перейти на вкладки **просмотра и 📥 сохранения** заданий, а так же 📝**онлайн-теста**.'
39
  '\n\n**_:red[ЗАГРУЗКА ИЗ АРХИВА]_**'
40
+ '\n\nПосле выбора данного режима работы появится таблица, в которой перечислены названия заданий,'
41
+ ' которые Вы сохранили, языковой уровень и дата их создания.'
42
  ' Для загрузки определенного файла с заданиями:'
43
  '\n1. Введите (или скопируйте из таблицы) название.'
44
  '\n2. Укажите соответсвующий языковой уровень.'
45
  '\n3. Нажмите на кнопку **"Загрузить"**.'
46
  '\n4. Если все поля заполнены верно, Вы увидите сообщение о том, что **:green[задания успешно загружены]**.'
47
+ '\n\n\nДля того, чтоб�� свернуть/развернуть блоки **Инструкций** или **Важной информации**,'
48
+ ' кликните по заголовку этого блока или по стрелке (ᐯ / ᐱ), располагающейся в его правом верхнем углу.')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  WHAT_TO_DO = st.radio(
50
  label='**Выберите режим работы**',
51
  options=[
 
64
  COL1, COL2 = LOAD_FORM.columns([1, 1])
65
  UPLOAD_CLOUD_FILE_NAME = COL1.text_input('Введите название заданий', placeholder='Жду название')
66
  with COL2:
67
+ UPLOAD_CLOUD_CEFR_LEVEL = st.selectbox(
68
+ label='Выберите языковой уровень',
69
+ options=['A1', 'A2', 'B1', 'B2', 'C1', 'Без уровня'],
70
+ index=None,
71
+ placeholder='-Выберите языковой уровень-')
72
  st.session_state['-UPLOAD_CLOUD_CEFR_LEVEL-'] = UPLOAD_CLOUD_CEFR_LEVEL
73
  LOAD_BUTTON = LOAD_FORM.form_submit_button('Загрузить')
74
  if LOAD_BUTTON:
 
118
  key='-DISTRACTOR_MODEL_MODE-', horizontal=True)
119
  CEFR_NUM_DISTRACTORS_COL, UTW_COL = FORM.columns([2, 2])
120
  with CEFR_NUM_DISTRACTORS_COL:
121
+ CEFR_TEXT_LEVEL = st.selectbox(
122
+ label='Выберите языковой уровень',
123
+ options=['A1', 'A2', 'B1', 'B2', 'Без уровня'],
124
+ index=None,
125
+ placeholder='-Выберите языковой уровень-')
126
  st.session_state['-CEFR_TEXT_LEVEL-'] = CEFR_TEXT_LEVEL
127
  NUMBER_DISTRACTORS = CEFR_NUM_DISTRACTORS_COL.number_input(
128
  label='**Выберите количество дистракторов в задании:**',
129
  min_value=1,
130
  max_value=9,
131
+ value=3,
132
  key='-NUM_DISTRACTORS-')
133
  TARGET_WORDS = UTW_COL.text_area(
134
  label='**Если "Самостоятельно", введите целевые слова:**',
135
  value='',
136
  height=120,
137
+ placeholder='Через запятую',
138
  key='-INPUT_TARGET_WORDS-')
139
  FORM.markdown('**Выберите формат(-ы) вывода:**')
140
  col1, col2, col3 = FORM.columns(3)
 
153
 
154
  START_COL, RERUN_COL, EXIT_COL = FORM.columns([1, 1, 1])
155
  START_BUTTON = START_COL.form_submit_button(
156
+ label='**Запуск**',
157
+ use_container_width=True)
158
  RERUN_BUTTON = RERUN_COL.form_submit_button(
159
+ label='**Перезагрузка**',
160
+ use_container_width=True)
161
  EXIT_BUTTON = EXIT_COL.form_submit_button(
162
+ label='**Выход**',
163
+ use_container_width=True)
164
 
165
  if START_BUTTON:
 
 
 
166
  # Initiate interface structure
167
+ LOGS = st.status(label='Прогресс выполнения', expanded=True)
168
+
169
+ PROGRESS_BAR = LOGS.progress(0)
170
+ PROGRESS_BAR_DISTRACTORS = LOGS.progress(0)
171
 
172
  # Start generation process. Everything happens inside main_workflow func
173
  if DISTRACTOR_MODEL == 'Модель-3':
174
+ from utilities_language_bert.rus_main_workflow_bert import main_workflow
175
+ __TASK_DATA__ = main_workflow(
176
  file=UPLOAD_FILE,
177
  text=UPLOAD_TEXT,
178
  logs=LOGS,
 
179
  progress=PROGRESS_BAR,
180
+ progress_d=PROGRESS_BAR_DISTRACTORS,
181
  level=CEFR_TEXT_LEVEL,
182
  tw_mode_automatic_mode=TARGET_WORDS_MODE,
183
  target_words=TARGET_WORDS,
184
  num_distractors=NUMBER_DISTRACTORS,
185
  save_name=USER__SAVE_IN_CLOUD_FILE_NAME)
186
  else:
187
+ from utilities_language_w2v.rus_main_workflow_w2v import main_workflow
188
+ __TASK_DATA__ = main_workflow(
 
189
  file=UPLOAD_FILE,
190
  text=UPLOAD_TEXT,
191
  logs=LOGS,
 
192
  progress=PROGRESS_BAR,
193
+ progress_d=PROGRESS_BAR_DISTRACTORS,
 
194
  level=CEFR_TEXT_LEVEL,
195
  tw_mode_automatic_mode=TARGET_WORDS_MODE,
196
  target_words=TARGET_WORDS,
 
209
  st.session_state['-UPLOAD_CLOUD_FILE_NAME-'] = USER__SAVE_IN_CLOUD_FILE_NAME
210
 
211
  PROGRESS_BAR.progress(100)
212
+ PROGRESS_BAR_DISTRACTORS.progress(100)
213
+ LOGS.update(label='**Все готово! Готовые задания и/или онлайн-тест доступны в соответствующих вкладках.**',
214
+ state='complete', expanded=False)
215
+ save_data_in_database(
216
+ user_task_database=user_save_text_table,
217
+ save_type='download',
218
+ save_name=USER__SAVE_IN_CLOUD_FILE_NAME,
219
+ cefr_level=CEFR_TEXT_LEVEL,
220
+ created_at=str(datetime.datetime.now())[:-7],
221
+ creator_name=st.session_state.get('-USER_NAME-'),
222
+ generated_result=__TASK_DATA__,
223
+ distractor_model=DISTRACTOR_MODEL, allow=SAVE_IN_CLOUD)
 
 
 
 
 
 
 
 
 
 
224
 
225
  if EXIT_BUTTON:
226
  for key in st.session_state:
 
234
  st.error('Что-то пошло не так?! Перезагружаюсь!')
235
  st.session_state["START_GENERATION"] = False
236
  st.stop()
237
+ st.rerun()
 
 
 
 
 
 
238
  else:
239
  st.warning('**Войдите или зарегистрируйтесь**')
pages/3_📥_Скачать.py CHANGED
@@ -1,7 +1,7 @@
1
  import streamlit as st
2
  from utilities_ui.custom_download_button import download_button as d_button
3
 
4
- st.set_page_config(page_title='Скачать', layout="wide", page_icon=':ru:')
5
  if st.session_state.get('-LOGGED_IN_BOOL-') and (st.session_state.get('-DISPLAY_READY-')
6
  or st.session_state.get('-DOWNLOAD_VERSION-')):
7
  result = st.session_state.get('RESULT')
@@ -10,7 +10,7 @@ if st.session_state.get('-LOGGED_IN_BOOL-') and (st.session_state.get('-DISPLAY_
10
  st.stop()
11
  # Download buttons
12
  if st.session_state.get('-DOWNLOAD_VERSION-'):
13
- invite, tasks_col, tasks_with_answers_col, full_coll, rest = st.columns([1, 1, 2, 3, 1])
14
  invite.write('Скачать:')
15
  with tasks_col:
16
  d_button(
@@ -22,6 +22,11 @@ if st.session_state.get('-LOGGED_IN_BOOL-') and (st.session_state.get('-DISPLAY_
22
  label='Задания+Ключи',
23
  data=result['TEACHER_OUT'],
24
  file_name=f'{result["name"]}_tasks_and_keys.txt')
 
 
 
 
 
25
  with full_coll:
26
  d_button(
27
  label='Исходник+Задания+Ключи',
 
1
  import streamlit as st
2
  from utilities_ui.custom_download_button import download_button as d_button
3
 
4
+ st.set_page_config(page_title='Скачать', layout="wide", page_icon=':ru:', initial_sidebar_state='collapsed')
5
  if st.session_state.get('-LOGGED_IN_BOOL-') and (st.session_state.get('-DISPLAY_READY-')
6
  or st.session_state.get('-DOWNLOAD_VERSION-')):
7
  result = st.session_state.get('RESULT')
 
10
  st.stop()
11
  # Download buttons
12
  if st.session_state.get('-DOWNLOAD_VERSION-'):
13
+ invite, tasks_col, tasks_with_answers_col, keys_only_col, full_coll, rest = st.columns([1, 1, 2, 1, 3, 1])
14
  invite.write('Скачать:')
15
  with tasks_col:
16
  d_button(
 
22
  label='Задания+Ключи',
23
  data=result['TEACHER_OUT'],
24
  file_name=f'{result["name"]}_tasks_and_keys.txt')
25
+ with keys_only_col:
26
+ d_button(
27
+ label='Ключи',
28
+ data=result['KEYS_ONLY'],
29
+ file_name=f'{result["name"]}_keys.txt')
30
  with full_coll:
31
  d_button(
32
  label='Исходник+Задания+Ключи',
pages/4_📝_Онлайн-тест (эксперимент).py CHANGED
@@ -4,19 +4,19 @@ import streamlit as st
4
  from utilities_database.user_database_utils import save_data_in_database
5
  from utilities_database.user_database_widgets import user_save_text_table
6
 
7
- st.set_page_config(page_title='Онлайн-тест', layout="wide", page_icon=':ru:')
8
  if st.session_state.get('-ONLINE_TEST_READY-') and st.session_state.get('-LOGGED_IN_BOOL-'):
9
- INSTRUCTION = st.expander(label='**ИНСТРУКЦИЯ**', expanded=True)
10
  INSTRUCTION.markdown(
11
  'Уважаемые пользователи, предлагаем Вам заполнить опросник по оценке качества созданных заданий. '
12
  '\n\nНиже находится анкета с заданиями в таблице.'
13
  '\n\n- В **первом столбце** приводится ответ - слово, удаленное из оригинального текста.'
14
  '\n\n- Отметьте во **втором столбце**, уместно ли создавать задание с данным словом.'
15
  '\n\n- В **третьем столбце** приведены подобранные программой дистракторы.'
16
- '\n\n- Введите в **четвертый столбец** дистракторы (целиком или букву), которые, по Вашему мнению, '
17
- '**:red[не уместны]**. '
18
- '\n\n**:green[Уместными дистракторами]** мы предлагаем считать те, которые одновременно удовлетворяют '
19
- 'следующим условиям в рамках языкового уровня, для которого они созданы:'
20
  '\n\n1. не слишком очевидно являются неправильными вариантами (*варить суп/стол*);'
21
  '\n\n2. и при этом не могут быть полноценной заменой удаленного слова (*варить суп/кашу*)'
22
  )
@@ -31,25 +31,26 @@ if st.session_state.get('-ONLINE_TEST_READY-') and st.session_state.get('-LOGGED
31
  ONLINE_TEST = st.form('Онлайн тест')
32
  ONLINE_TEST.write(result['TEXT_WITH_GAPS'].replace('_', '\_'))
33
  BAD_DISTRACTORS_AND_ANSWERS_temp = ONLINE_TEST.data_editor(
34
- pd.DataFrame([{"Задание №": i+1,
35
  "Ответ": [answers[i][1]],
36
  "Задание уместно": False,
37
  "Дистракторы": tasks[i][1],
38
  "Неуместные дистракторы": ''}
39
  for i in range(len(tasks))]),
40
  num_rows="fixed",
41
- height=45*len_answers,
 
42
  use_container_width=True)
43
- COMMENTS = ONLINE_TEST.text_input(label='**Прокомментировать**',
44
- placeholder='Напишите комментарий')
45
- SUBMIT = ONLINE_TEST.form_submit_button('READY')
46
  if SUBMIT:
47
  points = test_mark = 'Teacher'
48
  appropriate_tasks = BAD_DISTRACTORS_AND_ANSWERS_temp["Задание уместно"].values.tolist()
49
  inappropriate_distractors = BAD_DISTRACTORS_AND_ANSWERS_temp["Неуместные дистракторы"].values.tolist()
50
  RETURN_TEST_DATA = [{'ANSWER': answers[i],
51
  'APPROPRIATE_TASK': appropriate_tasks[i],
52
- 'INAPPROPRIATE_DISTRACTORS': inappropriate_distractors[i]} for i in range(len_answers)]
53
  save_data_in_database(user_task_database=user_save_text_table,
54
  save_type='online_test',
55
  save_name=st.session_state['-UPLOAD_CLOUD_FILE_NAME-'],
@@ -57,8 +58,8 @@ if st.session_state.get('-ONLINE_TEST_READY-') and st.session_state.get('-LOGGED
57
  created_at=str(datetime.datetime.now())[:-7],
58
  creator_name=st.session_state.get('-USER_NAME-'),
59
  test_taker_name=st.session_state.get('-USER_NAME-'),
60
- test_taker_answers=RETURN_TEST_DATA,
61
  generated_result=result,
 
62
  test_taker_result={'Баллов': points, 'Всего': len_answers, 'Оценка': test_mark},
63
  comments=COMMENTS)
64
  elif st.session_state.get('-LOGGED_IN_BOOL-'):
 
4
  from utilities_database.user_database_utils import save_data_in_database
5
  from utilities_database.user_database_widgets import user_save_text_table
6
 
7
+ st.set_page_config(page_title='Онлайн-тест', layout="wide", page_icon=':ru:', initial_sidebar_state='collapsed')
8
  if st.session_state.get('-ONLINE_TEST_READY-') and st.session_state.get('-LOGGED_IN_BOOL-'):
9
+ INSTRUCTION = st.expander(label='**ИНСТРУКЦИЯ**', expanded=False)
10
  INSTRUCTION.markdown(
11
  'Уважаемые пользователи, предлагаем Вам заполнить опросник по оценке качества созданных заданий. '
12
  '\n\nНиже находится анкета с заданиями в таблице.'
13
  '\n\n- В **первом столбце** приводится ответ - слово, удаленное из оригинального текста.'
14
  '\n\n- Отметьте во **втором столбце**, уместно ли создавать задание с данным словом.'
15
  '\n\n- В **третьем столбце** приведены подобранные программой дистракторы.'
16
+ '\n\n- Введите в **четвертый столбец** дистракторы (целиком или букву), которые, по Вашему мнению,'
17
+ ' **:red[не уместны]**. '
18
+ '\n\n**:green[Уместными дистракторами]** мы предлагаем считать те, которые одновременно удовлетворяют'
19
+ ' следующим условиям в рамках языкового уровня, для которого они созданы:'
20
  '\n\n1. не слишком очевидно являются неправильными вариантами (*варить суп/стол*);'
21
  '\n\n2. и при этом не могут быть полноценной заменой удаленного слова (*варить суп/кашу*)'
22
  )
 
31
  ONLINE_TEST = st.form('Онлайн тест')
32
  ONLINE_TEST.write(result['TEXT_WITH_GAPS'].replace('_', '\_'))
33
  BAD_DISTRACTORS_AND_ANSWERS_temp = ONLINE_TEST.data_editor(
34
+ pd.DataFrame([{"Задание №": i + 1,
35
  "Ответ": [answers[i][1]],
36
  "Задание уместно": False,
37
  "Дистракторы": tasks[i][1],
38
  "Неуместные дистракторы": ''}
39
  for i in range(len(tasks))]),
40
  num_rows="fixed",
41
+ height=40 * len_answers,
42
+ hide_index=True,
43
  use_container_width=True)
44
+ COMMENTS = ONLINE_TEST.text_area(label='**Прокомментировать**',
45
+ placeholder='Напишите комментарий')
46
+ SUBMIT = ONLINE_TEST.form_submit_button('ГОТОВО')
47
  if SUBMIT:
48
  points = test_mark = 'Teacher'
49
  appropriate_tasks = BAD_DISTRACTORS_AND_ANSWERS_temp["Задание уместно"].values.tolist()
50
  inappropriate_distractors = BAD_DISTRACTORS_AND_ANSWERS_temp["Неуместные дистракторы"].values.tolist()
51
  RETURN_TEST_DATA = [{'ANSWER': answers[i],
52
  'APPROPRIATE_TASK': appropriate_tasks[i],
53
+ 'INAPPROPRIATE_DISTRACTORS': inappropriate_distractors[i]} for i in range(len_answers)]
54
  save_data_in_database(user_task_database=user_save_text_table,
55
  save_type='online_test',
56
  save_name=st.session_state['-UPLOAD_CLOUD_FILE_NAME-'],
 
58
  created_at=str(datetime.datetime.now())[:-7],
59
  creator_name=st.session_state.get('-USER_NAME-'),
60
  test_taker_name=st.session_state.get('-USER_NAME-'),
 
61
  generated_result=result,
62
+ test_taker_answers=RETURN_TEST_DATA,
63
  test_taker_result={'Баллов': points, 'Всего': len_answers, 'Оценка': test_mark},
64
  comments=COMMENTS)
65
  elif st.session_state.get('-LOGGED_IN_BOOL-'):
requirements.txt CHANGED
@@ -1,15 +1,17 @@
1
- supabase>=2.4.0
2
- nltk>=3.8.1
3
- spacy>=3.7.2
4
- torch>=2.1.0
5
- gensim>=4.3.2
6
- pandas>=2.2.0
7
- requests>=2.31.0
8
- pymorphy2>=0.9.1
9
- trycourier>=5.0.0
10
- streamlit==1.30.0
11
- argon2-cffi>=21.3.0
12
- cryptography>=42.0.3
13
- transformers>=4.37.2
14
- streamlit-extras>=0.4.0
 
 
15
  ru_core_news_lg @ https://github.com/explosion/spacy-models/releases/download/ru_core_news_lg-3.7.0/ru_core_news_lg-3.7.0-py3-none-any.whl
 
1
+ nltk==3.8.1
2
+ spacy==3.7.2
3
+ torch==2.1.0
4
+ gensim==4.3.2
5
+ pandas==2.2.0
6
+ catboost==1.2.2
7
+ supabase==2.4.0
8
+ requests==2.31.0
9
+ pymorphy2==0.9.1
10
+ trycourier==5.0.0
11
+ streamlit==1.32.2
12
+ argon2-cffi==21.1.0
13
+ cryptography==42.0.3
14
+ transformers==4.38.2
15
+ streamlit-extras==0.4.0
16
+ bert-extractive-summarizer==0.10.1
17
  ru_core_news_lg @ https://github.com/explosion/spacy-models/releases/download/ru_core_news_lg-3.7.0/ru_core_news_lg-3.7.0-py3-none-any.whl
utilities_cookies/cookie_manager.py CHANGED
@@ -1,10 +1,8 @@
1
  import streamlit as st
2
  from pathlib import Path
3
- from typing import Mapping
4
- from datetime import datetime
5
- from datetime import timedelta
6
  from urllib.parse import unquote
7
- from typing import MutableMapping
 
8
  from streamlit.components.v1 import components
9
 
10
 
 
1
  import streamlit as st
2
  from pathlib import Path
 
 
 
3
  from urllib.parse import unquote
4
+ from datetime import datetime, timedelta
5
+ from typing import Mapping, MutableMapping
6
  from streamlit.components.v1 import components
7
 
8
 
utilities_cookies/encrypted_cookie_manager.py CHANGED
@@ -1,12 +1,10 @@
1
  import os
2
  import base64
3
  import streamlit as st
4
- from typing import Tuple
5
- from typing import Optional
6
  from cryptography import fernet
7
- from typing import MutableMapping
8
  from cryptography.fernet import Fernet
9
  from cryptography.hazmat.primitives import hashes
 
10
  from utilities_cookies.cookie_manager import CookieManager
11
  from cryptography.hazmat.primitives.kdf.pbkdf2 import PBKDF2HMAC
12
 
 
1
  import os
2
  import base64
3
  import streamlit as st
 
 
4
  from cryptography import fernet
 
5
  from cryptography.fernet import Fernet
6
  from cryptography.hazmat.primitives import hashes
7
+ from typing import Tuple, Optional, MutableMapping
8
  from utilities_cookies.cookie_manager import CookieManager
9
  from cryptography.hazmat.primitives.kdf.pbkdf2 import PBKDF2HMAC
10
 
utilities_database/user_database_utils.py CHANGED
@@ -1,9 +1,9 @@
1
- import re
2
- import json
3
- import secrets
4
  import pandas as pd
5
  import streamlit as st
 
 
6
  from trycourier import Courier
 
7
  from argon2 import PasswordHasher
8
  from argon2.exceptions import VerifyMismatchError
9
 
@@ -37,7 +37,7 @@ def check_valid_name(name_sign_up: str) -> bool:
37
  name_regex_eng = r'^[A-Za-z_]\w *'
38
  name_regex_rus = r'^[А-Яа-я_][А-Яа-я0-9_] *'
39
 
40
- if re.search(name_regex_eng, name_sign_up) or re.search(name_regex_rus, name_sign_up):
41
  return True
42
  return False
43
 
@@ -46,7 +46,7 @@ def check_valid_email(email_sign_up: str) -> bool:
46
  """
47
  Checks if the user entered a valid email while creating the account.
48
  """
49
- regex = re.compile(r'([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+')
50
  return True
51
 
52
  # if re.fullmatch(regex, email_sign_up):
@@ -133,7 +133,7 @@ def generate_random_passwd() -> str:
133
  Generates a random password to be sent in email.
134
  """
135
  password_length = 10
136
- return secrets.token_urlsafe(password_length)
137
 
138
 
139
  def send_passwd_in_email(auth_token: str, user_name_forgot_passwd: str, email_forgot_passwd: str, company_name: str,
@@ -266,5 +266,5 @@ def load_users_particular_task(user_task_database, load_mode, creator_name, save
266
  .eq('save_name', save_name)\
267
  .eq('save_type', load_mode)\
268
  .eq('cefr_level',cefr_level).execute().data[0]['generated_result']
269
- return_data = json.loads(return_data.replace("'", '"'), strict=False)
270
  return return_data
 
 
 
 
1
  import pandas as pd
2
  import streamlit as st
3
+ from json import loads
4
+ from re import search, compile
5
  from trycourier import Courier
6
+ from secrets import token_urlsafe
7
  from argon2 import PasswordHasher
8
  from argon2.exceptions import VerifyMismatchError
9
 
 
37
  name_regex_eng = r'^[A-Za-z_]\w *'
38
  name_regex_rus = r'^[А-Яа-я_][А-Яа-я0-9_] *'
39
 
40
+ if search(name_regex_eng, name_sign_up) or search(name_regex_rus, name_sign_up):
41
  return True
42
  return False
43
 
 
46
  """
47
  Checks if the user entered a valid email while creating the account.
48
  """
49
+ regex = compile(r'([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+')
50
  return True
51
 
52
  # if re.fullmatch(regex, email_sign_up):
 
133
  Generates a random password to be sent in email.
134
  """
135
  password_length = 10
136
+ return token_urlsafe(password_length)
137
 
138
 
139
  def send_passwd_in_email(auth_token: str, user_name_forgot_passwd: str, email_forgot_passwd: str, company_name: str,
 
266
  .eq('save_name', save_name)\
267
  .eq('save_type', load_mode)\
268
  .eq('cefr_level',cefr_level).execute().data[0]['generated_result']
269
+ return_data = loads(return_data.replace("'", '"'), strict=False)
270
  return return_data
utilities_database/user_database_widgets.py CHANGED
@@ -3,7 +3,6 @@ from datetime import datetime
3
  from supabase import create_client, Client
4
  from utilities_option_menu.option_menu import option_menu
5
  import utilities_database.user_database_utils as db_utils
6
- from utilities_database.user_database_utils import check_usr_pass
7
  from utilities_cookies.encrypted_cookie_manager import EncryptedCookieManager
8
 
9
  DB_URL = st.secrets['SUPABASE_URL']
@@ -91,9 +90,9 @@ class LogIn:
91
  login_submit_button = st.form_submit_button(label='Войти')
92
 
93
  if login_submit_button:
94
- authenticate_user_check = check_usr_pass(user_log_in_database=user_login_table,
95
- user_name=user_name,
96
- password=password)
97
 
98
  if not authenticate_user_check:
99
  st.error("Неверное имя пользователя или пароль!")
 
3
  from supabase import create_client, Client
4
  from utilities_option_menu.option_menu import option_menu
5
  import utilities_database.user_database_utils as db_utils
 
6
  from utilities_cookies.encrypted_cookie_manager import EncryptedCookieManager
7
 
8
  DB_URL = st.secrets['SUPABASE_URL']
 
90
  login_submit_button = st.form_submit_button(label='Войти')
91
 
92
  if login_submit_button:
93
+ authenticate_user_check = db_utils.check_usr_pass(user_log_in_database=user_login_table,
94
+ user_name=user_name,
95
+ password=password)
96
 
97
  if not authenticate_user_check:
98
  st.error("Неверное имя пользователя или пароль!")
utilities_language_bert/rus_main_workflow_bert.py CHANGED
@@ -1,28 +1,20 @@
1
  import datetime
2
  from io import StringIO
 
3
  from random import sample
4
  from collections import defaultdict
5
- from streamlit import progress as st_progress
6
- from utilities_language_general.rus_constants import st
7
- from streamlit.elements import WIDGETS as ST_WIDGETS
8
- from utilities_language_bert.rus_sentence_bert import TASK
9
- from utilities_language_general.rus_constants import load_bert
10
- from utilities_language_general.rus_utils import prepare_tasks
11
- from utilities_language_bert.rus_sentence_bert import SENTENCE
12
- import utilities_language_general.rus_constants as esp_constants
13
- from utilities_language_general.rus_utils import prepare_target_words
14
- from utilities_language_general.rus_utils import compute_frequency_dict
15
  from streamlit.runtime.uploaded_file_manager import UploadedFile
16
- from utilities_language_general.rus_constants import BAD_USER_TARGET_WORDS
 
 
17
 
18
 
19
- def main_workflow_bert(
20
- file: UploadedFile or None,
21
  text: str,
22
- logs: ST_WIDGETS,
23
- logs_d: ST_WIDGETS,
24
- progress: st_progress,
25
- progress_s: st_progress,
26
  level: str,
27
  tw_mode_automatic_mode: str,
28
  target_words: str,
@@ -31,13 +23,15 @@ def main_workflow_bert(
31
  global_bad_target_words=BAD_USER_TARGET_WORDS):
32
 
33
  # Clear bad target_words each time
34
- global_bad_target_words = []
 
35
 
36
  # Define main global variables
37
- logs.write()
38
  GLOBAL_DISTRACTORS = set()
39
  MAX_FREQUENCY = 0
40
 
 
 
41
  mask_filler = load_bert()
42
 
43
  # Get input text
@@ -47,15 +41,15 @@ def main_workflow_bert(
47
  elif text != '':
48
  current_text = text
49
  else:
50
- esp_constants.st.warning('Вы и текст не вставили, и файл не выбрали 😢')
51
  current_text = ''
52
- esp_constants.st.stop()
53
 
54
  # Process target words
55
  if tw_mode_automatic_mode == 'Самостоятельно':
56
  if target_words == '':
57
- esp_constants.st.warning('Вы не ввели целевые слова')
58
- esp_constants.st.stop()
59
  # Cannot make up paradigm, so only USER_TARGET_WORDS is used
60
  USER_TARGET_WORDS = prepare_target_words(target_words)
61
  tw_mode_automatic_mode = False
@@ -68,8 +62,8 @@ def main_workflow_bert(
68
  current_text = current_text.replace('.', '. ').replace('. . .', '...').replace(' ', ' ').replace('…', '...') \
69
  .replace('…', '...').replace('—', '-').replace('\u2014', '-').replace('—', '-').replace('-\n', '') \
70
  .replace('\n', '%^&*')
71
- current_text_sentences = [sent.text.strip() for sent in esp_constants.nlp(current_text).sents]
72
- logs.success('Получили Ваш текст!')
73
  progress.progress(10)
74
 
75
  # Compute frequency dict
@@ -81,31 +75,12 @@ def main_workflow_bert(
81
  if j < len(FREQ_DICT) * _frequency_barrier_percent:
82
  MAX_FREQUENCY = tp[1]
83
  MAX_FREQUENCY = 3 if MAX_FREQUENCY < 3 else MAX_FREQUENCY
84
- logs.success("Посчитали немного статистики!")
85
  progress.progress(15)
86
 
87
  # Choose necessary language minimum according to user's input
88
- if level == 'A1':
89
- target_minimum = esp_constants.a1_target_set
90
- distractor_minimum = esp_constants.a1_distractor_set
91
- elif level == 'A2':
92
- target_minimum = esp_constants.a2_target_set
93
- distractor_minimum = esp_constants.a2_distractor_set
94
- elif level == 'B1':
95
- target_minimum = esp_constants.b1_target_set
96
- distractor_minimum = esp_constants.b1_distractor_set
97
- elif level == 'B2':
98
- target_minimum = esp_constants.b2_target_set
99
- distractor_minimum = esp_constants.b2_distractor_set
100
- elif level == 'C1':
101
- target_minimum = esp_constants.c1_target_set
102
- distractor_minimum = esp_constants.c1_distractor_set
103
- elif level == 'C2':
104
- target_minimum = esp_constants.c2_target_set
105
- distractor_minimum = esp_constants.c2_distractor_set
106
- elif level == 'Без уровня':
107
- target_minimum = None
108
- distractor_minimum = None
109
  else:
110
  target_minimum = None
111
  distractor_minimum = None
@@ -115,24 +90,41 @@ def main_workflow_bert(
115
  # Start generation process
116
  workflow = [SENTENCE(original=sent.strip(), n_sentence=num, max_num_distractors=num_distractors)
117
  for num, sent in enumerate(current_text_sentences)]
118
- logs.success("Запускаем процесс генерации заданий!")
119
  progress.progress(20)
120
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  for sentence in workflow:
122
  sentence.lemmatize_sentence()
123
 
124
  for sentence in workflow:
125
  sentence.bind_phrases()
126
- logs.success("Подготовили предложения для дальнейшей работы!")
127
  progress.progress(30)
128
 
129
  for j, sentence in enumerate(workflow):
130
  sentence.search_target_words(target_words_automatic_mode=tw_mode_automatic_mode,
131
  target_minimum=target_minimum,
132
  user_target_words=USER_TARGET_WORDS,
133
- frequency_dict=FREQ_DICT)
 
134
  progress.progress(int(30 + (j * (20 / len(workflow)))))
135
- progress_s.progress(50)
136
  DUPLICATE_TARGET_WORDS = defaultdict(list)
137
  for sentence in workflow:
138
  for target_word in sentence.target_words:
@@ -145,8 +137,8 @@ def main_workflow_bert(
145
  if target_word not in RESULT_TW:
146
  global_bad_target_words.append(target_word['original_text'])
147
  sentence.target_words.remove(target_word)
148
- progress_s.progress(55)
149
- logs.success('Выбрали слова-пропуски!')
150
 
151
  for sentence in workflow:
152
  for i, target_word in enumerate(sentence.target_words):
@@ -157,7 +149,7 @@ def main_workflow_bert(
157
 
158
  for sentence in workflow:
159
  sentence.filter_target_words(target_words_automatic_mode=tw_mode_automatic_mode)
160
- progress_s.progress(60)
161
 
162
  RESULT_TASKS = []
163
  for sentence in workflow:
@@ -166,25 +158,29 @@ def main_workflow_bert(
166
  RESULT_TASKS.append(task)
167
 
168
  for num, task in enumerate(RESULT_TASKS):
169
- task.attach_distractors_to_target_word(model=mask_filler, level_name=level,
 
 
 
 
170
  global_distractors=GLOBAL_DISTRACTORS,
171
  distractor_minimum=distractor_minimum,
172
  max_frequency=MAX_FREQUENCY)
173
- logs_d.success(
174
- f'Обработали {num}/{len(RESULT_TASKS)} целевых слов!')
175
- logs_d.success(
176
- f'Обработали {len(RESULT_TASKS)}/{len(RESULT_TASKS)} целевых слов!')
177
- progress_s.progress(65)
178
- logs.success('Подобрали неправильные варианты!')
179
 
180
  for task in RESULT_TASKS:
181
  task.inflect_distractors()
182
- progress_s.progress(70)
183
- logs.success('П��осклоняли и проспрягали неправильные варианты!')
184
 
185
  for task in RESULT_TASKS:
186
  task.sample_distractors(num_distractors=num_distractors)
187
- progress_s.progress(75)
188
  RESULT_TASKS = list(filter(lambda t: not t.bad_target_word, RESULT_TASKS))
189
 
190
  for task in RESULT_TASKS[::-1]:
@@ -202,13 +198,18 @@ def main_workflow_bert(
202
  NUMBER_TASKS = 10
203
  else:
204
  NUMBER_TASKS = len(RESULT_TASKS)
205
- RESULT_TASKS = sample(RESULT_TASKS, NUMBER_TASKS)
 
 
 
 
 
206
  RESULT_TASKS = sorted(RESULT_TASKS, key=lambda t: (t.sentence_number, t.position_in_sentence))
207
 
208
  for task in RESULT_TASKS:
209
  task.compile_task(max_num_distractors=num_distractors)
210
- progress_s.progress(85)
211
- logs.success('Отобрали лучшие задания!')
212
 
213
  TEXT_WITH_GAPS = []
214
  VARIANTS = []
@@ -222,9 +223,6 @@ def main_workflow_bert(
222
  TEXT_WITH_GAPS.append(sentence)
223
  del RESULT_TASKS
224
 
225
- logs.success('Сейчас все будет готово!')
226
- progress_s.progress(90)
227
-
228
  TEXT_WITH_GAPS = ' '.join([sentence for sentence in TEXT_WITH_GAPS]).replace('%^&*', '\n')
229
  PREPARED_TASKS = prepare_tasks(VARIANTS)
230
  STUDENT_OUT = f'{TEXT_WITH_GAPS}\n\n{"=" * 70}\n\n{PREPARED_TASKS["TASKS_STUDENT"]}'
@@ -232,8 +230,8 @@ def main_workflow_bert(
232
  f'{PREPARED_TASKS["KEYS_ONLY"]}'
233
  TOTAL_OUT = f'{original_text}\n\n{"$" * 70}\n\n{STUDENT_OUT}\n\n{"=" * 70}\n\n{PREPARED_TASKS["TASKS_TEACHER"]}' \
234
  f'\n\n{"$" * 70}\n\n{PREPARED_TASKS["KEYS_ONLY"]}'
235
- logs.success('Сейчас все будет готово!')
236
- progress_s.progress(90)
237
  save_name = save_name if save_name != '' else f'{str(datetime.datetime.now())[:-7]}_{original_text[:20]}'
238
  out = {
239
  'name': save_name,
 
1
  import datetime
2
  from io import StringIO
3
+ from typing import Union
4
  from random import sample
5
  from collections import defaultdict
 
 
 
 
 
 
 
 
 
 
6
  from streamlit.runtime.uploaded_file_manager import UploadedFile
7
+ from utilities_language_bert.rus_sentence_bert import TASK, SENTENCE
8
+ from utilities_language_general.rus_utils import compute_frequency_dict, prepare_tasks, prepare_target_words
9
+ from utilities_language_general.rus_constants import st, load_bert, load_classifiers, nlp, summarization, BAD_USER_TARGET_WORDS, MINIMUM_SETS
10
 
11
 
12
+ def main_workflow(
13
+ file: Union[UploadedFile, None],
14
  text: str,
15
+ logs,
16
+ progress,
17
+ progress_d,
 
18
  level: str,
19
  tw_mode_automatic_mode: str,
20
  target_words: str,
 
23
  global_bad_target_words=BAD_USER_TARGET_WORDS):
24
 
25
  # Clear bad target_words each time
26
+ if global_bad_target_words:
27
+ global_bad_target_words = []
28
 
29
  # Define main global variables
 
30
  GLOBAL_DISTRACTORS = set()
31
  MAX_FREQUENCY = 0
32
 
33
+ logs.update(label='Загружаем языковые модели и другие данные', state='running')
34
+ pos_dict, scaler, classifier = load_classifiers('model3')
35
  mask_filler = load_bert()
36
 
37
  # Get input text
 
41
  elif text != '':
42
  current_text = text
43
  else:
44
+ st.warning('Вы ни текст не вставили, ни файл не выбрали 😢')
45
  current_text = ''
46
+ st.stop()
47
 
48
  # Process target words
49
  if tw_mode_automatic_mode == 'Самостоятельно':
50
  if target_words == '':
51
+ st.warning('Вы не ввели целевые слова')
52
+ st.stop()
53
  # Cannot make up paradigm, so only USER_TARGET_WORDS is used
54
  USER_TARGET_WORDS = prepare_target_words(target_words)
55
  tw_mode_automatic_mode = False
 
62
  current_text = current_text.replace('.', '. ').replace('. . .', '...').replace(' ', ' ').replace('…', '...') \
63
  .replace('…', '...').replace('—', '-').replace('\u2014', '-').replace('—', '-').replace('-\n', '') \
64
  .replace('\n', '%^&*')
65
+ current_text_sentences = [sent.text.strip() for sent in nlp(current_text).sents]
66
+ logs.update(label='Получили Ваш текст!', state='running')
67
  progress.progress(10)
68
 
69
  # Compute frequency dict
 
75
  if j < len(FREQ_DICT) * _frequency_barrier_percent:
76
  MAX_FREQUENCY = tp[1]
77
  MAX_FREQUENCY = 3 if MAX_FREQUENCY < 3 else MAX_FREQUENCY
78
+ logs.update(label="Посчитали немного статистики!", state='running')
79
  progress.progress(15)
80
 
81
  # Choose necessary language minimum according to user's input
82
+ if level:
83
+ target_minimum, distractor_minimum = MINIMUM_SETS[level]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  else:
85
  target_minimum = None
86
  distractor_minimum = None
 
90
  # Start generation process
91
  workflow = [SENTENCE(original=sent.strip(), n_sentence=num, max_num_distractors=num_distractors)
92
  for num, sent in enumerate(current_text_sentences)]
93
+ logs.update(label="Запускаем процесс генерации заданий!", state='running')
94
  progress.progress(20)
95
 
96
+ # Define summary length
97
+ text_length = len(current_text_sentences)
98
+ if text_length <= 15:
99
+ summary_length = text_length
100
+ elif text_length <= 25:
101
+ summary_length = 15
102
+ else:
103
+ n = (text_length - 20) // 5
104
+ summary_length = 15 + 2 * n
105
+ round_summary_length = summary_length - (summary_length % - 10)
106
+
107
+ # Get summary. May choose between round_summary_length and summary_length
108
+ SUMMARY = summarization(current_text, num_sentences=round_summary_length)
109
+ logs.update('Нашли интересные предложения. Пригодятся!')
110
+ progress.progress(25)
111
+
112
  for sentence in workflow:
113
  sentence.lemmatize_sentence()
114
 
115
  for sentence in workflow:
116
  sentence.bind_phrases()
117
+ logs.update(label="Подготовили предложения для дальнейшей работы!", state='running')
118
  progress.progress(30)
119
 
120
  for j, sentence in enumerate(workflow):
121
  sentence.search_target_words(target_words_automatic_mode=tw_mode_automatic_mode,
122
  target_minimum=target_minimum,
123
  user_target_words=USER_TARGET_WORDS,
124
+ frequency_dict=FREQ_DICT,
125
+ summary=SUMMARY)
126
  progress.progress(int(30 + (j * (20 / len(workflow)))))
127
+ progress.progress(50)
128
  DUPLICATE_TARGET_WORDS = defaultdict(list)
129
  for sentence in workflow:
130
  for target_word in sentence.target_words:
 
137
  if target_word not in RESULT_TW:
138
  global_bad_target_words.append(target_word['original_text'])
139
  sentence.target_words.remove(target_word)
140
+ progress.progress(55)
141
+ logs.update(label='Выбрали слова-пропуски!', state='running')
142
 
143
  for sentence in workflow:
144
  for i, target_word in enumerate(sentence.target_words):
 
149
 
150
  for sentence in workflow:
151
  sentence.filter_target_words(target_words_automatic_mode=tw_mode_automatic_mode)
152
+ progress.progress(60)
153
 
154
  RESULT_TASKS = []
155
  for sentence in workflow:
 
158
  RESULT_TASKS.append(task)
159
 
160
  for num, task in enumerate(RESULT_TASKS):
161
+ task.attach_distractors_to_target_word(model=mask_filler,
162
+ scaler=scaler,
163
+ classifier=classifier,
164
+ pos_dict=pos_dict,
165
+ level_name=level,
166
  global_distractors=GLOBAL_DISTRACTORS,
167
  distractor_minimum=distractor_minimum,
168
  max_frequency=MAX_FREQUENCY)
169
+ progress_d.progress(num / len(RESULT_TASKS))
170
+ logs.update(label=f'Обработали {num}/{len(RESULT_TASKS)} целевых слов!', state='running')
171
+ logs.update(label=f'Обработали {len(RESULT_TASKS)}/{len(RESULT_TASKS)} целевых слов!', state='running')
172
+ progress_d.progress(100)
173
+ progress.progress(70)
174
+ logs.update(label='Подобрали неправильные варианты!', state='running')
175
 
176
  for task in RESULT_TASKS:
177
  task.inflect_distractors()
178
+ progress.progress(80)
179
+ logs.update(label='Просклоняли и проспрягали неправильные варианты!', state='running')
180
 
181
  for task in RESULT_TASKS:
182
  task.sample_distractors(num_distractors=num_distractors)
183
+ progress.progress(85)
184
  RESULT_TASKS = list(filter(lambda t: not t.bad_target_word, RESULT_TASKS))
185
 
186
  for task in RESULT_TASKS[::-1]:
 
198
  NUMBER_TASKS = 10
199
  else:
200
  NUMBER_TASKS = len(RESULT_TASKS)
201
+ RESULT_TASKS_in_summary = list(filter(lambda task: task.in_summary, RESULT_TASKS))
202
+ RESULT_TASTS_not_in_summary = list(filter(lambda task: not task.in_summary, RESULT_TASKS))
203
+ if len(RESULT_TASKS_in_summary) >= NUMBER_TASKS:
204
+ RESULT_TASKS = RESULT_TASKS_in_summary
205
+ else:
206
+ RESULT_TASKS = RESULT_TASKS_in_summary + sample(RESULT_TASTS_not_in_summary, NUMBER_TASKS - len(RESULT_TASKS_in_summary))
207
  RESULT_TASKS = sorted(RESULT_TASKS, key=lambda t: (t.sentence_number, t.position_in_sentence))
208
 
209
  for task in RESULT_TASKS:
210
  task.compile_task(max_num_distractors=num_distractors)
211
+ progress.progress(90)
212
+ logs.update(label='Отобрали лучшие задания!', state='running')
213
 
214
  TEXT_WITH_GAPS = []
215
  VARIANTS = []
 
223
  TEXT_WITH_GAPS.append(sentence)
224
  del RESULT_TASKS
225
 
 
 
 
226
  TEXT_WITH_GAPS = ' '.join([sentence for sentence in TEXT_WITH_GAPS]).replace('%^&*', '\n')
227
  PREPARED_TASKS = prepare_tasks(VARIANTS)
228
  STUDENT_OUT = f'{TEXT_WITH_GAPS}\n\n{"=" * 70}\n\n{PREPARED_TASKS["TASKS_STUDENT"]}'
 
230
  f'{PREPARED_TASKS["KEYS_ONLY"]}'
231
  TOTAL_OUT = f'{original_text}\n\n{"$" * 70}\n\n{STUDENT_OUT}\n\n{"=" * 70}\n\n{PREPARED_TASKS["TASKS_TEACHER"]}' \
232
  f'\n\n{"$" * 70}\n\n{PREPARED_TASKS["KEYS_ONLY"]}'
233
+ logs.update(label='Сейчас все будет готово!', state='running')
234
+ progress.progress(95)
235
  save_name = save_name if save_name != '' else f'{str(datetime.datetime.now())[:-7]}_{original_text[:20]}'
236
  out = {
237
  'name': save_name,
utilities_language_bert/rus_sentence_bert.py CHANGED
@@ -1,15 +1,7 @@
1
- import copy
2
  import string
3
- from random import random
4
- from random import sample
5
- from utilities_language_general.rus_constants import nlp
6
- from utilities_language_general.rus_utils import get_tags
7
- from utilities_language_general.rus_constants import PHRASES
8
- from utilities_language_general.rus_utils import define_gender
9
- from utilities_language_general.rus_utils import make_inflection
10
- from utilities_language_general.rus_utils import check_token_bert
11
- from utilities_language_general.rus_constants import BAD_USER_TARGET_WORDS
12
- from utilities_language_general.rus_utils import get_distractors_from_model_bert
13
 
14
 
15
  class SENTENCE:
@@ -47,11 +39,10 @@ class SENTENCE:
47
  if not previous_was_phrase:
48
  self.sentence_phrases.append(self.sentence_lemma_pos[i][1])
49
  previous_was_phrase = False
 
50
 
51
- def search_target_words_automatically(self, target_minimum: set, frequency_dict: dict = None):
52
  for token in self.sentence_phrases:
53
- # TODO: Still do not have w2v model with phrases
54
- # therefore cannot come up with the criteria
55
  if isinstance(token, list): # if token is a phrase
56
  original_token1 = token[1]['original_token1']
57
  original_token2 = token[1]['original_token2']
@@ -71,7 +62,8 @@ class SENTENCE:
71
  'tags': tags,
72
  'position_in_sentence': self.original.find(original_token1.text),
73
  'not_named_entity': not_ner,
74
- 'frequency_in_text': 0
 
75
  }
76
  self.target_words.append(target_word)
77
  else: # if token is just a spacy.nlp token
@@ -89,10 +81,11 @@ class SENTENCE:
89
  'position_in_sentence': self.original.find(token.text),
90
  'not_named_entity': True if token.ent_type == 0 else False,
91
  'frequency_in_text': frequency_dict.get(token.lemma_, 1),
 
92
  }
93
  self.target_words.append(target_word)
94
 
95
- def search_user_target_words(self, user_target_words: set = None, frequency_dict: dict = None):
96
  for _utw in user_target_words:
97
  if _utw in self.original:
98
  parse_utw = nlp(_utw)
@@ -118,19 +111,20 @@ class SENTENCE:
118
  'tags': user_target_word_tags,
119
  'position_in_sentence': self.original.find(_utw),
120
  'not_named_entity': not_ner,
121
- 'frequency_in_text': frequency_dict.get(user_target_word_lemma, 1)
 
122
  }
123
  self.target_words.append(target_word)
124
 
125
  def search_target_words(self, target_words_automatic_mode: bool, target_minimum,
126
  user_target_words: set = None,
127
- frequency_dict: dict = None):
128
  if target_words_automatic_mode:
129
  self.search_target_words_automatically(target_minimum=target_minimum,
130
- frequency_dict=frequency_dict)
131
  else:
132
  self.search_user_target_words(user_target_words=user_target_words,
133
- frequency_dict=frequency_dict)
134
 
135
  def filter_target_words(self, target_words_automatic_mode):
136
  c_position = 0
@@ -170,17 +164,11 @@ class TASK:
170
  def __repr__(self):
171
  return '\n'.join([f'{key}\t=\t{value}' for key, value in self.__dict__.items()])
172
 
173
- def attach_distractors_to_target_word(self, model, global_distractors, distractor_minimum,
174
- level_name, max_frequency):
175
  pos = self.pos[0] if self.pos[0] == 'phrase' else self.pos[1]
176
- # distractors_full_text = get_distractors_from_model_bert(model=model, lemma=self.lemma, pos=pos,
177
- # gender=self.gender, level_name=level_name,
178
- # text_with_masked_task=self.text_with_masked_task,
179
- # global_distractors=global_distractors,
180
- # distractor_minimum=distractor_minimum,
181
- # max_num_distractors=self.max_num_distractors)
182
- distractors_sentence = get_distractors_from_model_bert(model=model, lemma=self.lemma, pos=pos,
183
- gender=self.gender, level_name=level_name,
184
  text_with_masked_task=self.masked_sentence,
185
  global_distractors=global_distractors,
186
  distractor_minimum=distractor_minimum,
@@ -201,17 +189,6 @@ class TASK:
201
  inflected = make_inflection(text=distractor_lemma, pos=self.pos[1], tags=self.tags)
202
  if inflected is not None:
203
  inflected_distractors.append(inflected)
204
- else:
205
- new_tags = copy.deepcopy(self.tags)
206
- if 'NOUN' in self.tags and 'inan' in self.tags:
207
- new_tags.discard('inan')
208
- new_tags.add('anim')
209
- elif 'NOUN' in self.tags and 'anim' in self.tags:
210
- new_tags.discard('anim')
211
- new_tags.add('inan')
212
- inflected = make_inflection(text=distractor_lemma, pos=self.pos[1], tags=new_tags)
213
- if inflected is not None:
214
- inflected_distractors.append(inflected)
215
  num_distractors = min(4, self.max_num_distractors) if self.max_num_distractors >= 4 \
216
  else self.max_num_distractors
217
  if len(inflected_distractors) < num_distractors:
 
 
1
  import string
2
+ from random import random, sample
3
+ from utilities_language_general.rus_constants import nlp, PHRASES, BAD_USER_TARGET_WORDS
4
+ from utilities_language_general.rus_utils import get_tags, define_gender, make_inflection, check_token_bert, get_distractors_from_model_bert
 
 
 
 
 
 
 
5
 
6
 
7
  class SENTENCE:
 
39
  if not previous_was_phrase:
40
  self.sentence_phrases.append(self.sentence_lemma_pos[i][1])
41
  previous_was_phrase = False
42
+ self.sentence_phrases.append(self.sentence_lemma_pos[-1][1])
43
 
44
+ def search_target_words_automatically(self, target_minimum: set, frequency_dict: dict = None, summary: list=None):
45
  for token in self.sentence_phrases:
 
 
46
  if isinstance(token, list): # if token is a phrase
47
  original_token1 = token[1]['original_token1']
48
  original_token2 = token[1]['original_token2']
 
62
  'tags': tags,
63
  'position_in_sentence': self.original.find(original_token1.text),
64
  'not_named_entity': not_ner,
65
+ 'frequency_in_text': 0,
66
+ 'in_summary': self.original in summary
67
  }
68
  self.target_words.append(target_word)
69
  else: # if token is just a spacy.nlp token
 
81
  'position_in_sentence': self.original.find(token.text),
82
  'not_named_entity': True if token.ent_type == 0 else False,
83
  'frequency_in_text': frequency_dict.get(token.lemma_, 1),
84
+ 'in_summary': self.original in summary
85
  }
86
  self.target_words.append(target_word)
87
 
88
+ def search_user_target_words(self, user_target_words: set = None, frequency_dict: dict = None, summary: list=None):
89
  for _utw in user_target_words:
90
  if _utw in self.original:
91
  parse_utw = nlp(_utw)
 
111
  'tags': user_target_word_tags,
112
  'position_in_sentence': self.original.find(_utw),
113
  'not_named_entity': not_ner,
114
+ 'frequency_in_text': frequency_dict.get(user_target_word_lemma, 1),
115
+ 'in_summary': self.original in summary
116
  }
117
  self.target_words.append(target_word)
118
 
119
  def search_target_words(self, target_words_automatic_mode: bool, target_minimum,
120
  user_target_words: set = None,
121
+ frequency_dict: dict = None, summary: list=None):
122
  if target_words_automatic_mode:
123
  self.search_target_words_automatically(target_minimum=target_minimum,
124
+ frequency_dict=frequency_dict, summary=summary)
125
  else:
126
  self.search_user_target_words(user_target_words=user_target_words,
127
+ frequency_dict=frequency_dict, summary=summary)
128
 
129
  def filter_target_words(self, target_words_automatic_mode):
130
  c_position = 0
 
164
  def __repr__(self):
165
  return '\n'.join([f'{key}\t=\t{value}' for key, value in self.__dict__.items()])
166
 
167
+ def attach_distractors_to_target_word(self, model, scaler, classifier, pos_dict,
168
+ global_distractors, distractor_minimum, level_name, max_frequency):
169
  pos = self.pos[0] if self.pos[0] == 'phrase' else self.pos[1]
170
+ distractors_sentence = get_distractors_from_model_bert(model=model, scaler=scaler, classifier=classifier, pos_dict=pos_dict,
171
+ level_name=level_name, lemma=self.lemma, pos=pos, gender=self.gender,
 
 
 
 
 
 
172
  text_with_masked_task=self.masked_sentence,
173
  global_distractors=global_distractors,
174
  distractor_minimum=distractor_minimum,
 
189
  inflected = make_inflection(text=distractor_lemma, pos=self.pos[1], tags=self.tags)
190
  if inflected is not None:
191
  inflected_distractors.append(inflected)
 
 
 
 
 
 
 
 
 
 
 
192
  num_distractors = min(4, self.max_num_distractors) if self.max_num_distractors >= 4 \
193
  else self.max_num_distractors
194
  if len(inflected_distractors) < num_distractors:
utilities_language_general/rus_constants.py CHANGED
@@ -3,7 +3,9 @@ import spacy
3
  import gensim
4
  import pymorphy2
5
  import streamlit as st
 
6
  from transformers import pipeline
 
7
 
8
 
9
  @st.cache_resource
@@ -13,24 +15,53 @@ def load_morph():
13
 
14
 
15
  @st.cache_resource
16
- def load_w2v(model_path):
17
- _w2v_model = gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=True)
18
- return _w2v_model
 
 
 
 
19
 
20
 
21
  @st.cache_resource
22
  def load_spacy():
23
- _nlp = spacy.load('ru_core_news_lg')
 
24
  return _nlp
25
 
26
 
27
  @st.cache_resource
28
  def load_bert():
29
- return pipeline("fill-mask", model="a-v-white/ruBert-base-finetuned-russian-moshkov-child-corpus-pro")
 
 
30
 
31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  nlp = load_spacy()
33
  morph = load_morph()
 
34
  w2v_model1_path = r'model1.gz'
35
  w2v_model2_path = r'model2.gz'
36
 
@@ -47,24 +78,49 @@ b1_path, b1_target_set = r'language_data/B1_MINIMUM.txt', set()
47
  b2_path, b2_target_set = r'language_data/B2_MINIMUM.txt', set()
48
  c1_path, c1_target_set = r'language_data/C1_MINIMUM.txt', set()
49
  c2_path, c2_target_set = r'language_data/C2_MINIMUM.txt', set()
50
- minimums_paths = (a1_path, a2_path, b1_path, b2_path)
 
51
  minimums_sets = (a1_target_set, a2_target_set, b1_target_set, b2_target_set, c1_target_set, c2_target_set)
52
  for i in range(len(minimums_paths)):
53
  with open(minimums_paths[i], 'r', encoding='utf-8') as read_file:
54
  for line in read_file:
55
  minimums_sets[i].add(line.strip())
56
 
57
- a1_distractor_set = a1_target_set
58
- a2_distractor_set = a2_target_set.union(a1_target_set)
59
- b1_distractor_set = b1_target_set.union(a2_target_set)
60
- b2_distractor_set = b2_target_set.union(b1_target_set)
61
- c1_distractor_set = c1_target_set.union(b2_target_set)
62
- c2_distractor_set = c2_target_set.union(c1_target_set)
 
 
 
 
 
63
 
64
  with open('language_data/phrases.json', 'r', encoding='utf-8') as f:
65
  PHRASES = set(json.load(f)['PHRASES'])
66
 
67
- SIMILARITY_VALUES_w2v = {'A1': 1.0, 'A2': 1.0, 'B1': 1.0, 'B2': 1.0, 'C1': 1.0, 'C2': 1.0, 'Без уровня': 1.0}
68
- SIMILARITY_VALUES_bert = {'A1': 1.0, 'A2': 1.0, 'B1': 1.0, 'B2': 1.0, 'C1': 1.0, 'C2': 1.0, 'Без уровня': 1.0}
69
-
70
  BAD_USER_TARGET_WORDS = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  import gensim
4
  import pymorphy2
5
  import streamlit as st
6
+ from pickle import load
7
  from transformers import pipeline
8
+ from summarizer import Summarizer
9
 
10
 
11
  @st.cache_resource
 
15
 
16
 
17
  @st.cache_resource
18
+ def load_w2v(model):
19
+ with st.spinner('Загружаю языковую модель'):
20
+ if model == 'model1':
21
+ model_path = r'language_data/model1.gz'
22
+ else:
23
+ model_path = r'language_data/model2.gz'
24
+ return gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=True)
25
 
26
 
27
  @st.cache_resource
28
  def load_spacy():
29
+ with st.spinner('Загружаю морфо-синтаксический парсер'):
30
+ _nlp = spacy.load('ru_core_news_lg')
31
  return _nlp
32
 
33
 
34
  @st.cache_resource
35
  def load_bert():
36
+ with st.spinner('Загружаю языковую модель'):
37
+ _pipeline = pipeline(task="fill-mask", model="a-v-white/bert-base-spanish-wwm-cased-finetuned-literature-pro")
38
+ return _pipeline
39
 
40
 
41
+ @st.cache_resource
42
+ def load_summarizer():
43
+ return Summarizer()
44
+
45
+ @st.cache_resource
46
+ def load_classifiers(model):
47
+ if model == 'model1':
48
+ scaler_path = 'language_data/model1_no_wn_minmaxscaler.pickle'
49
+ classifier_path = 'language_data/model1_no_wn_catboost_classifier.pickle'
50
+ elif model == 'model2':
51
+ scaler_path = 'language_data/model2_no_wn_minmaxscaler.pickle'
52
+ classifier_path = 'language_data/model2_no_wn_catboost_classifier.pickle'
53
+ else:
54
+ scaler_path = 'language_data/model3_no_wn_minmaxscaler.pickle'
55
+ classifier_path = 'language_data/model3_no_wn_catboost_classifier.pickle'
56
+ with (open(scaler_path, 'rb') as f1, open(classifier_path, 'rb') as f2, open('language_data/pos_dict.pickle', 'rb') as f3):
57
+ scaler = load(f1)
58
+ classifier = load(f2)
59
+ pos_dict = load(f3)
60
+ return pos_dict, scaler, classifier
61
+
62
  nlp = load_spacy()
63
  morph = load_morph()
64
+ summarization = load_summarizer()
65
  w2v_model1_path = r'model1.gz'
66
  w2v_model2_path = r'model2.gz'
67
 
 
78
  b2_path, b2_target_set = r'language_data/B2_MINIMUM.txt', set()
79
  c1_path, c1_target_set = r'language_data/C1_MINIMUM.txt', set()
80
  c2_path, c2_target_set = r'language_data/C2_MINIMUM.txt', set()
81
+
82
+ minimums_paths = (a1_path, a2_path, b1_path, b2_path, c1_path, c2_path)
83
  minimums_sets = (a1_target_set, a2_target_set, b1_target_set, b2_target_set, c1_target_set, c2_target_set)
84
  for i in range(len(minimums_paths)):
85
  with open(minimums_paths[i], 'r', encoding='utf-8') as read_file:
86
  for line in read_file:
87
  minimums_sets[i].add(line.strip())
88
 
89
+ MINIMUM_SETS = {
90
+ 'A1': (a1_target_set, a1_target_set),
91
+ 'A2': (a2_target_set, a2_target_set.union(a1_target_set)),
92
+ 'B1': (b1_target_set, b1_target_set.union(a2_target_set)),
93
+ 'B2': (b2_target_set, b2_target_set.union(b1_target_set)),
94
+ 'C1': (c1_target_set, c1_target_set.union(b2_target_set)),
95
+ 'C2': (c2_target_set, c2_target_set.union(c1_target_set)),
96
+ 'Без уровня': (None, None)
97
+ }
98
+
99
+ LEVEL_NUMBERS = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}
100
 
101
  with open('language_data/phrases.json', 'r', encoding='utf-8') as f:
102
  PHRASES = set(json.load(f)['PHRASES'])
103
 
 
 
 
104
  BAD_USER_TARGET_WORDS = []
105
+
106
+
107
+ COMBINE_POS = {
108
+ 'simple':
109
+ {
110
+ 'A1': {'VERB': ['AUX']},
111
+ 'A2': {'VERB': ['AUX']},
112
+ 'B1': {'VERB': ['AUX']},
113
+ 'B2': {'VERB': ['AUX']},
114
+ 'C1': {'VERB': ['AUX']},
115
+ 'C2': {'VERB': ['AUX']},
116
+ },
117
+ 'phrase':
118
+ {
119
+ 'A1': {'VERB': ['AUX']},
120
+ 'A2': {'VERB': ['AUX']},
121
+ 'B1': {'VERB': ['AUX']},
122
+ 'B2': {'VERB': ['AUX']},
123
+ 'C1': {'VERB': ['AUX']},
124
+ 'C2': {'VERB': ['AUX']},
125
+ },
126
+ }
utilities_language_general/rus_utils.py CHANGED
@@ -1,10 +1,8 @@
 
1
  from nltk import edit_distance
2
  from utilities.utils import answer_letter
3
- from utilities_language_general.rus_constants import nlp
4
- from utilities_language_general.rus_constants import morph
5
- from utilities_language_general.rus_constants import stop_list
6
- from utilities_language_general.rus_constants import SIMILARITY_VALUES_w2v
7
- from utilities_language_general.rus_constants import SIMILARITY_VALUES_bert
8
 
9
 
10
  def prepare_target_words(target_words):
@@ -73,7 +71,7 @@ def get_tags(token: str):
73
  return set(parts), method
74
 
75
 
76
- def make_inflection(text: str, pos: str or list, tags: set) -> str or None:
77
  if isinstance(pos, list):
78
  if set(pos).issubset({'NOUN', 'ADJ', 'PROPN'}):
79
  noun_adjective_phrase_tags = {'nomn', 'gent', 'datv', 'accs', 'ablt', 'loct', 'voct',
@@ -101,6 +99,75 @@ def make_inflection(text: str, pos: str or list, tags: set) -> str or None:
101
  return None
102
  else:
103
  word_form = morph.parse(text)[0].inflect(tags)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  return word_form.word if word_form is not None else None
105
 
106
 
@@ -184,8 +251,8 @@ def check_token_bert(token, current_minimum: set = None, stop_words=stop_list,
184
  return False
185
 
186
 
187
- def get_distractors_from_model(model, lemma: str, pos: str, gender: str or None, global_distractors: set,
188
- distractor_minimum: set, level_name: str, max_num_distractors: int,
189
  max_length_ratio=5, min_edit_distance_ratio=0.5):
190
  distractors = []
191
  query = lemma if '_' in lemma else f'{lemma}_{pos}'
@@ -205,12 +272,16 @@ def get_distractors_from_model(model, lemma: str, pos: str, gender: str or None,
205
  distractor_similarity = candidate[1]
206
  candidate_gender = define_gender(distractor_lemma)
207
  length_ratio = abs(len(lemma) - len(distractor_lemma))
 
 
 
208
  condition = ((distractor_pos == pos
209
- or (distractor_pos in ('VERB', 'ADJ', 'phrase') and pos in ('VERB', 'ADJ', 'phrase')))
 
 
210
  and distractor_lemma != lemma
211
  and len(distractors) < 100
212
- and distractor_similarity < SIMILARITY_VALUES_w2v[level_name]
213
- and candidate_gender == gender
214
  and length_ratio <= max_length_ratio
215
  and distractor_lemma not in global_distractors
216
  and edit_distance(lemma, distractor_lemma) / ((len(lemma) + len(distractor_lemma)) / 2) >
@@ -228,16 +299,22 @@ def get_distractors_from_model(model, lemma: str, pos: str, gender: str or None,
228
  continue
229
  d1_lemma, d1_pos, d2_lemma, d2_pos = candidate[0].split('_')
230
  distractor_lemma = f'{d1_lemma}_{d2_lemma}'
 
231
  distractor_similarity = candidate[1]
 
 
 
232
  condition = (((d1_pos == pos or d2_pos == pos)
 
 
233
  or (d1_pos in ('VERB', 'AUX', 'SCONJ', 'ADP')
234
  and pos in ('phrase', 'VERB', 'AUX', 'SCONJ', 'ADP'))
235
  or (d2_pos in ('VERB', 'AUX', 'SCONJ', 'ADP')
236
  and pos in ('phrase', 'VERB', 'AUX', 'SCONJ', 'ADP')))
 
237
  and candidate[0] != lemma
238
  and distractor_lemma != lemma
239
  and len(distractors) < 100
240
- and distractor_similarity < SIMILARITY_VALUES_w2v[level_name]
241
  and distractor_lemma not in global_distractors)
242
  if condition:
243
  if distractor_minimum is not None:
@@ -255,8 +332,8 @@ def get_distractors_from_model(model, lemma: str, pos: str, gender: str or None,
255
  return None
256
 
257
 
258
- def get_distractors_from_model_bert(model, text_with_masked_task: str, lemma: str, pos: str, gender: str or None,
259
- global_distractors: set, distractor_minimum: set, level_name: str,
260
  max_num_distractors: int, max_length_ratio=5, min_edit_distance_ratio=0.5):
261
  _distractors = []
262
  try:
@@ -283,12 +360,16 @@ def get_distractors_from_model_bert(model, text_with_masked_task: str, lemma: st
283
  distractor_similarity = candidate_distractor[1]
284
  candidate_gender = define_gender(distractor_lemma)
285
  length_ratio = abs(len(lemma) - len(distractor_lemma))
 
 
 
286
  if (((distractor_pos == pos)
287
- or (pos in ('VERB', 'ADJ', 'phrase') and distractor_pos in ('VERB', 'ADJ', 'phrase')))
 
 
288
  and distractor_lemma != lemma
289
  and (len(_distractors) < max_num_distractors + 10)
290
- and (distractor_similarity < SIMILARITY_VALUES_bert[level_name])
291
- and (candidate_gender == gender)
292
  and (length_ratio <= max_length_ratio) # May be changed if case of phrases
293
  and (distractor_lemma not in global_distractors)
294
  and (edit_distance(lemma, distractor_lemma) # May be changed if case of phrases
 
1
+ from random import randint
2
  from nltk import edit_distance
3
  from utilities.utils import answer_letter
4
+ from utilities_language_general.similarity_measures import make_decision
5
+ from utilities_language_general.rus_constants import nlp, morph, stop_list, COMBINE_POS
 
 
 
6
 
7
 
8
  def prepare_target_words(target_words):
 
71
  return set(parts), method
72
 
73
 
74
+ def make_inflection(text: str, pos: str or list, tags: set, level: str) -> str or None:
75
  if isinstance(pos, list):
76
  if set(pos).issubset({'NOUN', 'ADJ', 'PROPN'}):
77
  noun_adjective_phrase_tags = {'nomn', 'gent', 'datv', 'accs', 'ablt', 'loct', 'voct',
 
99
  return None
100
  else:
101
  word_form = morph.parse(text)[0].inflect(tags)
102
+ rnd = randint(0,5)
103
+ if pos == 'VERB' and 'impf' in tags and level in ('A1', 'A2'):
104
+ tags.discard('impf')
105
+ tags.add('perf')
106
+ word_form = morph.parse(text)[0].inflect(tags)
107
+ if word_form is not None:
108
+ return word_form.word
109
+ elif pos == 'VERB' and 'perf' in tags and level in ('A1', 'A2'):
110
+ tags.discard('perf')
111
+ tags.add('impf')
112
+ word_form = morph.parse(text)[0].inflect(tags)
113
+ if word_form is not None:
114
+ return word_form.word
115
+ if pos == 'NOUN' and level in ('A1', 'A2'):
116
+ if word_form is None:
117
+ if 'inan' in tags:
118
+ tags.discard('inan')
119
+ tags.add('anim')
120
+ elif 'anim' in tags:
121
+ tags.discard('anim')
122
+ tags.add('inan')
123
+ if pos in ('NOUN', 'ADJ') and level in ('A1, A2') and rnd == 0:
124
+ if 'sing' in tags:
125
+ tags.discard('sing')
126
+ tags.add('plur')
127
+ elif 'plur' in tags:
128
+ tags.discard('plur')
129
+ tags.add('sing')
130
+ elif pos in ('NOUN', 'ADJ') and level in ('A1', 'A2') and rnd == 1:
131
+ if 'masc' in tags:
132
+ tags.remove('masc')
133
+ tags.add('femn')
134
+ word_form = morph.parse(text)[0].inflect(tags)
135
+ if word_form is not None:
136
+ return word_form.word
137
+ else:
138
+ tags.remove('femn')
139
+ tags.add('neut')
140
+ word_form = morph.parse(text)[0].inflect(tags)
141
+ if word_form is not None:
142
+ return word_form.word
143
+ if 'femn' in tags:
144
+ tags.remove('femn')
145
+ tags.add('masc')
146
+ word_form = morph.parse(text)[0].inflect(tags)
147
+ if word_form is not None:
148
+ return word_form.word
149
+ else:
150
+ tags.remove('masc')
151
+ tags.add('neut')
152
+ word_form = morph.parse(text)[0].inflect(tags)
153
+ if word_form is not None:
154
+ return word_form.word
155
+ if 'neut' in tags:
156
+ tags.remove('neut')
157
+ tags.add('masc')
158
+ word_form = morph.parse(text)[0].inflect(tags)
159
+ if word_form is not None:
160
+ return word_form.word
161
+ else:
162
+ tags.remove('masc')
163
+ tags.add('femn')
164
+ word_form = morph.parse(text)[0].inflect(tags)
165
+ if word_form is not None:
166
+ return word_form.word
167
+ else:
168
+ word_form = morph.parse(text)[0].inflect(tags)
169
+ return word_form.word if word_form is not None else None
170
+ word_form = morph.parse(text)[0].inflect(tags)
171
  return word_form.word if word_form is not None else None
172
 
173
 
 
251
  return False
252
 
253
 
254
+ def get_distractors_from_model(doc, model, scaler, classifier, pos_dict:dict, target_text:str, lemma: str, pos: str, gender: str,
255
+ lemma_index:int, global_distractors: set, distractor_minimum: set, level_name: str, max_num_distractors: int,
256
  max_length_ratio=5, min_edit_distance_ratio=0.5):
257
  distractors = []
258
  query = lemma if '_' in lemma else f'{lemma}_{pos}'
 
272
  distractor_similarity = candidate[1]
273
  candidate_gender = define_gender(distractor_lemma)
274
  length_ratio = abs(len(lemma) - len(distractor_lemma))
275
+ decision = make_decision(doc, model_type='w2v', model=model, scaler=scaler, classifier=classifier, pos_dict=pos_dict,
276
+ level=level_name, target_lemma=query, target_text=target_text, target_pos=pos, target_position=lemma_index,
277
+ substitute_lemma=distractor_lemma, substitute_pos=distractor_pos)
278
  condition = ((distractor_pos == pos
279
+ or (COMBINE_POS['simple'][level_name].get(pos) is not None and COMBINE_POS['simple'][level_name].get(distractor_pos) is not None
280
+ and distractor_pos in COMBINE_POS['simple'][level_name][pos] and pos in COMBINE_POS['simple'][level_name][distractor_pos]))
281
+ and decision
282
  and distractor_lemma != lemma
283
  and len(distractors) < 100
284
+ and (candidate_gender == gender and level_name in ('B1', 'B2', 'C1', 'C2'))
 
285
  and length_ratio <= max_length_ratio
286
  and distractor_lemma not in global_distractors
287
  and edit_distance(lemma, distractor_lemma) / ((len(lemma) + len(distractor_lemma)) / 2) >
 
299
  continue
300
  d1_lemma, d1_pos, d2_lemma, d2_pos = candidate[0].split('_')
301
  distractor_lemma = f'{d1_lemma}_{d2_lemma}'
302
+ d_pos = f'{d1_pos}_{d2_pos}'
303
  distractor_similarity = candidate[1]
304
+ decision = make_decision(doc, model_type='w2v', model=model, scaler=scaler, classifier=classifier, pos_dict=pos_dict,
305
+ level=level_name, target_lemma=query, target_text=target_text, target_pos=pos, target_position=lemma_index,
306
+ substitute_lemma=candidate[0], substitute_pos=d_pos)
307
  condition = (((d1_pos == pos or d2_pos == pos)
308
+ or (COMBINE_POS['simple'][level_name].get(pos) is not None and COMBINE_POS['simple'][level_name].get(distractor_pos) is not None
309
+ and distractor_pos in COMBINE_POS['simple'][level_name][pos] and pos in COMBINE_POS['simple'][level_name][distractor_pos])
310
  or (d1_pos in ('VERB', 'AUX', 'SCONJ', 'ADP')
311
  and pos in ('phrase', 'VERB', 'AUX', 'SCONJ', 'ADP'))
312
  or (d2_pos in ('VERB', 'AUX', 'SCONJ', 'ADP')
313
  and pos in ('phrase', 'VERB', 'AUX', 'SCONJ', 'ADP')))
314
+ and decision
315
  and candidate[0] != lemma
316
  and distractor_lemma != lemma
317
  and len(distractors) < 100
 
318
  and distractor_lemma not in global_distractors)
319
  if condition:
320
  if distractor_minimum is not None:
 
332
  return None
333
 
334
 
335
+ def get_distractors_from_model_bert(model, scaler, classifier, pos_dict:dict, level_name: str, lemma: str, pos: str, gender: str,
336
+ text_with_masked_task: str, global_distractors: set, distractor_minimum: set,
337
  max_num_distractors: int, max_length_ratio=5, min_edit_distance_ratio=0.5):
338
  _distractors = []
339
  try:
 
360
  distractor_similarity = candidate_distractor[1]
361
  candidate_gender = define_gender(distractor_lemma)
362
  length_ratio = abs(len(lemma) - len(distractor_lemma))
363
+ decision = make_decision(doc=None, model_type='bert', scaler=scaler, classifier=classifier, pos_dict=pos_dict, level=level_name,
364
+ target_lemma=lemma, target_text=None, target_pos=pos, target_position=None,
365
+ substitute_lemma=distractor_lemma, substitute_pos=distractor_pos, bert_score=distractor_similarity)
366
  if (((distractor_pos == pos)
367
+ or (COMBINE_POS['phrase'][level_name].get(pos) is not None and COMBINE_POS['phrase'][level_name].get(distractor_pos) is not None
368
+ and distractor_pos in COMBINE_POS['phrase'][level_name][pos] and pos in COMBINE_POS['phrase'][level_name][distractor_pos]))
369
+ and decision
370
  and distractor_lemma != lemma
371
  and (len(_distractors) < max_num_distractors + 10)
372
+ and (candidate_gender == gender and level_name in ('B1', 'B2', 'C1', 'C2'))
 
373
  and (length_ratio <= max_length_ratio) # May be changed if case of phrases
374
  and (distractor_lemma not in global_distractors)
375
  and (edit_distance(lemma, distractor_lemma) # May be changed if case of phrases
utilities_language_general/similarity_measures.py ADDED
@@ -0,0 +1,255 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from math import pow
3
+ from nltk.corpus import wordnet as wn
4
+ from utilities_language_general.rus_constants import nlp, PHRASES, LEVEL_NUMBERS
5
+
6
+ def eucledian_distance(x, y):
7
+ return np.sqrt(np.sum((x - y) ** 2))
8
+
9
+ def cosine_similarity(x, y):
10
+ out = np.dot(x, y) / (np.sqrt(np.dot(x, x)) * np.sqrt(np.dot(y, y)))
11
+ if str(out) != 'nan':
12
+ return out
13
+ return None
14
+
15
+ def get_vector_for_token(model, token):
16
+ vector = None
17
+
18
+ splitted = token.split('_')
19
+ token_list = [f'{splitted[i]}_{splitted[i+1]}' for i in range(len(splitted)-1)]
20
+
21
+ if model.has_index_for(token):
22
+ vector = model.get_vector(token)
23
+ else:
24
+ try:
25
+ vector = model.get_mean_vector(token_list)
26
+ except ValueError:
27
+ return None
28
+ return vector
29
+
30
+ def compute_metric(func, vector1, vector2):
31
+ if vector1 is not None and vector2 is not None:
32
+ return func(vector1, vector2)
33
+ else:
34
+ return None
35
+
36
+ def compute_positive_cos(x, y):
37
+ cos_sim = cosine_similarity(x, y)
38
+ if cos_sim:
39
+ return (cos_sim + 1) / 2
40
+ else:
41
+ return None
42
+
43
+ def addition_metric(substitute, target, context):
44
+ substitute_target_cos = compute_metric(cosine_similarity, substitute, target)
45
+ if not substitute_target_cos:
46
+ return None
47
+ if not context:
48
+ return None
49
+
50
+ context_vectors = []
51
+ for context_tk in context:
52
+ substitute_context_cos = compute_metric(cosine_similarity, substitute, context_tk)
53
+ if substitute_context_cos:
54
+ context_vectors.append(substitute_context_cos)
55
+ sum_of_context_vectors = np.sum(context_vectors)
56
+
57
+ metric = (substitute_target_cos + sum_of_context_vectors) / (len(context) + 1)
58
+ return metric
59
+
60
+ def balanced_addition_metric(substitute, target, context):
61
+ substitute_target_cos = compute_metric(cosine_similarity, substitute, target)
62
+ if not substitute_target_cos:
63
+ return None
64
+ if not context:
65
+ return None
66
+
67
+ context_vectors = []
68
+ for context_tk in context:
69
+ substitute_context_cos = compute_metric(cosine_similarity, substitute, context_tk)
70
+ if substitute_context_cos:
71
+ context_vectors.append(substitute_context_cos)
72
+ sum_of_context_vectors = np.sum(context_vectors)
73
+
74
+ context_len = len(context)
75
+ metric = (context_len * substitute_target_cos + sum_of_context_vectors) / (2 * context_len)
76
+ return metric
77
+
78
+ def multiplication_metric(substitute, target, context):
79
+ substitute_target_cos = compute_metric(compute_positive_cos, substitute, target)
80
+ if not substitute_target_cos:
81
+ return None
82
+ if not context:
83
+ return None
84
+
85
+ context_vectors = []
86
+ for context_tk in context:
87
+ substitute_context_positive_cos = compute_metric(compute_positive_cos, substitute, context_tk)
88
+ if substitute_context_positive_cos:
89
+ context_vectors.append(substitute_context_positive_cos)
90
+ prod_of_context_vectors = np.prod(context_vectors)
91
+ try:
92
+ metric = pow((substitute_target_cos + prod_of_context_vectors), 1 / (len(context) + 1))
93
+ except ValueError:
94
+ return None
95
+ return metric
96
+
97
+ def balanced_multiplication_metric(substitute, target, context):
98
+ substitute_target_cos = compute_metric(compute_positive_cos, substitute, target)
99
+ if not substitute_target_cos:
100
+ return None
101
+ if not context:
102
+ return None
103
+
104
+ context_vectors = []
105
+ for context_tk in context:
106
+ substitute_context_positive_cos = compute_metric(compute_positive_cos, substitute, context_tk)
107
+ if substitute_context_positive_cos:
108
+ context_vectors.append(substitute_context_positive_cos)
109
+ prod_of_context_vectors = np.prod(context_vectors)
110
+
111
+ context_len = len(context)
112
+ try:
113
+ metric = pow((pow(substitute_target_cos, context_len) + prod_of_context_vectors), 1 / (2 * context_len))
114
+ except ValueError:
115
+ return None
116
+ return metric
117
+
118
+ def bind_phrases(context_list):
119
+ context = []
120
+ previous_was_phrase = False
121
+ for i in range(len(context_list)-1):
122
+ phrase_candidate = f'{context_list[i]}_{context_list[i+1]}'
123
+ if phrase_candidate in PHRASES and not previous_was_phrase:
124
+ context.append(phrase_candidate)
125
+ previous_was_phrase = True
126
+ else:
127
+ if not previous_was_phrase:
128
+ context.append(context_list[i])
129
+ previous_was_phrase = False
130
+ if context_list:
131
+ if not context:
132
+ context.append(context_list[-1])
133
+ elif not context_list[-1] in context[-1]:
134
+ context.append(context_list[-1])
135
+ return context
136
+
137
+ def get_context_windows(doc, target_text, window_size):
138
+ sentence_str = doc.text
139
+ sentence_masked = sentence_str.lower().replace(target_text.lower().strip(), ' [MASK] ')
140
+ alpha_tokens_lemma_pos = [f'{tk.lemma_.lower()}_{tk.pos_}' for tk in nlp(sentence_masked) if tk.text.isalpha()]
141
+ alpha_tokens_lemma_pos_no_stop = [f'{tk.lemma_.lower()}_{tk.pos_}' for tk in nlp(sentence_masked) if tk.text.isalpha() and not tk.is_stop]
142
+ try:
143
+ mask_token_index = alpha_tokens_lemma_pos.index('mask_PROPN')
144
+ mask_token_index_no_stop = alpha_tokens_lemma_pos_no_stop.index('mask_PROPN')
145
+ except ValueError:
146
+ return None
147
+
148
+ left_border = max(mask_token_index-window_size, 0)
149
+ right_border = min(mask_token_index+window_size, len(alpha_tokens_lemma_pos))
150
+ l_context = alpha_tokens_lemma_pos[left_border:mask_token_index]
151
+ r_context = alpha_tokens_lemma_pos[mask_token_index+1:right_border+1]
152
+
153
+ left_border_no_stop = max(mask_token_index_no_stop-window_size, 0)
154
+ right_border_no_stop = min(mask_token_index_no_stop+window_size, len(alpha_tokens_lemma_pos_no_stop))
155
+ l_context_no_stop = alpha_tokens_lemma_pos_no_stop[left_border_no_stop:mask_token_index_no_stop]
156
+ r_context_no_stop = alpha_tokens_lemma_pos_no_stop[mask_token_index_no_stop+1:right_border_no_stop+1]
157
+ return (bind_phrases(l_context) + bind_phrases(r_context), bind_phrases(l_context_no_stop) + bind_phrases(r_context_no_stop))
158
+
159
+ def get_context_linked_words(doc, target_position, target_text):
160
+ answer_list = target_text.split(' ')
161
+ context_words = []
162
+ for tk in doc:
163
+ if tk.text.isalpha():
164
+ if (tk.text in answer_list and abs(target_position - tk.idx) <= sum([len(t) for t in answer_list])):
165
+ context_words.extend([t for t in tk.subtree if t.text.isalpha() and not t.is_stop])
166
+ context_words.extend([t for t in tk.children if t.text.isalpha() and not t.is_stop])
167
+ context_words.extend([t for t in tk.ancestors if t.text.isalpha() and not t.is_stop])
168
+ context_words = [(tk, f'{tk.lemma_}_{tk.pos_}') for tk in sorted(set(context_words), key=lambda tk: tk.i) if tk.text not in answer_list]
169
+ context = []
170
+ previous_was_phrase = False
171
+ for i in range(len(context_words)-1):
172
+ phrase_candidate = f'{context_words[i][1]}_{context_words[i+1][1]}'
173
+ if phrase_candidate in PHRASES and not previous_was_phrase and abs(context_words[i][0].i - context_words[i+1][0].i) <=1:
174
+ context.append(phrase_candidate)
175
+ previous_was_phrase = True
176
+ else:
177
+ if not previous_was_phrase:
178
+ context.append(context_words[i][1])
179
+ if context and context_words:
180
+ if not context_words[-1][1] in context[-1]:
181
+ context.append(context_words[-1][1])
182
+ elif context_words:
183
+ context.append(context_words[-1][1])
184
+ return context
185
+
186
+
187
+ def compute_all_necessary_metrics(target_lemma, target_text, target_position, substitute_lemma, doc, model_type:str, model=None):
188
+
189
+ target_vector = get_vector_for_token(model, target_lemma)
190
+ substitute_vector = get_vector_for_token(model, substitute_lemma)
191
+
192
+ cosimilarity = compute_metric(cosine_similarity, substitute_vector, target_vector)
193
+ eucledian_similarity = compute_metric(eucledian_distance, substitute_vector, target_vector)
194
+
195
+ context_window3, context_window3_no_stop = get_context_windows(doc=doc, target_text=target_text, window_size=3)
196
+ context_window5, context_window5_no_stop = get_context_windows(doc=doc, target_text=target_text, window_size=5)
197
+ context_window_synt = get_context_linked_words(doc, target_position, target_text)
198
+
199
+ context_window3 = [get_vector_for_token(model, token) for token in context_window3]
200
+ context_window3_no_stop = [get_vector_for_token(model, token) for token in context_window3_no_stop]
201
+ context_window5 = [get_vector_for_token(model, token) for token in context_window5]
202
+ context_window5_no_stop = [get_vector_for_token(model, token) for token in context_window5_no_stop]
203
+ context_window_synt = [get_vector_for_token(model, token) for token in context_window_synt]
204
+
205
+ add_metric_window3 = addition_metric(target_vector, substitute_vector, context_window3)
206
+ bal_add_metric_window3 = balanced_addition_metric(target_vector, substitute_vector, context_window3)
207
+ add_metric_window3_no_stop = addition_metric(target_vector, substitute_vector, context_window3_no_stop)
208
+ bal_add_metric_window3_no_stop = balanced_addition_metric(target_vector, substitute_vector, context_window3_no_stop)
209
+
210
+ mult_metric_window3 = multiplication_metric(target_vector, substitute_vector, context_window3)
211
+ bal_mult_metric_window3 = balanced_multiplication_metric(target_vector, substitute_vector, context_window3)
212
+ mult_metric_window3_no_stop = multiplication_metric(target_vector, substitute_vector, context_window3_no_stop)
213
+ bal_mult_metric_window3_no_stop = balanced_multiplication_metric(target_vector, substitute_vector, context_window3_no_stop)
214
+
215
+ add_metric_window5 = addition_metric(target_vector, substitute_vector, context_window5)
216
+ bal_add_metric_window5 = balanced_addition_metric(target_vector, substitute_vector, context_window5)
217
+ add_metric_window5_no_stop = addition_metric(target_vector, substitute_vector, context_window5_no_stop)
218
+ bal_add_metric_window5_no_stop = balanced_addition_metric(target_vector, substitute_vector, context_window5_no_stop)
219
+
220
+ mult_metric_window5 = multiplication_metric(target_vector, substitute_vector, context_window5)
221
+ bal_mult_metric_window5 = balanced_multiplication_metric(target_vector, substitute_vector, context_window5)
222
+ mult_metric_window5_no_stop = multiplication_metric(target_vector, substitute_vector, context_window5_no_stop)
223
+ bal_mult_metric_window5_no_stop = balanced_multiplication_metric(target_vector, substitute_vector, context_window5_no_stop)
224
+
225
+ add_metric_synt = addition_metric(target_vector, substitute_vector, context_window_synt)
226
+ bal_add_metric_synt = balanced_addition_metric(target_vector, substitute_vector, context_window_synt)
227
+
228
+ mult_metric_synt = multiplication_metric(target_vector, substitute_vector, context_window_synt)
229
+ bal_mult_metric_synt = balanced_multiplication_metric(target_vector, substitute_vector, context_window_synt)
230
+
231
+ return (cosimilarity, eucledian_similarity,
232
+ add_metric_window3, bal_add_metric_window3,
233
+ mult_metric_window3, bal_mult_metric_window3,
234
+ add_metric_window3_no_stop, bal_add_metric_window3_no_stop,
235
+ mult_metric_window3_no_stop, bal_mult_metric_window3_no_stop,
236
+ add_metric_window5, bal_add_metric_window5,
237
+ mult_metric_window5, bal_mult_metric_window5,
238
+ add_metric_window5_no_stop, bal_add_metric_window5_no_stop,
239
+ mult_metric_window5_no_stop, bal_mult_metric_window5_no_stop,
240
+ add_metric_synt, bal_add_metric_synt,
241
+ mult_metric_synt, bal_mult_metric_synt)
242
+
243
+ def make_decision(doc, model_type, scaler, classifier, pos_dict, level, target_lemma, target_text, target_pos, target_position,
244
+ substitute_lemma, substitute_pos, model=None, bert_score=None):
245
+ # return True
246
+ metrics = compute_all_necessary_metrics(target_lemma=target_lemma, target_text=target_text, target_position=target_position,
247
+ substitute_lemma=substitute_lemma, doc=doc, model_type=model_type, model=model)
248
+ target_multiword, substitute_multiword = target_lemma.count('_') > 2, substitute_lemma.count('_') > 2
249
+ data = [LEVEL_NUMBERS.get(level), pos_dict.get(target_pos), target_multiword, pos_dict.get(substitute_pos), substitute_multiword] + scaler.transform([metrics]).tolist()[0]
250
+ if model_type == 'bert':
251
+ data = [LEVEL_NUMBERS.get(level), pos_dict.get(target_pos), target_multiword, pos_dict.get(substitute_pos), substitute_multiword, bert_score]
252
+ predict = classifier.predict(data)
253
+ return bool(predict)
254
+
255
+
utilities_language_w2v/rus_main_workflow_w2v.py CHANGED
@@ -1,31 +1,20 @@
1
  import datetime
2
  from io import StringIO
 
3
  from random import sample
4
  from collections import defaultdict
5
- from streamlit import progress as st_progress
6
- from streamlit.elements import WIDGETS as ST_WIDGETS
7
- from utilities_language_general.rus_constants import st
8
- from utilities_language_w2v.rus_sentence_w2v import TASK
9
- from utilities_language_general.rus_constants import load_w2v
10
- from utilities_language_general.rus_utils import prepare_tasks
11
- from utilities_language_w2v.rus_sentence_w2v import SENTENCE
12
- import utilities_language_general.rus_constants as esp_constants
13
- from utilities_language_general.rus_utils import prepare_target_words
14
- from utilities_language_general.rus_constants import w2v_model1_path
15
- from utilities_language_general.rus_constants import w2v_model2_path
16
- from utilities_language_general.rus_utils import compute_frequency_dict
17
  from streamlit.runtime.uploaded_file_manager import UploadedFile
18
- from utilities_language_general.rus_constants import BAD_USER_TARGET_WORDS
 
 
19
 
20
 
21
- def main_workflow_w2v(
22
- file: UploadedFile or None,
23
  text: str,
24
- logs: ST_WIDGETS,
25
- logs_d: ST_WIDGETS,
26
- progress: st_progress,
27
- progress_d: st_progress,
28
- progress_s: st_progress,
29
  level: str,
30
  tw_mode_automatic_mode: str,
31
  target_words: str,
@@ -56,19 +45,13 @@ def main_workflow_w2v(
56
  student_out, teacher_out, total_out, original_text
57
  """
58
  # Clear bad target_words each time
59
- global_bad_target_words = []
 
60
 
61
  # Define main global variables
62
- logs.write()
63
  GLOBAL_DISTRACTORS = set()
64
  MAX_FREQUENCY = 0
65
 
66
- # Define which model is used for distractor generation
67
- if model_name == 'Модель-1':
68
- mask_filler = load_w2v(w2v_model1_path)
69
- else:
70
- mask_filler = load_w2v(w2v_model2_path)
71
-
72
  # Get input text
73
  if file is not None:
74
  stringio = StringIO(file.getvalue().decode("utf-8"))
@@ -76,15 +59,15 @@ def main_workflow_w2v(
76
  elif text != '':
77
  current_text = text
78
  else:
79
- esp_constants.st.warning('Вы и текст не вставили, и файл не выбрали 😢')
80
  current_text = ''
81
- esp_constants.st.stop()
82
 
83
  # Process target words
84
  if tw_mode_automatic_mode == 'Самостоятельно':
85
  if target_words == '':
86
- esp_constants.st.warning('Вы не ввели целевые слова')
87
- esp_constants.st.stop()
88
  # Cannot make up paradigm, so only USER_TARGET_WORDS is used
89
  USER_TARGET_WORDS = prepare_target_words(target_words)
90
  tw_mode_automatic_mode = False
@@ -97,8 +80,8 @@ def main_workflow_w2v(
97
  current_text = current_text.replace('.', '. ').replace('. . .', '...').replace(' ', ' ').replace('…', '...') \
98
  .replace('…', '...').replace('—', '-').replace('\u2014', '-').replace('—', '-').replace('-\n', '') \
99
  .replace('\n', '%^&*')
100
- current_text_sentences = [sent.text.strip() for sent in esp_constants.nlp(current_text).sents]
101
- logs.success('Получили Ваш текст!')
102
  progress.progress(10)
103
 
104
  # Compute frequency dict
@@ -110,49 +93,55 @@ def main_workflow_w2v(
110
  if j < len(FREQ_DICT) * _frequency_barrier_percent:
111
  MAX_FREQUENCY = tp[1]
112
  MAX_FREQUENCY = 3 if MAX_FREQUENCY < 3 else MAX_FREQUENCY
113
- logs.success("Посчитали немного статистики!")
114
  progress.progress(15)
115
 
116
  # Choose necessary language minimum according to user's input
117
- if level == 'A1':
118
- target_minimum = esp_constants.a1_target_set
119
- distractor_minimum = esp_constants.a1_distractor_set
120
- elif level == 'A2':
121
- target_minimum = esp_constants.a2_target_set
122
- distractor_minimum = esp_constants.a2_distractor_set
123
- elif level == 'B1':
124
- target_minimum = esp_constants.b1_target_set
125
- distractor_minimum = esp_constants.b1_distractor_set
126
- elif level == 'B2':
127
- target_minimum = esp_constants.b2_target_set
128
- distractor_minimum = esp_constants.b2_distractor_set
129
- elif level == 'C1':
130
- target_minimum = esp_constants.c1_target_set
131
- distractor_minimum = esp_constants.c1_distractor_set
132
- elif level == 'C2':
133
- target_minimum = esp_constants.c2_target_set
134
- distractor_minimum = esp_constants.c2_distractor_set
135
- elif level == 'Без уровня':
136
- target_minimum = None
137
- distractor_minimum = None
138
  else:
139
  target_minimum = None
140
  distractor_minimum = None
141
  logs.error('Вы не выбрали языковой уровень!')
142
  st.stop()
143
 
 
 
 
 
 
 
 
 
 
144
  # Start generation process
145
  workflow = [SENTENCE(original=sent.strip(), n_sentence=num, max_num_distractors=num_distractors)
146
  for num, sent in enumerate(current_text_sentences)]
147
- logs.success("Запускаем процесс генерации заданий!")
148
  progress.progress(20)
149
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
  for sentence in workflow:
151
  sentence.lemmatize_sentence()
152
 
153
  for sentence in workflow:
154
  sentence.bind_phrases()
155
- logs.success("Подготовили предложения для дальнейшей работы!")
156
  progress.progress(30)
157
 
158
  for j, sentence in enumerate(workflow):
@@ -160,9 +149,10 @@ def main_workflow_w2v(
160
  target_words_automatic_mode=tw_mode_automatic_mode,
161
  target_minimum=target_minimum,
162
  user_target_words=USER_TARGET_WORDS,
163
- frequency_dict=FREQ_DICT)
 
164
  progress.progress(int(30 + (j * (30 / len(workflow)))))
165
- progress_s.progress(60)
166
  DUPLICATE_TARGET_WORDS = defaultdict(list)
167
  for sentence in workflow:
168
  for target_word in sentence.target_words:
@@ -175,31 +165,33 @@ def main_workflow_w2v(
175
  if target_word not in RESULT_TW:
176
  global_bad_target_words.append(target_word['original_text'])
177
  sentence.target_words.remove(target_word)
178
- progress_s.progress(65)
179
- logs.success('Выбрали слова-пропуски!')
180
 
181
  for sentence in workflow:
182
  sentence.attach_distractors_to_target_word(model=mask_filler,
 
 
 
183
  global_distractors=GLOBAL_DISTRACTORS,
184
  distractor_minimum=distractor_minimum,
185
  level_name=level,
186
  max_frequency=MAX_FREQUENCY,
187
- progress=progress_d,
188
- logs=logs_d)
189
- progress_s.progress(70)
190
- logs.success('Подобрали неправильные варианты!')
191
  for sentence in workflow:
192
- sentence.inflect_distractors()
193
- progress_s.progress(80)
194
- logs.success('Просклоняли и проспрягали неправильн��е варианты!')
195
 
196
  for sentence in workflow:
197
  sentence.filter_target_words(target_words_automatic_mode=tw_mode_automatic_mode)
198
 
199
  for sentence in workflow:
200
  sentence.sample_distractors(num_distractors=num_distractors)
201
- progress_s.progress(90)
202
- logs.success('Отобрали лучшие задания!')
203
 
204
  RESULT_TASKS = []
205
  for sentence in workflow:
@@ -219,7 +211,12 @@ def main_workflow_w2v(
219
  NUMBER_TASKS = 10
220
  else:
221
  NUMBER_TASKS = len(RESULT_TASKS)
222
- RESULT_TASKS = sample(RESULT_TASKS, NUMBER_TASKS)
 
 
 
 
 
223
  RESULT_TASKS = sorted(RESULT_TASKS, key=lambda t: (t.sentence_number, t.position_in_sentence))
224
 
225
  for task in RESULT_TASKS:
@@ -243,8 +240,8 @@ def main_workflow_w2v(
243
  f'{PREPARED_TASKS["KEYS_ONLY"]}'
244
  TOTAL_OUT = f'{original_text}\n\n{"$" * 70}\n\n{STUDENT_OUT}\n\n{"=" * 70}\n\n{PREPARED_TASKS["TASKS_TEACHER"]}' \
245
  f'\n\n{"$" * 70}\n\n{PREPARED_TASKS["KEYS_ONLY"]}'
246
- logs.success('Сейчас все будет готово!')
247
- progress_s.progress(90)
248
  save_name = save_name if save_name != '' else f'{str(datetime.datetime.now())[:-7]}_{original_text[:20]}'
249
  out = {
250
  'name': save_name,
 
1
  import datetime
2
  from io import StringIO
3
+ from typing import Union
4
  from random import sample
5
  from collections import defaultdict
 
 
 
 
 
 
 
 
 
 
 
 
6
  from streamlit.runtime.uploaded_file_manager import UploadedFile
7
+ from utilities_language_w2v.rus_sentence_w2v import SENTENCE, TASK
8
+ from utilities_language_general.rus_utils import compute_frequency_dict, prepare_target_words, prepare_tasks
9
+ from utilities_language_general.rus_constants import st, load_w2v, load_classifiers, nlp, summarization, BAD_USER_TARGET_WORDS, MINIMUM_SETS
10
 
11
 
12
+ def main_workflow(
13
+ file: Union[UploadedFile, None],
14
  text: str,
15
+ logs,
16
+ progress,
17
+ progress_d,
 
 
18
  level: str,
19
  tw_mode_automatic_mode: str,
20
  target_words: str,
 
45
  student_out, teacher_out, total_out, original_text
46
  """
47
  # Clear bad target_words each time
48
+ if global_bad_target_words:
49
+ global_bad_target_words = []
50
 
51
  # Define main global variables
 
52
  GLOBAL_DISTRACTORS = set()
53
  MAX_FREQUENCY = 0
54
 
 
 
 
 
 
 
55
  # Get input text
56
  if file is not None:
57
  stringio = StringIO(file.getvalue().decode("utf-8"))
 
59
  elif text != '':
60
  current_text = text
61
  else:
62
+ st.warning('Вы и текст не вставили, и файл не выбрали 😢')
63
  current_text = ''
64
+ st.stop()
65
 
66
  # Process target words
67
  if tw_mode_automatic_mode == 'Самостоятельно':
68
  if target_words == '':
69
+ st.warning('Вы не ввели целевые слова')
70
+ st.stop()
71
  # Cannot make up paradigm, so only USER_TARGET_WORDS is used
72
  USER_TARGET_WORDS = prepare_target_words(target_words)
73
  tw_mode_automatic_mode = False
 
80
  current_text = current_text.replace('.', '. ').replace('. . .', '...').replace(' ', ' ').replace('…', '...') \
81
  .replace('…', '...').replace('—', '-').replace('\u2014', '-').replace('—', '-').replace('-\n', '') \
82
  .replace('\n', '%^&*')
83
+ current_text_sentences = [sent.text.strip() for sent in nlp(current_text).sents]
84
+ logs.update(label='Получили Ваш текст!', state='running')
85
  progress.progress(10)
86
 
87
  # Compute frequency dict
 
93
  if j < len(FREQ_DICT) * _frequency_barrier_percent:
94
  MAX_FREQUENCY = tp[1]
95
  MAX_FREQUENCY = 3 if MAX_FREQUENCY < 3 else MAX_FREQUENCY
96
+ logs.update(label="Посчитали немного статистики!", state='running')
97
  progress.progress(15)
98
 
99
  # Choose necessary language minimum according to user's input
100
+ if level:
101
+ target_minimum, distractor_minimum = MINIMUM_SETS[level]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  else:
103
  target_minimum = None
104
  distractor_minimum = None
105
  logs.error('Вы не выбрали языковой уровень!')
106
  st.stop()
107
 
108
+ # Define which model is used for distractor generation
109
+ logs.update(label='Загружаем языковые модели и другие данные', state='running')
110
+ if model_name == 'Модель-1':
111
+ mask_filler = load_w2v('model1')
112
+ pos_dict, scaler, classifier = load_classifiers('model1')
113
+ else:
114
+ mask_filler = load_w2v('model2')
115
+ pos_dict, scaler, classifier = load_classifiers('model1')
116
+
117
  # Start generation process
118
  workflow = [SENTENCE(original=sent.strip(), n_sentence=num, max_num_distractors=num_distractors)
119
  for num, sent in enumerate(current_text_sentences)]
120
+ logs.update(label="Запускаем процесс генерации заданий!", state='running')
121
  progress.progress(20)
122
 
123
+ # Define summary length
124
+ text_length = len(current_text_sentences)
125
+ if text_length <= 15:
126
+ summary_length = text_length
127
+ elif text_length <= 25:
128
+ summary_length = 15
129
+ else:
130
+ n = (text_length - 20) // 5
131
+ summary_length = 15 + 2 * n
132
+ round_summary_length = summary_length - (summary_length % - 10)
133
+
134
+ # Get summary. May choose between round_summary_length and summary_length
135
+ SUMMARY = summarization(current_text, num_sentences=round_summary_length)
136
+ logs.success('Нашли интересные предложения. Пригодятся!')
137
+ progress.progress(25)
138
+
139
  for sentence in workflow:
140
  sentence.lemmatize_sentence()
141
 
142
  for sentence in workflow:
143
  sentence.bind_phrases()
144
+ logs.update(label="Подготовили предложения для дальнейшей работы!", state='running')
145
  progress.progress(30)
146
 
147
  for j, sentence in enumerate(workflow):
 
149
  target_words_automatic_mode=tw_mode_automatic_mode,
150
  target_minimum=target_minimum,
151
  user_target_words=USER_TARGET_WORDS,
152
+ frequency_dict=FREQ_DICT,
153
+ summary=SUMMARY)
154
  progress.progress(int(30 + (j * (30 / len(workflow)))))
155
+ progress.progress(60)
156
  DUPLICATE_TARGET_WORDS = defaultdict(list)
157
  for sentence in workflow:
158
  for target_word in sentence.target_words:
 
165
  if target_word not in RESULT_TW:
166
  global_bad_target_words.append(target_word['original_text'])
167
  sentence.target_words.remove(target_word)
168
+ progress.progress(65)
169
+ logs.update(label='Выбрали слова-пропуски!', state='running')
170
 
171
  for sentence in workflow:
172
  sentence.attach_distractors_to_target_word(model=mask_filler,
173
+ scaler=scaler,
174
+ classifier=classifier,
175
+ pos_dict=pos_dict,
176
  global_distractors=GLOBAL_DISTRACTORS,
177
  distractor_minimum=distractor_minimum,
178
  level_name=level,
179
  max_frequency=MAX_FREQUENCY,
180
+ logs=logs, progress=progress_d)
181
+ progress.progress(70)
182
+ logs.update(label='Подобрали неправильные варианты!', state='running')
 
183
  for sentence in workflow:
184
+ sentence.inflect_distractors(level_name=level)
185
+ progress.progress(80)
186
+ logs.update(label='Просклоняли и проспрягали неправильные варианты!', state='running')
187
 
188
  for sentence in workflow:
189
  sentence.filter_target_words(target_words_automatic_mode=tw_mode_automatic_mode)
190
 
191
  for sentence in workflow:
192
  sentence.sample_distractors(num_distractors=num_distractors)
193
+ progress.progress(90)
194
+ logs.update(label='Отобрали лучшие задания!', state='running')
195
 
196
  RESULT_TASKS = []
197
  for sentence in workflow:
 
211
  NUMBER_TASKS = 10
212
  else:
213
  NUMBER_TASKS = len(RESULT_TASKS)
214
+ RESULT_TASKS_in_summary = list(filter(lambda task: task.in_summary, RESULT_TASKS))
215
+ RESULT_TASTS_not_in_summary = list(filter(lambda task: not task.in_summary, RESULT_TASKS))
216
+ if len(RESULT_TASKS_in_summary) >= NUMBER_TASKS:
217
+ RESULT_TASKS = RESULT_TASKS_in_summary
218
+ else:
219
+ RESULT_TASKS = RESULT_TASKS_in_summary + sample(RESULT_TASTS_not_in_summary, NUMBER_TASKS - len(RESULT_TASKS_in_summary))
220
  RESULT_TASKS = sorted(RESULT_TASKS, key=lambda t: (t.sentence_number, t.position_in_sentence))
221
 
222
  for task in RESULT_TASKS:
 
240
  f'{PREPARED_TASKS["KEYS_ONLY"]}'
241
  TOTAL_OUT = f'{original_text}\n\n{"$" * 70}\n\n{STUDENT_OUT}\n\n{"=" * 70}\n\n{PREPARED_TASKS["TASKS_TEACHER"]}' \
242
  f'\n\n{"$" * 70}\n\n{PREPARED_TASKS["KEYS_ONLY"]}'
243
+ logs.update(label='Сейчас все будет готово!', state='running')
244
+ progress.progress(90)
245
  save_name = save_name if save_name != '' else f'{str(datetime.datetime.now())[:-7]}_{original_text[:20]}'
246
  out = {
247
  'name': save_name,
utilities_language_w2v/rus_sentence_w2v.py CHANGED
@@ -1,16 +1,7 @@
1
- import copy
2
  import string
3
- from random import random
4
- from random import sample
5
- from utilities_language_general.rus_constants import nlp
6
- from utilities_language_general.rus_utils import get_tags
7
- from utilities_language_general.rus_utils import check_token
8
- from utilities_language_general.rus_constants import PHRASES
9
- from utilities_language_general.rus_utils import define_gender
10
- from utilities_language_general.rus_utils import convert_gender
11
- from utilities_language_general.rus_utils import make_inflection
12
- from utilities_language_general.rus_constants import BAD_USER_TARGET_WORDS
13
- from utilities_language_general.rus_utils import get_distractors_from_model
14
 
15
 
16
  class SENTENCE:
@@ -48,7 +39,7 @@ class SENTENCE:
48
  self.sentence_phrases.append(self.sentence_lemma_pos[i][1])
49
  previous_was_phrase = False
50
 
51
- def search_target_words_automatically(self, model, target_minimum: set, frequency_dict: dict = None):
52
  for token in self.sentence_phrases:
53
  # TODO: Still do not have w2v model with phrases
54
  # therefore cannot come up with the criteria
@@ -69,7 +60,8 @@ class SENTENCE:
69
  'tags': tags,
70
  'position_in_sentence': self.original.find(original_token1.text),
71
  'not_named_entity': not_ner,
72
- 'frequency_in_text': 0
 
73
  }
74
  self.target_words.append(target_word)
75
  else: # if token is just a spacy.nlp token
@@ -86,10 +78,11 @@ class SENTENCE:
86
  'position_in_sentence': self.original.find(token.text),
87
  'not_named_entity': True if token.ent_type == 0 else False,
88
  'frequency_in_text': frequency_dict.get(token.lemma_, 1),
 
89
  }
90
  self.target_words.append(target_word)
91
 
92
- def search_user_target_words(self, model, user_target_words: set = None, frequency_dict: dict = None):
93
  for _utw in user_target_words:
94
  if _utw in self.original:
95
  parse_utw = nlp(_utw)
@@ -114,7 +107,8 @@ class SENTENCE:
114
  'tags': user_target_word_tags,
115
  'position_in_sentence': self.original.find(_utw),
116
  'not_named_entity': not_ner,
117
- 'frequency_in_text': frequency_dict.get(user_target_word_lemma, 1)
 
118
  }
119
  if not (model.has_index_for(user_target_word_lemma)
120
  or model.has_index_for(f'{user_target_word_lemma}_{user_target_word_pos[1]}')):
@@ -124,25 +118,24 @@ class SENTENCE:
124
 
125
  def search_target_words(self, model, target_words_automatic_mode: bool, target_minimum,
126
  user_target_words: set = None,
127
- frequency_dict: dict = None):
128
  if target_words_automatic_mode:
129
  self.search_target_words_automatically(model=model, target_minimum=target_minimum,
130
- frequency_dict=frequency_dict)
131
  else:
132
  self.search_user_target_words(model=model, user_target_words=user_target_words,
133
- frequency_dict=frequency_dict)
134
 
135
- def attach_distractors_to_target_word(self, model, global_distractors, distractor_minimum, level_name,
136
- max_frequency,
137
- progress, logs):
138
  n_target_words = len(self.target_words)
139
  bad_target_words = []
140
  for i, target_word in enumerate(self.target_words):
141
  pos = target_word['pos'][0] if target_word['pos'][0] == 'phrase' else target_word['pos'][1]
142
- distractors = get_distractors_from_model(model, lemma=target_word['lemma'], pos=pos,
143
- gender=target_word['gender'], level_name=level_name,
144
- global_distractors=global_distractors,
145
- distractor_minimum=distractor_minimum,
146
  max_num_distractors=self.max_num_distractors)
147
  if distractors is None or target_word['frequency_in_text'] > max_frequency:
148
  target_word['distractors'] = distractors
@@ -150,39 +143,29 @@ class SENTENCE:
150
  target_word['distractors'] = distractors
151
  target_word['distractors_number'] = len(distractors) if distractors is not None else 0
152
  progress.progress(i / n_target_words)
153
- logs.success(f'Обработали {i}/{n_target_words} слов в {self.n_sentence + 1}-м предложении')
 
154
  for btw in bad_target_words:
155
  BAD_USER_TARGET_WORDS.append(btw['original_text'])
156
  self.target_words.remove(btw)
157
  progress.progress(100)
158
- logs.success(
159
- f'Обработали {n_target_words}/{n_target_words} слов в {self.n_sentence + 1}-м предложении')
160
 
161
- def inflect_distractors(self):
162
  bad_target_words = []
163
  for target_word in self.target_words:
164
  inflected_distractors = []
165
  for distractor_lemma, distractor_similarity in target_word['distractors']:
166
  if distractor_lemma.count('_') > 1:
167
  # TODO The same. Has to train model and test this code
168
- inflected = make_inflection(text=distractor_lemma,
169
  pos=target_word['pos'][1], tags=target_word['tags'])
170
  else:
171
- inflected = make_inflection(text=distractor_lemma,
172
  pos=target_word['pos'][1], tags=target_word['tags'])
173
  if inflected is not None:
174
  inflected_distractors.append(inflected)
175
- else:
176
- new_tags = copy.deepcopy(target_word['tags'])
177
- if 'NOUN' in target_word['tags'] and 'inan' in target_word['tags']:
178
- new_tags.discard('inan')
179
- new_tags.add('anim')
180
- elif 'NOUN' in target_word['tags'] and 'anim' in target_word['tags']:
181
- new_tags.discard('anim')
182
- new_tags.add('inan')
183
- inflected = make_inflection(text=distractor_lemma, pos=target_word['pos'][1], tags=new_tags)
184
- if inflected is not None:
185
- inflected_distractors.append(inflected)
186
  num_distractors = min(4, self.max_num_distractors) if self.max_num_distractors >= 4 \
187
  else self.max_num_distractors
188
  if len(inflected_distractors) < num_distractors:
 
 
1
  import string
2
+ from random import random, sample
3
+ from utilities_language_general.rus_constants import nlp, PHRASES, BAD_USER_TARGET_WORDS
4
+ from utilities_language_general.rus_utils import get_tags, check_token, define_gender, convert_gender, make_inflection, get_distractors_from_model
 
 
 
 
 
 
 
 
5
 
6
 
7
  class SENTENCE:
 
39
  self.sentence_phrases.append(self.sentence_lemma_pos[i][1])
40
  previous_was_phrase = False
41
 
42
+ def search_target_words_automatically(self, model, target_minimum: set, frequency_dict: dict = None, summary:list=None):
43
  for token in self.sentence_phrases:
44
  # TODO: Still do not have w2v model with phrases
45
  # therefore cannot come up with the criteria
 
60
  'tags': tags,
61
  'position_in_sentence': self.original.find(original_token1.text),
62
  'not_named_entity': not_ner,
63
+ 'frequency_in_text': 0,
64
+ 'in_summary': self.original in summary
65
  }
66
  self.target_words.append(target_word)
67
  else: # if token is just a spacy.nlp token
 
78
  'position_in_sentence': self.original.find(token.text),
79
  'not_named_entity': True if token.ent_type == 0 else False,
80
  'frequency_in_text': frequency_dict.get(token.lemma_, 1),
81
+ 'in_summary': self.original in summary
82
  }
83
  self.target_words.append(target_word)
84
 
85
+ def search_user_target_words(self, model, user_target_words: set = None, frequency_dict: dict = None, summary:list=None):
86
  for _utw in user_target_words:
87
  if _utw in self.original:
88
  parse_utw = nlp(_utw)
 
107
  'tags': user_target_word_tags,
108
  'position_in_sentence': self.original.find(_utw),
109
  'not_named_entity': not_ner,
110
+ 'frequency_in_text': frequency_dict.get(user_target_word_lemma, 1),
111
+ 'in_summary': self.original in summary
112
  }
113
  if not (model.has_index_for(user_target_word_lemma)
114
  or model.has_index_for(f'{user_target_word_lemma}_{user_target_word_pos[1]}')):
 
118
 
119
  def search_target_words(self, model, target_words_automatic_mode: bool, target_minimum,
120
  user_target_words: set = None,
121
+ frequency_dict: dict = None, summary: list=None):
122
  if target_words_automatic_mode:
123
  self.search_target_words_automatically(model=model, target_minimum=target_minimum,
124
+ frequency_dict=frequency_dict, summary=summary)
125
  else:
126
  self.search_user_target_words(model=model, user_target_words=user_target_words,
127
+ frequency_dict=frequency_dict, summary=summary)
128
 
129
+ def attach_distractors_to_target_word(self, model, scaler, classifier, pos_dict, global_distractors,
130
+ distractor_minimum, level_name, max_frequency, logs, progress):
 
131
  n_target_words = len(self.target_words)
132
  bad_target_words = []
133
  for i, target_word in enumerate(self.target_words):
134
  pos = target_word['pos'][0] if target_word['pos'][0] == 'phrase' else target_word['pos'][1]
135
+ distractors = get_distractors_from_model(doc=self.parsed, model=model, scaler=scaler, classifier=classifier, pos_dict=pos_dict,
136
+ target_text=target_word['original_text'], lemma=target_word['lemma'],
137
+ pos=pos, gender=target_word['gender'], lemma_index=target_word['position_in_sentence'],
138
+ global_distractors=global_distractors, distractor_minimum=distractor_minimum, level_name=level_name,
139
  max_num_distractors=self.max_num_distractors)
140
  if distractors is None or target_word['frequency_in_text'] > max_frequency:
141
  target_word['distractors'] = distractors
 
143
  target_word['distractors'] = distractors
144
  target_word['distractors_number'] = len(distractors) if distractors is not None else 0
145
  progress.progress(i / n_target_words)
146
+ logs.update(label=f'Обработали {i}/{n_target_words} слов в {self.n_sentence + 1}-м предложении',
147
+ state='running')
148
  for btw in bad_target_words:
149
  BAD_USER_TARGET_WORDS.append(btw['original_text'])
150
  self.target_words.remove(btw)
151
  progress.progress(100)
152
+ logs.update(label=f'Обработали {n_target_words}/{n_target_words} слов в {self.n_sentence + 1}-м предложении',
153
+ state='running')
154
 
155
+ def inflect_distractors(self, level_name):
156
  bad_target_words = []
157
  for target_word in self.target_words:
158
  inflected_distractors = []
159
  for distractor_lemma, distractor_similarity in target_word['distractors']:
160
  if distractor_lemma.count('_') > 1:
161
  # TODO The same. Has to train model and test this code
162
+ inflected = make_inflection(text=distractor_lemma, level=level_name,
163
  pos=target_word['pos'][1], tags=target_word['tags'])
164
  else:
165
+ inflected = make_inflection(text=distractor_lemma, level=level_name,
166
  pos=target_word['pos'][1], tags=target_word['tags'])
167
  if inflected is not None:
168
  inflected_distractors.append(inflected)
 
 
 
 
 
 
 
 
 
 
 
169
  num_distractors = min(4, self.max_num_distractors) if self.max_num_distractors >= 4 \
170
  else self.max_num_distractors
171
  if len(inflected_distractors) < num_distractors: