a-v-bely commited on
Commit
94004b3
1 Parent(s): f49c9b7

distractor classification for bert

Browse files
language_data/model3_with_wn_catboost_classifier.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d27b12b7d7c7aa81da02aba229941ffef9e51879be6673c4f389bea10cd1a2db
3
+ size 2425245
language_data/model3_with_wn_minmaxscaler.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c415fb5e8b4258876b11043f43593fde8026456202629c5280cc59a1a5c5351b
3
+ size 1404
utilities_cookies/cookie_manager.py CHANGED
@@ -1,10 +1,8 @@
1
  import streamlit as st
2
  from pathlib import Path
3
- from typing import Mapping
4
- from datetime import datetime
5
- from datetime import timedelta
6
  from urllib.parse import unquote
7
- from typing import MutableMapping
 
8
  from streamlit.components.v1 import components
9
 
10
 
 
1
  import streamlit as st
2
  from pathlib import Path
 
 
 
3
  from urllib.parse import unquote
4
+ from datetime import datetime, timedelta
5
+ from typing import Mapping, MutableMapping
6
  from streamlit.components.v1 import components
7
 
8
 
utilities_cookies/encrypted_cookie_manager.py CHANGED
@@ -1,10 +1,8 @@
1
  import os
2
  import base64
3
  import streamlit as st
4
- from typing import Tuple
5
- from typing import Optional
6
  from cryptography import fernet
7
- from typing import MutableMapping
8
  from cryptography.fernet import Fernet
9
  from cryptography.hazmat.primitives import hashes
10
  from utilities_cookies.cookie_manager import CookieManager
 
1
  import os
2
  import base64
3
  import streamlit as st
4
+ from typing import Tuple, Optional, MutableMapping
 
5
  from cryptography import fernet
 
6
  from cryptography.fernet import Fernet
7
  from cryptography.hazmat.primitives import hashes
8
  from utilities_cookies.cookie_manager import CookieManager
utilities_database/user_database_utils.py CHANGED
@@ -1,9 +1,9 @@
1
- import re
2
- import json
3
- import secrets
4
  import pandas as pd
5
  import streamlit as st
 
 
6
  from trycourier import Courier
 
7
  from argon2 import PasswordHasher
8
  from argon2.exceptions import VerifyMismatchError
9
 
@@ -37,7 +37,7 @@ def check_valid_name(name_sign_up: str) -> bool:
37
  name_regex_eng = r'^[A-Za-z_]\w *'
38
  name_regex_rus = r'^[А-Яа-я_][А-Яа-я0-9_] *'
39
 
40
- if re.search(name_regex_eng, name_sign_up) or re.search(name_regex_rus, name_sign_up):
41
  return True
42
  return False
43
 
@@ -46,12 +46,8 @@ def check_valid_email(email_sign_up: str) -> bool:
46
  """
47
  Checks if the user entered a valid email while creating the account.
48
  """
49
- regex = re.compile(r'([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+')
50
  return True
51
-
52
- # if re.fullmatch(regex, email_sign_up):
53
- # return True
54
- # return False
55
 
56
 
57
  def check_unique_email(user_log_in_database, email_sign_up: str) -> bool:
@@ -133,7 +129,7 @@ def generate_random_passwd() -> str:
133
  Generates a random password to be sent in email.
134
  """
135
  password_length = 10
136
- return secrets.token_urlsafe(password_length)
137
 
138
 
139
  def send_passwd_in_email(auth_token: str, user_name_forgot_passwd: str, email_forgot_passwd: str, company_name: str,
@@ -266,5 +262,5 @@ def load_users_particular_task(user_task_database, load_mode, creator_name, save
266
  .eq('save_name', save_name)\
267
  .eq('save_type', load_mode)\
268
  .eq('cefr_level',cefr_level).execute().data[0]['generated_result']
269
- return_data = json.loads(return_data.replace("'", '"'), strict=False)
270
  return return_data
 
 
 
 
1
  import pandas as pd
2
  import streamlit as st
3
+ from json import loads
4
+ from re import search, compile
5
  from trycourier import Courier
6
+ from secrets import token_urlsafe
7
  from argon2 import PasswordHasher
8
  from argon2.exceptions import VerifyMismatchError
9
 
 
37
  name_regex_eng = r'^[A-Za-z_]\w *'
38
  name_regex_rus = r'^[А-Яа-я_][А-Яа-я0-9_] *'
39
 
40
+ if search(name_regex_eng, name_sign_up) or search(name_regex_rus, name_sign_up):
41
  return True
42
  return False
43
 
 
46
  """
47
  Checks if the user entered a valid email while creating the account.
48
  """
49
+ regex = compile(r'([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+')
50
  return True
 
 
 
 
51
 
52
 
53
  def check_unique_email(user_log_in_database, email_sign_up: str) -> bool:
 
129
  Generates a random password to be sent in email.
130
  """
131
  password_length = 10
132
+ return token_urlsafe(password_length)
133
 
134
 
135
  def send_passwd_in_email(auth_token: str, user_name_forgot_passwd: str, email_forgot_passwd: str, company_name: str,
 
262
  .eq('save_name', save_name)\
263
  .eq('save_type', load_mode)\
264
  .eq('cefr_level',cefr_level).execute().data[0]['generated_result']
265
+ return_data = loads(return_data.replace("'", '"'), strict=False)
266
  return return_data
utilities_database/user_database_widgets.py CHANGED
@@ -3,7 +3,6 @@ from datetime import datetime
3
  from supabase import create_client, Client
4
  from utilities_option_menu.option_menu import option_menu
5
  import utilities_database.user_database_utils as db_utils
6
- from utilities_database.user_database_utils import check_usr_pass
7
  from utilities_cookies.encrypted_cookie_manager import EncryptedCookieManager
8
 
9
  DB_URL = st.secrets['SUPABASE_URL']
@@ -91,7 +90,7 @@ class LogIn:
91
  login_submit_button = st.form_submit_button(label='Войти')
92
 
93
  if login_submit_button:
94
- authenticate_user_check = check_usr_pass(user_log_in_database=user_login_table,
95
  user_name=user_name,
96
  password=password)
97
 
 
3
  from supabase import create_client, Client
4
  from utilities_option_menu.option_menu import option_menu
5
  import utilities_database.user_database_utils as db_utils
 
6
  from utilities_cookies.encrypted_cookie_manager import EncryptedCookieManager
7
 
8
  DB_URL = st.secrets['SUPABASE_URL']
 
90
  login_submit_button = st.form_submit_button(label='Войти')
91
 
92
  if login_submit_button:
93
+ authenticate_user_check = db_utils.check_usr_pass(user_log_in_database=user_login_table,
94
  user_name=user_name,
95
  password=password)
96
 
utilities_language_bert/esp_main_workflow_bert.py CHANGED
@@ -1,29 +1,20 @@
1
  import datetime
2
  from io import StringIO
 
3
  from random import sample
4
  from collections import defaultdict
5
- from streamlit import progress as st_progress
6
- from streamlit.elements import WIDGETS as ST_WIDGETS
7
- from utilities_language_general.esp_constants import st
8
- from utilities_language_bert.esp_sentence_bert import TASK
9
- from utilities_language_bert.esp_sentence_bert import SENTENCE
10
- from utilities_language_general.esp_utils import prepare_tasks
11
- from utilities_language_general.esp_constants import load_bert
12
  from streamlit.runtime.uploaded_file_manager import UploadedFile
13
- import utilities_language_general.esp_constants as esp_constants
14
- from utilities_language_general.esp_constants import summarization
15
- from utilities_language_general.esp_utils import prepare_target_words
16
- from utilities_language_general.esp_utils import compute_frequency_dict
17
- from utilities_language_general.esp_constants import BAD_USER_TARGET_WORDS
18
-
19
 
20
 
21
  def main_workflow(
22
- file: UploadedFile or None,
23
  text: str,
24
- logs: ST_WIDGETS,
25
- progress: st_progress,
26
- progress_d: st_progress,
27
  level: str,
28
  tw_mode_automatic_mode: str,
29
  target_words: str,
@@ -58,6 +49,7 @@ def main_workflow(
58
  MAX_FREQUENCY = 0
59
 
60
  logs.update(label='Загружаем языковые модели и другие данные', state='running')
 
61
  mask_filler = load_bert()
62
 
63
  # Get input text
@@ -67,15 +59,15 @@ def main_workflow(
67
  elif text != '':
68
  current_text = text
69
  else:
70
- esp_constants.st.warning('Вы и текст не вставили, и файл не выбрали 😢')
71
  current_text = ''
72
- esp_constants.st.stop()
73
 
74
  # Process target words
75
  if tw_mode_automatic_mode == 'Самостоятельно':
76
  if target_words == '':
77
- esp_constants.st.warning('Вы не ввели целевые слова')
78
- esp_constants.st.stop()
79
  # Cannot make up paradigm, so only USER_TARGET_WORDS is used
80
  USER_TARGET_WORDS = prepare_target_words(target_words)
81
  tw_mode_automatic_mode = False
@@ -89,7 +81,7 @@ def main_workflow(
89
  .replace(' ', ' ').replace('…', '...').replace('…', '...')
90
  .replace('—', '-').replace('\u2014', '-').replace('—', '-')
91
  .replace('-\n', '').replace('\n', '%^&*'))
92
- current_text_sentences = [sent.text.strip() for sent in esp_constants.nlp(current_text).sents]
93
  logs.update(label='Получили Ваш текст!', state='running')
94
  progress.progress(10)
95
 
@@ -106,27 +98,8 @@ def main_workflow(
106
  progress.progress(15)
107
 
108
  # Choose necessary language minimum according to user's input
109
- if level == 'A1':
110
- target_minimum = esp_constants.a1_target_set
111
- distractor_minimum = esp_constants.a1_distractor_set
112
- elif level == 'A2':
113
- target_minimum = esp_constants.a2_target_set
114
- distractor_minimum = esp_constants.a2_distractor_set
115
- elif level == 'B1':
116
- target_minimum = esp_constants.b1_target_set
117
- distractor_minimum = esp_constants.b1_distractor_set
118
- elif level == 'B2':
119
- target_minimum = esp_constants.b2_target_set
120
- distractor_minimum = esp_constants.b2_distractor_set
121
- elif level == 'C1':
122
- target_minimum = esp_constants.c1_target_set
123
- distractor_minimum = esp_constants.c1_distractor_set
124
- elif level == 'C2':
125
- target_minimum = esp_constants.c2_target_set
126
- distractor_minimum = esp_constants.c2_distractor_set
127
- elif level == 'Без уровня':
128
- target_minimum = None
129
- distractor_minimum = None
130
  else:
131
  target_minimum = None
132
  distractor_minimum = None
@@ -204,7 +177,11 @@ def main_workflow(
204
  RESULT_TASKS.append(task)
205
 
206
  for num, task in enumerate(RESULT_TASKS):
207
- task.attach_distractors_to_target_word(model=mask_filler, level_name=level,
 
 
 
 
208
  global_distractors=GLOBAL_DISTRACTORS,
209
  distractor_minimum=distractor_minimum,
210
  max_frequency=MAX_FREQUENCY)
@@ -240,8 +217,8 @@ def main_workflow(
240
  NUMBER_TASKS = 10
241
  else:
242
  NUMBER_TASKS = len(RESULT_TASKS)
243
- RESULT_TASKS_in_summary = filter(lambda task: task.in_summary, RESULT_TASKS)
244
- RESULT_TASTS_not_in_summary = filter(lambda task: not task.in_summary, RESULT_TASKS)
245
  if len(RESULT_TASKS_in_summary) >= NUMBER_TASKS:
246
  RESULT_TASKS = RESULT_TASKS_in_summary
247
  else:
 
1
  import datetime
2
  from io import StringIO
3
+ from typing import Union
4
  from random import sample
5
  from collections import defaultdict
 
 
 
 
 
 
 
6
  from streamlit.runtime.uploaded_file_manager import UploadedFile
7
+ from utilities_language_bert.esp_sentence_bert import TASK, SENTENCE
8
+ from utilities_language_general.esp_utils import prepare_tasks, prepare_target_words, compute_frequency_dict
9
+ from utilities_language_general.esp_constants import st, load_bert, load_classifiers, nlp, summarization, BAD_USER_TARGET_WORDS, MINIMUM_SETS
 
 
 
10
 
11
 
12
  def main_workflow(
13
+ file: Union[UploadedFile, None],
14
  text: str,
15
+ logs,
16
+ progress,
17
+ progress_d,
18
  level: str,
19
  tw_mode_automatic_mode: str,
20
  target_words: str,
 
49
  MAX_FREQUENCY = 0
50
 
51
  logs.update(label='Загружаем языковые модели и другие данные', state='running')
52
+ pos_dict, scaler, classifier = load_classifiers('model3')
53
  mask_filler = load_bert()
54
 
55
  # Get input text
 
59
  elif text != '':
60
  current_text = text
61
  else:
62
+ st.warning('Вы и текст не вставили, и файл не выбрали 😢')
63
  current_text = ''
64
+ st.stop()
65
 
66
  # Process target words
67
  if tw_mode_automatic_mode == 'Самостоятельно':
68
  if target_words == '':
69
+ st.warning('Вы не ввели целевые слова')
70
+ st.stop()
71
  # Cannot make up paradigm, so only USER_TARGET_WORDS is used
72
  USER_TARGET_WORDS = prepare_target_words(target_words)
73
  tw_mode_automatic_mode = False
 
81
  .replace(' ', ' ').replace('…', '...').replace('…', '...')
82
  .replace('—', '-').replace('\u2014', '-').replace('—', '-')
83
  .replace('-\n', '').replace('\n', '%^&*'))
84
+ current_text_sentences = [sent.text.strip() for sent in nlp(current_text).sents]
85
  logs.update(label='Получили Ваш текст!', state='running')
86
  progress.progress(10)
87
 
 
98
  progress.progress(15)
99
 
100
  # Choose necessary language minimum according to user's input
101
+ if level:
102
+ target_minimum, distractor_minimum = MINIMUM_SETS[level]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  else:
104
  target_minimum = None
105
  distractor_minimum = None
 
177
  RESULT_TASKS.append(task)
178
 
179
  for num, task in enumerate(RESULT_TASKS):
180
+ task.attach_distractors_to_target_word(model=mask_filler,
181
+ scaler=scaler,
182
+ classifier=classifier,
183
+ pos_dict=pos_dict,
184
+ level_name=level,
185
  global_distractors=GLOBAL_DISTRACTORS,
186
  distractor_minimum=distractor_minimum,
187
  max_frequency=MAX_FREQUENCY)
 
217
  NUMBER_TASKS = 10
218
  else:
219
  NUMBER_TASKS = len(RESULT_TASKS)
220
+ RESULT_TASKS_in_summary = list(filter(lambda task: task.in_summary, RESULT_TASKS))
221
+ RESULT_TASTS_not_in_summary = list(filter(lambda task: not task.in_summary, RESULT_TASKS))
222
  if len(RESULT_TASKS_in_summary) >= NUMBER_TASKS:
223
  RESULT_TASKS = RESULT_TASKS_in_summary
224
  else:
utilities_language_bert/esp_sentence_bert.py CHANGED
@@ -43,6 +43,7 @@ class SENTENCE:
43
  if not previous_was_phrase:
44
  self.sentence_phrases.append(self.sentence_lemma_pos[i][1])
45
  previous_was_phrase = False
 
46
 
47
  def search_target_words_automatically(self, target_minimum: set, frequency_dict: dict = None, summary:list=None):
48
  for token in self.sentence_phrases:
@@ -188,11 +189,11 @@ class TASK:
188
  def __repr__(self):
189
  return '\n'.join([f'{key}\t=\t{value}' for key, value in self.__dict__.items()])
190
 
191
- def attach_distractors_to_target_word(self, model, global_distractors, distractor_minimum,
192
- level_name, max_frequency):
193
  pos = self.pos[0] if self.pos[0] == 'phrase' else self.pos[1]
194
- distractors_sentence = get_distractors_from_model_bert(model=model, lemma=self.lemma, pos=pos,
195
- gender=self.gender, level_name=level_name,
196
  text_with_masked_task=self.masked_sentence,
197
  global_distractors=global_distractors,
198
  distractor_minimum=distractor_minimum,
 
43
  if not previous_was_phrase:
44
  self.sentence_phrases.append(self.sentence_lemma_pos[i][1])
45
  previous_was_phrase = False
46
+ self.sentence_phrases.append(self.sentence_lemma_pos[-1][1])
47
 
48
  def search_target_words_automatically(self, target_minimum: set, frequency_dict: dict = None, summary:list=None):
49
  for token in self.sentence_phrases:
 
189
  def __repr__(self):
190
  return '\n'.join([f'{key}\t=\t{value}' for key, value in self.__dict__.items()])
191
 
192
+ def attach_distractors_to_target_word(self, model, scaler, classifier, pos_dict,
193
+ global_distractors, distractor_minimum, level_name, max_frequency):
194
  pos = self.pos[0] if self.pos[0] == 'phrase' else self.pos[1]
195
+ distractors_sentence = get_distractors_from_model_bert(model=model, scaler=scaler, classifier=classifier, pos_dict=pos_dict,
196
+ level_name=level_name, lemma=self.lemma, pos=pos, gender=self.gender,
197
  text_with_masked_task=self.masked_sentence,
198
  global_distractors=global_distractors,
199
  distractor_minimum=distractor_minimum,
utilities_language_general/esp_constants.py CHANGED
@@ -2,7 +2,6 @@ import json
2
  import spacy
3
  import gensim
4
  import streamlit as st
5
-
6
  from pickle import load
7
  from transformers import pipeline
8
  from summarizer import Summarizer
 
2
  import spacy
3
  import gensim
4
  import streamlit as st
 
5
  from pickle import load
6
  from transformers import pipeline
7
  from summarizer import Summarizer
utilities_language_general/esp_utils.py CHANGED
@@ -192,9 +192,8 @@ def get_distractors_from_model(doc, model, scaler, classifier, pos_dict:dict, ta
192
  return distractors
193
 
194
 
195
- def get_distractors_from_model_bert(doc, model, scaler, classifier, text_with_masked_task: str, lemma: str, pos: str, gender: str, lemma_index:int,
196
- global_distractors: set, distractor_minimum: set, level_name: str, pos_dict:dict,
197
- max_num_distractors: int, max_length_ratio=5, min_edit_distance_ratio=0.5):
198
  _distractors = []
199
  try:
200
  bert_candidates = [token for token in model(text_with_masked_task, top_k=max_num_distractors + 100)]
@@ -217,9 +216,9 @@ def get_distractors_from_model_bert(doc, model, scaler, classifier, text_with_ma
217
  distractor_similarity = candidate_distractor[1]
218
  candidate_gender = get_tags(distractor_lemma).get('Gender')
219
  length_ratio = abs(len(lemma) - len(distractor_lemma))
220
- decision = make_decision(doc, model_type='bert', scaler=scaler, classifier=classifier, pos_dict=pos_dict,
221
- level=level_name, target_text=lemma, target_pos=pos, target_position=lemma_index,
222
- substitute_text=distractor_lemma, substitute_pos=distractor_pos)
223
  if ((distractor_pos == pos
224
  or (COMBINE_POS['simple'][level_name].get(pos) is not None and COMBINE_POS['simple'][level_name].get(distractor_pos) is not None
225
  and distractor_pos in COMBINE_POS['simple'][level_name][pos] and pos in COMBINE_POS['simple'][level_name][distractor_pos]))
 
192
  return distractors
193
 
194
 
195
+ def get_distractors_from_model_bert(model, scaler, classifier, pos_dict:dict, level_name: str, lemma: str, pos: str, gender: str, text_with_masked_task: str,
196
+ global_distractors: set, distractor_minimum: set, max_num_distractors: int, max_length_ratio=5, min_edit_distance_ratio=0.5):
 
197
  _distractors = []
198
  try:
199
  bert_candidates = [token for token in model(text_with_masked_task, top_k=max_num_distractors + 100)]
 
216
  distractor_similarity = candidate_distractor[1]
217
  candidate_gender = get_tags(distractor_lemma).get('Gender')
218
  length_ratio = abs(len(lemma) - len(distractor_lemma))
219
+ decision = make_decision(doc=None, model_type='bert', scaler=scaler, classifier=classifier, pos_dict=pos_dict, level=level_name,
220
+ target_lemma=lemma, target_text=None, target_pos=pos, target_position=None,
221
+ substitute_lemma=distractor_lemma, substitute_pos=distractor_pos, bert_score=distractor_similarity)
222
  if ((distractor_pos == pos
223
  or (COMBINE_POS['simple'][level_name].get(pos) is not None and COMBINE_POS['simple'][level_name].get(distractor_pos) is not None
224
  and distractor_pos in COMBINE_POS['simple'][level_name][pos] and pos in COMBINE_POS['simple'][level_name][distractor_pos]))
utilities_language_w2v/esp_main_workflow_w2v.py CHANGED
@@ -3,16 +3,12 @@ from io import StringIO
3
  from typing import Union
4
  from random import sample
5
  from collections import defaultdict
6
- from streamlit import progress as st_progress
7
- from streamlit.elements import WIDGETS as ST_WIDGETS
8
  from streamlit.runtime.uploaded_file_manager import UploadedFile
9
- import utilities_language_general.esp_constants as esp_constants
10
  from utilities_language_w2v.esp_sentence_w2v import TASK, SENTENCE
11
  from utilities_language_general.esp_utils import prepare_tasks, prepare_target_words, compute_frequency_dict
12
  from utilities_language_general.esp_constants import st, load_w2v, load_classifiers, nlp, summarization, BAD_USER_TARGET_WORDS, MINIMUM_SETS
13
 
14
 
15
-
16
  def main_workflow(
17
  file: Union[UploadedFile, None],
18
  text: str,
@@ -84,7 +80,7 @@ def main_workflow(
84
  .replace(' ', ' ').replace('…', '...').replace('…', '...')
85
  .replace('—', '-').replace('\u2014', '-').replace('—', '-')
86
  .replace('-\n', '').replace('\n', '%^&*'))
87
- current_text_sentences = [sent.text.strip() for sent in esp_constants.nlp(current_text).sents]
88
  logs.update(label='Получили Ваш текст!', state='running')
89
  progress.progress(10)
90
 
 
3
  from typing import Union
4
  from random import sample
5
  from collections import defaultdict
 
 
6
  from streamlit.runtime.uploaded_file_manager import UploadedFile
 
7
  from utilities_language_w2v.esp_sentence_w2v import TASK, SENTENCE
8
  from utilities_language_general.esp_utils import prepare_tasks, prepare_target_words, compute_frequency_dict
9
  from utilities_language_general.esp_constants import st, load_w2v, load_classifiers, nlp, summarization, BAD_USER_TARGET_WORDS, MINIMUM_SETS
10
 
11
 
 
12
  def main_workflow(
13
  file: Union[UploadedFile, None],
14
  text: str,
 
80
  .replace(' ', ' ').replace('…', '...').replace('…', '...')
81
  .replace('—', '-').replace('\u2014', '-').replace('—', '-')
82
  .replace('-\n', '').replace('\n', '%^&*'))
83
+ current_text_sentences = [sent.text.strip() for sent in nlp(current_text).sents]
84
  logs.update(label='Получили Ваш текст!', state='running')
85
  progress.progress(10)
86