Spaces:

a-v-bely
/

spanish-task-generator

Runtime error

App Files Files Community

a-v-bely commited on May 6

Commit

ea7c789

•

1 Parent(s): 307a5f3

Towards distractor classification

Browse files

Files changed (6) hide show

utilities_language_bert/esp_sentence_bert.py +4 -14
utilities_language_general/esp_constants.py +45 -12
utilities_language_general/esp_utils.py +29 -13
utilities_language_general/similarity_measures.py +283 -0
utilities_language_w2v/esp_main_workflow_w2v.py +24 -44
utilities_language_w2v/esp_sentence_w2v.py +10 -16

utilities_language_bert/esp_sentence_bert.py CHANGED Viewed

@@ -1,13 +1,9 @@
 import string
-from random import random
-from random import sample
-from utilities_language_general.esp_constants import nlp
 from utilities_language_general.morphology import inflect
-from utilities_language_general.esp_constants import PHRASES
-from utilities_language_general.esp_utils import check_token_bert
-from utilities_language_general.esp_utils import fix_irregular_lemma
-from utilities_language_general.esp_constants import BAD_USER_TARGET_WORDS
-from utilities_language_general.esp_utils import get_distractors_from_model_bert
 class SENTENCE:
@@ -195,12 +191,6 @@ class TASK:
     def attach_distractors_to_target_word(self, model, global_distractors, distractor_minimum,
                                           level_name, max_frequency):
         pos = self.pos[0] if self.pos[0] == 'phrase' else self.pos[1]
-        # distractors_full_text = get_distractors_from_model_bert(model=model, lemma=self.lemma, pos=pos,
-        #                                                         gender=self.gender, level_name=level_name,
-        #                                                         text_with_masked_task=self.text_with_masked_task,
-        #                                                         global_distractors=global_distractors,
-        #                                                         distractor_minimum=distractor_minimum,
-        #                                                         max_num_distractors=self.max_num_distractors)
         distractors_sentence = get_distractors_from_model_bert(model=model, lemma=self.lemma, pos=pos,
                                                                gender=self.gender, level_name=level_name,
                                                                text_with_masked_task=self.masked_sentence,

 import string
+from random import random, sample
 from utilities_language_general.morphology import inflect
+from utilities_language_general.esp_constants import nlp, PHRASES, BAD_USER_TARGET_WORDS
+from utilities_language_general.esp_utils import check_token_bert, fix_irregular_lemma, get_distractors_from_model_bert
 class SENTENCE:
     def attach_distractors_to_target_word(self, model, global_distractors, distractor_minimum,
                                           level_name, max_frequency):
         pos = self.pos[0] if self.pos[0] == 'phrase' else self.pos[1]
         distractors_sentence = get_distractors_from_model_bert(model=model, lemma=self.lemma, pos=pos,
                                                                gender=self.gender, level_name=level_name,
                                                                text_with_masked_task=self.masked_sentence,

utilities_language_general/esp_constants.py CHANGED Viewed

@@ -2,15 +2,20 @@ import json
 import spacy
 import gensim
 import streamlit as st
 from transformers import pipeline
 from summarizer import Summarizer
 @st.cache_resource
-def load_w2v(model_path):
     with st.spinner('Загружаю языковую модель'):
-        _w2v_model = gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=True)
-    return _w2v_model
 @st.cache_resource
@@ -26,14 +31,31 @@ def load_bert():
         _pipeline = pipeline(task="fill-mask", model="a-v-white/bert-base-spanish-wwm-cased-finetuned-literature-pro")
     return _pipeline
 @st.cache_resource
 def load_summarizer():
     return Summarizer()
 nlp = load_spacy()
 summarization = load_summarizer()
-w2v_model_1_path = r'model1.gz'
-w2v_model_2_path = r'model2.gz'
 # Upload minimums
 a1_path, a1_target_set = r'lexical_minimums/A1_MINIMUM.txt', set()
@@ -42,19 +64,25 @@ b1_path, b1_target_set = r'lexical_minimums/B1_MINIMUM.txt', set()
 b2_path, b2_target_set = r'lexical_minimums/B2_MINIMUM.txt', set()
 c1_path, c1_target_set = r'lexical_minimums/C1_MINIMUM.txt', set()
 c2_path, c2_target_set = r'lexical_minimums/C2_MINIMUM.txt', set()
-minimums_paths = (a1_path, a2_path, b1_path, b2_path)
 minimums_sets = (a1_target_set, a2_target_set, b1_target_set, b2_target_set, c1_target_set, c2_target_set)
 for i in range(len(minimums_paths)):
     with open(minimums_paths[i], 'r', encoding='utf-8') as read_file:
         for line in read_file:
             minimums_sets[i].add(line.strip())
-a1_distractor_set = a1_target_set
-a2_distractor_set = a2_target_set.union(a1_target_set)
-b1_distractor_set = b1_target_set.union(a2_target_set)
-b2_distractor_set = b2_target_set.union(b1_target_set)
-c1_distractor_set = c1_target_set.union(b2_target_set)
-c2_distractor_set = c2_target_set.union(c1_target_set)
 with open('language_data/phrases.json', 'r', encoding='utf-8') as f:
     PHRASES = set(json.load(f)['PHRASES'])
@@ -77,6 +105,8 @@ COMBINE_POS = {
                 'ADJ':['NOUN'], 'NOUN': ['ADJ']},
         'C2': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX', 'AUX_VERB'], 'AUX': ['VERB', 'AUX_VERB'], 'AUX_VERB': ['VERB', 'AUX'], 'AUX_AUX': ['AUX'],
                 'ADJ':['NOUN'], 'NOUN': ['ADJ']},
     },
     'phrase':
     {
@@ -89,5 +119,8 @@ COMBINE_POS = {
                 'ADJ':['NOUN'], 'NOUN': ['ADJ']},
         'C2': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX', 'AUX_VERB'], 'AUX': ['VERB', 'AUX_VERB'], 'AUX_VERB': ['VERB', 'AUX'], 'AUX_AUX': ['AUX'],
                 'ADJ':['NOUN'], 'NOUN': ['ADJ']},
     },
 }

 import spacy
 import gensim
 import streamlit as st
+from pickle import load
 from transformers import pipeline
 from summarizer import Summarizer
 @st.cache_resource
+def load_w2v(model):
     with st.spinner('Загружаю языковую модель'):
+        if model == 'model1':
+            model_path = r'language_data/model1.gz'
+        else:
+            model_path = r'language_data/model2.gz'
+    return gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=True)
 @st.cache_resource
         _pipeline = pipeline(task="fill-mask", model="a-v-white/bert-base-spanish-wwm-cased-finetuned-literature-pro")
     return _pipeline
 @st.cache_resource
 def load_summarizer():
     return Summarizer()
+@st.cache_resource
+def load_classifiers(model):
+    if model == 'model1':
+        scaler_path = 'language_data/model1_with_wn_minmaxscaler.pickle'
+        classifier_path = 'language_data/model1_with_wn_catboost_classifier.pickle'
+    elif model == 'model2':
+        scaler_path = 'language_data/model2_with_wn_minmaxscaler.pickle'
+        classifier_path = 'language_data/model2_with_wn_catboost_classifier.pickle'
+    else:
+        scaler_path = 'language_data/model3_with_wn_minmaxscaler.pickle'
+        classifier_path = 'language_data/model3_with_wn_catboost_classifier.pickle'
+    with (open(scaler_path, 'rb') as f1, open(classifier_path, 'rb') as f2, open('language_data/pos_dict.pickle', 'rb') as f3):
+        scaler = load(f1)
+        classifier = load(f2)
+        pos_dict = load(f3)
+    return pos_dict, scaler, classifier
 nlp = load_spacy()
 summarization = load_summarizer()
 # Upload minimums
 a1_path, a1_target_set = r'lexical_minimums/A1_MINIMUM.txt', set()
 b2_path, b2_target_set = r'lexical_minimums/B2_MINIMUM.txt', set()
 c1_path, c1_target_set = r'lexical_minimums/C1_MINIMUM.txt', set()
 c2_path, c2_target_set = r'lexical_minimums/C2_MINIMUM.txt', set()
+minimums_paths = (a1_path, a2_path, b1_path, b2_path, c1_path, c2_path)
 minimums_sets = (a1_target_set, a2_target_set, b1_target_set, b2_target_set, c1_target_set, c2_target_set)
 for i in range(len(minimums_paths)):
     with open(minimums_paths[i], 'r', encoding='utf-8') as read_file:
         for line in read_file:
             minimums_sets[i].add(line.strip())
+MINIMUM_SETS = {
+    'A1': (a1_target_set, a1_target_set),
+    'A2': (a2_target_set, a2_target_set.union(a1_target_set)),
+    'B1': (b1_target_set, b1_target_set.union(a2_target_set)),
+    'B2': (b2_target_set, b2_target_set.union(b1_target_set)),
+    'C1': (c1_target_set, c1_target_set.union(b2_target_set)),
+    'C2': (c2_target_set, c2_target_set.union(c1_target_set)),
+    'Без уровня': (None, None)
+}
+LEVEL_NUMBERS = {'A1': 1, 'A2': 1, 'B1': 2, 'B2': 3, 'C1': 4, 'C2': 4}
 with open('language_data/phrases.json', 'r', encoding='utf-8') as f:
     PHRASES = set(json.load(f)['PHRASES'])
                 'ADJ':['NOUN'], 'NOUN': ['ADJ']},
         'C2': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX', 'AUX_VERB'], 'AUX': ['VERB', 'AUX_VERB'], 'AUX_VERB': ['VERB', 'AUX'], 'AUX_AUX': ['AUX'],
                 'ADJ':['NOUN'], 'NOUN': ['ADJ']},
+        'Без уровня': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX', 'AUX_VERB'], 'AUX': ['VERB', 'AUX_VERB'],
+                       'AUX_VERB': ['VERB', 'AUX'], 'AUX_AUX': ['AUX'], 'AUX_ADJ': ['PRON_VERB'], 'PRON_VERB': ['AUX_ADJ'],                        'ADP': ['SCONJ', 'ADV'], 'ADJ': ['NOUN'], 'NOUN': ['ADJ']}
     },
     'phrase':
     {
                 'ADJ':['NOUN'], 'NOUN': ['ADJ']},
         'C2': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX', 'AUX_VERB'], 'AUX': ['VERB', 'AUX_VERB'], 'AUX_VERB': ['VERB', 'AUX'], 'AUX_AUX': ['AUX'],
                 'ADJ':['NOUN'], 'NOUN': ['ADJ']},
+        'Без уровня': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX', 'AUX_VERB'], 'AUX': ['VERB', 'AUX_VERB'],
+                       'AUX_VERB': ['VERB', 'AUX'], 'AUX_AUX': ['AUX'], 'AUX_ADJ': ['PRON_VERB'], 'PRON_VERB': ['AUX_ADJ'],
+                       'ADP': ['SCONJ', 'ADV'], 'ADJ': ['NOUN'], 'NOUN': ['ADJ']}
     },
 }

utilities_language_general/esp_utils.py CHANGED Viewed

@@ -1,8 +1,8 @@
 from nltk import edit_distance
 from utilities.utils import answer_letter
-from utilities_language_general.esp_constants import nlp
-from utilities_language_general.esp_constants import FIX_LEMMA
-from utilities_language_general.esp_constants import COMBINE_POS
 def prepare_target_words(target_words):
@@ -107,12 +107,13 @@ def check_token_bert(token, current_minimum: set = None, check_allowed_pos: set
         return False
-def get_distractors_from_model(model, lemma: str, pos: str, gender: str or None, global_distractors: set,
-                               distractor_minimum: set, level_name: str, max_num_distractors: int,
                                max_length_ratio=5, min_edit_distance_ratio=0.5):
     distractors = []
     query = lemma if '_' in lemma else f'{lemma}_{pos}'
     lemma = '_'.join(lemma.split('_')[::2])
     if model.has_index_for(query):
         candidates = model.most_similar(query, topn=max_num_distractors + 100)
@@ -126,19 +127,23 @@ def get_distractors_from_model(model, lemma: str, pos: str, gender: str or None,
     for candidate in candidates:
         if candidate[0].count('_') == 1 and pos != 'phrase':
             distractor_lemma, distractor_pos = candidate[0].split('_')
             distractor_similarity = candidate[1]
             candidate_gender = get_tags(distractor_lemma).get('Gender')
             length_ratio = abs(len(lemma) - len(distractor_lemma))
             condition = ((distractor_pos == pos
                           or (COMBINE_POS['simple'][level_name].get(pos) is not None and COMBINE_POS['simple'][level_name].get(distractor_pos) is not None
-                              and distractor_pos in COMBINE_POS['simple'][level_name][pos] and pos in COMBINE_POS['simple'][level_name][distractor_pos])
-                          )
                          and distractor_lemma != lemma
                          and (candidate_gender == gender and level_name in ('B1', 'B2', 'C1', 'C2'))
                          and length_ratio <= max_length_ratio
                          and distractor_lemma not in global_distractors
-                         and edit_distance(lemma, distractor_lemma) / ((len(lemma) + len(distractor_lemma)) / 2) >
-                         min_edit_distance_ratio)
             if condition:
                 if distractor_minimum is not None:
                     if distractor_lemma in distractor_minimum:
@@ -146,14 +151,18 @@ def get_distractors_from_model(model, lemma: str, pos: str, gender: str or None,
                         global_distractors.add(distractor_lemma)
                 else:
                     distractors.append((distractor_lemma, distractor_similarity))
-                    global_distractors.add(distractor_lemma)
         else:
-            if candidate[0].count('_') > 3 or pos in ('NOUN', 'ADJ', 'NUM'):
                 continue
             d1_lemma, d1_pos, d2_lemma, d2_pos = candidate[0].split('_')
             d_pos = f'{d1_pos}_{d2_pos}'
             distractor_lemma = f'{d1_lemma}_{d2_lemma}'
             distractor_similarity = candidate[1]
             condition = (((d1_pos == pos or d2_pos == pos)
                           or (COMBINE_POS['phrase'][level_name].get(d_pos) is not None and COMBINE_POS['phrase'][level_name].get(pos) is not None
                               and d_pos in COMBINE_POS['phrase'][level_name].get(d_pos)  and pos in COMBINE_POS['phrase'][level_name].get(pos) )
@@ -161,7 +170,10 @@ def get_distractors_from_model(model, lemma: str, pos: str, gender: str or None,
                               and pos in ('phrase', 'VERB', 'AUX', 'SCONJ', 'ADP'))
                           or (d2_pos in ('VERB', 'AUX', 'SCONJ', 'ADP')
                               and pos in ('phrase', 'VERB', 'AUX', 'SCONJ', 'ADP')))
                          and candidate[0] != lemma
                          and distractor_lemma != lemma
                          and distractor_lemma not in global_distractors)
             if condition:
@@ -180,8 +192,8 @@ def get_distractors_from_model(model, lemma: str, pos: str, gender: str or None,
         return distractors
-def get_distractors_from_model_bert(model, text_with_masked_task: str, lemma: str, pos: str, gender: str or None,
-                                    global_distractors: set, distractor_minimum: set, level_name: str,
                                     max_num_distractors: int, max_length_ratio=5, min_edit_distance_ratio=0.5):
     _distractors = []
     try:
@@ -205,9 +217,13 @@ def get_distractors_from_model_bert(model, text_with_masked_task: str, lemma: st
         distractor_similarity = candidate_distractor[1]
         candidate_gender = get_tags(distractor_lemma).get('Gender')
         length_ratio = abs(len(lemma) - len(distractor_lemma))
         if ((distractor_pos == pos
              or (COMBINE_POS['simple'][level_name].get(pos) is not None and COMBINE_POS['simple'][level_name].get(distractor_pos) is not None
                  and distractor_pos in COMBINE_POS['simple'][level_name][pos] and pos in COMBINE_POS['simple'][level_name][distractor_pos]))
                 and distractor_lemma != lemma
                 and (len(_distractors) < max_num_distractors+100)
                 and (candidate_gender == gender and level_name in ('B1', 'B2', 'C1', 'C2'))

 from nltk import edit_distance
 from utilities.utils import answer_letter
+from utilities_language_general.similarity_measures import make_decision
+from utilities_language_general.esp_constants import nlp, FIX_LEMMA, COMBINE_POS
 def prepare_target_words(target_words):
         return False
+def get_distractors_from_model(doc, model, scaler, classifier, pos_dict:dict, target_text:str, lemma: str, pos: str, gender: str, lemma_index:int, global_distractors: set,
+                               distractor_minimum: set, level_name: str, max_num_distractors: int,
                                max_length_ratio=5, min_edit_distance_ratio=0.5):
     distractors = []
     query = lemma if '_' in lemma else f'{lemma}_{pos}'
+    raw_lemma = query
     lemma = '_'.join(lemma.split('_')[::2])
     if model.has_index_for(query):
         candidates = model.most_similar(query, topn=max_num_distractors + 100)
     for candidate in candidates:
         if candidate[0].count('_') == 1 and pos != 'phrase':
             distractor_lemma, distractor_pos = candidate[0].split('_')
+            decision = make_decision(doc, model_type='w2v', model=model, scaler=scaler, classifier=classifier, pos_dict=pos_dict,
+                                     level=level_name, target_lemma=raw_lemma, target_text=target_text, target_pos=pos, target_position=lemma_index,
+                                     substitute_lemma=distractor_lemma, substitute_pos=distractor_pos)
             distractor_similarity = candidate[1]
             candidate_gender = get_tags(distractor_lemma).get('Gender')
             length_ratio = abs(len(lemma) - len(distractor_lemma))
             condition = ((distractor_pos == pos
                           or (COMBINE_POS['simple'][level_name].get(pos) is not None and COMBINE_POS['simple'][level_name].get(distractor_pos) is not None
+                              and distractor_pos in COMBINE_POS['simple'][level_name][pos] and pos in COMBINE_POS['simple'][level_name][distractor_pos]))
+                         and decision
                          and distractor_lemma != lemma
+                         and distractor_lemma not in lemma
+                         and lemma not in  distractor_lemma
                          and (candidate_gender == gender and level_name in ('B1', 'B2', 'C1', 'C2'))
                          and length_ratio <= max_length_ratio
                          and distractor_lemma not in global_distractors
+                         and edit_distance(lemma, distractor_lemma) / ((len(lemma) + len(distractor_lemma)) / 2) > min_edit_distance_ratio)
             if condition:
                 if distractor_minimum is not None:
                     if distractor_lemma in distractor_minimum:
                         global_distractors.add(distractor_lemma)
                 else:
                     distractors.append((distractor_lemma, distractor_similarity))
+                    global_distractors.add(distractor_lemma)
         else:
+            if (candidate[0].count('_') == 1 # REMOVE HOTFIX
+                or candidate[0].count('_') > 3 or pos in ('NOUN', 'ADJ', 'NUM')):
                 continue
             d1_lemma, d1_pos, d2_lemma, d2_pos = candidate[0].split('_')
             d_pos = f'{d1_pos}_{d2_pos}'
             distractor_lemma = f'{d1_lemma}_{d2_lemma}'
             distractor_similarity = candidate[1]
+            decision = make_decision(doc, model_type='w2v', model=model, scaler=scaler, classifier=classifier, pos_dict=pos_dict,
+                                     level=level_name, target_lemma=raw_lemma, target_text=target_text, target_pos=pos, target_position=lemma_index,
+                                     substitute_lemma=candidate[0], substitute_pos=d_pos)
             condition = (((d1_pos == pos or d2_pos == pos)
                           or (COMBINE_POS['phrase'][level_name].get(d_pos) is not None and COMBINE_POS['phrase'][level_name].get(pos) is not None
                               and d_pos in COMBINE_POS['phrase'][level_name].get(d_pos)  and pos in COMBINE_POS['phrase'][level_name].get(pos) )
                               and pos in ('phrase', 'VERB', 'AUX', 'SCONJ', 'ADP'))
                           or (d2_pos in ('VERB', 'AUX', 'SCONJ', 'ADP')
                               and pos in ('phrase', 'VERB', 'AUX', 'SCONJ', 'ADP')))
+                         and decision
                          and candidate[0] != lemma
+                         and distractor_lemma not in lemma
+                         and lemma not in  distractor_lemma
                          and distractor_lemma != lemma
                          and distractor_lemma not in global_distractors)
             if condition:
         return distractors
+def get_distractors_from_model_bert(doc, model, scaler, classifier, text_with_masked_task: str, lemma: str, pos: str, gender: str, lemma_index:int,
+                                    global_distractors: set, distractor_minimum: set, level_name: str, pos_dict:dict,
                                     max_num_distractors: int, max_length_ratio=5, min_edit_distance_ratio=0.5):
     _distractors = []
     try:
         distractor_similarity = candidate_distractor[1]
         candidate_gender = get_tags(distractor_lemma).get('Gender')
         length_ratio = abs(len(lemma) - len(distractor_lemma))
+        decision = make_decision(doc, model_type='bert', scaler=scaler, classifier=classifier, pos_dict=pos_dict,
+                                    level=level_name, target_text=lemma, target_pos=pos, target_position=lemma_index,
+                                    substitute_text=distractor_lemma, substitute_pos=distractor_pos)
         if ((distractor_pos == pos
              or (COMBINE_POS['simple'][level_name].get(pos) is not None and COMBINE_POS['simple'][level_name].get(distractor_pos) is not None
                  and distractor_pos in COMBINE_POS['simple'][level_name][pos] and pos in COMBINE_POS['simple'][level_name][distractor_pos]))
+                and decision
                 and distractor_lemma != lemma
                 and (len(_distractors) < max_num_distractors+100)
                 and (candidate_gender == gender and level_name in ('B1', 'B2', 'C1', 'C2'))

utilities_language_general/similarity_measures.py ADDED Viewed

	@@ -0,0 +1,283 @@

+import numpy as np
+from math import pow
+from nltk.corpus import wordnet as wn
+from utilities_language_general.esp_constants import nlp, PHRASES, LEVEL_NUMBERS
+def eucledian_distance(x, y):
+    return np.sqrt(np.sum((x - y) ** 2))
+def cosine_similarity(x, y):
+    out = np.dot(x, y) / (np.sqrt(np.dot(x, x)) * np.sqrt(np.dot(y, y)))
+    if str(out) != 'nan':
+        return out
+    return None
+def get_vector_for_token(model, token):
+    vector = None
+    splitted = token.split('_')
+    token_list = [f'{splitted[i]}_{splitted[i+1]}' for i in range(len(splitted)-1)]
+    if model.has_index_for(token):
+        vector = model.get_vector(token)
+    else:
+        try:
+            vector = model.get_mean_vector(token_list)
+        except ValueError:
+            return None
+    return vector
+def compute_metric(func, vector1, vector2):
+    if vector1 is not None and vector2 is not None:
+        return func(vector1, vector2)
+    else:
+        return None
+def compute_positive_cos(x, y):
+    cos_sim = cosine_similarity(x, y)
+    if cos_sim:
+        return (cos_sim + 1) / 2
+    else:
+        return None
+def addition_metric(substitute, target, context):
+    substitute_target_cos = compute_metric(cosine_similarity, substitute, target)
+    if not substitute_target_cos:
+        return None
+    if not context:
+        return None
+    context_vectors = []
+    for context_tk in context:
+        substitute_context_cos = compute_metric(cosine_similarity, substitute, context_tk)
+        if substitute_context_cos:
+            context_vectors.append(substitute_context_cos)
+    sum_of_context_vectors = np.sum(context_vectors)
+    metric = (substitute_target_cos + sum_of_context_vectors) / (len(context) + 1)
+    return metric
+def balanced_addition_metric(substitute, target, context):
+    substitute_target_cos = compute_metric(cosine_similarity, substitute, target)
+    if not substitute_target_cos:
+        return None
+    if not context:
+        return None
+    context_vectors = []
+    for context_tk in context:
+        substitute_context_cos = compute_metric(cosine_similarity, substitute, context_tk)
+        if substitute_context_cos:
+            context_vectors.append(substitute_context_cos)
+    sum_of_context_vectors = np.sum(context_vectors)
+    context_len = len(context)
+    metric = (context_len * substitute_target_cos + sum_of_context_vectors) / (2 * context_len)
+    return metric
+def multiplication_metric(substitute, target, context):
+    substitute_target_cos = compute_metric(compute_positive_cos, substitute, target)
+    if not substitute_target_cos:
+        return None
+    if not context:
+        return None
+    context_vectors = []
+    for context_tk in context:
+        substitute_context_positive_cos = compute_metric(compute_positive_cos, substitute, context_tk)
+        if substitute_context_positive_cos:
+            context_vectors.append(substitute_context_positive_cos)
+    prod_of_context_vectors = np.prod(context_vectors)
+    try:
+        metric = pow((substitute_target_cos + prod_of_context_vectors), 1 / (len(context) + 1))
+    except ValueError:
+        return None
+    return metric
+def balanced_multiplication_metric(substitute, target, context):
+    substitute_target_cos = compute_metric(compute_positive_cos, substitute, target)
+    if not substitute_target_cos:
+        return None
+    if not context:
+        return None
+    context_vectors = []
+    for context_tk in context:
+        substitute_context_positive_cos = compute_metric(compute_positive_cos, substitute, context_tk)
+        if substitute_context_positive_cos:
+            context_vectors.append(substitute_context_positive_cos)
+    prod_of_context_vectors = np.prod(context_vectors)
+    context_len = len(context)
+    try:
+        metric = pow((pow(substitute_target_cos, context_len) + prod_of_context_vectors), 1 / (2 * context_len))
+    except ValueError:
+        return None
+    return metric
+def bind_phrases(context_list):
+    context = []
+    previous_was_phrase = False
+    for i in range(len(context_list)-1):
+        phrase_candidate = f'{context_list[i]}_{context_list[i+1]}'
+        if phrase_candidate in PHRASES and not previous_was_phrase:
+            context.append(phrase_candidate)
+            previous_was_phrase = True
+        else:
+            if not previous_was_phrase:
+                context.append(context_list[i])
+            previous_was_phrase = False
+    if context_list:
+        if not context:
+            context.append(context_list[-1])
+        elif not context_list[-1] in context[-1]:
+            context.append(context_list[-1])
+    return context
+def get_context_windows(doc, target_text, window_size):
+    sentence_str = doc.text
+    sentence_masked = sentence_str.lower().replace(target_text.lower().strip(), ' [MASK] ')
+    alpha_tokens_lemma_pos = [f'{tk.lemma_.lower()}_{tk.pos_}' for tk in nlp(sentence_masked) if tk.text.isalpha()]
+    alpha_tokens_lemma_pos_no_stop = [f'{tk.lemma_.lower()}_{tk.pos_}' for tk in nlp(sentence_masked) if tk.text.isalpha() and not tk.is_stop]
+    try:
+        mask_token_index = alpha_tokens_lemma_pos.index('mask_NUM')
+        mask_token_index_no_stop = alpha_tokens_lemma_pos_no_stop.index('mask_NUM')
+    except ValueError:
+        return None
+    left_border = max(mask_token_index-window_size, 0)
+    right_border = min(mask_token_index+window_size, len(alpha_tokens_lemma_pos))
+    l_context = alpha_tokens_lemma_pos[left_border:mask_token_index]
+    r_context = alpha_tokens_lemma_pos[mask_token_index+1:right_border+1]
+    left_border_no_stop = max(mask_token_index_no_stop-window_size, 0)
+    right_border_no_stop = min(mask_token_index_no_stop+window_size, len(alpha_tokens_lemma_pos_no_stop))
+    l_context_no_stop = alpha_tokens_lemma_pos_no_stop[left_border_no_stop:mask_token_index_no_stop]
+    r_context_no_stop = alpha_tokens_lemma_pos_no_stop[mask_token_index_no_stop+1:right_border_no_stop+1]
+    return (bind_phrases(l_context) + bind_phrases(r_context), bind_phrases(l_context_no_stop) + bind_phrases(r_context_no_stop))
+def get_context_linked_words(doc, target_position, target_text):
+    answer_list = target_text.split(' ')
+    context_words = []
+    for tk in doc:
+        if tk.text.isalpha():
+            if (tk.text in answer_list and abs(target_position - tk.idx) <= sum([len(t) for t in answer_list])):
+                context_words.extend([t for t in tk.subtree if t.text.isalpha() and not t.is_stop])
+                context_words.extend([t for t in tk.children if t.text.isalpha() and not t.is_stop])
+                context_words.extend([t for t in tk.ancestors if t.text.isalpha() and not t.is_stop])
+    context_words = [(tk, f'{tk.lemma_}_{tk.pos_}') for tk in sorted(set(context_words), key=lambda tk: tk.i) if tk.text not in answer_list]
+    context = []
+    previous_was_phrase = False
+    for i in range(len(context_words)-1):
+        phrase_candidate = f'{context_words[i][1]}_{context_words[i+1][1]}'
+        if phrase_candidate in PHRASES and not previous_was_phrase and abs(context_words[i][0].i - context_words[i+1][0].i) <=1:
+            context.append(phrase_candidate)
+            previous_was_phrase = True
+        else:
+            if not previous_was_phrase:
+                context.append(context_words[i][1])
+    if context and context_words:
+        if not context_words[-1][1] in context[-1]:
+            context.append(context_words[-1][1])
+    elif context_words:
+        context.append(context_words[-1][1])
+    return context
+def get_word_net_similarity(token1, token2, metric):
+    token1_list = token1.split('_')[::2]
+    token2_list = token2.split('_')[::2]
+    data = []
+    for token1_part in token1_list:
+        for syn1 in wn.synsets(token1_part, lang='spa'):
+            for token2_part in token2_list:
+                for syn2 in wn.synsets(token2_part, lang='spa'):
+                    if syn1.pos() == syn2.pos():
+                        data.append(metric(syn1, syn2))
+    if data:
+        data = np.array(data)
+        return data.min(), data.max(), data.mean(), data.std()
+    else:
+        return None, None, None, None
+def compute_all_necessary_metrics(target_lemma, target_text, target_position, substitute_lemma, doc, model_type:str, model=None):
+    path_similarity_min, path_similarity_max, path_similarity_mean, path_similarity_std = get_word_net_similarity(target_lemma, substitute_lemma, wn.path_similarity)
+    wup_similarity_min, wup_similarity_max, wup_similarity_mean, wup_similarity_std = get_word_net_similarity(target_lemma, substitute_lemma, wn.wup_similarity)
+    lch_similarity_min, lch_similarity_max, lch_similarity_mean, lch_similarity_std = get_word_net_similarity(target_lemma, substitute_lemma, wn.lch_similarity)
+    if model_type == 'bert':
+        return (path_similarity_min, path_similarity_max, path_similarity_mean, path_similarity_std,
+                wup_similarity_min, wup_similarity_max, wup_similarity_mean, wup_similarity_std,
+                lch_similarity_min, lch_similarity_max, lch_similarity_mean, lch_similarity_std)
+    target_vector = get_vector_for_token(model, target_lemma)
+    substitute_vector = get_vector_for_token(model, substitute_lemma)
+    cosimilarity = compute_metric(cosine_similarity, substitute_vector, target_vector)
+    eucledian_similarity = compute_metric(eucledian_distance, substitute_vector, target_vector)
+    context_window3, context_window3_no_stop = get_context_windows(doc=doc, target_text=target_text, window_size=3)
+    context_window5, context_window5_no_stop = get_context_windows(doc=doc, target_text=target_text, window_size=5)
+    context_window_synt = get_context_linked_words(doc, target_position, target_text)
+    context_window3 = [get_vector_for_token(model, token) for token in context_window3]
+    context_window3_no_stop = [get_vector_for_token(model, token) for token in context_window3_no_stop]
+    context_window5 = [get_vector_for_token(model, token) for token in context_window5]
+    context_window5_no_stop = [get_vector_for_token(model, token) for token in context_window5_no_stop]
+    context_window_synt = [get_vector_for_token(model, token) for token in context_window_synt]
+    add_metric_window3 = addition_metric(target_vector, substitute_vector, context_window3)
+    bal_add_metric_window3 = balanced_addition_metric(target_vector, substitute_vector, context_window3)
+    add_metric_window3_no_stop = addition_metric(target_vector, substitute_vector, context_window3_no_stop)
+    bal_add_metric_window3_no_stop = balanced_addition_metric(target_vector, substitute_vector, context_window3_no_stop)
+    mult_metric_window3 = multiplication_metric(target_vector, substitute_vector, context_window3)
+    bal_mult_metric_window3 = balanced_multiplication_metric(target_vector, substitute_vector, context_window3)
+    mult_metric_window3_no_stop = multiplication_metric(target_vector, substitute_vector, context_window3_no_stop)
+    bal_mult_metric_window3_no_stop = balanced_multiplication_metric(target_vector, substitute_vector, context_window3_no_stop)
+    add_metric_window5 = addition_metric(target_vector, substitute_vector, context_window5)
+    bal_add_metric_window5 = balanced_addition_metric(target_vector, substitute_vector, context_window5)
+    add_metric_window5_no_stop = addition_metric(target_vector, substitute_vector, context_window5_no_stop)
+    bal_add_metric_window5_no_stop = balanced_addition_metric(target_vector, substitute_vector, context_window5_no_stop)
+    mult_metric_window5 = multiplication_metric(target_vector, substitute_vector, context_window5)
+    bal_mult_metric_window5 = balanced_multiplication_metric(target_vector, substitute_vector, context_window5)
+    mult_metric_window5_no_stop = multiplication_metric(target_vector, substitute_vector, context_window5_no_stop)
+    bal_mult_metric_window5_no_stop = balanced_multiplication_metric(target_vector, substitute_vector, context_window5_no_stop)
+    add_metric_synt = addition_metric(target_vector, substitute_vector, context_window_synt)
+    bal_add_metric_synt = balanced_addition_metric(target_vector, substitute_vector, context_window_synt)
+    mult_metric_synt = multiplication_metric(target_vector, substitute_vector, context_window_synt)
+    bal_mult_metric_synt = balanced_multiplication_metric(target_vector, substitute_vector, context_window_synt)
+    return (cosimilarity, eucledian_similarity,
+            add_metric_window3, bal_add_metric_window3,
+            mult_metric_window3, bal_mult_metric_window3,
+            add_metric_window3_no_stop, bal_add_metric_window3_no_stop,
+            mult_metric_window3_no_stop, bal_mult_metric_window3_no_stop,
+            add_metric_window5, bal_add_metric_window5,
+            mult_metric_window5, bal_mult_metric_window5,
+            add_metric_window5_no_stop, bal_add_metric_window5_no_stop,
+            mult_metric_window5_no_stop, bal_mult_metric_window5_no_stop,
+            add_metric_synt, bal_add_metric_synt,
+            mult_metric_synt, bal_mult_metric_synt,
+            path_similarity_min, path_similarity_mean, path_similarity_std, path_similarity_max,
+            wup_similarity_min,  wup_similarity_mean, wup_similarity_std, wup_similarity_max,
+            lch_similarity_min, lch_similarity_mean, lch_similarity_std, lch_similarity_max)
+def make_decision(doc, model_type, scaler, classifier, pos_dict, level, target_lemma, target_text, target_pos, target_position,
+                  substitute_lemma, substitute_pos, model=None, bert_score=None):
+    # return True
+    metrics = compute_all_necessary_metrics(target_lemma=target_lemma, target_text=target_text, target_position=target_position,
+                                            substitute_lemma=substitute_lemma, doc=doc, model_type=model_type, model=model)
+    target_multiword, substitute_multiword = target_lemma.count('_') > 2, substitute_lemma.count('_') > 2
+    data = [LEVEL_NUMBERS.get(level), pos_dict.get(target_pos), target_multiword, pos_dict.get(substitute_pos), substitute_multiword] + scaler.transform([metrics]).tolist()[0]
+    if model_type == 'bert':
+        data = [LEVEL_NUMBERS.get(level), pos_dict.get(target_pos), target_multiword, pos_dict.get(substitute_pos), substitute_multiword, bert_score] + scaler.transform([metrics]).tolist()[0]
+    predict = classifier.predict(data)
+    return bool(predict)

utilities_language_w2v/esp_main_workflow_w2v.py CHANGED Viewed

@@ -1,30 +1,24 @@
 import datetime
 from io import StringIO
 from random import sample
 from collections import defaultdict
 from streamlit import progress as st_progress
 from streamlit.elements import WIDGETS as ST_WIDGETS
-from utilities_language_general.esp_constants import st
-from utilities_language_w2v.esp_sentence_w2v import TASK
-from utilities_language_w2v.esp_sentence_w2v import SENTENCE
-from utilities_language_general.esp_constants import load_w2v
-from utilities_language_general.esp_utils import prepare_tasks
 from streamlit.runtime.uploaded_file_manager import UploadedFile
 import utilities_language_general.esp_constants as esp_constants
-from utilities_language_general.esp_constants import summarization
-from utilities_language_general.esp_constants import w2v_model_1_path
-from utilities_language_general.esp_constants import w2v_model_2_path
-from utilities_language_general.esp_utils import prepare_target_words
-from utilities_language_general.esp_utils import compute_frequency_dict
-from utilities_language_general.esp_constants import BAD_USER_TARGET_WORDS
 def main_workflow(
-        file: UploadedFile or None,
         text: str,
-        logs: ST_WIDGETS,
-        progress: st_progress,
-        progress_d: st_progress,
         level: str,
         tw_mode_automatic_mode: str,
         target_words: str,
@@ -68,15 +62,15 @@ def main_workflow(
     elif text != '':
         current_text = text
     else:
-        esp_constants.st.warning('Вы и текст не вставили, и файл не выбрали 😢')
         current_text = ''
-        esp_constants.st.stop()
     # Process target words
     if tw_mode_automatic_mode == 'Самостоятельно':
         if target_words == '':
-            esp_constants.st.warning('Вы не ввели целевые слова')
-            esp_constants.st.stop()
         # Cannot make up paradigm, so only USER_TARGET_WORDS is used
         USER_TARGET_WORDS = prepare_target_words(target_words)
         tw_mode_automatic_mode = False
@@ -107,27 +101,8 @@ def main_workflow(
     progress.progress(15)
     # Choose necessary language minimum according to user's input
-    if level == 'A1':
-        target_minimum = esp_constants.a1_target_set
-        distractor_minimum = esp_constants.a1_distractor_set
-    elif level == 'A2':
-        target_minimum = esp_constants.a2_target_set
-        distractor_minimum = esp_constants.a2_distractor_set
-    elif level == 'B1':
-        target_minimum = esp_constants.b1_target_set
-        distractor_minimum = esp_constants.b1_distractor_set
-    elif level == 'B2':
-        target_minimum = esp_constants.b2_target_set
-        distractor_minimum = esp_constants.b2_distractor_set
-    elif level == 'C1':
-        target_minimum = esp_constants.c1_target_set
-        distractor_minimum = esp_constants.c1_distractor_set
-    elif level == 'C2':
-        target_minimum = esp_constants.c2_target_set
-        distractor_minimum = esp_constants.c2_distractor_set
-    elif level == 'Без уровня':
-        target_minimum = None
-        distractor_minimum = None
     else:
         target_minimum = None
         distractor_minimum = None
@@ -137,9 +112,11 @@ def main_workflow(
     # Define which model is used for distractor generation
     logs.update(label='Загружаем языковые модели и другие данные', state='running')
     if model_name == 'Модель-1':
-        mask_filler = load_w2v(w2v_model_1_path)
     else:
-        mask_filler = load_w2v(w2v_model_2_path)
     # Start generation process
     workflow = [SENTENCE(original=sent.strip(), n_sentence=num, max_num_distractors=num_distractors)
@@ -197,6 +174,9 @@ def main_workflow(
     for sentence in workflow:
         sentence.attach_distractors_to_target_word(model=mask_filler,
                                                    global_distractors=GLOBAL_DISTRACTORS,
                                                    distractor_minimum=distractor_minimum,
                                                    level_name=level,
@@ -235,8 +215,8 @@ def main_workflow(
                 NUMBER_TASKS = 10
             else:
                 NUMBER_TASKS = len(RESULT_TASKS)
-    RESULT_TASKS_in_summary = filter(lambda task: task.in_summary, RESULT_TASKS)
-    RESULT_TASTS_not_in_summary = filter(lambda task: not task.in_summary, RESULT_TASKS)
     if len(RESULT_TASKS_in_summary) >= NUMBER_TASKS:
         RESULT_TASKS = RESULT_TASKS_in_summary
     else:

 import datetime
 from io import StringIO
+from typing import Union
 from random import sample
 from collections import defaultdict
 from streamlit import progress as st_progress
 from streamlit.elements import WIDGETS as ST_WIDGETS
 from streamlit.runtime.uploaded_file_manager import UploadedFile
 import utilities_language_general.esp_constants as esp_constants
+from utilities_language_w2v.esp_sentence_w2v import TASK, SENTENCE
+from utilities_language_general.esp_utils import prepare_tasks, prepare_target_words, compute_frequency_dict
+from utilities_language_general.esp_constants import st, load_w2v, load_classifiers, nlp, summarization, BAD_USER_TARGET_WORDS, MINIMUM_SETS
 def main_workflow(
+        file: Union[UploadedFile, None],
         text: str,
+        logs,
+        progress,
+        progress_d,
         level: str,
         tw_mode_automatic_mode: str,
         target_words: str,
     elif text != '':
         current_text = text
     else:
+        st.warning('Вы и текст не вставили, и файл не выбрали 😢')
         current_text = ''
+        st.stop()
     # Process target words
     if tw_mode_automatic_mode == 'Самостоятельно':
         if target_words == '':
+            st.warning('Вы не ввели целевые слова')
+            st.stop()
         # Cannot make up paradigm, so only USER_TARGET_WORDS is used
         USER_TARGET_WORDS = prepare_target_words(target_words)
         tw_mode_automatic_mode = False
     progress.progress(15)
     # Choose necessary language minimum according to user's input
+    if level:
+        target_minimum, distractor_minimum = MINIMUM_SETS[level]
     else:
         target_minimum = None
         distractor_minimum = None
     # Define which model is used for distractor generation
     logs.update(label='Загружаем языковые модели и другие данные', state='running')
     if model_name == 'Модель-1':
+        mask_filler = load_w2v('model1')
+        pos_dict, scaler, classifier = load_classifiers('model1')
     else:
+        mask_filler = load_w2v('model2')
+        pos_dict, scaler, classifier = load_classifiers('model1')
     # Start generation process
     workflow = [SENTENCE(original=sent.strip(), n_sentence=num, max_num_distractors=num_distractors)
     for sentence in workflow:
         sentence.attach_distractors_to_target_word(model=mask_filler,
+                                                   scaler=scaler,
+                                                   classifier=classifier,
+                                                   pos_dict=pos_dict,
                                                    global_distractors=GLOBAL_DISTRACTORS,
                                                    distractor_minimum=distractor_minimum,
                                                    level_name=level,
                 NUMBER_TASKS = 10
             else:
                 NUMBER_TASKS = len(RESULT_TASKS)
+    RESULT_TASKS_in_summary = list(filter(lambda task: task.in_summary, RESULT_TASKS))
+    RESULT_TASTS_not_in_summary = list(filter(lambda task: not task.in_summary, RESULT_TASKS))
     if len(RESULT_TASKS_in_summary) >= NUMBER_TASKS:
         RESULT_TASKS = RESULT_TASKS_in_summary
     else:

utilities_language_w2v/esp_sentence_w2v.py CHANGED Viewed

@@ -1,13 +1,9 @@
 import string
-from random import random
-from random import sample
-from utilities_language_general.esp_constants import nlp
 from utilities_language_general.morphology import inflect
-from utilities_language_general.esp_utils import check_token
-from utilities_language_general.esp_constants import PHRASES
-from utilities_language_general.esp_utils import fix_irregular_lemma
-from utilities_language_general.esp_constants import BAD_USER_TARGET_WORDS
-from utilities_language_general.esp_utils import get_distractors_from_model
 class SENTENCE:
@@ -46,6 +42,7 @@ class SENTENCE:
                 if not previous_was_phrase:
                     self.sentence_phrases.append(self.sentence_lemma_pos[i][1])
                 previous_was_phrase = False
     def search_target_words_automatically(self, model, target_minimum: set, frequency_dict: dict = None, summary:list=None):
         for token in self.sentence_phrases:
@@ -152,18 +149,15 @@ class SENTENCE:
             self.search_user_target_words(model=model, user_target_words=user_target_words,
                                           frequency_dict=frequency_dict, summary=summary)
-    def attach_distractors_to_target_word(self, model, global_distractors, distractor_minimum, level_name,
-                                          max_frequency, logs, progress):
         n_target_words = len(self.target_words)
         bad_target_words = []
         for i, target_word in enumerate(self.target_words):
-            distractors = get_distractors_from_model(model, lemma=target_word['lemma'], pos=target_word['pos'],
-                                                     gender=target_word['gender'], level_name=level_name,
-                                                     global_distractors=global_distractors,
-                                                     distractor_minimum=distractor_minimum,
-                                                     max_num_distractors=self.max_num_distractors)
             if distractors is None or target_word['frequency_in_text'] > max_frequency:
-                target_word['distractors'] = distractors
                 bad_target_words.append(target_word)
             target_word['distractors'] = distractors
             target_word['distractors_number'] = len(distractors) if distractors is not None else 0

 import string
+from random import random, sample
 from utilities_language_general.morphology import inflect
+from utilities_language_general.esp_constants import nlp, PHRASES, BAD_USER_TARGET_WORDS
+from utilities_language_general.esp_utils import check_token, fix_irregular_lemma, get_distractors_from_model
 class SENTENCE:
                 if not previous_was_phrase:
                     self.sentence_phrases.append(self.sentence_lemma_pos[i][1])
                 previous_was_phrase = False
+        self.sentence_phrases.append(self.sentence_lemma_pos[-1][1])
     def search_target_words_automatically(self, model, target_minimum: set, frequency_dict: dict = None, summary:list=None):
         for token in self.sentence_phrases:
             self.search_user_target_words(model=model, user_target_words=user_target_words,
                                           frequency_dict=frequency_dict, summary=summary)
+    def attach_distractors_to_target_word(self, model, scaler, classifier, pos_dict, global_distractors, distractor_minimum, level_name, max_frequency, logs, progress):
         n_target_words = len(self.target_words)
         bad_target_words = []
         for i, target_word in enumerate(self.target_words):
+            distractors = get_distractors_from_model(doc=self.parsed, model=model, scaler=scaler, classifier=classifier, pos_dict=pos_dict,
+                                                     target_text=target_word['original_text'], lemma=target_word['lemma'], pos=target_word['pos'], gender=target_word['gender'],
+                                                     lemma_index=target_word['position_in_sentence'], global_distractors=global_distractors,
+                                                     distractor_minimum=distractor_minimum, level_name=level_name, max_num_distractors=self.max_num_distractors)
             if distractors is None or target_word['frequency_in_text'] > max_frequency:
                 bad_target_words.append(target_word)
             target_word['distractors'] = distractors
             target_word['distractors_number'] = len(distractors) if distractors is not None else 0