Spaces:

a-v-bely
/

russian-task-generator

Running

App Files Files Community

a-v-bely commited on May 23, 2024

Commit

92aa5ff

1 Parent(s): 192a825

upd code

Browse files

Files changed (3) hide show

utilities_language_bert/rus_sentence_bert.py +1 -19
utilities_language_general/rus_constants.py +14 -14
utilities_language_general/rus_utils.py +6 -8

utilities_language_bert/rus_sentence_bert.py CHANGED Viewed

@@ -21,25 +21,7 @@ class SENTENCE:
             self.sentence_lemma_pos.append((lemma_pos, token))
     def bind_phrases(self):
-        previous_was_phrase = False
-        for i in range(len(self.sentence_lemma_pos) - 1):
-            phrase_candidate = f'{self.sentence_lemma_pos[i][0]}_{self.sentence_lemma_pos[i + 1][0]}'
-            if phrase_candidate in PHRASES and not previous_was_phrase:
-                # phrase is {phrase: {original_token1: spacy.token, original_token2: spacy.token}}
-                phrase = [
-                    f'{self.sentence_lemma_pos[i][0]}_{self.sentence_lemma_pos[i + 1][0]}',
-                    {
-                        'original_token1': self.sentence_lemma_pos[i][1],
-                        'original_token2': self.sentence_lemma_pos[i + 1][1]
-                    }
-                ]
-                self.sentence_phrases.append(phrase)
-                previous_was_phrase = True
-            else:
-                if not previous_was_phrase:
-                    self.sentence_phrases.append(self.sentence_lemma_pos[i][1])
-                previous_was_phrase = False
-        self.sentence_phrases.append(self.sentence_lemma_pos[-1][1])
     def search_target_words_automatically(self, target_minimum: set, frequency_dict: dict = None, summary: list=None):
         for token in self.sentence_phrases:

             self.sentence_lemma_pos.append((lemma_pos, token))
     def bind_phrases(self):
+        self.sentence_phrases = self.parsed
     def search_target_words_automatically(self, target_minimum: set, frequency_dict: dict = None, summary: list=None):
         for token in self.sentence_phrases:

utilities_language_general/rus_constants.py CHANGED Viewed

@@ -109,22 +109,22 @@ BAD_USER_TARGET_WORDS = []
 COMBINE_POS = {
     'simple':
     {
-        'A1': {'VERB': ['AUX']},
-        'A2': {'VERB': ['AUX']},
-        'B1': {'VERB': ['AUX']},
-        'B2': {'VERB': ['AUX']},
-        'C1': {'VERB': ['AUX']},
-        'C2': {'VERB': ['AUX']},
-        'Без уровня': {'VERB': ['AUX']}
     },
     'phrase':
     {
-        'A1': {'VERB': ['AUX']},
-        'A2': {'VERB': ['AUX']},
-        'B1': {'VERB': ['AUX']},
-        'B2': {'VERB': ['AUX']},
-        'C1': {'VERB': ['AUX']},
-        'C2': {'VERB': ['AUX']},
-        'Без уровня': {'VERB': ['AUX']}
     },
 }

 COMBINE_POS = {
     'simple':
     {
+        'A1': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], },
+        'A2': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], },
+        'B1': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], },
+        'B2': {'VERB': ['AUX'], '': ['VERB'], },
+        'C1': {'VERB': ['AUX'], '': ['VERB'], },
+        'C2': {'VERB': ['AUX'], '': ['VERB'], },
+        'Без уровня': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], }
     },
     'phrase':
     {
+        'A1': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], },
+        'A2': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], },
+        'B1': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], },
+        'B2': {'VERB': ['AUX'], '': ['VERB'], },
+        'C1': {'VERB': ['AUX'], '': ['VERB'], },
+        'C2': {'VERB': ['AUX'], '': ['VERB'], },
+        'Без уровня': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], }
     },
 }

utilities_language_general/rus_utils.py CHANGED Viewed

@@ -99,14 +99,13 @@ def make_inflection(text: str, pos: str or list, tags: set, level: str) -> str o
             return None
     else:
         word_form = morph.parse(text)[0].inflect(tags)
-        rnd = randint(0,5)
-        if pos == 'VERB' and 'impf' in tags and level in ('A1', 'A2'):
             tags.discard('impf')
             tags.add('perf')
             word_form = morph.parse(text)[0].inflect(tags)
             if word_form is not None:
                 return word_form.word
-        elif pos == 'VERB' and 'perf' in tags and level in ('A1', 'A2'):
             tags.discard('perf')
             tags.add('impf')
             word_form = morph.parse(text)[0].inflect(tags)
@@ -209,7 +208,7 @@ def check_token_bert(token, current_minimum: set = None, stop_words=stop_list,
 def get_distractors_from_model(doc, model, scaler, classifier, pos_dict:dict, target_text:str, lemma: str, pos: str, gender: str,
                                lemma_index:int, global_distractors: set, distractor_minimum: set, level_name: str, max_num_distractors: int,
-                               max_length_ratio=5, min_edit_distance_ratio=0.5):
     distractors = []
     query = lemma if '_' in lemma else f'{lemma}_{pos}'
     lemma = '_'.join(lemma.split('_')[::2])
@@ -240,8 +239,7 @@ def get_distractors_from_model(doc, model, scaler, classifier, pos_dict:dict, ta
                          and (candidate_gender == gender and level_name in ('B1', 'B2', 'C1', 'C2') or level_name in ('A1', 'A2'))
                          and length_ratio <= max_length_ratio
                          and distractor_lemma not in global_distractors
-                         and edit_distance(lemma, distractor_lemma) / ((len(lemma) + len(distractor_lemma)) / 2) >
-                         min_edit_distance_ratio)
             if condition:
                 if distractor_minimum is not None:
                     if distractor_lemma in distractor_minimum:
@@ -290,7 +288,7 @@ def get_distractors_from_model(doc, model, scaler, classifier, pos_dict:dict, ta
 def get_distractors_from_model_bert(model, scaler, classifier, pos_dict:dict, level_name: str, lemma: str, pos: str, gender: str,
                                     text_with_masked_task: str, global_distractors: set, distractor_minimum: set,
-                                    max_num_distractors: int, max_length_ratio=5, min_edit_distance_ratio=0.5):
     _distractors = []
     try:
         if distractor_minimum:
@@ -329,7 +327,7 @@ def get_distractors_from_model_bert(model, scaler, classifier, pos_dict:dict, le
                           and (length_ratio <= max_length_ratio)  # May be changed if case of phrases
                           and (distractor_lemma not in global_distractors)
                           and (edit_distance(lemma, distractor_lemma)  # May be changed if case of phrases
-                               / ((len(lemma) + len(distractor_lemma)) / 2) > min_edit_distance_ratio))
         if condition:
             if distractor_minimum is not None:
                 if distractor_lemma in distractor_minimum:

             return None
     else:
         word_form = morph.parse(text)[0].inflect(tags)
+        if word_form is None and pos == 'VERB' and 'impf' in tags and level in ('A1', 'A2'):
             tags.discard('impf')
             tags.add('perf')
             word_form = morph.parse(text)[0].inflect(tags)
             if word_form is not None:
                 return word_form.word
+        elif word_form is None and pos == 'VERB' and 'perf' in tags and level in ('A1', 'A2'):
             tags.discard('perf')
             tags.add('impf')
             word_form = morph.parse(text)[0].inflect(tags)
 def get_distractors_from_model(doc, model, scaler, classifier, pos_dict:dict, target_text:str, lemma: str, pos: str, gender: str,
                                lemma_index:int, global_distractors: set, distractor_minimum: set, level_name: str, max_num_distractors: int,
+                               max_length_ratio=5, min_edit_distance_ratio=0.4):
     distractors = []
     query = lemma if '_' in lemma else f'{lemma}_{pos}'
     lemma = '_'.join(lemma.split('_')[::2])
                          and (candidate_gender == gender and level_name in ('B1', 'B2', 'C1', 'C2') or level_name in ('A1', 'A2'))
                          and length_ratio <= max_length_ratio
                          and distractor_lemma not in global_distractors
+                         and edit_distance(lemma, distractor_lemma) / ((len(lemma) + len(distractor_lemma)) / 2) >= min_edit_distance_ratio)
             if condition:
                 if distractor_minimum is not None:
                     if distractor_lemma in distractor_minimum:
 def get_distractors_from_model_bert(model, scaler, classifier, pos_dict:dict, level_name: str, lemma: str, pos: str, gender: str,
                                     text_with_masked_task: str, global_distractors: set, distractor_minimum: set,
+                                    max_num_distractors: int, max_length_ratio=5, min_edit_distance_ratio=0.4):
     _distractors = []
     try:
         if distractor_minimum:
                           and (length_ratio <= max_length_ratio)  # May be changed if case of phrases
                           and (distractor_lemma not in global_distractors)
                           and (edit_distance(lemma, distractor_lemma)  # May be changed if case of phrases
+                               / ((len(lemma) + len(distractor_lemma)) / 2) >= min_edit_distance_ratio))
         if condition:
             if distractor_minimum is not None:
                 if distractor_lemma in distractor_minimum: