a-v-bely commited on
Commit
92aa5ff
·
1 Parent(s): 192a825
utilities_language_bert/rus_sentence_bert.py CHANGED
@@ -21,25 +21,7 @@ class SENTENCE:
21
  self.sentence_lemma_pos.append((lemma_pos, token))
22
 
23
  def bind_phrases(self):
24
- previous_was_phrase = False
25
- for i in range(len(self.sentence_lemma_pos) - 1):
26
- phrase_candidate = f'{self.sentence_lemma_pos[i][0]}_{self.sentence_lemma_pos[i + 1][0]}'
27
- if phrase_candidate in PHRASES and not previous_was_phrase:
28
- # phrase is {phrase: {original_token1: spacy.token, original_token2: spacy.token}}
29
- phrase = [
30
- f'{self.sentence_lemma_pos[i][0]}_{self.sentence_lemma_pos[i + 1][0]}',
31
- {
32
- 'original_token1': self.sentence_lemma_pos[i][1],
33
- 'original_token2': self.sentence_lemma_pos[i + 1][1]
34
- }
35
- ]
36
- self.sentence_phrases.append(phrase)
37
- previous_was_phrase = True
38
- else:
39
- if not previous_was_phrase:
40
- self.sentence_phrases.append(self.sentence_lemma_pos[i][1])
41
- previous_was_phrase = False
42
- self.sentence_phrases.append(self.sentence_lemma_pos[-1][1])
43
 
44
  def search_target_words_automatically(self, target_minimum: set, frequency_dict: dict = None, summary: list=None):
45
  for token in self.sentence_phrases:
 
21
  self.sentence_lemma_pos.append((lemma_pos, token))
22
 
23
  def bind_phrases(self):
24
+ self.sentence_phrases = self.parsed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  def search_target_words_automatically(self, target_minimum: set, frequency_dict: dict = None, summary: list=None):
27
  for token in self.sentence_phrases:
utilities_language_general/rus_constants.py CHANGED
@@ -109,22 +109,22 @@ BAD_USER_TARGET_WORDS = []
109
  COMBINE_POS = {
110
  'simple':
111
  {
112
- 'A1': {'VERB': ['AUX']},
113
- 'A2': {'VERB': ['AUX']},
114
- 'B1': {'VERB': ['AUX']},
115
- 'B2': {'VERB': ['AUX']},
116
- 'C1': {'VERB': ['AUX']},
117
- 'C2': {'VERB': ['AUX']},
118
- 'Без уровня': {'VERB': ['AUX']}
119
  },
120
  'phrase':
121
  {
122
- 'A1': {'VERB': ['AUX']},
123
- 'A2': {'VERB': ['AUX']},
124
- 'B1': {'VERB': ['AUX']},
125
- 'B2': {'VERB': ['AUX']},
126
- 'C1': {'VERB': ['AUX']},
127
- 'C2': {'VERB': ['AUX']},
128
- 'Без уровня': {'VERB': ['AUX']}
129
  },
130
  }
 
109
  COMBINE_POS = {
110
  'simple':
111
  {
112
+ 'A1': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], },
113
+ 'A2': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], },
114
+ 'B1': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], },
115
+ 'B2': {'VERB': ['AUX'], '': ['VERB'], },
116
+ 'C1': {'VERB': ['AUX'], '': ['VERB'], },
117
+ 'C2': {'VERB': ['AUX'], '': ['VERB'], },
118
+ 'Без уровня': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], }
119
  },
120
  'phrase':
121
  {
122
+ 'A1': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], },
123
+ 'A2': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], },
124
+ 'B1': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], },
125
+ 'B2': {'VERB': ['AUX'], '': ['VERB'], },
126
+ 'C1': {'VERB': ['AUX'], '': ['VERB'], },
127
+ 'C2': {'VERB': ['AUX'], '': ['VERB'], },
128
+ 'Без уровня': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], }
129
  },
130
  }
utilities_language_general/rus_utils.py CHANGED
@@ -99,14 +99,13 @@ def make_inflection(text: str, pos: str or list, tags: set, level: str) -> str o
99
  return None
100
  else:
101
  word_form = morph.parse(text)[0].inflect(tags)
102
- rnd = randint(0,5)
103
- if pos == 'VERB' and 'impf' in tags and level in ('A1', 'A2'):
104
  tags.discard('impf')
105
  tags.add('perf')
106
  word_form = morph.parse(text)[0].inflect(tags)
107
  if word_form is not None:
108
  return word_form.word
109
- elif pos == 'VERB' and 'perf' in tags and level in ('A1', 'A2'):
110
  tags.discard('perf')
111
  tags.add('impf')
112
  word_form = morph.parse(text)[0].inflect(tags)
@@ -209,7 +208,7 @@ def check_token_bert(token, current_minimum: set = None, stop_words=stop_list,
209
 
210
  def get_distractors_from_model(doc, model, scaler, classifier, pos_dict:dict, target_text:str, lemma: str, pos: str, gender: str,
211
  lemma_index:int, global_distractors: set, distractor_minimum: set, level_name: str, max_num_distractors: int,
212
- max_length_ratio=5, min_edit_distance_ratio=0.5):
213
  distractors = []
214
  query = lemma if '_' in lemma else f'{lemma}_{pos}'
215
  lemma = '_'.join(lemma.split('_')[::2])
@@ -240,8 +239,7 @@ def get_distractors_from_model(doc, model, scaler, classifier, pos_dict:dict, ta
240
  and (candidate_gender == gender and level_name in ('B1', 'B2', 'C1', 'C2') or level_name in ('A1', 'A2'))
241
  and length_ratio <= max_length_ratio
242
  and distractor_lemma not in global_distractors
243
- and edit_distance(lemma, distractor_lemma) / ((len(lemma) + len(distractor_lemma)) / 2) >
244
- min_edit_distance_ratio)
245
  if condition:
246
  if distractor_minimum is not None:
247
  if distractor_lemma in distractor_minimum:
@@ -290,7 +288,7 @@ def get_distractors_from_model(doc, model, scaler, classifier, pos_dict:dict, ta
290
 
291
  def get_distractors_from_model_bert(model, scaler, classifier, pos_dict:dict, level_name: str, lemma: str, pos: str, gender: str,
292
  text_with_masked_task: str, global_distractors: set, distractor_minimum: set,
293
- max_num_distractors: int, max_length_ratio=5, min_edit_distance_ratio=0.5):
294
  _distractors = []
295
  try:
296
  if distractor_minimum:
@@ -329,7 +327,7 @@ def get_distractors_from_model_bert(model, scaler, classifier, pos_dict:dict, le
329
  and (length_ratio <= max_length_ratio) # May be changed if case of phrases
330
  and (distractor_lemma not in global_distractors)
331
  and (edit_distance(lemma, distractor_lemma) # May be changed if case of phrases
332
- / ((len(lemma) + len(distractor_lemma)) / 2) > min_edit_distance_ratio))
333
  if condition:
334
  if distractor_minimum is not None:
335
  if distractor_lemma in distractor_minimum:
 
99
  return None
100
  else:
101
  word_form = morph.parse(text)[0].inflect(tags)
102
+ if word_form is None and pos == 'VERB' and 'impf' in tags and level in ('A1', 'A2'):
 
103
  tags.discard('impf')
104
  tags.add('perf')
105
  word_form = morph.parse(text)[0].inflect(tags)
106
  if word_form is not None:
107
  return word_form.word
108
+ elif word_form is None and pos == 'VERB' and 'perf' in tags and level in ('A1', 'A2'):
109
  tags.discard('perf')
110
  tags.add('impf')
111
  word_form = morph.parse(text)[0].inflect(tags)
 
208
 
209
  def get_distractors_from_model(doc, model, scaler, classifier, pos_dict:dict, target_text:str, lemma: str, pos: str, gender: str,
210
  lemma_index:int, global_distractors: set, distractor_minimum: set, level_name: str, max_num_distractors: int,
211
+ max_length_ratio=5, min_edit_distance_ratio=0.4):
212
  distractors = []
213
  query = lemma if '_' in lemma else f'{lemma}_{pos}'
214
  lemma = '_'.join(lemma.split('_')[::2])
 
239
  and (candidate_gender == gender and level_name in ('B1', 'B2', 'C1', 'C2') or level_name in ('A1', 'A2'))
240
  and length_ratio <= max_length_ratio
241
  and distractor_lemma not in global_distractors
242
+ and edit_distance(lemma, distractor_lemma) / ((len(lemma) + len(distractor_lemma)) / 2) >= min_edit_distance_ratio)
 
243
  if condition:
244
  if distractor_minimum is not None:
245
  if distractor_lemma in distractor_minimum:
 
288
 
289
  def get_distractors_from_model_bert(model, scaler, classifier, pos_dict:dict, level_name: str, lemma: str, pos: str, gender: str,
290
  text_with_masked_task: str, global_distractors: set, distractor_minimum: set,
291
+ max_num_distractors: int, max_length_ratio=5, min_edit_distance_ratio=0.4):
292
  _distractors = []
293
  try:
294
  if distractor_minimum:
 
327
  and (length_ratio <= max_length_ratio) # May be changed if case of phrases
328
  and (distractor_lemma not in global_distractors)
329
  and (edit_distance(lemma, distractor_lemma) # May be changed if case of phrases
330
+ / ((len(lemma) + len(distractor_lemma)) / 2) >= min_edit_distance_ratio))
331
  if condition:
332
  if distractor_minimum is not None:
333
  if distractor_lemma in distractor_minimum: