Spaces:
Sleeping
Sleeping
from nltk import edit_distance | |
from utilities.utils import answer_letter | |
from utilities_language_general.esp_constants import nlp | |
from utilities_language_general.esp_constants import FIX_LEMMA | |
from utilities_language_general.esp_constants import SIMILARITY_VALUES | |
from utilities_language_general.esp_constants import SIMILARITY_VALUES_bert | |
def prepare_target_words(target_words): | |
target_words = target_words.replace(' ,', ',').replace(',', ', ').replace(' ', ' ').split(', ') | |
return list(set(target_words)) | |
def compute_frequency_dict(text: str) -> dict: | |
""" | |
Compute frequency dictionary of given text and return it sorted in descending order. | |
:param text: given text as string variable | |
:return: frequency dictionary {word: frequency} sorted in descending order | |
""" | |
freq_dict = {} | |
doc = nlp(text) | |
lemma_list_spacy = [token.lemma_ for token in doc] | |
for lemma in lemma_list_spacy: | |
if lemma.isalpha(): | |
if lemma not in freq_dict.keys(): | |
freq_dict[lemma] = 1 | |
else: | |
freq_dict[lemma] += 1 | |
return freq_dict | |
def get_tags(token: str): | |
return nlp(token)[0].morph.to_dict() | |
def fix_irregular_lemma(lemma, fixed_lemmas=FIX_LEMMA): | |
for key, value in fixed_lemmas.items(): | |
if lemma in value: | |
return key | |
return lemma | |
def check_token(token, lemma_pos, model, current_minimum: set = None, check_allowed_pos: set = None, | |
check_allowed_dep: set = None) -> bool: | |
not_allowed_pos = {'PROPN', 'PUNCT', 'NUM'} | |
not_allowed_dep = {'cop', } # 'ROOT' | |
if lemma_pos == 'auto': | |
lemma_pos = f'{token.lemma_}_{token.pos_}' | |
if not token.text.isalpha(): | |
return False | |
if current_minimum is not None and token.lemma_ not in current_minimum: | |
return False | |
if not model.has_index_for(lemma_pos): | |
return False | |
if (not token.is_oov | |
and not token.is_stop): | |
if check_allowed_pos is None and check_allowed_dep is None: | |
if token.pos_ not in not_allowed_pos and token.dep_ not in not_allowed_dep: | |
return True | |
return False | |
elif check_allowed_pos is not None and check_allowed_dep is None: | |
if token.pos_ in check_allowed_pos and token.dep_ not in not_allowed_dep: | |
return True | |
return False | |
elif check_allowed_pos is None and check_allowed_dep is not None: | |
if token.pos_ not in not_allowed_pos and token.dep_ in check_allowed_dep: | |
return True | |
return False | |
else: | |
if token.pos_ in check_allowed_pos and token.dep_ in check_allowed_dep: | |
return True | |
return False | |
else: | |
return False | |
def check_token_bert(token, current_minimum: set = None, check_allowed_pos: set = None, | |
check_allowed_dep: set = None) -> bool: | |
not_allowed_pos = {'PROPN', 'PUNCT', 'NUM'} | |
not_allowed_synt_dep = {'cop', } # 'ROOT' | |
if not token.text.isalpha(): | |
return False | |
if current_minimum is not None and token.lemma_ not in current_minimum: | |
return False | |
if get_tags(token.text) is not None: | |
tags = get_tags(token.text) | |
else: | |
tags = None | |
if not token.is_stop and tags is not None: | |
if check_allowed_pos is None and check_allowed_dep is None: | |
if token.pos_ not in not_allowed_pos and token.dep_ not in not_allowed_synt_dep: | |
return True | |
return False | |
elif check_allowed_pos is not None and check_allowed_dep is None: | |
if token.pos_ in check_allowed_pos and token.dep_ not in not_allowed_synt_dep: | |
return True | |
return False | |
elif check_allowed_pos is None and check_allowed_dep is not None: | |
if token.pos_ not in not_allowed_pos and token.dep_ in check_allowed_dep: | |
return True | |
return False | |
else: | |
if token.pos_ in check_allowed_pos and token.dep_ in check_allowed_dep: | |
return True | |
return False | |
else: | |
return False | |
def get_distractors_from_model(model, lemma: str, pos: str, gender: str or None, global_distractors: set, | |
distractor_minimum: set, level_name: str, max_num_distractors: int, | |
max_length_ratio=5, min_edit_distance_ratio=0.5): | |
distractors = [] | |
query = lemma if '_' in lemma else f'{lemma}_{pos}' | |
lemma = '_'.join(lemma.split('_')[::2]) | |
if model.has_index_for(query): | |
candidates = model.most_similar(query, topn=max_num_distractors + 100) | |
else: | |
if query.count('_') == 1: | |
return None | |
query_raw_list = query.split('_') | |
query_parts = ['_'.join(query_raw_list[i:i + 2]) for i in range(len(query_raw_list))][::2] | |
query_vector = model.get_mean_vector(query_parts) | |
candidates = model.similar_by_vector(query_vector, topn=max_num_distractors + 100) | |
for candidate in candidates: | |
if candidate[0].count('_') == 1: | |
distractor_lemma, distractor_pos = candidate[0].split('_') | |
distractor_similarity = candidate[1] | |
candidate_gender = get_tags(distractor_lemma).get('Gender') | |
length_ratio = abs(len(lemma) - len(distractor_lemma)) | |
condition = ((distractor_pos == pos | |
or (distractor_pos in ('VERB', 'ADJ', 'phrase') and pos in ('VERB', 'ADJ', 'phrase'))) | |
and distractor_lemma != lemma | |
and distractor_similarity < SIMILARITY_VALUES[level_name] | |
and candidate_gender == gender | |
and length_ratio <= max_length_ratio | |
and distractor_lemma not in global_distractors | |
and edit_distance(lemma, distractor_lemma) / ((len(lemma) + len(distractor_lemma)) / 2) > | |
min_edit_distance_ratio) | |
if condition: | |
if distractor_minimum is not None: | |
if distractor_lemma in distractor_minimum: | |
distractors.append((distractor_lemma, distractor_similarity)) | |
global_distractors.add(distractor_lemma) | |
else: | |
distractors.append((distractor_lemma, distractor_similarity)) | |
global_distractors.add(distractor_lemma) | |
else: | |
if candidate[0].count('_') > 3 or pos in ('NOUN', 'ADJ', 'NUM'): | |
continue | |
d1_lemma, d1_pos, d2_lemma, d2_pos = candidate[0].split('_') | |
distractor_lemma = f'{d1_lemma}_{d2_lemma}' | |
distractor_similarity = candidate[1] | |
condition = (((d1_pos == pos or d2_pos == pos) | |
or (d1_pos in ('VERB', 'AUX', 'SCONJ', 'ADP') | |
and pos in ('phrase', 'VERB', 'AUX', 'SCONJ', 'ADP')) | |
or (d2_pos in ('VERB', 'AUX', 'SCONJ', 'ADP') | |
and pos in ('phrase', 'VERB', 'AUX', 'SCONJ', 'ADP'))) | |
and candidate[0] != lemma | |
and distractor_lemma != lemma | |
and distractor_similarity < SIMILARITY_VALUES[level_name] | |
and distractor_lemma not in global_distractors) | |
if condition: | |
if distractor_minimum is not None: | |
if (distractor_lemma in distractor_minimum | |
or (d1_lemma in distractor_minimum and d2_lemma in distractor_minimum)): | |
distractors.append((candidate[0], distractor_similarity)) | |
global_distractors.add(distractor_lemma) | |
else: | |
distractors.append((candidate[0], distractor_similarity)) | |
global_distractors.add(distractor_lemma) | |
max_num_distractors = min(4, max_num_distractors) if max_num_distractors >= 4 else max_num_distractors | |
if len(distractors) < max_num_distractors: | |
return None | |
else: | |
return distractors | |
def get_distractors_from_model_bert(model, text_with_masked_task: str, lemma: str, pos: str, gender: str or None, | |
global_distractors: set, distractor_minimum: set, level_name: str, | |
max_num_distractors: int, max_length_ratio=5, min_edit_distance_ratio=0.5): | |
_distractors = [] | |
try: | |
bert_candidates = [token for token in model(text_with_masked_task, top_k=max_num_distractors + 100)] | |
candidates = [] | |
for candidate in bert_candidates: | |
if isinstance(candidate, list): | |
bert_candidates = candidate | |
continue | |
if candidate['token_str'].isalpha(): | |
candidate_morph = nlp(candidate['token_str'])[0] | |
candidates.append((f"{candidate_morph.lemma_}_{candidate_morph.pos_}", candidate['score'])) | |
except KeyError: | |
return None | |
for candidate_distractor in candidates: | |
if '_' in candidate_distractor[0]: | |
distractor_lemma, distractor_pos = candidate_distractor[0].split('_') | |
else: | |
candidate_morph = nlp(candidate_distractor[0])[0] | |
distractor_lemma, distractor_pos = candidate_morph.lemma_, candidate_morph.pos_ | |
distractor_similarity = candidate_distractor[1] | |
candidate_gender = get_tags(distractor_lemma).get('Gender') | |
length_ratio = abs(len(lemma) - len(distractor_lemma)) | |
if (((distractor_pos == pos) | |
or (pos in ('VERB', 'ADJ', 'phrase') and distractor_pos in ('VERB', 'ADJ', 'phrase'))) | |
and distractor_lemma != lemma | |
and (len(_distractors) < max_num_distractors+100) | |
and (distractor_similarity < SIMILARITY_VALUES_bert[level_name]) | |
and (candidate_gender == gender) | |
and (length_ratio <= max_length_ratio) # May be changed if case of phrases | |
and (distractor_lemma not in global_distractors) | |
and (edit_distance(lemma, distractor_lemma) # May be changed if case of phrases | |
/ ((len(lemma) + len(distractor_lemma)) / 2) > min_edit_distance_ratio)): | |
if distractor_minimum is not None: | |
if distractor_lemma in distractor_minimum: | |
_distractors.append((distractor_lemma, candidate_distractor[1])) | |
global_distractors.add(distractor_lemma) | |
else: | |
_distractors.append((distractor_lemma, candidate_distractor[1])) | |
num_distractors = min(4, max_num_distractors) if max_num_distractors >= 4 else max_num_distractors | |
if len(_distractors) < num_distractors: | |
return None | |
return _distractors | |
def prepare_tasks(input_variants): | |
TASKS_STUDENT = '' | |
TASKS_TEACHER = '' | |
KEYS_ONLY = '' | |
RAW_TASKS = [] | |
RAW_KEYS_ONLY = [] | |
RESULT_TASKS_STUDENT = [] | |
TASKS_WITH_ANSWERS_L = [] | |
KEYS = [] | |
for num, item in enumerate(input_variants): | |
item = item[0] | |
answer = item[0] | |
variants = '\t'.join([i.lower() for i in item[1]]) | |
current_answer_letter = answer_letter(answer=answer, variants=[i.lower() for i in item[1]]) | |
RAW_TASKS.append((num + 1, variants)) | |
RAW_KEYS_ONLY.append((num + 1, current_answer_letter.split(' ')[0])) | |
RESULT_TASKS_STUDENT.append(f"{num + 1}.\t{variants}") | |
TASKS_WITH_ANSWERS_L.append(f"{num + 1}.\t" | |
f"Ответ: {current_answer_letter}\n\t" | |
f"Варианты: {variants}") | |
KEYS.append(f"{num + 1}.\tОтвет: {current_answer_letter}") | |
for task in RESULT_TASKS_STUDENT: | |
TASKS_STUDENT += f'{task}\n' | |
for task in TASKS_WITH_ANSWERS_L: | |
TASKS_TEACHER += f'{task}\n' | |
for task in KEYS: | |
KEYS_ONLY += f'{task}\n' | |
return {'TASKS_STUDENT': TASKS_STUDENT, 'TASKS_TEACHER': TASKS_TEACHER, | |
'KEYS_ONLY': KEYS_ONLY, 'RAW_TASKS': RAW_TASKS, 'RAW_KEYS_ONLY': RAW_KEYS_ONLY} | |