Spaces:

a-v-bely
/

spanish-task-generator

Sleeping

spanish-task-generator / utilities_language_general /esp_utils.py

a-v-bely

Update backend

41e198b about 1 year ago

12.2 kB

	from nltk import edit_distance
	from utilities.utils import answer_letter
	from utilities_language_general.esp_constants import nlp
	from utilities_language_general.esp_constants import FIX_LEMMA
	from utilities_language_general.esp_constants import SIMILARITY_VALUES
	from utilities_language_general.esp_constants import SIMILARITY_VALUES_bert


	def prepare_target_words(target_words):
	target_words = target_words.replace(' ,', ',').replace(',', ', ').replace(' ', ' ').split(', ')
	return list(set(target_words))


	def compute_frequency_dict(text: str) -> dict:
	"""
	Compute frequency dictionary of given text and return it sorted in descending order.

	:param text: given text as string variable
	:return: frequency dictionary {word: frequency} sorted in descending order
	"""
	freq_dict = {}
	doc = nlp(text)
	lemma_list_spacy = [token.lemma_ for token in doc]
	for lemma in lemma_list_spacy:
	if lemma.isalpha():
	if lemma not in freq_dict.keys():
	freq_dict[lemma] = 1
	else:
	freq_dict[lemma] += 1
	return freq_dict


	def get_tags(token: str):
	return nlp(token)[0].morph.to_dict()


	def fix_irregular_lemma(lemma, fixed_lemmas=FIX_LEMMA):
	for key, value in fixed_lemmas.items():
	if lemma in value:
	return key
	return lemma


	def check_token(token, lemma_pos, model, current_minimum: set = None, check_allowed_pos: set = None,
	check_allowed_dep: set = None) -> bool:
	not_allowed_pos = {'PROPN', 'PUNCT', 'NUM'}
	not_allowed_dep = {'cop', } # 'ROOT'
	if lemma_pos == 'auto':
	lemma_pos = f'{token.lemma_}_{token.pos_}'
	if not token.text.isalpha():
	return False
	if current_minimum is not None and token.lemma_ not in current_minimum:
	return False
	if not model.has_index_for(lemma_pos):
	return False
	if (not token.is_oov
	and not token.is_stop):
	if check_allowed_pos is None and check_allowed_dep is None:
	if token.pos_ not in not_allowed_pos and token.dep_ not in not_allowed_dep:
	return True
	return False
	elif check_allowed_pos is not None and check_allowed_dep is None:
	if token.pos_ in check_allowed_pos and token.dep_ not in not_allowed_dep:
	return True
	return False
	elif check_allowed_pos is None and check_allowed_dep is not None:
	if token.pos_ not in not_allowed_pos and token.dep_ in check_allowed_dep:
	return True
	return False
	else:
	if token.pos_ in check_allowed_pos and token.dep_ in check_allowed_dep:
	return True
	return False
	else:
	return False


	def check_token_bert(token, current_minimum: set = None, check_allowed_pos: set = None,
	check_allowed_dep: set = None) -> bool:
	not_allowed_pos = {'PROPN', 'PUNCT', 'NUM'}
	not_allowed_synt_dep = {'cop', } # 'ROOT'
	if not token.text.isalpha():
	return False
	if current_minimum is not None and token.lemma_ not in current_minimum:
	return False
	if get_tags(token.text) is not None:
	tags = get_tags(token.text)
	else:
	tags = None
	if not token.is_stop and tags is not None:
	if check_allowed_pos is None and check_allowed_dep is None:
	if token.pos_ not in not_allowed_pos and token.dep_ not in not_allowed_synt_dep:
	return True
	return False
	elif check_allowed_pos is not None and check_allowed_dep is None:
	if token.pos_ in check_allowed_pos and token.dep_ not in not_allowed_synt_dep:
	return True
	return False
	elif check_allowed_pos is None and check_allowed_dep is not None:
	if token.pos_ not in not_allowed_pos and token.dep_ in check_allowed_dep:
	return True
	return False
	else:
	if token.pos_ in check_allowed_pos and token.dep_ in check_allowed_dep:
	return True
	return False
	else:
	return False


	def get_distractors_from_model(model, lemma: str, pos: str, gender: str or None, global_distractors: set,
	distractor_minimum: set, level_name: str, max_num_distractors: int,
	max_length_ratio=5, min_edit_distance_ratio=0.5):
	distractors = []
	query = lemma if '_' in lemma else f'{lemma}_{pos}'
	lemma = '_'.join(lemma.split('_')[::2])
	if model.has_index_for(query):
	candidates = model.most_similar(query, topn=max_num_distractors + 100)
	else:
	if query.count('_') == 1:
	return None
	query_raw_list = query.split('_')
	query_parts = ['_'.join(query_raw_list[i:i + 2]) for i in range(len(query_raw_list))][::2]
	query_vector = model.get_mean_vector(query_parts)
	candidates = model.similar_by_vector(query_vector, topn=max_num_distractors + 100)
	for candidate in candidates:
	if candidate[0].count('_') == 1:
	distractor_lemma, distractor_pos = candidate[0].split('_')
	distractor_similarity = candidate[1]
	candidate_gender = get_tags(distractor_lemma).get('Gender')
	length_ratio = abs(len(lemma) - len(distractor_lemma))
	condition = ((distractor_pos == pos
	or (distractor_pos in ('VERB', 'ADJ', 'phrase') and pos in ('VERB', 'ADJ', 'phrase')))
	and distractor_lemma != lemma
	and distractor_similarity < SIMILARITY_VALUES[level_name]
	and candidate_gender == gender
	and length_ratio <= max_length_ratio
	and distractor_lemma not in global_distractors
	and edit_distance(lemma, distractor_lemma) / ((len(lemma) + len(distractor_lemma)) / 2) >
	min_edit_distance_ratio)
	if condition:
	if distractor_minimum is not None:
	if distractor_lemma in distractor_minimum:
	distractors.append((distractor_lemma, distractor_similarity))
	global_distractors.add(distractor_lemma)
	else:
	distractors.append((distractor_lemma, distractor_similarity))
	global_distractors.add(distractor_lemma)
	else:
	if candidate[0].count('_') > 3 or pos in ('NOUN', 'ADJ', 'NUM'):
	continue
	d1_lemma, d1_pos, d2_lemma, d2_pos = candidate[0].split('_')
	distractor_lemma = f'{d1_lemma}_{d2_lemma}'
	distractor_similarity = candidate[1]
	condition = (((d1_pos == pos or d2_pos == pos)
	or (d1_pos in ('VERB', 'AUX', 'SCONJ', 'ADP')
	and pos in ('phrase', 'VERB', 'AUX', 'SCONJ', 'ADP'))
	or (d2_pos in ('VERB', 'AUX', 'SCONJ', 'ADP')
	and pos in ('phrase', 'VERB', 'AUX', 'SCONJ', 'ADP')))
	and candidate[0] != lemma
	and distractor_lemma != lemma
	and distractor_similarity < SIMILARITY_VALUES[level_name]
	and distractor_lemma not in global_distractors)
	if condition:
	if distractor_minimum is not None:
	if (distractor_lemma in distractor_minimum
	or (d1_lemma in distractor_minimum and d2_lemma in distractor_minimum)):
	distractors.append((candidate[0], distractor_similarity))
	global_distractors.add(distractor_lemma)
	else:
	distractors.append((candidate[0], distractor_similarity))
	global_distractors.add(distractor_lemma)
	max_num_distractors = min(4, max_num_distractors) if max_num_distractors >= 4 else max_num_distractors
	if len(distractors) < max_num_distractors:
	return None
	else:
	return distractors


	def get_distractors_from_model_bert(model, text_with_masked_task: str, lemma: str, pos: str, gender: str or None,
	global_distractors: set, distractor_minimum: set, level_name: str,
	max_num_distractors: int, max_length_ratio=5, min_edit_distance_ratio=0.5):
	_distractors = []
	try:
	bert_candidates = [token for token in model(text_with_masked_task, top_k=max_num_distractors + 100)]
	candidates = []
	for candidate in bert_candidates:
	if isinstance(candidate, list):
	bert_candidates = candidate
	continue
	if candidate['token_str'].isalpha():
	candidate_morph = nlp(candidate['token_str'])[0]
	candidates.append((f"{candidate_morph.lemma_}_{candidate_morph.pos_}", candidate['score']))
	except KeyError:
	return None
	for candidate_distractor in candidates:
	if '_' in candidate_distractor[0]:
	distractor_lemma, distractor_pos = candidate_distractor[0].split('_')
	else:
	candidate_morph = nlp(candidate_distractor[0])[0]
	distractor_lemma, distractor_pos = candidate_morph.lemma_, candidate_morph.pos_
	distractor_similarity = candidate_distractor[1]
	candidate_gender = get_tags(distractor_lemma).get('Gender')
	length_ratio = abs(len(lemma) - len(distractor_lemma))
	if (((distractor_pos == pos)
	or (pos in ('VERB', 'ADJ', 'phrase') and distractor_pos in ('VERB', 'ADJ', 'phrase')))
	and distractor_lemma != lemma
	and (len(_distractors) < max_num_distractors+100)
	and (distractor_similarity < SIMILARITY_VALUES_bert[level_name])
	and (candidate_gender == gender)
	and (length_ratio <= max_length_ratio) # May be changed if case of phrases
	and (distractor_lemma not in global_distractors)
	and (edit_distance(lemma, distractor_lemma) # May be changed if case of phrases
	/ ((len(lemma) + len(distractor_lemma)) / 2) > min_edit_distance_ratio)):
	if distractor_minimum is not None:
	if distractor_lemma in distractor_minimum:
	_distractors.append((distractor_lemma, candidate_distractor[1]))
	global_distractors.add(distractor_lemma)
	else:
	_distractors.append((distractor_lemma, candidate_distractor[1]))
	num_distractors = min(4, max_num_distractors) if max_num_distractors >= 4 else max_num_distractors
	if len(_distractors) < num_distractors:
	return None
	return _distractors


	def prepare_tasks(input_variants):
	TASKS_STUDENT = ''
	TASKS_TEACHER = ''
	KEYS_ONLY = ''
	RAW_TASKS = []
	RAW_KEYS_ONLY = []
	RESULT_TASKS_STUDENT = []
	TASKS_WITH_ANSWERS_L = []
	KEYS = []

	for num, item in enumerate(input_variants):
	item = item[0]
	answer = item[0]
	variants = '\t'.join([i.lower() for i in item[1]])
	current_answer_letter = answer_letter(answer=answer, variants=[i.lower() for i in item[1]])
	RAW_TASKS.append((num + 1, variants))
	RAW_KEYS_ONLY.append((num + 1, current_answer_letter.split(' ')[0]))
	RESULT_TASKS_STUDENT.append(f"{num + 1}.\t{variants}")
	TASKS_WITH_ANSWERS_L.append(f"{num + 1}.\t"
	f"Ответ: {current_answer_letter}\n\t"
	f"Варианты: {variants}")
	KEYS.append(f"{num + 1}.\tОтвет: {current_answer_letter}")

	for task in RESULT_TASKS_STUDENT:
	TASKS_STUDENT += f'{task}\n'
	for task in TASKS_WITH_ANSWERS_L:
	TASKS_TEACHER += f'{task}\n'
	for task in KEYS:
	KEYS_ONLY += f'{task}\n'

	return {'TASKS_STUDENT': TASKS_STUDENT, 'TASKS_TEACHER': TASKS_TEACHER,
	'KEYS_ONLY': KEYS_ONLY, 'RAW_TASKS': RAW_TASKS, 'RAW_KEYS_ONLY': RAW_KEYS_ONLY}