Spaces:

senyukhin
/

annotator_demo

Running

App Files Files Community

annotator_demo / category_parser.py

senyukhin

Upload 11 files

a8dc9d8 about 1 year ago

raw

history blame contribute delete

5.17 kB

	import json
	import os
	import preproc
	from collections import Counter, deque
	import numpy as np

	# Находит токен целевого слова по концу слова
	def find_need_word_by_lemma(tokens, word):
	res = []
	for token in tokens:
	if token.lemma == word:
	res.append(token.stop)
	return res


	def find_first(tokens, stop):
	for token in tokens:
	if token.stop == stop:
	return (token.id, token.rel)


	# Находит связанное слово
	def find_related_word(scenario, id, r1, r2, rel_ids, rels, head_ids, words):
	if scenario == 0: # Нам известен 0 элемент в кортеже
	try:
	for i in rel_ids.get(id):
	if rels[i] == r2:
	if (r2 == 'advmod') & (words.get(i) != 'не'):
	continue
	return i
	except:
	return '1_0'
	elif scenario == 1: # Нам известен 1 элемент в кортеже
	for i in r1:
	if rels.get(head_ids[id]) == i:
	return head_ids[id]

	return '1_0'

	# Собирает факт по целевому слову
	def construct_fact(tokens, stop, category):
	words = dict()
	head_ids = dict()
	rels = dict()
	rel_ids = dict()

	instructions = get_insructions(category)

	for token in tokens:
	words[token.id] = token.text
	head_ids[token.id] = token.head_id
	if rel_ids.get(token.head_id):
	rel_ids[token.head_id].append(token.id)
	else:
	rel_ids[token.head_id] = [token.id]
	rels[token.id] = token.rel

	fact = deque()
	first_word = find_first(tokens, stop)
	id = first_word[0]
	fact.append(words[first_word[0]])

	breaker = False

	if instructions.get(first_word[1]):
	for instruction in instructions[first_word[1]]:
	for i in instruction:
	related_word = find_related_word(i[2], id, i[0], i[1], rel_ids, rels, head_ids, words)
	#print(related_word)
	if (related_word == '1_0') & (instruction.index(i) == 1):
	break
	elif related_word == '1_0':
	pass
	elif i[2] == 0:
	fact.appendleft(words[related_word])
	else:
	fact.appendleft(words[related_word])
	id = related_word

	if instruction.index(i) == len(instruction) - 1:
	breaker = True

	if breaker:
	break

	if len(fact) == len(set(fact)):
	if len(fact) > 1:
	return ' '.join(fact)
	else: pass
	else:
	fact.popleft()
	return ' '.join(fact)
	else: pass


	# Поскольку в одном предложении могут быть несколько целевых слов, которые могут быть связаны, эта функция провеярет, не являются факт сокращённой копией прошлого факта
	def cheker_fact(previous_fact, new_fact):
	previous_fact = set(previous_fact.split(' '))
	new_fact = set(new_fact.split())

	if len(previous_fact & new_fact) != len(new_fact):
	return True
	else:
	return False


	SCRIPT_DIR = os.path.dirname(__file__)

	def get_insructions(category):
	with open(f'{SCRIPT_DIR}/{category}/instructions.json') as f:
	return json.load(f)


	def get_category_words(category):
	return set(open(f'{SCRIPT_DIR}/{category}/words.txt', encoding='utf8').read().split('\n'))


	def get_morfology_from_fact(fact, sent_tokens):
	'''
	Вычленяет часть речи и морфологические свойства слова из факта
	'''
	res = []
	for word in fact.split(' '):
	for token in sent_tokens:
	if word == token.text:
	res.append([token.pos, token.feats])
	break
	return res


	def get_facts(tokens, category):
	facts = []
	for sent in tokens:
	sent_tokens = preproc.get_sent_tokens(sent)
	set_lemmas = preproc.get_set_sent_lemmas(sent)
	res = set_lemmas & get_category_words(category)
	if res:
	for w in res:
	for word in find_need_word_by_lemma(sent_tokens, w):
	fact = construct_fact(sent_tokens, word, category)
	if fact:
	# facts.append(fact)
	morthology = get_morfology_from_fact(fact, sent_tokens)
	# facts.append([w, fact])
	facts.append([w, fact, morthology])
	return facts


	def get_mentioned_words(tokens, category):
	lemmas = preproc.get_all_lemmas(tokens)
	res = set(lemmas) & get_category_words(category)
	if res:
	return Counter([lemma for lemma in lemmas if lemma in res])
	else:
	return Counter()

	def get_most_mentioned_words(mentioned_words):
	return mentioned_words.sum().most_common(3)