import json import os import preproc from collections import Counter, deque import numpy as np # Находит токен целевого слова по концу слова def find_need_word_by_lemma(tokens, word): res = [] for token in tokens: if token.lemma == word: res.append(token.stop) return res def find_first(tokens, stop): for token in tokens: if token.stop == stop: return (token.id, token.rel) # Находит связанное слово def find_related_word(scenario, id, r1, r2, rel_ids, rels, head_ids, words): if scenario == 0: # Нам известен 0 элемент в кортеже try: for i in rel_ids.get(id): if rels[i] == r2: if (r2 == 'advmod') & (words.get(i) != 'не'): continue return i except: return '1_0' elif scenario == 1: # Нам известен 1 элемент в кортеже for i in r1: if rels.get(head_ids[id]) == i: return head_ids[id] return '1_0' # Собирает факт по целевому слову def construct_fact(tokens, stop, category): words = dict() head_ids = dict() rels = dict() rel_ids = dict() instructions = get_insructions(category) for token in tokens: words[token.id] = token.text head_ids[token.id] = token.head_id if rel_ids.get(token.head_id): rel_ids[token.head_id].append(token.id) else: rel_ids[token.head_id] = [token.id] rels[token.id] = token.rel fact = deque() first_word = find_first(tokens, stop) id = first_word[0] fact.append(words[first_word[0]]) breaker = False if instructions.get(first_word[1]): for instruction in instructions[first_word[1]]: for i in instruction: related_word = find_related_word(i[2], id, i[0], i[1], rel_ids, rels, head_ids, words) #print(related_word) if (related_word == '1_0') & (instruction.index(i) == 1): break elif related_word == '1_0': pass elif i[2] == 0: fact.appendleft(words[related_word]) else: fact.appendleft(words[related_word]) id = related_word if instruction.index(i) == len(instruction) - 1: breaker = True if breaker: break if len(fact) == len(set(fact)): if len(fact) > 1: return ' '.join(fact) else: pass else: fact.popleft() return ' '.join(fact) else: pass # Поскольку в одном предложении могут быть несколько целевых слов, которые могут быть связаны, эта функция провеярет, не являются факт сокращённой копией прошлого факта def cheker_fact(previous_fact, new_fact): previous_fact = set(previous_fact.split(' ')) new_fact = set(new_fact.split()) if len(previous_fact & new_fact) != len(new_fact): return True else: return False SCRIPT_DIR = os.path.dirname(__file__) def get_insructions(category): with open(f'{SCRIPT_DIR}/{category}/instructions.json') as f: return json.load(f) def get_category_words(category): return set(open(f'{SCRIPT_DIR}/{category}/words.txt', encoding='utf8').read().split('\n')) def get_morfology_from_fact(fact, sent_tokens): ''' Вычленяет часть речи и морфологические свойства слова из факта ''' res = [] for word in fact.split(' '): for token in sent_tokens: if word == token.text: res.append([token.pos, token.feats]) break return res def get_facts(tokens, category): facts = [] for sent in tokens: sent_tokens = preproc.get_sent_tokens(sent) set_lemmas = preproc.get_set_sent_lemmas(sent) res = set_lemmas & get_category_words(category) if res: for w in res: for word in find_need_word_by_lemma(sent_tokens, w): fact = construct_fact(sent_tokens, word, category) if fact: # facts.append(fact) morthology = get_morfology_from_fact(fact, sent_tokens) # facts.append([w, fact]) facts.append([w, fact, morthology]) return facts def get_mentioned_words(tokens, category): lemmas = preproc.get_all_lemmas(tokens) res = set(lemmas) & get_category_words(category) if res: return Counter([lemma for lemma in lemmas if lemma in res]) else: return Counter() def get_most_mentioned_words(mentioned_words): return mentioned_words.sum().most_common(3)