|
import json |
|
import os |
|
import preproc |
|
from collections import Counter, deque |
|
import numpy as np |
|
|
|
|
|
def find_need_word_by_lemma(tokens, word): |
|
res = [] |
|
for token in tokens: |
|
if token.lemma == word: |
|
res.append(token.stop) |
|
return res |
|
|
|
|
|
def find_first(tokens, stop): |
|
for token in tokens: |
|
if token.stop == stop: |
|
return (token.id, token.rel) |
|
|
|
|
|
|
|
def find_related_word(scenario, id, r1, r2, rel_ids, rels, head_ids, words): |
|
if scenario == 0: |
|
try: |
|
for i in rel_ids.get(id): |
|
if rels[i] == r2: |
|
if (r2 == 'advmod') & (words.get(i) != 'не'): |
|
continue |
|
return i |
|
except: |
|
return '1_0' |
|
elif scenario == 1: |
|
for i in r1: |
|
if rels.get(head_ids[id]) == i: |
|
return head_ids[id] |
|
|
|
return '1_0' |
|
|
|
|
|
def construct_fact(tokens, stop, category): |
|
words = dict() |
|
head_ids = dict() |
|
rels = dict() |
|
rel_ids = dict() |
|
|
|
instructions = get_insructions(category) |
|
|
|
for token in tokens: |
|
words[token.id] = token.text |
|
head_ids[token.id] = token.head_id |
|
if rel_ids.get(token.head_id): |
|
rel_ids[token.head_id].append(token.id) |
|
else: |
|
rel_ids[token.head_id] = [token.id] |
|
rels[token.id] = token.rel |
|
|
|
fact = deque() |
|
first_word = find_first(tokens, stop) |
|
id = first_word[0] |
|
fact.append(words[first_word[0]]) |
|
|
|
breaker = False |
|
|
|
if instructions.get(first_word[1]): |
|
for instruction in instructions[first_word[1]]: |
|
for i in instruction: |
|
related_word = find_related_word(i[2], id, i[0], i[1], rel_ids, rels, head_ids, words) |
|
|
|
if (related_word == '1_0') & (instruction.index(i) == 1): |
|
break |
|
elif related_word == '1_0': |
|
pass |
|
elif i[2] == 0: |
|
fact.appendleft(words[related_word]) |
|
else: |
|
fact.appendleft(words[related_word]) |
|
id = related_word |
|
|
|
if instruction.index(i) == len(instruction) - 1: |
|
breaker = True |
|
|
|
if breaker: |
|
break |
|
|
|
if len(fact) == len(set(fact)): |
|
if len(fact) > 1: |
|
return ' '.join(fact) |
|
else: pass |
|
else: |
|
fact.popleft() |
|
return ' '.join(fact) |
|
else: pass |
|
|
|
|
|
|
|
def cheker_fact(previous_fact, new_fact): |
|
previous_fact = set(previous_fact.split(' ')) |
|
new_fact = set(new_fact.split()) |
|
|
|
if len(previous_fact & new_fact) != len(new_fact): |
|
return True |
|
else: |
|
return False |
|
|
|
|
|
SCRIPT_DIR = os.path.dirname(__file__) |
|
|
|
def get_insructions(category): |
|
with open(f'{SCRIPT_DIR}/{category}/instructions.json') as f: |
|
return json.load(f) |
|
|
|
|
|
def get_category_words(category): |
|
return set(open(f'{SCRIPT_DIR}/{category}/words.txt', encoding='utf8').read().split('\n')) |
|
|
|
|
|
def get_facts(tokens, category): |
|
facts = [] |
|
for sent in tokens: |
|
sent_tokens = preproc.get_sent_tokens(sent) |
|
set_lemmas = preproc.get_set_sent_lemmas(sent) |
|
res = set_lemmas & get_category_words(category) |
|
if res: |
|
for w in res: |
|
for word in find_need_word_by_lemma(sent_tokens, w): |
|
fact = construct_fact(sent_tokens, word, category) |
|
if fact: |
|
facts.append(fact) |
|
return facts |
|
|
|
|
|
def get_mentioned_words(tokens, category): |
|
lemmas = preproc.get_all_lemmas(tokens) |
|
res = set(lemmas) & get_category_words(category) |
|
if res: |
|
return Counter([lemma for lemma in lemmas if lemma in res]) |
|
else: |
|
return Counter() |
|
|
|
def get_most_mentioned_words(mentioned_words): |
|
return mentioned_words.sum().most_common(3) |
|
|