Spaces:
Runtime error
Runtime error
File size: 5,167 Bytes
876403c a8dc9d8 876403c 7d61140 a8dc9d8 876403c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 |
import json
import os
import preproc
from collections import Counter, deque
import numpy as np
# Находит токен целевого слова по концу слова
def find_need_word_by_lemma(tokens, word):
res = []
for token in tokens:
if token.lemma == word:
res.append(token.stop)
return res
def find_first(tokens, stop):
for token in tokens:
if token.stop == stop:
return (token.id, token.rel)
# Находит связанное слово
def find_related_word(scenario, id, r1, r2, rel_ids, rels, head_ids, words):
if scenario == 0: # Нам известен 0 элемент в кортеже
try:
for i in rel_ids.get(id):
if rels[i] == r2:
if (r2 == 'advmod') & (words.get(i) != 'не'):
continue
return i
except:
return '1_0'
elif scenario == 1: # Нам известен 1 элемент в кортеже
for i in r1:
if rels.get(head_ids[id]) == i:
return head_ids[id]
return '1_0'
# Собирает факт по целевому слову
def construct_fact(tokens, stop, category):
words = dict()
head_ids = dict()
rels = dict()
rel_ids = dict()
instructions = get_insructions(category)
for token in tokens:
words[token.id] = token.text
head_ids[token.id] = token.head_id
if rel_ids.get(token.head_id):
rel_ids[token.head_id].append(token.id)
else:
rel_ids[token.head_id] = [token.id]
rels[token.id] = token.rel
fact = deque()
first_word = find_first(tokens, stop)
id = first_word[0]
fact.append(words[first_word[0]])
breaker = False
if instructions.get(first_word[1]):
for instruction in instructions[first_word[1]]:
for i in instruction:
related_word = find_related_word(i[2], id, i[0], i[1], rel_ids, rels, head_ids, words)
#print(related_word)
if (related_word == '1_0') & (instruction.index(i) == 1):
break
elif related_word == '1_0':
pass
elif i[2] == 0:
fact.appendleft(words[related_word])
else:
fact.appendleft(words[related_word])
id = related_word
if instruction.index(i) == len(instruction) - 1:
breaker = True
if breaker:
break
if len(fact) == len(set(fact)):
if len(fact) > 1:
return ' '.join(fact)
else: pass
else:
fact.popleft()
return ' '.join(fact)
else: pass
# Поскольку в одном предложении могут быть несколько целевых слов, которые могут быть связаны, эта функция провеярет, не являются факт сокращённой копией прошлого факта
def cheker_fact(previous_fact, new_fact):
previous_fact = set(previous_fact.split(' '))
new_fact = set(new_fact.split())
if len(previous_fact & new_fact) != len(new_fact):
return True
else:
return False
SCRIPT_DIR = os.path.dirname(__file__)
def get_insructions(category):
with open(f'{SCRIPT_DIR}/{category}/instructions.json') as f:
return json.load(f)
def get_category_words(category):
return set(open(f'{SCRIPT_DIR}/{category}/words.txt', encoding='utf8').read().split('\n'))
def get_morfology_from_fact(fact, sent_tokens):
'''
Вычленяет часть речи и морфологические свойства слова из факта
'''
res = []
for word in fact.split(' '):
for token in sent_tokens:
if word == token.text:
res.append([token.pos, token.feats])
break
return res
def get_facts(tokens, category):
facts = []
for sent in tokens:
sent_tokens = preproc.get_sent_tokens(sent)
set_lemmas = preproc.get_set_sent_lemmas(sent)
res = set_lemmas & get_category_words(category)
if res:
for w in res:
for word in find_need_word_by_lemma(sent_tokens, w):
fact = construct_fact(sent_tokens, word, category)
if fact:
# facts.append(fact)
morthology = get_morfology_from_fact(fact, sent_tokens)
# facts.append([w, fact])
facts.append([w, fact, morthology])
return facts
def get_mentioned_words(tokens, category):
lemmas = preproc.get_all_lemmas(tokens)
res = set(lemmas) & get_category_words(category)
if res:
return Counter([lemma for lemma in lemmas if lemma in res])
else:
return Counter()
def get_most_mentioned_words(mentioned_words):
return mentioned_words.sum().most_common(3)
|