Spaces:
Runtime error
Runtime error
Upload 11 files
Browse files- app.py +43 -45
- category_parser.py +16 -1
- date_parser.py +32 -25
- diary.py +3 -3
- network_builder.py +52 -3
- word_transformations.py +23 -1
app.py
CHANGED
@@ -10,10 +10,10 @@ st.title('Автоматический аннотатор')
|
|
10 |
|
11 |
# st.markdown("Скопируйте текст дневика в это поле или выберите для теста один из подготовленных отрывков.")
|
12 |
|
13 |
-
txt = st.text_area('Скопируйте текст
|
14 |
|
15 |
option = st.selectbox(
|
16 |
-
'Или выберите один из
|
17 |
('Выбрать...', 'Анатолий Василивицкий', 'Мария Германова'))
|
18 |
|
19 |
if option == 'Анатолий Василивицкий':
|
@@ -26,50 +26,48 @@ elif option == 'Мария Германова':
|
|
26 |
|
27 |
# diary = st.text_area('Текст дневника')
|
28 |
if st.button('Обработать') and txt != '':
|
29 |
-
|
30 |
-
# st.dataframe(df)
|
31 |
-
# for_chart = sp.data_for_sentiment_chart(df).set_index('n_date')
|
32 |
-
# st.markdown('### График сентимента по записям дневника (тест)')
|
33 |
-
# st.line_chart(data=for_chart)
|
34 |
-
# st.experimental_memo.clear()
|
35 |
-
graph = nb.build_graph(df)
|
36 |
-
|
37 |
-
GENDER = wt.get_gender(df['tokens'])
|
38 |
-
|
39 |
-
st.markdown(f'**Аннотация этого дневника:** {nb.annotation(graph, GENDER)}')
|
40 |
-
|
41 |
-
textnet = Network( height='400px',
|
42 |
-
width='100%',
|
43 |
-
bgcolor='white',
|
44 |
-
font_color='black'
|
45 |
-
)
|
46 |
-
|
47 |
-
textnet.from_nx(graph)
|
48 |
-
|
49 |
-
textnet.repulsion(
|
50 |
-
node_distance=420,
|
51 |
-
central_gravity=0.33,
|
52 |
-
spring_length=110,
|
53 |
-
spring_strength=0.10,
|
54 |
-
damping=0.95
|
55 |
-
)
|
56 |
-
|
57 |
try:
|
58 |
-
|
59 |
-
|
60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
except:
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
components.html(HtmlFile.read(), height=435)
|
68 |
-
|
69 |
-
# if st.button('Обработать'):
|
70 |
-
# df = d.analyze(diary)
|
71 |
-
# st.dataframe(df)
|
72 |
-
# for_chart = sp.data_for_sentiment_chart(df).set_index('n_date')
|
73 |
-
# st.markdown('### График сентимента по записям дневника (тест)')
|
74 |
-
# st.line_chart(data=for_chart)
|
75 |
|
|
|
10 |
|
11 |
# st.markdown("Скопируйте текст дневика в это поле или выберите для теста один из подготовленных отрывков.")
|
12 |
|
13 |
+
txt = st.text_area('Скопируйте текст дневника в это поле', height=100)
|
14 |
|
15 |
option = st.selectbox(
|
16 |
+
'Или выберите один из пробных текстов дневников:',
|
17 |
('Выбрать...', 'Анатолий Василивицкий', 'Мария Германова'))
|
18 |
|
19 |
if option == 'Анатолий Василивицкий':
|
|
|
26 |
|
27 |
# diary = st.text_area('Текст дневника')
|
28 |
if st.button('Обработать') and txt != '':
|
29 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
try:
|
31 |
+
df = d.analyze(txt)
|
32 |
+
graph = nb.build_graph(df)
|
33 |
+
|
34 |
+
GENDER = wt.get_gender(df['tokens'])
|
35 |
+
locations = df['locations']
|
36 |
+
|
37 |
+
st.markdown(f'**Аннотация этого дневника:** {nb.annotation(graph, GENDER, locations)}')
|
38 |
+
|
39 |
+
textnet = Network( height='400px',
|
40 |
+
width='100%',
|
41 |
+
bgcolor='white',
|
42 |
+
font_color='black'
|
43 |
+
)
|
44 |
+
|
45 |
+
textnet.from_nx(graph)
|
46 |
|
47 |
+
textnet.repulsion(
|
48 |
+
node_distance=420,
|
49 |
+
central_gravity=0.33,
|
50 |
+
spring_length=110,
|
51 |
+
spring_strength=0.10,
|
52 |
+
damping=0.95
|
53 |
+
)
|
54 |
+
|
55 |
+
try:
|
56 |
+
path = '/tmp'
|
57 |
+
textnet.save_graph(f'{path}/pyvis_graph.html')
|
58 |
+
HtmlFile = open(f'{path}/pyvis_graph.html', 'r', encoding='utf-8')
|
59 |
+
|
60 |
+
except:
|
61 |
+
path = '/html_files'
|
62 |
+
textnet.save_graph(f'{path}/pyvis_graph.html')
|
63 |
+
HtmlFile = open(f'{path}/pyvis_graph.html', 'r', encoding='utf-8')
|
64 |
+
|
65 |
+
st.markdown('### «Граф знания» этого дневника')
|
66 |
+
st.caption(':blue[Синим цветом] выделены узлы, связанные с одной датированной дневниковой записью, розовым — общий сентимент записи, :red[красным] — найденное утверждение, :green[зелёным] — места и локации, фиолетовым — занятия.')
|
67 |
+
st.caption('Чтобы увеличить граф и посмотреть лейблы узлов, установите курсор в нужном месте и проскорольте вниз.')
|
68 |
+
components.html(HtmlFile.read(), height=435)
|
69 |
except:
|
70 |
+
st.warning('Вставьте текст дневника, который начинается с даты!', icon="⚠️")
|
71 |
+
|
72 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
|
category_parser.py
CHANGED
@@ -115,6 +115,19 @@ def get_category_words(category):
|
|
115 |
return set(open(f'{SCRIPT_DIR}/{category}/words.txt', encoding='utf8').read().split('\n'))
|
116 |
|
117 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
def get_facts(tokens, category):
|
119 |
facts = []
|
120 |
for sent in tokens:
|
@@ -127,7 +140,9 @@ def get_facts(tokens, category):
|
|
127 |
fact = construct_fact(sent_tokens, word, category)
|
128 |
if fact:
|
129 |
# facts.append(fact)
|
130 |
-
|
|
|
|
|
131 |
return facts
|
132 |
|
133 |
|
|
|
115 |
return set(open(f'{SCRIPT_DIR}/{category}/words.txt', encoding='utf8').read().split('\n'))
|
116 |
|
117 |
|
118 |
+
def get_morfology_from_fact(fact, sent_tokens):
|
119 |
+
'''
|
120 |
+
Вычленяет часть речи и морфологические свойства слова из факта
|
121 |
+
'''
|
122 |
+
res = []
|
123 |
+
for word in fact.split(' '):
|
124 |
+
for token in sent_tokens:
|
125 |
+
if word == token.text:
|
126 |
+
res.append([token.pos, token.feats])
|
127 |
+
break
|
128 |
+
return res
|
129 |
+
|
130 |
+
|
131 |
def get_facts(tokens, category):
|
132 |
facts = []
|
133 |
for sent in tokens:
|
|
|
140 |
fact = construct_fact(sent_tokens, word, category)
|
141 |
if fact:
|
142 |
# facts.append(fact)
|
143 |
+
morthology = get_morfology_from_fact(fact, sent_tokens)
|
144 |
+
# facts.append([w, fact])
|
145 |
+
facts.append([w, fact, morthology])
|
146 |
return facts
|
147 |
|
148 |
|
date_parser.py
CHANGED
@@ -309,33 +309,40 @@ DATE_RANGE = or_(
|
|
309 |
DateRange
|
310 |
)
|
311 |
|
312 |
-
@st.experimental_memo
|
313 |
def date_extractor_for_diary(text):
|
314 |
-
res = {
|
315 |
-
'date_start' : [],
|
316 |
-
'date_stop' : [],
|
317 |
-
'text' : []
|
318 |
-
}
|
319 |
-
entry = ''
|
320 |
-
for paragraph in text.split('\n'):
|
321 |
-
parser = Parser(DATE_RANGE)
|
322 |
-
for match in parser.findall(paragraph):
|
323 |
-
record = match.fact.normalized
|
324 |
-
if record.spans[0].start in range (0, 3):
|
325 |
-
start = record.get_start_date
|
326 |
-
stop = record.get_stop_date
|
327 |
-
res['date_start'].append(start)
|
328 |
-
res['date_stop'].append(stop)
|
329 |
-
if entry != '':
|
330 |
-
res['text'].append(entry)
|
331 |
-
entry = ''
|
332 |
-
break
|
333 |
-
entry += paragraph
|
334 |
-
entry += '\n'
|
335 |
-
if entry != '':
|
336 |
-
res['text'].append(entry)
|
337 |
|
338 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
339 |
|
340 |
def normalize_dates(start, stop):
|
341 |
if start == stop:
|
|
|
309 |
DateRange
|
310 |
)
|
311 |
|
|
|
312 |
def date_extractor_for_diary(text):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
313 |
|
314 |
+
res = {
|
315 |
+
'date_start' : [],
|
316 |
+
'date_stop' : [],
|
317 |
+
'text' : []
|
318 |
+
}
|
319 |
+
try:
|
320 |
+
entry = ''
|
321 |
+
for paragraph in text.split('\n'):
|
322 |
+
parser = Parser(DATE_RANGE)
|
323 |
+
for match in parser.findall(paragraph):
|
324 |
+
record = match.fact.normalized
|
325 |
+
if record.spans[0].start in range (0, 3):
|
326 |
+
start = record.get_start_date
|
327 |
+
stop = record.get_stop_date
|
328 |
+
if entry != '':
|
329 |
+
res['text'].append(entry)
|
330 |
+
entry = ''
|
331 |
+
res['date_start'].append(start)
|
332 |
+
res['date_stop'].append(stop)
|
333 |
+
# if entry != '':
|
334 |
+
# res['text'].append(entry)
|
335 |
+
# entry = ''
|
336 |
+
break
|
337 |
+
entry += paragraph
|
338 |
+
entry += '\n'
|
339 |
+
if entry != '':
|
340 |
+
res['text'].append(entry)
|
341 |
+
|
342 |
+
df = pd.DataFrame(res)
|
343 |
+
return df.drop(df.loc[df['text'].str.len() < 10].index)
|
344 |
+
except:
|
345 |
+
return pd.DataFrame(res)
|
346 |
|
347 |
def normalize_dates(start, stop):
|
348 |
if start == stop:
|
diary.py
CHANGED
@@ -12,7 +12,7 @@ import word_transformations as wt
|
|
12 |
def analyze(text):
|
13 |
# Разделение текста на датированные куски
|
14 |
diary = dp.date_extractor_for_diary(text)
|
15 |
-
|
16 |
# Очистка текста дневника
|
17 |
diary = preproc.text_preproc(diary)
|
18 |
|
@@ -23,7 +23,7 @@ def analyze(text):
|
|
23 |
diary['locations'] = diary['tokens'].apply(lambda tokens: cp.get_facts(tokens, 'locations'))
|
24 |
|
25 |
# Определение сентимента по записям
|
26 |
-
|
27 |
-
|
28 |
|
29 |
return diary
|
|
|
12 |
def analyze(text):
|
13 |
# Разделение текста на датированные куски
|
14 |
diary = dp.date_extractor_for_diary(text)
|
15 |
+
|
16 |
# Очистка текста дневника
|
17 |
diary = preproc.text_preproc(diary)
|
18 |
|
|
|
23 |
diary['locations'] = diary['tokens'].apply(lambda tokens: cp.get_facts(tokens, 'locations'))
|
24 |
|
25 |
# Определение сентимента по записям
|
26 |
+
diary['sent'] = diary['tokens'].apply(lambda tokens: sp.get_overall_sentiment(tokens))
|
27 |
+
diary['sent_index'] = diary['sent'].apply(lambda sent: sp.get_sentiment_index(sent))
|
28 |
|
29 |
return diary
|
network_builder.py
CHANGED
@@ -1,6 +1,8 @@
|
|
1 |
import networkx as nx
|
2 |
from collections import Counter
|
3 |
import word_transformations as wt
|
|
|
|
|
4 |
|
5 |
|
6 |
def build_graph(df):
|
@@ -20,6 +22,10 @@ def build_graph(df):
|
|
20 |
G.add_edge(row['date_start'], fact[1])
|
21 |
G.add_edge(fact[0], fact[1])
|
22 |
|
|
|
|
|
|
|
|
|
23 |
return G
|
24 |
|
25 |
|
@@ -60,14 +66,57 @@ def facts_for_annotation(G, gender, most_places):
|
|
60 |
res.append((date, fact))
|
61 |
return res
|
62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
|
64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
dates = dates_of_Diary_writing(G)
|
66 |
most_places = most_visited_places(G)
|
67 |
facts = facts_for_annotation(G, gender, most_places)
|
|
|
68 |
|
69 |
-
facts = ', '.join([f"{fact[1].lower()} ({fact[0]})" for fact in facts])
|
|
|
70 |
|
71 |
-
annotation = f'{wt.get_noun(gender).title()} этого дневника {wt.gender_transformer("вести", gender)} его с {dates[0]} по {dates[1]}. Наиболее часто {wt.get_pronoun(gender)} {wt.gender_transformer("описывал", gender)} {wt.inflector(most_places[0], "accs")}, {wt.inflector(most_places[1], "accs")} и {wt.inflector(most_places[2], "accs")}.\n\n
|
72 |
|
73 |
return annotation
|
|
|
1 |
import networkx as nx
|
2 |
from collections import Counter
|
3 |
import word_transformations as wt
|
4 |
+
import sentiment_parser as sp
|
5 |
+
# import app
|
6 |
|
7 |
|
8 |
def build_graph(df):
|
|
|
22 |
G.add_edge(row['date_start'], fact[1])
|
23 |
G.add_edge(fact[0], fact[1])
|
24 |
|
25 |
+
# Добавление связей даты записи с сентиментом
|
26 |
+
for index, row in df[['date_start', 'sent_index']].iterrows():
|
27 |
+
G.add_node(sp.get_most_sentiment([row['sent_index']]), group="Sentiment", color = "pink")
|
28 |
+
G.add_edge(row['date_start'], sp.get_most_sentiment([row['sent_index']]))
|
29 |
return G
|
30 |
|
31 |
|
|
|
66 |
res.append((date, fact))
|
67 |
return res
|
68 |
|
69 |
+
def sentiment_of_date(G):
|
70 |
+
sentiment = dict()
|
71 |
+
sentiment['positive'] = [date for date in G.predecessors('positive')]
|
72 |
+
sentiment['negative'] = [date for date in G.predecessors('negative')]
|
73 |
+
sentiment['neutral'] = [date for date in G.predecessors('neutral')]
|
74 |
+
return sentiment
|
75 |
|
76 |
+
|
77 |
+
def constuct_fact_for_annotation(facts, sentiment, gender, locations):
|
78 |
+
'''
|
79 |
+
Собирает из отобранных фактов текст для аннотации.
|
80 |
+
'''
|
81 |
+
prompts = [f'В записях с преимущественно положительной тональностью {wt.get_noun(gender)} {wt.gender_transformer("писал", gender)} как {wt.get_pronoun(gender)}',
|
82 |
+
f'Также в дневнике описывается, как {wt.get_pronoun(gender)}']
|
83 |
+
|
84 |
+
|
85 |
+
positive_facts = []
|
86 |
+
negative_facts = []
|
87 |
+
if sentiment['positive']:
|
88 |
+
for date in sentiment['positive']:
|
89 |
+
for fact in facts:
|
90 |
+
if date == fact[0]:
|
91 |
+
positive_facts.append(f"{wt.transform_fact(locations, fact[1], gender).lower()} ({fact[0]})")
|
92 |
+
if sentiment['negative']:
|
93 |
+
for date in sentiment['negative']:
|
94 |
+
for fact in facts:
|
95 |
+
print(fact[1])
|
96 |
+
if date == fact[0]:
|
97 |
+
negative_facts.append(f"{wt.transform_fact(locations, fact[1], gender).lower()} ({fact[0]})")
|
98 |
+
if sentiment['neutral']:
|
99 |
+
for date in sentiment['neutral']:
|
100 |
+
for fact in facts:
|
101 |
+
print(fact[1])
|
102 |
+
if date == fact[0]:
|
103 |
+
negative_facts.append(f"{wt.transform_fact(locations, fact[1], gender).lower()} ({fact[0]})")
|
104 |
+
text = ''
|
105 |
+
if positive_facts:
|
106 |
+
text += f'{prompts[0]} {", ".join(positive_facts)}.'
|
107 |
+
if negative_facts:
|
108 |
+
text += f'\n\n{prompts[1]} {", ".join(negative_facts)}.'
|
109 |
+
return text
|
110 |
+
|
111 |
+
def annotation(G, gender, locations):
|
112 |
dates = dates_of_Diary_writing(G)
|
113 |
most_places = most_visited_places(G)
|
114 |
facts = facts_for_annotation(G, gender, most_places)
|
115 |
+
sentiment = sentiment_of_date(G)
|
116 |
|
117 |
+
# facts = ', '.join([f"{fact[1].lower()} ({fact[0]})" for fact in facts])
|
118 |
+
# facts = ''
|
119 |
|
120 |
+
annotation = f'{wt.get_noun(gender).title()} этого дневника {wt.gender_transformer("вести", gender)} его с {dates[0]} по {dates[1]}. Наиболее часто {wt.get_pronoun(gender)} {wt.gender_transformer("описывал", gender)} {wt.inflector(most_places[0], "accs")}, {wt.inflector(most_places[1], "accs")} и {wt.inflector(most_places[2], "accs")}.\n\n{constuct_fact_for_annotation(facts, sentiment, gender, locations)}'
|
121 |
|
122 |
return annotation
|
word_transformations.py
CHANGED
@@ -84,4 +84,26 @@ def get_fact_to_annotation(fact, gender, most_mentioned_word):
|
|
84 |
return False
|
85 |
if form.normal_form in ['она', 'он']:
|
86 |
return False
|
87 |
-
return flag
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
return False
|
85 |
if form.normal_form in ['она', 'он']:
|
86 |
return False
|
87 |
+
return flag
|
88 |
+
|
89 |
+
def transform_fact(tokens, fact, gender):
|
90 |
+
'''
|
91 |
+
Если факт написан в первом лице, то трансформирует его в третье лицо.
|
92 |
+
На вход поступает столбец с записями фактов — там есть токены
|
93 |
+
'''
|
94 |
+
facts = [(fact_string[1], fact_string[2]) for entry in tokens for fact_string in entry]
|
95 |
+
for f in facts:
|
96 |
+
if f[0] == fact:
|
97 |
+
fact = fact.split(' ')
|
98 |
+
delete_index = None
|
99 |
+
for i in range(0, len(fact)):
|
100 |
+
# глагол в первом лице
|
101 |
+
if (f[1][i][0] == 'VERB') and (f[1][i][1].get('Person') == '1'):
|
102 |
+
replaced_verb = gender_transformer(fact[i], gender)
|
103 |
+
del fact[i]
|
104 |
+
fact.insert(i, replaced_verb)
|
105 |
+
if fact[i].lower() == 'я':
|
106 |
+
delete_index = i
|
107 |
+
if delete_index != None:
|
108 |
+
del fact[delete_index]
|
109 |
+
return ' '.join(fact)
|