Spaces:
Runtime error
Runtime error
Upload 13 files
Browse files- app.py +38 -4
- category_parser.py +2 -1
- date_parser.py +2 -2
- diary.py +6 -5
- network_builder.py +21 -0
- requirements.txt +7 -3
- sentiment_parser.py +10 -3
app.py
CHANGED
@@ -1,21 +1,55 @@
|
|
1 |
import streamlit as st
|
2 |
import diary as d
|
3 |
import sentiment_parser as sp
|
|
|
|
|
|
|
4 |
# import altair as alt
|
5 |
|
6 |
st.title('Автоматический аннотатор')
|
7 |
|
8 |
st.markdown("Скопируйте текст дневика в это поле или выберите для теста один из подготовленных отрывков.")
|
9 |
|
|
|
10 |
with open('test.txt', 'r') as f:
|
11 |
TEST = f.read()
|
12 |
# diary = st.text_area('Текст дневника')
|
13 |
if st.button('Быстрая обработка на тестовом тексте '):
|
14 |
df = d.analyze(TEST)
|
15 |
-
st.dataframe(df)
|
16 |
-
for_chart = sp.data_for_sentiment_chart(df).set_index('n_date')
|
17 |
-
st.markdown('### График сентимента по записям дневника (тест)')
|
18 |
-
st.line_chart(data=for_chart)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
# if st.button('Обработать'):
|
21 |
# df = d.analyze(diary)
|
|
|
1 |
import streamlit as st
|
2 |
import diary as d
|
3 |
import sentiment_parser as sp
|
4 |
+
import network_builder as nb
|
5 |
+
from pyvis.network import Network
|
6 |
+
import streamlit.components.v1 as components
|
7 |
# import altair as alt
|
8 |
|
9 |
st.title('Автоматический аннотатор')
|
10 |
|
11 |
st.markdown("Скопируйте текст дневика в это поле или выберите для теста один из подготовленных отрывков.")
|
12 |
|
13 |
+
|
14 |
with open('test.txt', 'r') as f:
|
15 |
TEST = f.read()
|
16 |
# diary = st.text_area('Текст дневника')
|
17 |
if st.button('Быстрая обработка на тестовом тексте '):
|
18 |
df = d.analyze(TEST)
|
19 |
+
# st.dataframe(df)
|
20 |
+
# for_chart = sp.data_for_sentiment_chart(df).set_index('n_date')
|
21 |
+
# st.markdown('### График сентимента по записям дневника (тест)')
|
22 |
+
# st.line_chart(data=for_chart)
|
23 |
+
# st.experimental_memo.clear()
|
24 |
+
graph = nb.build_graph(df)
|
25 |
+
|
26 |
+
textnet = Network( height='400px',
|
27 |
+
width='100%',
|
28 |
+
bgcolor='white',
|
29 |
+
font_color='black'
|
30 |
+
)
|
31 |
+
|
32 |
+
textnet.from_nx(graph)
|
33 |
+
|
34 |
+
textnet.repulsion(
|
35 |
+
node_distance=420,
|
36 |
+
central_gravity=0.33,
|
37 |
+
spring_length=110,
|
38 |
+
spring_strength=0.10,
|
39 |
+
damping=0.95
|
40 |
+
)
|
41 |
+
|
42 |
+
try:
|
43 |
+
path = '/tmp'
|
44 |
+
textnet.save_graph(f'{path}/pyvis_graph.html')
|
45 |
+
HtmlFile = open(f'{path}/pyvis_graph.html', 'r', encoding='utf-8')
|
46 |
+
|
47 |
+
except:
|
48 |
+
path = '/html_files'
|
49 |
+
textnet.save_graph(f'{path}/pyvis_graph.html')
|
50 |
+
HtmlFile = open(f'{path}/pyvis_graph.html', 'r', encoding='utf-8')
|
51 |
+
|
52 |
+
components.html(HtmlFile.read(), height=435)
|
53 |
|
54 |
# if st.button('Обработать'):
|
55 |
# df = d.analyze(diary)
|
category_parser.py
CHANGED
@@ -126,7 +126,8 @@ def get_facts(tokens, category):
|
|
126 |
for word in find_need_word_by_lemma(sent_tokens, w):
|
127 |
fact = construct_fact(sent_tokens, word, category)
|
128 |
if fact:
|
129 |
-
facts.append(fact)
|
|
|
130 |
return facts
|
131 |
|
132 |
|
|
|
126 |
for word in find_need_word_by_lemma(sent_tokens, w):
|
127 |
fact = construct_fact(sent_tokens, word, category)
|
128 |
if fact:
|
129 |
+
# facts.append(fact)
|
130 |
+
facts.append([w, fact])
|
131 |
return facts
|
132 |
|
133 |
|
date_parser.py
CHANGED
@@ -9,7 +9,7 @@ from yargy.predicates import (
|
|
9 |
dictionary, normalized,
|
10 |
)
|
11 |
import pandas as pd
|
12 |
-
|
13 |
|
14 |
DateRange = fact(
|
15 |
'DateRange',
|
@@ -309,7 +309,7 @@ DATE_RANGE = or_(
|
|
309 |
DateRange
|
310 |
)
|
311 |
|
312 |
-
|
313 |
def date_extractor_for_diary(text):
|
314 |
res = {
|
315 |
'date_start' : [],
|
|
|
9 |
dictionary, normalized,
|
10 |
)
|
11 |
import pandas as pd
|
12 |
+
import streamlit as st
|
13 |
|
14 |
DateRange = fact(
|
15 |
'DateRange',
|
|
|
309 |
DateRange
|
310 |
)
|
311 |
|
312 |
+
@st.experimental_memo
|
313 |
def date_extractor_for_diary(text):
|
314 |
res = {
|
315 |
'date_start' : [],
|
diary.py
CHANGED
@@ -2,11 +2,12 @@ import date_parser as dp
|
|
2 |
import preproc
|
3 |
import category_parser as cp
|
4 |
import sentiment_parser as sp
|
5 |
-
|
6 |
|
7 |
# def get_gender(tokens):
|
8 |
# r = [token.feats['Gender'] for sent in tokenizing(text) for token in sent if (token.feats.get('Gender') and token.feats.get('Voice')) ]
|
9 |
|
|
|
10 |
def analyze(text):
|
11 |
# Разделение текста на датированные куски
|
12 |
diary = dp.date_extractor_for_diary(text)
|
@@ -17,12 +18,12 @@ def analyze(text):
|
|
17 |
# Токенизация текста дневника по предложениям
|
18 |
diary['tokens'] = diary['text'].apply(lambda text: preproc.tokenizing(text))
|
19 |
|
20 |
-
# Выделение фактов из текста
|
21 |
diary['loc_facts'] = diary['tokens'].apply(lambda tokens: cp.get_facts(tokens, 'locations'))
|
22 |
-
diary['loc_words'] = diary['tokens'].apply(lambda tokens: cp.get_mentioned_words(tokens, 'locations'))
|
23 |
|
24 |
# Определение сентимента по записям
|
25 |
-
diary['sent'] = diary['tokens'].apply(lambda tokens: sp.get_overall_sentiment(tokens))
|
26 |
-
diary['sent_index'] = diary['sent'].apply(lambda sent: sp.get_sentiment_index(sent))
|
27 |
|
28 |
return diary
|
|
|
2 |
import preproc
|
3 |
import category_parser as cp
|
4 |
import sentiment_parser as sp
|
5 |
+
import streamlit as st
|
6 |
|
7 |
# def get_gender(tokens):
|
8 |
# r = [token.feats['Gender'] for sent in tokenizing(text) for token in sent if (token.feats.get('Gender') and token.feats.get('Voice')) ]
|
9 |
|
10 |
+
# @st.experimental_memo
|
11 |
def analyze(text):
|
12 |
# Разделение текста на датированные куски
|
13 |
diary = dp.date_extractor_for_diary(text)
|
|
|
18 |
# Токенизация текста дневника по предложениям
|
19 |
diary['tokens'] = diary['text'].apply(lambda text: preproc.tokenizing(text))
|
20 |
|
21 |
+
# # Выделение фактов из текста
|
22 |
diary['loc_facts'] = diary['tokens'].apply(lambda tokens: cp.get_facts(tokens, 'locations'))
|
23 |
+
# diary['loc_words'] = diary['tokens'].apply(lambda tokens: cp.get_mentioned_words(tokens, 'locations'))
|
24 |
|
25 |
# Определение сентимента по записям
|
26 |
+
# diary['sent'] = diary['tokens'].apply(lambda tokens: sp.get_overall_sentiment(tokens))
|
27 |
+
# diary['sent_index'] = diary['sent'].apply(lambda sent: sp.get_sentiment_index(sent))
|
28 |
|
29 |
return diary
|
network_builder.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import networkx as nx
|
2 |
+
|
3 |
+
|
4 |
+
def build_graph(df):
|
5 |
+
G = nx.Graph()
|
6 |
+
|
7 |
+
# Связывание дат
|
8 |
+
for previous, current in zip(df['date_start'], df['date_start'][1:]):
|
9 |
+
G.add_node(previous, group="Date", color = "blue")
|
10 |
+
G.add_node(current, group="Date", color = "blue")
|
11 |
+
G.add_edge(previous, current)
|
12 |
+
|
13 |
+
# Добавление связей дат и фактов, фактов и слов
|
14 |
+
for index, row in df[['date_start', 'loc_facts']].iterrows():
|
15 |
+
for fact in row['loc_facts']:
|
16 |
+
G.add_node(fact[0], group="Category_word", color = "green")
|
17 |
+
G.add_node(fact[1], group="Fact", color = "red")
|
18 |
+
G.add_edge(row['date_start'], fact[1])
|
19 |
+
G.add_edge(fact[0], fact[1])
|
20 |
+
|
21 |
+
return G
|
requirements.txt
CHANGED
@@ -1,4 +1,8 @@
|
|
1 |
-
natasha
|
2 |
streamlit==1.17.0
|
3 |
-
|
4 |
-
|
|
|
|
|
|
|
|
|
|
1 |
+
natasha
|
2 |
streamlit==1.17.0
|
3 |
+
networkx==2.8.6
|
4 |
+
numpy==1.23.3
|
5 |
+
pandas==1.4.4
|
6 |
+
altair<5
|
7 |
+
pyvis==0.2.1
|
8 |
+
pymorphy2
|
sentiment_parser.py
CHANGED
@@ -3,6 +3,7 @@ import os
|
|
3 |
import preproc, date_parser
|
4 |
from collections import Counter
|
5 |
import pandas as pd
|
|
|
6 |
|
7 |
SCRIPT_DIR = os.path.dirname(__file__)
|
8 |
|
@@ -10,13 +11,17 @@ def sentiment_verbs():
|
|
10 |
with open(f'{SCRIPT_DIR}/sentiment/verbs.json', 'r', encoding='utf-8') as file:
|
11 |
return json.load(file)
|
12 |
|
|
|
13 |
def sentiment_nouns():
|
14 |
with open(f'{SCRIPT_DIR}/sentiment/emo_clean.json', 'r', encoding='utf-8') as file:
|
15 |
return json.load(file)
|
16 |
|
|
|
|
|
|
|
|
|
17 |
def get_sentiment_from_verbs(lemmas):
|
18 |
res = []
|
19 |
-
VERBS = sentiment_verbs()
|
20 |
matching = set(lemmas) & set(VERBS.keys())
|
21 |
lemmas_dict = Counter(lemmas)
|
22 |
if matching:
|
@@ -26,10 +31,9 @@ def get_sentiment_from_verbs(lemmas):
|
|
26 |
res.extend(s)
|
27 |
return Counter(res)
|
28 |
else: return Counter()
|
29 |
-
|
30 |
def get_sentiment_from_nouns(lemmas):
|
31 |
res = []
|
32 |
-
NOUNS = sentiment_nouns()
|
33 |
matching = set(lemmas) & set(NOUNS.keys())
|
34 |
lemmas_dict = Counter(lemmas)
|
35 |
if matching:
|
@@ -46,9 +50,11 @@ def get_overall_sentiment(tokens):
|
|
46 |
nouns = get_sentiment_from_nouns(lemmas)
|
47 |
return verbs + nouns
|
48 |
|
|
|
49 |
def get_sentiment_index(sentiments):
|
50 |
return sentiments['positive'] - sentiments['negative']
|
51 |
|
|
|
52 |
def get_most_sentiment(sentiment_index):
|
53 |
sentiments = []
|
54 |
for index in sentiment_index:
|
@@ -61,6 +67,7 @@ def get_most_sentiment(sentiment_index):
|
|
61 |
sentiments = Counter(sentiments)
|
62 |
return sentiments.most_common(1)[0][0]
|
63 |
|
|
|
64 |
def data_for_sentiment_chart(df):
|
65 |
df = df.copy()
|
66 |
df['n_date'] = df.apply(lambda row:
|
|
|
3 |
import preproc, date_parser
|
4 |
from collections import Counter
|
5 |
import pandas as pd
|
6 |
+
import streamlit as st
|
7 |
|
8 |
SCRIPT_DIR = os.path.dirname(__file__)
|
9 |
|
|
|
11 |
with open(f'{SCRIPT_DIR}/sentiment/verbs.json', 'r', encoding='utf-8') as file:
|
12 |
return json.load(file)
|
13 |
|
14 |
+
|
15 |
def sentiment_nouns():
|
16 |
with open(f'{SCRIPT_DIR}/sentiment/emo_clean.json', 'r', encoding='utf-8') as file:
|
17 |
return json.load(file)
|
18 |
|
19 |
+
VERBS = sentiment_verbs()
|
20 |
+
NOUNS = sentiment_nouns()
|
21 |
+
|
22 |
+
|
23 |
def get_sentiment_from_verbs(lemmas):
|
24 |
res = []
|
|
|
25 |
matching = set(lemmas) & set(VERBS.keys())
|
26 |
lemmas_dict = Counter(lemmas)
|
27 |
if matching:
|
|
|
31 |
res.extend(s)
|
32 |
return Counter(res)
|
33 |
else: return Counter()
|
34 |
+
|
35 |
def get_sentiment_from_nouns(lemmas):
|
36 |
res = []
|
|
|
37 |
matching = set(lemmas) & set(NOUNS.keys())
|
38 |
lemmas_dict = Counter(lemmas)
|
39 |
if matching:
|
|
|
50 |
nouns = get_sentiment_from_nouns(lemmas)
|
51 |
return verbs + nouns
|
52 |
|
53 |
+
|
54 |
def get_sentiment_index(sentiments):
|
55 |
return sentiments['positive'] - sentiments['negative']
|
56 |
|
57 |
+
|
58 |
def get_most_sentiment(sentiment_index):
|
59 |
sentiments = []
|
60 |
for index in sentiment_index:
|
|
|
67 |
sentiments = Counter(sentiments)
|
68 |
return sentiments.most_common(1)[0][0]
|
69 |
|
70 |
+
@st.experimental_memo
|
71 |
def data_for_sentiment_chart(df):
|
72 |
df = df.copy()
|
73 |
df['n_date'] = df.apply(lambda row:
|