senyukhin commited on
Commit
7d61140
1 Parent(s): b702b00

Upload 13 files

Browse files
Files changed (7) hide show
  1. app.py +38 -4
  2. category_parser.py +2 -1
  3. date_parser.py +2 -2
  4. diary.py +6 -5
  5. network_builder.py +21 -0
  6. requirements.txt +7 -3
  7. sentiment_parser.py +10 -3
app.py CHANGED
@@ -1,21 +1,55 @@
1
  import streamlit as st
2
  import diary as d
3
  import sentiment_parser as sp
 
 
 
4
  # import altair as alt
5
 
6
  st.title('Автоматический аннотатор')
7
 
8
  st.markdown("Скопируйте текст дневика в это поле или выберите для теста один из подготовленных отрывков.")
9
 
 
10
  with open('test.txt', 'r') as f:
11
  TEST = f.read()
12
  # diary = st.text_area('Текст дневника')
13
  if st.button('Быстрая обработка на тестовом тексте '):
14
  df = d.analyze(TEST)
15
- st.dataframe(df)
16
- for_chart = sp.data_for_sentiment_chart(df).set_index('n_date')
17
- st.markdown('### График сентимента по записям дневника (тест)')
18
- st.line_chart(data=for_chart)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  # if st.button('Обработать'):
21
  # df = d.analyze(diary)
 
1
  import streamlit as st
2
  import diary as d
3
  import sentiment_parser as sp
4
+ import network_builder as nb
5
+ from pyvis.network import Network
6
+ import streamlit.components.v1 as components
7
  # import altair as alt
8
 
9
  st.title('Автоматический аннотатор')
10
 
11
  st.markdown("Скопируйте текст дневика в это поле или выберите для теста один из подготовленных отрывков.")
12
 
13
+
14
  with open('test.txt', 'r') as f:
15
  TEST = f.read()
16
  # diary = st.text_area('Текст дневника')
17
  if st.button('Быстрая обработка на тестовом тексте '):
18
  df = d.analyze(TEST)
19
+ # st.dataframe(df)
20
+ # for_chart = sp.data_for_sentiment_chart(df).set_index('n_date')
21
+ # st.markdown('### График сентимента по записям дневника (тест)')
22
+ # st.line_chart(data=for_chart)
23
+ # st.experimental_memo.clear()
24
+ graph = nb.build_graph(df)
25
+
26
+ textnet = Network( height='400px',
27
+ width='100%',
28
+ bgcolor='white',
29
+ font_color='black'
30
+ )
31
+
32
+ textnet.from_nx(graph)
33
+
34
+ textnet.repulsion(
35
+ node_distance=420,
36
+ central_gravity=0.33,
37
+ spring_length=110,
38
+ spring_strength=0.10,
39
+ damping=0.95
40
+ )
41
+
42
+ try:
43
+ path = '/tmp'
44
+ textnet.save_graph(f'{path}/pyvis_graph.html')
45
+ HtmlFile = open(f'{path}/pyvis_graph.html', 'r', encoding='utf-8')
46
+
47
+ except:
48
+ path = '/html_files'
49
+ textnet.save_graph(f'{path}/pyvis_graph.html')
50
+ HtmlFile = open(f'{path}/pyvis_graph.html', 'r', encoding='utf-8')
51
+
52
+ components.html(HtmlFile.read(), height=435)
53
 
54
  # if st.button('Обработать'):
55
  # df = d.analyze(diary)
category_parser.py CHANGED
@@ -126,7 +126,8 @@ def get_facts(tokens, category):
126
  for word in find_need_word_by_lemma(sent_tokens, w):
127
  fact = construct_fact(sent_tokens, word, category)
128
  if fact:
129
- facts.append(fact)
 
130
  return facts
131
 
132
 
 
126
  for word in find_need_word_by_lemma(sent_tokens, w):
127
  fact = construct_fact(sent_tokens, word, category)
128
  if fact:
129
+ # facts.append(fact)
130
+ facts.append([w, fact])
131
  return facts
132
 
133
 
date_parser.py CHANGED
@@ -9,7 +9,7 @@ from yargy.predicates import (
9
  dictionary, normalized,
10
  )
11
  import pandas as pd
12
-
13
 
14
  DateRange = fact(
15
  'DateRange',
@@ -309,7 +309,7 @@ DATE_RANGE = or_(
309
  DateRange
310
  )
311
 
312
-
313
  def date_extractor_for_diary(text):
314
  res = {
315
  'date_start' : [],
 
9
  dictionary, normalized,
10
  )
11
  import pandas as pd
12
+ import streamlit as st
13
 
14
  DateRange = fact(
15
  'DateRange',
 
309
  DateRange
310
  )
311
 
312
+ @st.experimental_memo
313
  def date_extractor_for_diary(text):
314
  res = {
315
  'date_start' : [],
diary.py CHANGED
@@ -2,11 +2,12 @@ import date_parser as dp
2
  import preproc
3
  import category_parser as cp
4
  import sentiment_parser as sp
5
-
6
 
7
  # def get_gender(tokens):
8
  # r = [token.feats['Gender'] for sent in tokenizing(text) for token in sent if (token.feats.get('Gender') and token.feats.get('Voice')) ]
9
 
 
10
  def analyze(text):
11
  # Разделение текста на датированные куски
12
  diary = dp.date_extractor_for_diary(text)
@@ -17,12 +18,12 @@ def analyze(text):
17
  # Токенизация текста дневника по предложениям
18
  diary['tokens'] = diary['text'].apply(lambda text: preproc.tokenizing(text))
19
 
20
- # Выделение фактов из текста
21
  diary['loc_facts'] = diary['tokens'].apply(lambda tokens: cp.get_facts(tokens, 'locations'))
22
- diary['loc_words'] = diary['tokens'].apply(lambda tokens: cp.get_mentioned_words(tokens, 'locations'))
23
 
24
  # Определение сентимента по записям
25
- diary['sent'] = diary['tokens'].apply(lambda tokens: sp.get_overall_sentiment(tokens))
26
- diary['sent_index'] = diary['sent'].apply(lambda sent: sp.get_sentiment_index(sent))
27
 
28
  return diary
 
2
  import preproc
3
  import category_parser as cp
4
  import sentiment_parser as sp
5
+ import streamlit as st
6
 
7
  # def get_gender(tokens):
8
  # r = [token.feats['Gender'] for sent in tokenizing(text) for token in sent if (token.feats.get('Gender') and token.feats.get('Voice')) ]
9
 
10
+ # @st.experimental_memo
11
  def analyze(text):
12
  # Разделение текста на датированные куски
13
  diary = dp.date_extractor_for_diary(text)
 
18
  # Токенизация текста дневника по предложениям
19
  diary['tokens'] = diary['text'].apply(lambda text: preproc.tokenizing(text))
20
 
21
+ # # Выделение фактов из текста
22
  diary['loc_facts'] = diary['tokens'].apply(lambda tokens: cp.get_facts(tokens, 'locations'))
23
+ # diary['loc_words'] = diary['tokens'].apply(lambda tokens: cp.get_mentioned_words(tokens, 'locations'))
24
 
25
  # Определение сентимента по записям
26
+ # diary['sent'] = diary['tokens'].apply(lambda tokens: sp.get_overall_sentiment(tokens))
27
+ # diary['sent_index'] = diary['sent'].apply(lambda sent: sp.get_sentiment_index(sent))
28
 
29
  return diary
network_builder.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import networkx as nx
2
+
3
+
4
+ def build_graph(df):
5
+ G = nx.Graph()
6
+
7
+ # Связывание дат
8
+ for previous, current in zip(df['date_start'], df['date_start'][1:]):
9
+ G.add_node(previous, group="Date", color = "blue")
10
+ G.add_node(current, group="Date", color = "blue")
11
+ G.add_edge(previous, current)
12
+
13
+ # Добавление связей дат и фактов, фактов и слов
14
+ for index, row in df[['date_start', 'loc_facts']].iterrows():
15
+ for fact in row['loc_facts']:
16
+ G.add_node(fact[0], group="Category_word", color = "green")
17
+ G.add_node(fact[1], group="Fact", color = "red")
18
+ G.add_edge(row['date_start'], fact[1])
19
+ G.add_edge(fact[0], fact[1])
20
+
21
+ return G
requirements.txt CHANGED
@@ -1,4 +1,8 @@
1
- natasha==1.5.0
2
  streamlit==1.17.0
3
- pandas
4
- altair<5
 
 
 
 
 
1
+ natasha
2
  streamlit==1.17.0
3
+ networkx==2.8.6
4
+ numpy==1.23.3
5
+ pandas==1.4.4
6
+ altair<5
7
+ pyvis==0.2.1
8
+ pymorphy2
sentiment_parser.py CHANGED
@@ -3,6 +3,7 @@ import os
3
  import preproc, date_parser
4
  from collections import Counter
5
  import pandas as pd
 
6
 
7
  SCRIPT_DIR = os.path.dirname(__file__)
8
 
@@ -10,13 +11,17 @@ def sentiment_verbs():
10
  with open(f'{SCRIPT_DIR}/sentiment/verbs.json', 'r', encoding='utf-8') as file:
11
  return json.load(file)
12
 
 
13
  def sentiment_nouns():
14
  with open(f'{SCRIPT_DIR}/sentiment/emo_clean.json', 'r', encoding='utf-8') as file:
15
  return json.load(file)
16
 
 
 
 
 
17
  def get_sentiment_from_verbs(lemmas):
18
  res = []
19
- VERBS = sentiment_verbs()
20
  matching = set(lemmas) & set(VERBS.keys())
21
  lemmas_dict = Counter(lemmas)
22
  if matching:
@@ -26,10 +31,9 @@ def get_sentiment_from_verbs(lemmas):
26
  res.extend(s)
27
  return Counter(res)
28
  else: return Counter()
29
-
30
  def get_sentiment_from_nouns(lemmas):
31
  res = []
32
- NOUNS = sentiment_nouns()
33
  matching = set(lemmas) & set(NOUNS.keys())
34
  lemmas_dict = Counter(lemmas)
35
  if matching:
@@ -46,9 +50,11 @@ def get_overall_sentiment(tokens):
46
  nouns = get_sentiment_from_nouns(lemmas)
47
  return verbs + nouns
48
 
 
49
  def get_sentiment_index(sentiments):
50
  return sentiments['positive'] - sentiments['negative']
51
 
 
52
  def get_most_sentiment(sentiment_index):
53
  sentiments = []
54
  for index in sentiment_index:
@@ -61,6 +67,7 @@ def get_most_sentiment(sentiment_index):
61
  sentiments = Counter(sentiments)
62
  return sentiments.most_common(1)[0][0]
63
 
 
64
  def data_for_sentiment_chart(df):
65
  df = df.copy()
66
  df['n_date'] = df.apply(lambda row:
 
3
  import preproc, date_parser
4
  from collections import Counter
5
  import pandas as pd
6
+ import streamlit as st
7
 
8
  SCRIPT_DIR = os.path.dirname(__file__)
9
 
 
11
  with open(f'{SCRIPT_DIR}/sentiment/verbs.json', 'r', encoding='utf-8') as file:
12
  return json.load(file)
13
 
14
+
15
  def sentiment_nouns():
16
  with open(f'{SCRIPT_DIR}/sentiment/emo_clean.json', 'r', encoding='utf-8') as file:
17
  return json.load(file)
18
 
19
+ VERBS = sentiment_verbs()
20
+ NOUNS = sentiment_nouns()
21
+
22
+
23
  def get_sentiment_from_verbs(lemmas):
24
  res = []
 
25
  matching = set(lemmas) & set(VERBS.keys())
26
  lemmas_dict = Counter(lemmas)
27
  if matching:
 
31
  res.extend(s)
32
  return Counter(res)
33
  else: return Counter()
34
+
35
  def get_sentiment_from_nouns(lemmas):
36
  res = []
 
37
  matching = set(lemmas) & set(NOUNS.keys())
38
  lemmas_dict = Counter(lemmas)
39
  if matching:
 
50
  nouns = get_sentiment_from_nouns(lemmas)
51
  return verbs + nouns
52
 
53
+
54
  def get_sentiment_index(sentiments):
55
  return sentiments['positive'] - sentiments['negative']
56
 
57
+
58
  def get_most_sentiment(sentiment_index):
59
  sentiments = []
60
  for index in sentiment_index:
 
67
  sentiments = Counter(sentiments)
68
  return sentiments.most_common(1)[0][0]
69
 
70
+ @st.experimental_memo
71
  def data_for_sentiment_chart(df):
72
  df = df.copy()
73
  df['n_date'] = df.apply(lambda row: