dand199 commited on
Commit
2efb8ad
·
verified ·
1 Parent(s): e5f44ad

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +235 -0
app.py ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Load libaries
2
+ import pandas as pd
3
+ import numpy as np
4
+ from datetime import datetime
5
+ import cleantext
6
+ import re
7
+ import ast
8
+ import streamlit as st
9
+ from itables import init_notebook_mode, show
10
+ import spacy
11
+ from spacy.lang.en import English
12
+ #import spacy_cleaner
13
+ #from spacy_cleaner.processing import removers, replacers, mutators
14
+ #from spacy import displacy
15
+ #import asent
16
+ from dotenv import load_dotenv
17
+ from subprocess import Popen
18
+ import scrapy
19
+ from scrapy import Selector
20
+ import json
21
+ import requests
22
+
23
+
24
+ md_intro = '''# Sentiment Dashboard
25
+ Main libraries used: Scrapy, SpaCy, PyTorch, Mercury
26
+
27
+ Data source: CNN, BBC, CNBC
28
+ '''
29
+ md_sumstats = '''## News Sentiment Summary
30
+ '''
31
+ md_table = '''## News Sentiment Report
32
+ '''
33
+ md_notes = '''## Notes and Thoughts:
34
+ Lexicon-based approach may confuse named entities and actual sentiment, because brand names may have positive words, say BeautifulSoup.
35
+
36
+ Hence, implementing named entity recognition before sentiment analysis helps to improve accuracy.
37
+
38
+ Using RNN-based approach can also overcome lexicon issues, but it also takes more resources.
39
+ '''
40
+ dat_name = 'E:/Project/NLP/news_sentiment_analytics/news_db/merged_news_data_' + datetime.today().strftime('%Y-%m-%d') + '.csv'
41
+ news = pd.read_csv(dat_name, on_bad_lines='skip')
42
+ news[['pos_sent', 'neg_sent']] = news[['pos_sent', 'neg_sent']].fillna('')
43
+ news['clean_content'] = news.clean_content.apply(lambda x: ast.literal_eval(x))
44
+ news = news.fillna(value = '')
45
+ news['clean_date'] = pd.to_datetime(news['clean_date'], dayfirst=True)
46
+ news['date_extracted'] = datetime.today().strftime('%Y-%m-%d')
47
+
48
+
49
+ # Calculate summary
50
+ def news_stats(news, method_selection, range_selection):
51
+ overall_sentiment = 0
52
+ news_count = 0
53
+ news['chosen_score'] = np.where((method_selection == 'Lexicon') | (method_selection is None),
54
+ news['arti_score'], news['rnn_arti_score'])
55
+ if range_selection == '1 day' or range_selection is None:
56
+ overall_sentiment = news[news.date_extracted == datetime.today().strftime('%Y-%m-%d')].chosen_score.mean()
57
+ news_count = news[news.date_extracted == datetime.today().strftime('%Y-%m-%d')].title.count()
58
+ elif range_selection == '3 days':
59
+ overall_sentiment = news.chosen_score.mean()
60
+ news_count = news.title.count()
61
+ return overall_sentiment, news_count
62
+
63
+
64
+ def news_table(news, date_selection, method_selection):
65
+ if date_selection == 'Today' or date_selection is None:
66
+ date_selected = datetime.today().strftime('%Y-%m-%d')
67
+ elif date_selection == 'Yesterday':
68
+ date_selected = datetime.today().strftime('%Y-%m-%d')
69
+ elif date_selection == '2 Days Ago':
70
+ date_selected = datetime.today().strftime('%Y-%m-%d')
71
+
72
+ if method_selection == 'Lexicon' or method_selection is None:
73
+ clean_news = news.loc[news.date_extracted == date_selected, ['title', 'arti_score', 'pos_sent', 'neg_sent', 'clean_date', 'url']]
74
+ clean_news = clean_news.rename(columns = {'title': 'Title', 'url': 'URL', 'clean_date': 'Date',
75
+ 'arti_score': 'Sentiment Score',
76
+ 'pos_sent': 'Most Positive Sentence',
77
+ 'neg_sent': 'Least Positive Sentence'})
78
+
79
+ elif method_selection == 'Transformer':
80
+ clean_news = news.loc[news.date_extracted == date_selected, ['title', 'url', 'clean_date', 'rnn_arti_score', 'rnn_pos_sent', 'rnn_neg_sent']]
81
+ clean_news = clean_news.rename(columns = {'title': 'Title', 'url': 'URL', 'clean_date': 'Date',
82
+ 'rnn_arti_score': 'Sentiment Score',
83
+ 'rnn_pos_sent': 'Most Positive Sentence',
84
+ 'rnn_neg_sent': 'Least Positive Sentence'})
85
+
86
+ # Formatting for table display
87
+ clean_news = clean_news.sort_values('Date', ascending = False).reset_index(drop=True)
88
+ clean_news.loc[:, 'Title'] = clean_news['Title'].str.wrap(width = 40)
89
+ clean_news.loc[:, 'URL'] = clean_news['URL'].str.wrap(width = 65)
90
+ clean_news.loc[:, 'Most Positive Sentence'] = clean_news['Most Positive Sentence'].str.wrap(width = 65)
91
+ clean_news.loc[:, 'Least Positive Sentence'] = clean_news['Least Positive Sentence'].str.wrap(width = 65)
92
+
93
+ return clean_news
94
+
95
+
96
+ def bbc_json_extract(bbc_script):
97
+ json_data = json.loads(bbc_script)
98
+ res = ''
99
+ news_key = list(json_data['props']['pageProps']['page'].keys())[0]
100
+ for item in json_data['props']['pageProps']['page'][news_key]['contents']:
101
+ if item['type'] == 'text':
102
+ for block in item['model']['blocks']:
103
+ if block['type'] == 'paragraph':
104
+ #res = res + ''.join(block['model']['text']).strip()
105
+ res = res + block['model']['text'] + ' '
106
+ return res
107
+
108
+
109
+ def link_extract(link):
110
+ extracted_content = ''
111
+ if newslink is not None and newslink != '':
112
+ if 'https://' in newslink or 'http://' in newslink:
113
+ clean_link = newslink
114
+ else:
115
+ clean_link = 'https://' + newslink
116
+ html = requests.get(clean_link).content
117
+ sel = Selector(text = html)
118
+ if 'www.bbc.com' in clean_link:
119
+ raw_content = sel.xpath('//body//script//text()').extract()[0]
120
+ extracted_content = bbc_json_extract(raw_content)
121
+ elif 'edition.cnn.com' in clean_link:
122
+ extracted_content = ''.join(sel.xpath('//p[contains(@class, "paragraph") and not(@class="footer__disclaimer-text") and not(@class="footer__copyright-text")]//text()').getall()).strip()
123
+ elif 'www.cnbc.com' in clean_link:
124
+ extracted_content = ''.join(sel.xpath('//div[@class="ArticleBody-articleBody"]//p//text()').getall()).strip()
125
+ return extracted_content
126
+
127
+
128
+ def sentence_breakdown(string):
129
+ # Transform scraped data to a list of separate sentences
130
+ sentences = ""
131
+ if string != "":
132
+ clean_string = cleantext.clean(string, extra_spaces = True)
133
+ for ch in [r'\r', r'\n', r'\t', r'\xa0', '\r', '\n', '\t', '\xa0']:
134
+ if ch in clean_string:
135
+ clean_string = clean_string.replace(ch, '')
136
+ nlp = English()
137
+ nlp.add_pipe('sentencizer')
138
+ doc = nlp(clean_string)
139
+ sentences = [sent.text.strip() for sent in doc.sents]
140
+ return sentences
141
+
142
+
143
+ def rnn_sentence_sentiment(sent):
144
+ from transformers import TextClassificationPipeline, AutoTokenizer, AutoModelForSequenceClassification
145
+
146
+ tokenizer = AutoTokenizer.from_pretrained("Venkatesh4342/distilbert-helpdesk-sentence-sentiment")
147
+ model = AutoModelForSequenceClassification.from_pretrained("Venkatesh4342/distilbert-helpdesk-sentence-sentiment")
148
+ pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer)
149
+ prediction = pipe(sent, top_k=1)
150
+
151
+ # assign score to sentiment, positive sentiment has positive score, negative sentiment has negative score, neutral has 0
152
+ if prediction[0]['label'] == 'Positive':
153
+ res = prediction[0]['score']
154
+ elif prediction[0]['label'] == 'Negative':
155
+ res = -prediction[0]['score']
156
+ elif prediction[0]['label'] == 'Neutral':
157
+ res = 0
158
+
159
+ return res
160
+
161
+
162
+ def article_sentiment(arti):
163
+ # Input is a list of strings/sentences
164
+ scores = dict()
165
+ for sent in arti:
166
+ scores[sent] = rnn_sentence_sentiment(sent)
167
+
168
+ scores_list = list(scores.values())
169
+ arti_score = np.mean([score for score in scores_list if score != 0])
170
+ pos_sents = sorted(scores, key=scores.get, reverse=True)[:3]
171
+ neg_sents = sorted(scores, key=scores.get, reverse=False)[:3]
172
+ return round(arti_score, 3), pos_sents, neg_sents
173
+
174
+
175
+ st.markdown(md_intro)
176
+ tab_news, tab_pred = st.tabs(["News Report", "Sentiment Prediction"])
177
+ with tab_news:
178
+ st.markdown(md_sumstats)
179
+ method_col, range_col = st.columns(2)
180
+ with method_col:
181
+ method_selection = st.selectbox("Select Method", ('Lexicon', 'Transformer'))
182
+ with range_col:
183
+ range_selection = st.selectbox("Statistics Range", ('1 day', '3 days'))
184
+ overall_sentiment, news_count = news_stats(news, method_selection, range_selection)
185
+ senti_col, count_col = st.columns(2)
186
+ senti_col.metric("Overall Sentiment", str(round(overall_sentiment, 3)))
187
+ count_col.metric("Number of News", str(news_count))
188
+ st.markdown(md_table)
189
+ date_selection = st.selectbox("Extraction Date", ('Today', 'Yesterday', '2 Days Ago'))
190
+ clean_news = news_table(news, date_selection, method_selection)
191
+ st.dataframe(data=clean_news,
192
+ column_config={"Title": st.column_config.Column(width=250),
193
+ "Most Positive Sentence": st.column_config.Column(width=400),
194
+ "Least Positive Sentence": st.column_config.Column(width=400),
195
+ "Date": st.column_config.DateColumn(format="DD-MM-YYYY"),
196
+ "URL": st.column_config.LinkColumn("App URL", width=400)
197
+ })
198
+ st.markdown(md_notes)
199
+
200
+ with tab_pred:
201
+ st.write("This is a sentiment prediction module.\nPlease enter your news link into the textbox.\nSentiment prediction will be returned shortly!")
202
+ newslink = st.chat_input(placeholder="Please input CNN/BBC/CNBC link")
203
+ if newslink:
204
+ placeholder = st.empty()
205
+ placeholder.text("Running ...")
206
+ extracted_content = link_extract(newslink)
207
+ cleaned_content = sentence_breakdown(extracted_content)
208
+ arti_score, user_pos_sents, user_neg_sents = article_sentiment(cleaned_content)
209
+ placeholder.empty()
210
+
211
+ if newslink:
212
+ st.markdown(f'### Article sentiment score is: {arti_score}')
213
+ st.markdown("### Three most positive sentences are: ")
214
+ st.markdown(f"{user_pos_sents[0]}")
215
+ st.markdown('''
216
+
217
+ ''')
218
+ st.markdown(f'''{user_pos_sents[1]}
219
+
220
+
221
+ ''')
222
+ st.markdown(f'''{user_pos_sents[2]}
223
+
224
+
225
+ ''')
226
+ st.markdown("### Three most negative sentences are: ")
227
+ st.markdown(f'''{user_neg_sents[0]}
228
+
229
+
230
+ ''')
231
+ st.markdown(f'''{user_neg_sents[1]}
232
+
233
+
234
+ ''')
235
+ st.markdown(f"{user_neg_sents[2]}")