Browse files
@@ -0,0 +1,235 @@
1 |
# Load libaries
2 |
import pandas as pd
3 |
import numpy as np
4 |
from datetime import datetime
5 |
import cleantext
6 |
import re
7 |
import ast
8 |
import streamlit as st
9 |
from itables import init_notebook_mode, show
10 |
import spacy
11 |
from spacy.lang.en import English
12 |
#import spacy_cleaner
13 |
#from spacy_cleaner.processing import removers, replacers, mutators
14 |
#from spacy import displacy
15 |
#import asent
16 |
from dotenv import load_dotenv
17 |
from subprocess import Popen
18 |
import scrapy
19 |
from scrapy import Selector
20 |
import json
21 |
import requests
22 |
23 |
24 |
md_intro = '''# Sentiment Dashboard
25 |
Main libraries used: Scrapy, SpaCy, PyTorch, Mercury
26 |
27 |
Data source: CNN, BBC, CNBC
28 |
29 |
md_sumstats = '''## News Sentiment Summary
30 |
31 |
md_table = '''## News Sentiment Report
32 |
33 |
md_notes = '''## Notes and Thoughts:
34 |
Lexicon-based approach may confuse named entities and actual sentiment, because brand names may have positive words, say BeautifulSoup.
35 |
36 |
Hence, implementing named entity recognition before sentiment analysis helps to improve accuracy.
37 |
38 |
Using RNN-based approach can also overcome lexicon issues, but it also takes more resources.
39 |
40 |
dat_name = 'E:/Project/NLP/news_sentiment_analytics/news_db/merged_news_data_' +'%Y-%m-%d') + '.csv'
41 |
news = pd.read_csv(dat_name, on_bad_lines='skip')
42 |
news[['pos_sent', 'neg_sent']] = news[['pos_sent', 'neg_sent']].fillna('')
43 |
news['clean_content'] = news.clean_content.apply(lambda x: ast.literal_eval(x))
44 |
news = news.fillna(value = '')
45 |
news['clean_date'] = pd.to_datetime(news['clean_date'], dayfirst=True)
46 |
news['date_extracted'] ='%Y-%m-%d')
47 |
48 |
49 |
# Calculate summary
50 |
def news_stats(news, method_selection, range_selection):
51 |
overall_sentiment = 0
52 |
news_count = 0
53 |
news['chosen_score'] = np.where((method_selection == 'Lexicon') | (method_selection is None),
54 |
news['arti_score'], news['rnn_arti_score'])
55 |
if range_selection == '1 day' or range_selection is None:
56 |
overall_sentiment = news[news.date_extracted =='%Y-%m-%d')].chosen_score.mean()
57 |
news_count = news[news.date_extracted =='%Y-%m-%d')].title.count()
58 |
elif range_selection == '3 days':
59 |
overall_sentiment = news.chosen_score.mean()
60 |
news_count = news.title.count()
61 |
return overall_sentiment, news_count
62 |
63 |
64 |
def news_table(news, date_selection, method_selection):
65 |
if date_selection == 'Today' or date_selection is None:
66 |
date_selected ='%Y-%m-%d')
67 |
elif date_selection == 'Yesterday':
68 |
date_selected ='%Y-%m-%d')
69 |
elif date_selection == '2 Days Ago':
70 |
date_selected ='%Y-%m-%d')
71 |
72 |
if method_selection == 'Lexicon' or method_selection is None:
73 |
clean_news = news.loc[news.date_extracted == date_selected, ['title', 'arti_score', 'pos_sent', 'neg_sent', 'clean_date', 'url']]
74 |
clean_news = clean_news.rename(columns = {'title': 'Title', 'url': 'URL', 'clean_date': 'Date',
75 |
'arti_score': 'Sentiment Score',
76 |
'pos_sent': 'Most Positive Sentence',
77 |
'neg_sent': 'Least Positive Sentence'})
78 |
79 |
elif method_selection == 'Transformer':
80 |
clean_news = news.loc[news.date_extracted == date_selected, ['title', 'url', 'clean_date', 'rnn_arti_score', 'rnn_pos_sent', 'rnn_neg_sent']]
81 |
clean_news = clean_news.rename(columns = {'title': 'Title', 'url': 'URL', 'clean_date': 'Date',
82 |
'rnn_arti_score': 'Sentiment Score',
83 |
'rnn_pos_sent': 'Most Positive Sentence',
84 |
'rnn_neg_sent': 'Least Positive Sentence'})
85 |
86 |
# Formatting for table display
87 |
clean_news = clean_news.sort_values('Date', ascending = False).reset_index(drop=True)
88 |
clean_news.loc[:, 'Title'] = clean_news['Title'].str.wrap(width = 40)
89 |
clean_news.loc[:, 'URL'] = clean_news['URL'].str.wrap(width = 65)
90 |
clean_news.loc[:, 'Most Positive Sentence'] = clean_news['Most Positive Sentence'].str.wrap(width = 65)
91 |
clean_news.loc[:, 'Least Positive Sentence'] = clean_news['Least Positive Sentence'].str.wrap(width = 65)
92 |
93 |
return clean_news
94 |
95 |
96 |
def bbc_json_extract(bbc_script):
97 |
json_data = json.loads(bbc_script)
98 |
res = ''
99 |
news_key = list(json_data['props']['pageProps']['page'].keys())[0]
100 |
for item in json_data['props']['pageProps']['page'][news_key]['contents']:
101 |
if item['type'] == 'text':
102 |
for block in item['model']['blocks']:
103 |
if block['type'] == 'paragraph':
104 |
#res = res + ''.join(block['model']['text']).strip()
105 |
res = res + block['model']['text'] + ' '
106 |
return res
107 |
108 |
109 |
def link_extract(link):
110 |
extracted_content = ''
111 |
if newslink is not None and newslink != '':
112 |
if 'https://' in newslink or 'http://' in newslink:
113 |
clean_link = newslink
114 |
115 |
clean_link = 'https://' + newslink
116 |
html = requests.get(clean_link).content
117 |
sel = Selector(text = html)
118 |
if '' in clean_link:
119 |
raw_content = sel.xpath('//body//script//text()').extract()[0]
120 |
extracted_content = bbc_json_extract(raw_content)
121 |
elif '' in clean_link:
122 |
extracted_content = ''.join(sel.xpath('//p[contains(@class, "paragraph") and not(@class="footer__disclaimer-text") and not(@class="footer__copyright-text")]//text()').getall()).strip()
123 |
elif '' in clean_link:
124 |
extracted_content = ''.join(sel.xpath('//div[@class="ArticleBody-articleBody"]//p//text()').getall()).strip()
125 |
return extracted_content
126 |
127 |
128 |
def sentence_breakdown(string):
129 |
# Transform scraped data to a list of separate sentences
130 |
sentences = ""
131 |
if string != "":
132 |
clean_string = cleantext.clean(string, extra_spaces = True)
133 |
for ch in [r'\r', r'\n', r'\t', r'\xa0', '\r', '\n', '\t', '\xa0']:
134 |
if ch in clean_string:
135 |
clean_string = clean_string.replace(ch, '')
136 |
nlp = English()
137 |
138 |
doc = nlp(clean_string)
139 |
sentences = [sent.text.strip() for sent in doc.sents]
140 |
return sentences
141 |
142 |
143 |
def rnn_sentence_sentiment(sent):
144 |
from transformers import TextClassificationPipeline, AutoTokenizer, AutoModelForSequenceClassification
145 |
146 |
tokenizer = AutoTokenizer.from_pretrained("Venkatesh4342/distilbert-helpdesk-sentence-sentiment")
147 |
model = AutoModelForSequenceClassification.from_pretrained("Venkatesh4342/distilbert-helpdesk-sentence-sentiment")
148 |
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer)
149 |
prediction = pipe(sent, top_k=1)
150 |
151 |
# assign score to sentiment, positive sentiment has positive score, negative sentiment has negative score, neutral has 0
152 |
if prediction[0]['label'] == 'Positive':
153 |
res = prediction[0]['score']
154 |
elif prediction[0]['label'] == 'Negative':
155 |
res = -prediction[0]['score']
156 |
elif prediction[0]['label'] == 'Neutral':
157 |
res = 0
158 |
159 |
return res
160 |
161 |
162 |
def article_sentiment(arti):
163 |
# Input is a list of strings/sentences
164 |
scores = dict()
165 |
for sent in arti:
166 |
scores[sent] = rnn_sentence_sentiment(sent)
167 |
168 |
scores_list = list(scores.values())
169 |
arti_score = np.mean([score for score in scores_list if score != 0])
170 |
pos_sents = sorted(scores, key=scores.get, reverse=True)[:3]
171 |
neg_sents = sorted(scores, key=scores.get, reverse=False)[:3]
172 |
return round(arti_score, 3), pos_sents, neg_sents
173 |
174 |
175 |
176 |
tab_news, tab_pred = st.tabs(["News Report", "Sentiment Prediction"])
177 |
with tab_news:
178 |
179 |
method_col, range_col = st.columns(2)
180 |
with method_col:
181 |
method_selection = st.selectbox("Select Method", ('Lexicon', 'Transformer'))
182 |
with range_col:
183 |
range_selection = st.selectbox("Statistics Range", ('1 day', '3 days'))
184 |
overall_sentiment, news_count = news_stats(news, method_selection, range_selection)
185 |
senti_col, count_col = st.columns(2)
186 |
senti_col.metric("Overall Sentiment", str(round(overall_sentiment, 3)))
187 |
count_col.metric("Number of News", str(news_count))
188 |
189 |
date_selection = st.selectbox("Extraction Date", ('Today', 'Yesterday', '2 Days Ago'))
190 |
clean_news = news_table(news, date_selection, method_selection)
191 |
192 |
column_config={"Title": st.column_config.Column(width=250),
193 |
"Most Positive Sentence": st.column_config.Column(width=400),
194 |
"Least Positive Sentence": st.column_config.Column(width=400),
195 |
"Date": st.column_config.DateColumn(format="DD-MM-YYYY"),
196 |
"URL": st.column_config.LinkColumn("App URL", width=400)
197 |
198 |
199 |
200 |
with tab_pred:
201 |
st.write("This is a sentiment prediction module.\nPlease enter your news link into the textbox.\nSentiment prediction will be returned shortly!")
202 |
newslink = st.chat_input(placeholder="Please input CNN/BBC/CNBC link")
203 |
if newslink:
204 |
placeholder = st.empty()
205 |
placeholder.text("Running ...")
206 |
extracted_content = link_extract(newslink)
207 |
cleaned_content = sentence_breakdown(extracted_content)
208 |
arti_score, user_pos_sents, user_neg_sents = article_sentiment(cleaned_content)
209 |
210 |
211 |
if newslink:
212 |
st.markdown(f'### Article sentiment score is: {arti_score}')
213 |
st.markdown("### Three most positive sentences are: ")
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
st.markdown("### Three most negative sentences are: ")
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |