Spaces:
Runtime error
Runtime error
File size: 10,935 Bytes
2efb8ad ba5a9c0 2efb8ad 6e8b3c0 77dc9c6 2efb8ad 2c25125 2efb8ad 6e8b3c0 2efb8ad 77dc9c6 2efb8ad 2c25125 2efb8ad 2c25125 2efb8ad 2c25125 2efb8ad 8330215 2efb8ad a0f0687 2efb8ad 8d02c7b 2efb8ad 8d02c7b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 |
# Load libaries
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import cleantext
import re
import ast
import streamlit as st
import spacy
from spacy.lang.en import English
from dotenv import load_dotenv
from subprocess import Popen
import scrapy
from scrapy import Selector
import json
import requests
md_intro = '''# Business News Sentiment Dashboard
The dashboard has 2 tabs:
- News sentiment report: reports the sentiment of business news from past few days
- Sentiment prediction: receives a news link and outputs sentiment results
Main libraries used: scrapy, SpaCy, PyTorch, transformers, streamlit
News scope: CNN, BBC, CNBC (other business news sources don't have free access)
Time scope: up to 3 days (from yesterday to 3 days ago), based on free tier's available data source
'''
md_sumstats = '''## News Sentiment Summary
'''
md_table = '''## News Sentiment Report
'''
md_notes = '''## Notes and Thoughts:
Lexicon-based approach may confuse named entities and actual sentiment, because brand names may have positive words, say BeautifulSoup.
Hence, implementing named entity recognition before sentiment analysis helps to improve accuracy.
Using RNN-based approach can also overcome lexicon issues, but it also takes more resources.
## References:
https://edition.cnn.com/business
https://www.bbc.com/business
https://www.cnbc.com/business/
https://huggingface.co/mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis
https://huggingface.co/Venkatesh4342/distilbert-helpdesk-sentence-sentiment
https://kennethenevoldsen.github.io/asent/introduction.html
'''
dat_name = './news_db/merged_news_data_' + (datetime.today() - timedelta(days=1)).strftime('%Y-%m-%d') + '.csv'
news = pd.read_csv(dat_name, on_bad_lines='skip')
news[['pos_sent', 'neg_sent']] = news[['pos_sent', 'neg_sent']].fillna('')
news['clean_content'] = news.clean_content.apply(lambda x: ast.literal_eval(x))
news = news.fillna(value = '')
news['clean_date'] = pd.to_datetime(news['clean_date'], dayfirst=True)
# Calculate summary
def news_stats(news, method_selection, range_selection):
overall_sentiment = 0
news_count = 0
news['chosen_score'] = np.where((method_selection == 'Lexicon') | (method_selection is None),
news['arti_score'], news['rnn_arti_score'])
if range_selection == '1 day' or range_selection is None:
overall_sentiment = news[news.date_extracted == (datetime.today() - timedelta(days=1)).strftime('%Y-%m-%d')].chosen_score.mean()
news_count = news[news.date_extracted == (datetime.today() - timedelta(days=1)).strftime('%Y-%m-%d')].title.count()
elif range_selection == '3 days':
overall_sentiment = news.chosen_score.mean()
news_count = news.title.count()
return overall_sentiment, news_count
def news_table(news, date_selection, method_selection):
if date_selection == 'Yesterday' or date_selection is None:
date_selected = (datetime.today() - timedelta(days=1)).strftime('%Y-%m-%d')
elif date_selection == '2 Days Ago':
date_selected = (datetime.today() - timedelta(days=2)).strftime('%Y-%m-%d')
elif date_selection == '3 Days Ago':
date_selected = (datetime.today() - timedelta(days=3)).strftime('%Y-%m-%d')
if method_selection == 'Lexicon' or method_selection is None:
clean_news = news.loc[news.date_extracted == date_selected, ['title', 'arti_score', 'pos_sent', 'neg_sent', 'clean_date', 'url']]
clean_news = clean_news.rename(columns = {'title': 'Title', 'url': 'URL', 'clean_date': 'Date',
'arti_score': 'Sentiment Score',
'pos_sent': 'Most Positive Sentence',
'neg_sent': 'Least Positive Sentence'})
elif method_selection == 'Transformer':
clean_news = news.loc[news.date_extracted == date_selected, ['title', 'rnn_arti_score', 'rnn_pos_sent', 'rnn_neg_sent', 'clean_date', 'url']]
clean_news = clean_news.rename(columns = {'title': 'Title', 'url': 'URL', 'clean_date': 'Date',
'rnn_arti_score': 'Sentiment Score',
'rnn_pos_sent': 'Most Positive Sentence',
'rnn_neg_sent': 'Least Positive Sentence'})
# Formatting for table display
clean_news = clean_news.sort_values('Date', ascending = False).reset_index(drop=True)
clean_news.loc[:, 'Title'] = clean_news['Title'].str.wrap(width = 40)
clean_news.loc[:, 'URL'] = clean_news['URL'].str.wrap(width = 65)
clean_news.loc[:, 'Most Positive Sentence'] = clean_news['Most Positive Sentence'].str.wrap(width = 65)
clean_news.loc[:, 'Least Positive Sentence'] = clean_news['Least Positive Sentence'].str.wrap(width = 65)
return clean_news
def bbc_json_extract(bbc_script):
json_data = json.loads(bbc_script)
res = ''
news_key = list(json_data['props']['pageProps']['page'].keys())[0]
for item in json_data['props']['pageProps']['page'][news_key]['contents']:
if item['type'] == 'text':
for block in item['model']['blocks']:
if block['type'] == 'paragraph':
#res = res + ''.join(block['model']['text']).strip()
res = res + block['model']['text'] + ' '
return res
def link_extract(link):
extracted_content = ''
if newslink is not None and newslink != '':
if 'https://' in newslink or 'http://' in newslink:
clean_link = newslink
else:
clean_link = 'https://' + newslink
html = requests.get(clean_link).content
sel = Selector(text = html)
if 'www.bbc.com' in clean_link:
raw_content = sel.xpath('//body//script//text()').extract()[0]
extracted_content = bbc_json_extract(raw_content)
elif 'edition.cnn.com' in clean_link:
extracted_content = ''.join(sel.xpath('//p[contains(@class, "paragraph") and not(@class="footer__disclaimer-text") and not(@class="footer__copyright-text")]//text()').getall()).strip()
elif 'www.cnbc.com' in clean_link:
extracted_content = ''.join(sel.xpath('//div[@class="ArticleBody-articleBody"]//p//text()').getall()).strip()
return extracted_content
def sentence_breakdown(string):
# Transform scraped data to a list of separate sentences
sentences = ""
if string != "":
clean_string = cleantext.clean(string, extra_spaces = True)
for ch in [r'\r', r'\n', r'\t', r'\xa0', '\r', '\n', '\t', '\xa0']:
if ch in clean_string:
clean_string = clean_string.replace(ch, '')
nlp = English()
nlp.add_pipe('sentencizer')
doc = nlp(clean_string)
sentences = [sent.text.strip() for sent in doc.sents]
return sentences
def rnn_sentence_sentiment(sent):
from transformers import TextClassificationPipeline, AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("Venkatesh4342/distilbert-helpdesk-sentence-sentiment")
model = AutoModelForSequenceClassification.from_pretrained("Venkatesh4342/distilbert-helpdesk-sentence-sentiment")
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer)
prediction = pipe(sent, top_k=1)
# assign score to sentiment, positive sentiment has positive score, negative sentiment has negative score, neutral has 0
if prediction[0]['label'] == 'Positive':
res = prediction[0]['score']
elif prediction[0]['label'] == 'Negative':
res = -prediction[0]['score']
elif prediction[0]['label'] == 'Neutral':
res = 0
return res
def article_sentiment(arti):
# Input is a list of strings/sentences
scores = dict()
for sent in arti:
scores[sent] = rnn_sentence_sentiment(sent)
scores_list = list(scores.values())
arti_score = np.mean([score for score in scores_list if score != 0])
pos_sents = sorted(scores, key=scores.get, reverse=True)[:3]
neg_sents = sorted(scores, key=scores.get, reverse=False)[:3]
return round(arti_score, 3), pos_sents, neg_sents
st.markdown(md_intro)
tab_pred, tab_news = st.tabs(["Sentiment Prediction", "News Report"])
with tab_pred:
st.write("This is a sentiment prediction module.\nPlease enter your news link into the textbox.\nSentiment prediction will be returned shortly!.\nExample link: https://www.bbc.com/news/technology-68818113")
newslink = st.chat_input(placeholder="Please input CNN/BBC/CNBC link")
if newslink:
placeholder = st.empty()
placeholder.text("Running ...")
extracted_content = link_extract(newslink)
cleaned_content = sentence_breakdown(extracted_content)
arti_score, user_pos_sents, user_neg_sents = article_sentiment(cleaned_content)
placeholder.empty()
if newslink:
st.markdown(f'### Article sentiment score is: {arti_score}')
st.markdown("### Three most positive sentences are: ")
st.markdown(f"{user_pos_sents[0]}")
st.markdown('''
''')
st.markdown(f'''{user_pos_sents[1]}
''')
st.markdown(f'''{user_pos_sents[2]}
''')
st.markdown("### Three most negative sentences are: ")
st.markdown(f'''{user_neg_sents[0]}
''')
st.markdown(f'''{user_neg_sents[1]}
''')
st.markdown(f"{user_neg_sents[2]}")
with tab_news:
st.markdown(md_sumstats)
method_col, range_col = st.columns(2)
with method_col:
method_selection = st.selectbox("Select Method", ('Lexicon', 'Transformer'))
with range_col:
range_selection = st.selectbox("Statistics Range", ('1 day', '3 days'))
overall_sentiment, news_count = news_stats(news, method_selection, range_selection)
senti_col, count_col = st.columns(2)
senti_col.metric("Overall Sentiment", str(round(overall_sentiment, 3)))
count_col.metric("Number of News", str(news_count))
st.markdown(md_table)
date_selection = st.selectbox("Extraction Date", ('Yesterday', '2 Days Ago', '3 Days Ago'))
clean_news = news_table(news, date_selection, method_selection)
st.dataframe(data=clean_news,
column_config={"Title": st.column_config.Column(width=250),
"Most Positive Sentence": st.column_config.Column(width=400),
"Least Positive Sentence": st.column_config.Column(width=400),
"Date": st.column_config.DateColumn(format="DD-MM-YYYY"),
"URL": st.column_config.LinkColumn("App URL", width=400)
})
st.markdown(md_notes) |