File size: 10,935 Bytes
2efb8ad
 
 
ba5a9c0
2efb8ad
 
 
 
 
 
 
 
 
 
 
 
 
 
6e8b3c0
 
 
 
 
77dc9c6
2efb8ad
2c25125
 
 
2efb8ad
 
 
 
 
 
 
 
 
 
 
6e8b3c0
 
 
 
 
 
 
 
 
 
 
 
 
2efb8ad
77dc9c6
2efb8ad
 
 
 
 
 
 
 
 
 
 
 
 
 
2c25125
 
2efb8ad
 
 
 
 
 
 
2c25125
 
2efb8ad
2c25125
 
 
2efb8ad
 
 
 
 
 
 
 
 
8330215
2efb8ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a0f0687
2efb8ad
8d02c7b
2efb8ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8d02c7b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
# Load libaries
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import cleantext 
import re
import ast
import streamlit as st
import spacy
from spacy.lang.en import English 
from dotenv import load_dotenv
from subprocess import Popen
import scrapy
from scrapy import Selector
import json
import requests


md_intro = '''# Business News Sentiment Dashboard
The dashboard has 2 tabs:
- News sentiment report: reports the sentiment of business news from past few days
- Sentiment prediction: receives a news link and outputs sentiment results

Main libraries used: scrapy, SpaCy, PyTorch, transformers, streamlit

News scope: CNN, BBC, CNBC (other business news sources don't have free access)

Time scope: up to 3 days (from yesterday to 3 days ago), based on free tier's available data source
'''
md_sumstats = '''## News Sentiment Summary
'''
md_table = '''## News Sentiment Report
'''
md_notes = '''## Notes and Thoughts: 
Lexicon-based approach may confuse named entities and actual sentiment, because brand names may have positive words, say BeautifulSoup. 

Hence, implementing named entity recognition before sentiment analysis helps to improve accuracy.

Using RNN-based approach can also overcome lexicon issues, but it also takes more resources.

## References:
https://edition.cnn.com/business

https://www.bbc.com/business 

https://www.cnbc.com/business/ 

https://huggingface.co/mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis

https://huggingface.co/Venkatesh4342/distilbert-helpdesk-sentence-sentiment 

https://kennethenevoldsen.github.io/asent/introduction.html 
'''
dat_name = './news_db/merged_news_data_' + (datetime.today() - timedelta(days=1)).strftime('%Y-%m-%d') + '.csv'
news = pd.read_csv(dat_name, on_bad_lines='skip')
news[['pos_sent', 'neg_sent']] = news[['pos_sent', 'neg_sent']].fillna('')
news['clean_content'] = news.clean_content.apply(lambda x: ast.literal_eval(x))
news = news.fillna(value = '')
news['clean_date'] = pd.to_datetime(news['clean_date'], dayfirst=True)


# Calculate summary
def news_stats(news, method_selection, range_selection):
    overall_sentiment = 0
    news_count = 0
    news['chosen_score'] = np.where((method_selection == 'Lexicon') | (method_selection is None), 
                                    news['arti_score'], news['rnn_arti_score'])
    if range_selection == '1 day' or range_selection is None:
        overall_sentiment = news[news.date_extracted == (datetime.today() - timedelta(days=1)).strftime('%Y-%m-%d')].chosen_score.mean()
        news_count = news[news.date_extracted == (datetime.today() - timedelta(days=1)).strftime('%Y-%m-%d')].title.count()
    elif range_selection == '3 days':
        overall_sentiment = news.chosen_score.mean()
        news_count = news.title.count()
    return overall_sentiment, news_count
    

def news_table(news, date_selection, method_selection):
    if date_selection == 'Yesterday' or date_selection is None:
        date_selected = (datetime.today() - timedelta(days=1)).strftime('%Y-%m-%d')
    elif date_selection == '2 Days Ago':
        date_selected = (datetime.today() - timedelta(days=2)).strftime('%Y-%m-%d')
    elif date_selection == '3 Days Ago':
        date_selected = (datetime.today() - timedelta(days=3)).strftime('%Y-%m-%d')

    if method_selection == 'Lexicon' or method_selection is None:
        clean_news = news.loc[news.date_extracted == date_selected, ['title', 'arti_score', 'pos_sent', 'neg_sent', 'clean_date', 'url']]
        clean_news = clean_news.rename(columns = {'title': 'Title', 'url': 'URL', 'clean_date': 'Date', 
                                                'arti_score': 'Sentiment Score', 
                                                'pos_sent': 'Most Positive Sentence', 
                                                'neg_sent': 'Least Positive Sentence'})

    elif method_selection == 'Transformer':
        clean_news = news.loc[news.date_extracted == date_selected, ['title', 'rnn_arti_score', 'rnn_pos_sent', 'rnn_neg_sent', 'clean_date', 'url']]
        clean_news = clean_news.rename(columns = {'title': 'Title', 'url': 'URL', 'clean_date': 'Date', 
                                                'rnn_arti_score': 'Sentiment Score', 
                                                'rnn_pos_sent': 'Most Positive Sentence', 
                                                'rnn_neg_sent': 'Least Positive Sentence'})
        
    # Formatting for table display
    clean_news = clean_news.sort_values('Date', ascending = False).reset_index(drop=True)
    clean_news.loc[:, 'Title'] = clean_news['Title'].str.wrap(width = 40)
    clean_news.loc[:, 'URL'] = clean_news['URL'].str.wrap(width = 65)
    clean_news.loc[:, 'Most Positive Sentence'] = clean_news['Most Positive Sentence'].str.wrap(width = 65)
    clean_news.loc[:, 'Least Positive Sentence'] = clean_news['Least Positive Sentence'].str.wrap(width = 65)

    return clean_news


def bbc_json_extract(bbc_script):
    json_data = json.loads(bbc_script)
    res = ''
    news_key = list(json_data['props']['pageProps']['page'].keys())[0]
    for item in json_data['props']['pageProps']['page'][news_key]['contents']:
        if item['type'] == 'text':
            for block in item['model']['blocks']:
                if block['type'] == 'paragraph':
                    #res = res + ''.join(block['model']['text']).strip()
                    res = res + block['model']['text'] + ' '
    return res 


def link_extract(link):   
    extracted_content = ''
    if newslink is not None and newslink != '':
        if 'https://' in newslink or 'http://' in newslink:
            clean_link = newslink
        else:
            clean_link = 'https://' + newslink
        html = requests.get(clean_link).content
        sel = Selector(text = html)
        if 'www.bbc.com' in clean_link:
            raw_content = sel.xpath('//body//script//text()').extract()[0]
            extracted_content = bbc_json_extract(raw_content)
        elif 'edition.cnn.com' in clean_link:
            extracted_content = ''.join(sel.xpath('//p[contains(@class, "paragraph") and not(@class="footer__disclaimer-text") and not(@class="footer__copyright-text")]//text()').getall()).strip()
        elif 'www.cnbc.com' in clean_link:
            extracted_content = ''.join(sel.xpath('//div[@class="ArticleBody-articleBody"]//p//text()').getall()).strip()
    return extracted_content


def sentence_breakdown(string):
    # Transform scraped data to a list of separate sentences
    sentences = ""
    if string != "":
        clean_string = cleantext.clean(string, extra_spaces = True)
        for ch in [r'\r', r'\n', r'\t', r'\xa0', '\r', '\n', '\t', '\xa0']:
            if ch in clean_string:
                clean_string = clean_string.replace(ch, '')
        nlp = English()
        nlp.add_pipe('sentencizer')
        doc = nlp(clean_string)
        sentences = [sent.text.strip() for sent in doc.sents]
    return sentences


def rnn_sentence_sentiment(sent):
    from transformers import TextClassificationPipeline, AutoTokenizer, AutoModelForSequenceClassification

    tokenizer = AutoTokenizer.from_pretrained("Venkatesh4342/distilbert-helpdesk-sentence-sentiment")
    model = AutoModelForSequenceClassification.from_pretrained("Venkatesh4342/distilbert-helpdesk-sentence-sentiment")
    pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer)
    prediction = pipe(sent, top_k=1)
    
    # assign score to sentiment, positive sentiment has positive score, negative sentiment has negative score, neutral has 0
    if prediction[0]['label'] == 'Positive':
        res = prediction[0]['score']
    elif prediction[0]['label'] == 'Negative':
        res = -prediction[0]['score']
    elif prediction[0]['label'] == 'Neutral':
        res = 0
        
    return res


def article_sentiment(arti):
    # Input is a list of strings/sentences
    scores = dict()
    for sent in arti:
        scores[sent] = rnn_sentence_sentiment(sent)
    
    scores_list = list(scores.values())
    arti_score = np.mean([score for score in scores_list if score != 0])
    pos_sents = sorted(scores, key=scores.get, reverse=True)[:3]
    neg_sents = sorted(scores, key=scores.get, reverse=False)[:3]
    return round(arti_score, 3), pos_sents, neg_sents


st.markdown(md_intro)
tab_pred, tab_news = st.tabs(["Sentiment Prediction", "News Report"])
with tab_pred:
    st.write("This is a sentiment prediction module.\nPlease enter your news link into the textbox.\nSentiment prediction will be returned shortly!.\nExample link: https://www.bbc.com/news/technology-68818113")
    newslink = st.chat_input(placeholder="Please input CNN/BBC/CNBC link")
    if newslink:
        placeholder = st.empty()
        placeholder.text("Running ...")
        extracted_content = link_extract(newslink)
        cleaned_content = sentence_breakdown(extracted_content)
        arti_score, user_pos_sents, user_neg_sents = article_sentiment(cleaned_content)
        placeholder.empty()

    if newslink:
        st.markdown(f'### Article sentiment score is: {arti_score}')
        st.markdown("### Three most positive sentences are: ")
        st.markdown(f"{user_pos_sents[0]}")
        st.markdown('''

        ''')
        st.markdown(f'''{user_pos_sents[1]}
        
        
        ''')
        st.markdown(f'''{user_pos_sents[2]}
        
        
        ''')
        st.markdown("### Three most negative sentences are: ")
        st.markdown(f'''{user_neg_sents[0]}
        

        ''')
        st.markdown(f'''{user_neg_sents[1]}
        
        
        ''')
        st.markdown(f"{user_neg_sents[2]}")


with tab_news:
    st.markdown(md_sumstats)
    method_col, range_col = st.columns(2)
    with method_col: 
        method_selection = st.selectbox("Select Method", ('Lexicon', 'Transformer'))
    with range_col:
        range_selection = st.selectbox("Statistics Range", ('1 day', '3 days'))
    overall_sentiment, news_count = news_stats(news, method_selection, range_selection)
    senti_col, count_col = st.columns(2)
    senti_col.metric("Overall Sentiment", str(round(overall_sentiment, 3)))
    count_col.metric("Number of News", str(news_count))
    st.markdown(md_table)
    date_selection = st.selectbox("Extraction Date", ('Yesterday', '2 Days Ago', '3 Days Ago'))
    clean_news = news_table(news, date_selection, method_selection)
    st.dataframe(data=clean_news, 
                column_config={"Title": st.column_config.Column(width=250),
                               "Most Positive Sentence": st.column_config.Column(width=400),
                               "Least Positive Sentence": st.column_config.Column(width=400), 
                               "Date": st.column_config.DateColumn(format="DD-MM-YYYY"), 
                               "URL": st.column_config.LinkColumn("App URL", width=400)
                })
    st.markdown(md_notes)