File size: 11,500 Bytes
004fea0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5e803c8
004fea0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
# Import baseline dependencies
import time
from datetime import date

import pandas as pd
import pandas_datareader as data
import requests
import streamlit as st
from bs4 import BeautifulSoup
from plotly import graph_objs as go
from prophet import Prophet
from prophet.plot import plot_plotly
# summarisation (Pegasus) and sentiment analysis (BERT) models
from transformers import (BertForSequenceClassification, BertTokenizer,
                          PegasusTokenizer, TFPegasusForConditionalGeneration,
                          pipeline)

# Setting streamlit page config to wide
st.set_page_config(layout='wide')


@st.cache(allow_output_mutation=True, show_spinner=False)
# Setup summarisation model
def get_summarisation_model():
    sum_model_name = "human-centered-summarization/financial-summarization-pegasus"
    sum_tokenizer = PegasusTokenizer.from_pretrained(sum_model_name)
    sum_model = TFPegasusForConditionalGeneration.from_pretrained(
        sum_model_name)

    # returning model and tokenizer
    return sum_model, sum_tokenizer


@st.cache(allow_output_mutation=True, show_spinner=False)
# Setup sentiment analysis model
def get_sentiment_pepeline():
    sen_model_name = "ahmedrachid/FinancialBERT-Sentiment-Analysis"
    sen_tokenizer = BertTokenizer.from_pretrained(sen_model_name)
    sen_model = BertForSequenceClassification.from_pretrained(
        sen_model_name, num_labels=3)
    sentiment_nlp = pipeline("sentiment-analysis",
                             model=sen_model, tokenizer=sen_tokenizer)

    # returning sentiment pipeline
    return sentiment_nlp


@st.cache(show_spinner=False, suppress_st_warning=True)
# Get all links from Google News
def search_urls(ticker, num, date):

    # https://developers.google.com/custom-search/docs/xml_results_appendices#interfaceLanguages

    # Request headers and parameters
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36",
    }

    params = {
        "as_sitesearch": "finance.yahoo.com",  # we only want results from Yahoo Finance
        "hl": "en",  # language of the interface
        "gl": "us",  # country of the search
        "tbm": "nws",  # news results
        "lr": "lang_en"  # language filter
    }

    # base URL
    url = "https://www.google.com/search"

    # search query
    params["as_epq"] = ticker
    params["as_occt"] = ticker
    # number of search results per page
    params["num"] = num

    # articles timeframe
    #  d = past 24h, h = past hour, w = past week, m = pasth month
    if date == "Past week":
        params["as_qdr"] = "w"
    elif date == "Past day":
        params["as_qdr"] = "d"

    r = requests.get(url, headers=headers, params=params,
                     cookies={'CONSENT': 'YES+'})
    time.sleep(5)
    st.write("Searched URL:")
    st.write(r.url)  # debugging
    soup = BeautifulSoup(r.text, "html.parser")
    atags = soup.find_all("a", "WlydOe")
    hrefs = [link["href"] for link in atags]

    return hrefs


@st.cache(show_spinner=False)
# Extract title, date, and content of the article from all given URLs
def search_scrape(urls):
    articles = []
    titles = []
    post_dates = []

    for url in urls:
        r = requests.get(url)
        time.sleep(5)
        soup = BeautifulSoup(r.text, "html.parser")

        # title
        title = soup.find("header", "caas-title-wrapper")
        # handling missing titles
        if title is not None:
            titles.append(title.text)
        else:
            titles.append("N/A")

        # posting date of the article
        date = soup.find("time", "caas-attr-meta-time")
        # handling missing dates
        if date is not None:
            post_dates.append(date.text)
        else:
            post_dates.append("N/A")

        # article content
        # all the paragraphs within the article
        paragraphs = soup.find_all("div", "caas-body")
        text = [paragraph.text for paragraph in paragraphs]
        # extract only the first 300 words (needs to be done to avoid limit
        # problems with the summarisation model)
        words = " ".join(text).split(" ")[:350]
        article = " ".join(words)
        articles.append(article)

    return titles, post_dates, articles


@st.cache(show_spinner=False)
# Summarise all given articles using a fine-tuned Pegasus Transformers model
def summarise_articles(sum_model, sum_tokenizer, articles):
    summaries = []
    for article in articles:

        # source
        # https://huggingface.co/human-centered-summarization/financial-summarization-pegasus
        input_ids = sum_tokenizer(
            article, return_tensors="tf").input_ids
        output = sum_model.generate(
            input_ids, max_length=55, num_beans=5, early_stopping=True)
        summary = sum_tokenizer.decode(
            output[0], skip_special_tokens=True)
        summaries.append(summary)

    return summaries


@st.cache(show_spinner=False)
# Join all data into rows
def create_output_array(titles, post_dates, summarised_articles, sentiment_scores, raw_urls):
    output_array = []
    for idx in range(len(summarised_articles)):
        row = [
            titles[idx],
            post_dates[idx],
            summarised_articles[idx],
            sentiment_scores[idx]["label"].capitalize(),
            "{:.0%}".format(sentiment_scores[idx]["score"]),
            raw_urls[idx]
        ]
        output_array.append(row)

    return output_array


@st.cache(show_spinner=False)
# Convert dataframe to .csv file
def convert_df(df):
    return df.to_csv().encode("utf-8")

# ------------------------------------------------------------------------------


@st.cache(show_spinner=False)
# Load data from Yahoo Finance
def load_data(ticker, start, end):
    df = data.DataReader(ticker, "yahoo", start, end)
    df.reset_index(inplace=True)
    return df


@st.cache(show_spinner=False)
# Predict stock trend for N years using Prophet
def predict(df, period):

    df_train = df[["Date", "Close"]]
    df_train = df_train.rename(columns={"Date": "ds", "Close": "y"})

    model = Prophet()

    model.fit(df_train)
    future = model.make_future_dataframe(periods=period)
    forecast = model.predict(future)

    return model, forecast


def main_page():

    # Financial News Analysis feature

    # Streamlit text

    st.sidebar.markdown("## Financial News Analysis")
    st.sidebar.write(
        "Scrape, auto summarise and calculate sentiment for stock and crypto news.")

    # User input
    ticker = st.text_input("Ticker:", "TSLA")
    num = st.number_input("Number of articles:", 5, 15, 10)
    date = st.selectbox(
        "Timeline:", ["Past week", "Past day"])

    search = st.button("Search")

    st.info("Please do not spam the search button")
    st.markdown("---")

    # If button is pressed
    if search:

        with st.spinner("Processing articles, please wait..."):
            # Search query and return all articles' links
            raw_urls = search_urls(ticker, num, date)

            # If any problems happened (e.g., blocked by Google's server) stop app
            if not raw_urls:
                st.error("Please wait a few minutes before trying again")
            else:

                # Scrap title, posting date and article content from all the URLs
                titles, post_dates, articles = search_scrape(raw_urls)

                # Summarise all articles
                summarised_articles = summarise_articles(
                    sum_model, sum_tokenizer, articles)

                # Calculate sentiment for all articles
                # source
                # https://huggingface.co/ahmedrachid/FinancialBERT-Sentiment-Analysis
                sentiment_scores = sentiment_pipeline(summarised_articles)

                # Create dataframe
                output_array = create_output_array(
                    titles, post_dates, summarised_articles, sentiment_scores, raw_urls)
                cols = ["Title", "Date", "Summary",
                        "Label", "Confidence", "URL"]
                df = pd.DataFrame(output_array, columns=cols)

                # Visualise dataframe
                st.dataframe(df)

                # Convert dataframe to csv and let user download it
                csv_file = convert_df(df)

                # Download CSV
                st.download_button(
                    "Save data to CSV", csv_file, "assetsummaries.csv", "text/csv", key="download-csv")


def page2():

    # Stock Trend Forecasting feature

    # Streamlit text
    st.sidebar.markdown("## Stock Trend Forecasting")
    st.sidebar.write(
        "A simple dashboard for stock trend forecasting and analysis.")

    # Start and end date of data
    start = "2010-01-01"
    end = date.today().strftime("%Y-%m-%d")

    # Ticker selection
    ticker = st.text_input("Ticker:", "AAPL")
    # Loading data from Yahoo Finance
    df = load_data(ticker, start, end)

    # Period selection
    n_years = st.number_input("Years of prediction:", 1, 4, 1)
    period = n_years * 365

    # Start prediction button
    init = st.button("Predict")

    st.markdown("---")

    # Visualisation
    # Dropping adj close column
    df = df.drop(["Adj Close"], axis=1)

    # Visualisation
    # Exploratory analysis
    st.subheader("Exploratory analysis")
    st.write(df.describe())

    # Plot raw closing data with 100 and 200 days MA (for simple analysis)
    st.subheader("Closing data, MA100 and MA200")

    ma100 = df.Close.rolling(100).mean()
    ma200 = df.Close.rolling(200).mean()

    fig = go.Figure()
    fig.update_layout(
        margin=dict(
            l=0,
            r=0,
            b=0,
            t=50,
            pad=4
        )
    )
    fig.add_trace(go.Scatter(x=df["Date"],
                  y=df['Close'], name="stock_close"))
    fig.add_trace(go.Scatter(x=df["Date"], y=ma100, name="ma100"))
    fig.add_trace(go.Scatter(x=df["Date"], y=ma200, name="ma200"))
    fig.layout.update(xaxis_rangeslider_visible=True)
    st.plotly_chart(fig, use_container_width=True)

    # If button is pressed, start forecasting
    if init:
        with st.spinner("Please wait..."):
            model, forecast = predict(df, period)

            st.markdown("---")
            st.subheader("Forecast data")
            st.write(forecast.tail())

            st.subheader(f"Forecast plot for {n_years} years")

            fig = plot_plotly(model, forecast)
            fig.update_layout(
                margin=dict(
                    l=0,
                    r=0,
                    b=0,
                    t=0,
                    pad=4
                )
            )
            st.plotly_chart(fig, use_container_width=True)

            st.subheader("Forecast components")
            fig = model.plot_components(forecast)
            st.write(fig)


if __name__ == "__main__":

    with st.spinner("Loading all models..."):
        # Creating summariser and sentiment models
        sum_model, sum_tokenizer = get_summarisation_model()
        sentiment_pipeline = get_sentiment_pepeline()

    page_names_to_funcs = {
        "Financial News Analysis": main_page,
        "Stock Trend Forecasting": page2
    }

    st.sidebar.markdown("# Financial Researcher")

    selected_page = st.sidebar.selectbox(
        "Select a page", page_names_to_funcs.keys())

    st.sidebar.markdown("---")

    page_names_to_funcs[selected_page]()