r1391819 commited on
Commit
004fea0
1 Parent(s): 455de29
Files changed (2) hide show
  1. app.py +376 -0
  2. requirements.txt +13 -0
app.py ADDED
@@ -0,0 +1,376 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import baseline dependencies
2
+ import csv
3
+ import time
4
+ from datetime import date
5
+
6
+ import numpy as np
7
+ import pandas as pd
8
+ import pandas_datareader as data
9
+ import requests
10
+ import streamlit as st
11
+ from bs4 import BeautifulSoup
12
+ from plotly import graph_objs as go
13
+ from prophet import Prophet
14
+ from prophet.plot import plot_plotly
15
+ # summarisation (Pegasus) and sentiment analysis (BERT) models
16
+ from transformers import (BertForSequenceClassification, BertTokenizer,
17
+ PegasusTokenizer, TFPegasusForConditionalGeneration,
18
+ pipeline)
19
+
20
+ # Setting streamlit page config to wide
21
+ st.set_page_config(layout='wide')
22
+
23
+
24
+ @st.cache(allow_output_mutation=True, show_spinner=False)
25
+ # Setup summarisation model
26
+ def get_summarisation_model():
27
+ sum_model_name = "human-centered-summarization/financial-summarization-pegasus"
28
+ sum_tokenizer = PegasusTokenizer.from_pretrained(sum_model_name)
29
+ sum_model = TFPegasusForConditionalGeneration.from_pretrained(
30
+ sum_model_name)
31
+
32
+ # returning model and tokenizer
33
+ return sum_model, sum_tokenizer
34
+
35
+
36
+ @st.cache(allow_output_mutation=True, show_spinner=False)
37
+ # Setup sentiment analysis model
38
+ def get_sentiment_pepeline():
39
+ sen_model_name = "ahmedrachid/FinancialBERT-Sentiment-Analysis"
40
+ sen_tokenizer = BertTokenizer.from_pretrained(sen_model_name)
41
+ sen_model = BertForSequenceClassification.from_pretrained(
42
+ sen_model_name, num_labels=3)
43
+ sentiment_nlp = pipeline("sentiment-analysis",
44
+ model=sen_model, tokenizer=sen_tokenizer)
45
+
46
+ # returning sentiment pipeline
47
+ return sentiment_nlp
48
+
49
+
50
+ @st.cache(show_spinner=False, suppress_st_warning=True)
51
+ # Get all links from Google News
52
+ def search_urls(ticker, num, date):
53
+
54
+ # https://developers.google.com/custom-search/docs/xml_results_appendices#interfaceLanguages
55
+
56
+ # Request headers and parameters
57
+ headers = {
58
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36",
59
+ }
60
+
61
+ params = {
62
+ "as_sitesearch": "finance.yahoo.com", # we only want results from Yahoo Finance
63
+ "hl": "en", # language of the interface
64
+ "gl": "us", # country of the search
65
+ "tbm": "nws", # news results
66
+ "lr": "lang_en" # language filter
67
+ }
68
+
69
+ # base URL
70
+ url = "https://www.google.com/search"
71
+
72
+ # search query
73
+ params["as_epq"] = ticker
74
+ params["as_occt"] = ticker
75
+ # number of search results per page
76
+ params["num"] = num
77
+
78
+ # articles timeframe
79
+ # d = past 24h, h = past hour, w = past week, m = pasth month
80
+ if date == "Past week":
81
+ params["as_qdr"] = "w"
82
+ elif date == "Past day":
83
+ params["as_qdr"] = "d"
84
+
85
+ r = requests.get(url, headers=headers, params=params,
86
+ cookies={'CONSENT': 'YES+'})
87
+ time.sleep(5)
88
+ st.write("Searched URL:")
89
+ st.write(r.url) # debugging
90
+ soup = BeautifulSoup(r.text, "html.parser")
91
+ atags = soup.find_all("a", "WlydOe")
92
+ hrefs = [link["href"] for link in atags]
93
+
94
+ return hrefs
95
+
96
+
97
+ @st.cache(show_spinner=False)
98
+ # Extract title, date, and content of the article from all given URLs
99
+ def search_scrape(urls):
100
+ articles = []
101
+ titles = []
102
+ post_dates = []
103
+
104
+ for url in urls:
105
+ r = requests.get(url)
106
+ time.sleep(5)
107
+ soup = BeautifulSoup(r.text, "html.parser")
108
+
109
+ # title
110
+ title = soup.find("header", "caas-title-wrapper")
111
+ # handling missing titles
112
+ if title is not None:
113
+ titles.append(title.text)
114
+ else:
115
+ titles.append("N/A")
116
+
117
+ # posting date of the article
118
+ date = soup.find("time", "caas-attr-meta-time")
119
+ # handling missing dates
120
+ if date is not None:
121
+ post_dates.append(date.text)
122
+ else:
123
+ post_dates.append("N/A")
124
+
125
+ # article content
126
+ # all the paragraphs within the article
127
+ paragraphs = soup.find_all("div", "caas-body")
128
+ text = [paragraph.text for paragraph in paragraphs]
129
+ # extract only the first 300 words (needs to be done to avoid limit
130
+ # problems with the summarisation model)
131
+ words = " ".join(text).split(" ")[:350]
132
+ article = " ".join(words)
133
+ articles.append(article)
134
+
135
+ return titles, post_dates, articles
136
+
137
+
138
+ @st.cache(show_spinner=False)
139
+ # Summarise all given articles using a fine-tuned Pegasus Transformers model
140
+ def summarise_articles(sum_model, sum_tokenizer, articles):
141
+ summaries = []
142
+ for article in articles:
143
+
144
+ # source
145
+ # https://huggingface.co/human-centered-summarization/financial-summarization-pegasus
146
+ input_ids = sum_tokenizer(
147
+ article, return_tensors="tf").input_ids
148
+ output = sum_model.generate(
149
+ input_ids, max_length=55, num_beans=5, early_stopping=True)
150
+ summary = sum_tokenizer.decode(
151
+ output[0], skip_special_tokens=True)
152
+ summaries.append(summary)
153
+
154
+ return summaries
155
+
156
+
157
+ @st.cache(show_spinner=False)
158
+ # Join all data into rows
159
+ def create_output_array(titles, post_dates, summarised_articles, sentiment_scores, raw_urls):
160
+ output_array = []
161
+ for idx in range(len(summarised_articles)):
162
+ row = [
163
+ titles[idx],
164
+ post_dates[idx],
165
+ summarised_articles[idx],
166
+ sentiment_scores[idx]["label"].capitalize(),
167
+ "{:.0%}".format(sentiment_scores[idx]["score"]),
168
+ raw_urls[idx]
169
+ ]
170
+ output_array.append(row)
171
+
172
+ return output_array
173
+
174
+
175
+ @st.cache(show_spinner=False)
176
+ # Convert dataframe to .csv file
177
+ def convert_df(df):
178
+ return df.to_csv().encode("utf-8")
179
+
180
+ # ------------------------------------------------------------------------------
181
+
182
+
183
+ @st.cache(show_spinner=False)
184
+ # Load data from Yahoo Finance
185
+ def load_data(ticker, start, end):
186
+ df = data.DataReader(ticker, "yahoo", start, end)
187
+ df.reset_index(inplace=True)
188
+ return df
189
+
190
+
191
+ @st.cache(show_spinner=False)
192
+ # Predict stock trend for N years using Prophet
193
+ def predict(df, period):
194
+
195
+ df_train = df[["Date", "Close"]]
196
+ df_train = df_train.rename(columns={"Date": "ds", "Close": "y"})
197
+
198
+ model = Prophet()
199
+
200
+ model.fit(df_train)
201
+ future = model.make_future_dataframe(periods=period)
202
+ forecast = model.predict(future)
203
+
204
+ return model, forecast
205
+
206
+
207
+ def main_page():
208
+
209
+ # Financial News Analysis feature
210
+
211
+ # Streamlit text
212
+
213
+ st.sidebar.markdown("## Financial News Analysis")
214
+ st.sidebar.write(
215
+ "Scrape, auto summarise and calculate sentiment for stock and crypto news.")
216
+
217
+ # User input
218
+ ticker = st.text_input("Ticker:", "TSLA")
219
+ num = st.number_input("Number of articles:", 5, 15, 10)
220
+ date = st.selectbox(
221
+ "Timeline:", ["Past week", "Past day"])
222
+
223
+ search = st.button("Search")
224
+
225
+ st.info("Please do not spam the search button")
226
+ st.markdown("---")
227
+
228
+ # If button is pressed
229
+ if search:
230
+
231
+ with st.spinner("Processing articles, please wait..."):
232
+ # Search query and return all articles' links
233
+ raw_urls = search_urls(ticker, num, date)
234
+
235
+ # If any problems happened (e.g., blocked by Google's server) stop app
236
+ if not raw_urls:
237
+ st.error("Please wait a few minutes before trying again")
238
+ else:
239
+
240
+ # Scrap title, posting date and article content from all the URLs
241
+ titles, post_dates, articles = search_scrape(raw_urls)
242
+
243
+ # Summarise all articles
244
+ summarised_articles = summarise_articles(
245
+ sum_model, sum_tokenizer, articles)
246
+
247
+ # Calculate sentiment for all articles
248
+ # source
249
+ # https://huggingface.co/ahmedrachid/FinancialBERT-Sentiment-Analysis
250
+ sentiment_scores = sentiment_pipeline(summarised_articles)
251
+
252
+ # Create dataframe
253
+ output_array = create_output_array(
254
+ titles, post_dates, summarised_articles, sentiment_scores, raw_urls)
255
+ cols = ["Title", "Date", "Summary",
256
+ "Label", "Confidence", "URL"]
257
+ df = pd.DataFrame(output_array, columns=cols)
258
+
259
+ # Visualise dataframe
260
+ st.dataframe(df)
261
+
262
+ # Convert dataframe to csv and let user download it
263
+ csv_file = convert_df(df)
264
+
265
+ # Download CSV
266
+ st.download_button(
267
+ "Save data to CSV", csv_file, "assetsummaries.csv", "text/csv", key="download-csv")
268
+
269
+
270
+ def page2():
271
+
272
+ # Stock Trend Forecasting feature
273
+
274
+ # Streamlit text
275
+ st.sidebar.markdown("## Stock Trend Forecasting")
276
+ st.sidebar.write(
277
+ "A simple dashboard for stock trend forecasting and analysis.")
278
+
279
+ # Start and end date of data
280
+ start = "2010-01-01"
281
+ end = date.today().strftime("%Y-%m-%d")
282
+
283
+ # Ticker selection
284
+ ticker = st.text_input("Ticker:", "AAPL")
285
+ # Loading data from Yahoo Finance
286
+ df = load_data(ticker, start, end)
287
+
288
+ # Period selection
289
+ n_years = st.number_input("Years of prediction:", 1, 4, 1)
290
+ period = n_years * 365
291
+
292
+ # Start prediction button
293
+ init = st.button("Predict")
294
+
295
+ st.markdown("---")
296
+
297
+ # Visualisation
298
+ # Dropping adj close column
299
+ df = df.drop(["Adj Close"], axis=1)
300
+
301
+ # Visualisation
302
+ # Exploratory analysis
303
+ st.subheader("Exploratory analysis")
304
+ st.write(df.describe())
305
+
306
+ # Plot raw closing data with 100 and 200 days MA (for simple analysis)
307
+ st.subheader("Closing data, MA100 and MA200")
308
+
309
+ ma100 = df.Close.rolling(100).mean()
310
+ ma200 = df.Close.rolling(200).mean()
311
+
312
+ fig = go.Figure()
313
+ fig.update_layout(
314
+ margin=dict(
315
+ l=0,
316
+ r=0,
317
+ b=0,
318
+ t=50,
319
+ pad=4
320
+ )
321
+ )
322
+ fig.add_trace(go.Scatter(x=df["Date"],
323
+ y=df['Close'], name="stock_close"))
324
+ fig.add_trace(go.Scatter(x=df["Date"], y=ma100, name="ma100"))
325
+ fig.add_trace(go.Scatter(x=df["Date"], y=ma200, name="ma200"))
326
+ fig.layout.update(xaxis_rangeslider_visible=True)
327
+ st.plotly_chart(fig, use_container_width=True)
328
+
329
+ # If button is pressed, start forecasting
330
+ if init:
331
+ with st.spinner("Please wait..."):
332
+ model, forecast = predict(df, period)
333
+
334
+ st.markdown("---")
335
+ st.subheader("Forecast data")
336
+ st.write(forecast.tail())
337
+
338
+ st.subheader(f"Forecast plot for {n_years} years")
339
+
340
+ fig = plot_plotly(model, forecast)
341
+ fig.update_layout(
342
+ margin=dict(
343
+ l=0,
344
+ r=0,
345
+ b=0,
346
+ t=0,
347
+ pad=4
348
+ )
349
+ )
350
+ st.plotly_chart(fig, use_container_width=True)
351
+
352
+ st.subheader("Forecast components")
353
+ fig = model.plot_components(forecast)
354
+ st.write(fig)
355
+
356
+
357
+ if __name__ == "__main__":
358
+
359
+ with st.spinner("Loading all models..."):
360
+ # Creating summariser and sentiment models
361
+ sum_model, sum_tokenizer = get_summarisation_model()
362
+ sentiment_pipeline = get_sentiment_pepeline()
363
+
364
+ page_names_to_funcs = {
365
+ "Financial News Analysis": main_page,
366
+ "Stock Trend Forecasting": page2
367
+ }
368
+
369
+ st.sidebar.markdown("# Financial Researcher")
370
+
371
+ selected_page = st.sidebar.selectbox(
372
+ "Select a page", page_names_to_funcs.keys())
373
+
374
+ st.sidebar.markdown("---")
375
+
376
+ page_names_to_funcs[selected_page]()
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pandas==1.4.2
2
+ DateTime==4.7
3
+ numpy==1.22.3
4
+ streamlit==1.12.2
5
+ plotly==5.10.0
6
+ prophet==1.1.1
7
+ pandas-datareader==0.10.0
8
+ requests==2.27.1
9
+ beautifulsoup4==4.11.1
10
+ transformers==4.21.3
11
+ sentencepiece==0.1.97
12
+ tensorflow==2.8.0
13
+ torch==1.11.0