Spaces:
No application file
No application file
Updated_2
Browse files- historical_news.ipynb +17 -1
- historical_stock.ipynb +140 -1
- news_articles.csv +0 -0
historical_news.ipynb
CHANGED
@@ -115,12 +115,28 @@
|
|
115 |
},
|
116 |
{
|
117 |
"cell_type": "code",
|
118 |
-
"execution_count":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
"metadata": {},
|
120 |
"outputs": [],
|
121 |
"source": [
|
122 |
"df.to_csv('news_articles.csv', index=False)\n"
|
123 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
}
|
125 |
],
|
126 |
"metadata": {
|
|
|
115 |
},
|
116 |
{
|
117 |
"cell_type": "code",
|
118 |
+
"execution_count": 11,
|
119 |
+
"metadata": {},
|
120 |
+
"outputs": [],
|
121 |
+
"source": [
|
122 |
+
"df= df.sort_index(ascending=False)"
|
123 |
+
]
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"cell_type": "code",
|
127 |
+
"execution_count": 13,
|
128 |
"metadata": {},
|
129 |
"outputs": [],
|
130 |
"source": [
|
131 |
"df.to_csv('news_articles.csv', index=False)\n"
|
132 |
]
|
133 |
+
},
|
134 |
+
{
|
135 |
+
"cell_type": "code",
|
136 |
+
"execution_count": null,
|
137 |
+
"metadata": {},
|
138 |
+
"outputs": [],
|
139 |
+
"source": []
|
140 |
}
|
141 |
],
|
142 |
"metadata": {
|
historical_stock.ipynb
CHANGED
@@ -12,7 +12,15 @@
|
|
12 |
"import pandas as pd\n",
|
13 |
"import hopsworks\n",
|
14 |
"import re \n",
|
15 |
-
"import modal \n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
]
|
17 |
},
|
18 |
{
|
@@ -46,6 +54,137 @@
|
|
46 |
"print(data.head())"
|
47 |
]
|
48 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
{
|
50 |
"cell_type": "code",
|
51 |
"execution_count": null,
|
|
|
12 |
"import pandas as pd\n",
|
13 |
"import hopsworks\n",
|
14 |
"import re \n",
|
15 |
+
"import modal \n",
|
16 |
+
"#prepocessing\n",
|
17 |
+
"import requests\n",
|
18 |
+
"import pandas as pd\n",
|
19 |
+
"import json\n",
|
20 |
+
"#import pandas_market_calendars as mcal\n",
|
21 |
+
"import datetime\n",
|
22 |
+
"import numpy as np\n",
|
23 |
+
"from datetime import timedelta \n"
|
24 |
]
|
25 |
},
|
26 |
{
|
|
|
54 |
"print(data.head())"
|
55 |
]
|
56 |
},
|
57 |
+
{
|
58 |
+
"cell_type": "code",
|
59 |
+
"execution_count": null,
|
60 |
+
"metadata": {},
|
61 |
+
"outputs": [],
|
62 |
+
"source": [
|
63 |
+
"def create_tsla_history():\n",
|
64 |
+
"\n",
|
65 |
+
" start_date = datetime.datetime.strptime('2015-07-16',\"%Y-%m-%d\")\n",
|
66 |
+
" end_date = datetime.datetime.strptime('2023-01-05',\"%Y-%m-%d\")\n",
|
67 |
+
"\n",
|
68 |
+
" # Get the TSLA stock data from yfinance\n",
|
69 |
+
" tsla = yf.Ticker(\"TSLA\") #VEFAB.ST\n",
|
70 |
+
" # info = tsla.info\n",
|
71 |
+
"\n",
|
72 |
+
" # get historical market data\n",
|
73 |
+
" data = tsla.history(start=start_date, end=end_date)\n",
|
74 |
+
"\n",
|
75 |
+
" # drop some columns\n",
|
76 |
+
" tesla_df = data.drop(columns=['Dividends','Stock Splits'])\n",
|
77 |
+
" tesla_df.index = tesla_df.index.strftime('%Y-%m-%d')\n",
|
78 |
+
" \n",
|
79 |
+
" print('Number of business days included in data set: ',np.shape(tesla_df))\n",
|
80 |
+
"\n",
|
81 |
+
" # Create an array of all dates in the specified period\n",
|
82 |
+
" all_dates = np.array([start_date + datetime.timedelta(days=i) for i in range((end_date - start_date).days)])\n",
|
83 |
+
" all_dates = [d.strftime('%Y-%m-%d') for d in all_dates]\n",
|
84 |
+
"\n",
|
85 |
+
" # Use setdiff1d() to find the non-business days\n",
|
86 |
+
" isBusinessDay, _ = extract_business_day(start_date='2015-07-16',end_date='2023-01-04')\n",
|
87 |
+
" non_business_days = np.setdiff1d(all_dates, isBusinessDay)\n",
|
88 |
+
"\n",
|
89 |
+
" # Add nan-values to the non-business days\n",
|
90 |
+
" print('Add {} non business days with NaN-values'.format(len(non_business_days)))\n",
|
91 |
+
" for d in non_business_days:\n",
|
92 |
+
" tesla_df.loc[d,:] = [np.nan,np.nan,np.nan,np.nan,np.nan]\n",
|
93 |
+
"\n",
|
94 |
+
" # sort index (dates)\n",
|
95 |
+
" tesla_df = tesla_df.sort_index()\n",
|
96 |
+
" \n",
|
97 |
+
" # move \"date\"-index into its own column\n",
|
98 |
+
" tesla_df = tesla_df.reset_index()\n",
|
99 |
+
" \n",
|
100 |
+
" # Rename column 'Date' to 'date'\n",
|
101 |
+
" tesla_df = tesla_df.rename(columns={'Date': 'date'})\n",
|
102 |
+
" print('Final size of dataframe',np.shape(tesla_df))\n",
|
103 |
+
" \n",
|
104 |
+
" # Write the merged dataframe to a CSV file\n",
|
105 |
+
" start_date ='2015-07-16'\n",
|
106 |
+
" end_date = '2023-01-05'\n",
|
107 |
+
" save_path = \"data/stock/tesla_{}-{}.csv\".format(start_date,end_date)\n",
|
108 |
+
" \n",
|
109 |
+
" print('Save at :',save_path)\n",
|
110 |
+
" tesla_df.to_csv(save_path, index=False)\n",
|
111 |
+
" \n",
|
112 |
+
" return tesla_df"
|
113 |
+
]
|
114 |
+
},
|
115 |
+
{
|
116 |
+
"cell_type": "code",
|
117 |
+
"execution_count": null,
|
118 |
+
"metadata": {},
|
119 |
+
"outputs": [],
|
120 |
+
"source": [
|
121 |
+
"def extract_business_day(start_date,end_date):\n",
|
122 |
+
" \"\"\"\n",
|
123 |
+
" Given a start_date and end_date.\n",
|
124 |
+
" \n",
|
125 |
+
" `Returns`:\n",
|
126 |
+
" \n",
|
127 |
+
" isBusinessDay: list of str (with all dates being business days)\n",
|
128 |
+
" is_open: boolean list\n",
|
129 |
+
" e.g is_open = [1,0,...,1] means that start_date = open, day after start_date = closed, and end_date = open\n",
|
130 |
+
" \"\"\"\n",
|
131 |
+
" \n",
|
132 |
+
" # Save for later\n",
|
133 |
+
" end_date_save = end_date\n",
|
134 |
+
" \n",
|
135 |
+
" # Get the NYSE calendar\n",
|
136 |
+
" cal = mcal.get_calendar('NYSE')\n",
|
137 |
+
"\n",
|
138 |
+
" # Get the NYSE calendar's open and close times for the specified period\n",
|
139 |
+
" schedule = cal.schedule(start_date=start_date, end_date=end_date)\n",
|
140 |
+
" \n",
|
141 |
+
" # Only need a list of dates when it's open (not open and close times)\n",
|
142 |
+
" isBusinessDay = np.array(schedule.market_open.dt.strftime('%Y-%m-%d')) \n",
|
143 |
+
" \n",
|
144 |
+
" # Go over all days: \n",
|
145 |
+
" delta = datetime.timedelta(days=1)\n",
|
146 |
+
" start_date = datetime.datetime.strptime(start_date,\"%Y-%m-%d\") #datetime.date(2015, 7, 16)\n",
|
147 |
+
" end_date = datetime.datetime.strptime(end_date,\"%Y-%m-%d\") #datetime.date(2023, 1, 4)\n",
|
148 |
+
" \n",
|
149 |
+
" # Extract days from the timedelta object\n",
|
150 |
+
" num_days = (end_date - start_date).days + 1\n",
|
151 |
+
" \n",
|
152 |
+
" # Create boolean array for days being open (1) and closed (0) \n",
|
153 |
+
" is_open = np.zeros(num_days)\n",
|
154 |
+
" \n",
|
155 |
+
" # iterate over range of dates\n",
|
156 |
+
" current_BusinessDay = isBusinessDay[0]\n",
|
157 |
+
" count_dates = 0\n",
|
158 |
+
" next_BusinessDay = 0\n",
|
159 |
+
" \n",
|
160 |
+
" while (start_date <= end_date):\n",
|
161 |
+
" \n",
|
162 |
+
" if start_date.strftime('%Y-%m-%d') == current_BusinessDay:\n",
|
163 |
+
" is_open[count_dates] = True\n",
|
164 |
+
"\n",
|
165 |
+
" if current_BusinessDay == end_date_save or current_BusinessDay==isBusinessDay[-1]:\n",
|
166 |
+
" break\n",
|
167 |
+
" else:\n",
|
168 |
+
" next_BusinessDay += 1\n",
|
169 |
+
" current_BusinessDay = isBusinessDay[next_BusinessDay]\n",
|
170 |
+
" else:\n",
|
171 |
+
" is_open[count_dates] = False\n",
|
172 |
+
"\n",
|
173 |
+
" count_dates += 1 \n",
|
174 |
+
" start_date += delta\n",
|
175 |
+
" \n",
|
176 |
+
" print(np.shape(is_open))\n",
|
177 |
+
" \n",
|
178 |
+
" return isBusinessDay, is_open"
|
179 |
+
]
|
180 |
+
},
|
181 |
+
{
|
182 |
+
"cell_type": "code",
|
183 |
+
"execution_count": null,
|
184 |
+
"metadata": {},
|
185 |
+
"outputs": [],
|
186 |
+
"source": []
|
187 |
+
},
|
188 |
{
|
189 |
"cell_type": "code",
|
190 |
"execution_count": null,
|
news_articles.csv
CHANGED
The diff for this file is too large to render.
See raw diff
|
|