mtzeve commited on
Commit
26654a1
·
1 Parent(s): 1f66d66
historical_news.ipynb CHANGED
@@ -115,12 +115,28 @@
115
  },
116
  {
117
  "cell_type": "code",
118
- "execution_count": 8,
 
 
 
 
 
 
 
 
 
119
  "metadata": {},
120
  "outputs": [],
121
  "source": [
122
  "df.to_csv('news_articles.csv', index=False)\n"
123
  ]
 
 
 
 
 
 
 
124
  }
125
  ],
126
  "metadata": {
 
115
  },
116
  {
117
  "cell_type": "code",
118
+ "execution_count": 11,
119
+ "metadata": {},
120
+ "outputs": [],
121
+ "source": [
122
+ "df= df.sort_index(ascending=False)"
123
+ ]
124
+ },
125
+ {
126
+ "cell_type": "code",
127
+ "execution_count": 13,
128
  "metadata": {},
129
  "outputs": [],
130
  "source": [
131
  "df.to_csv('news_articles.csv', index=False)\n"
132
  ]
133
+ },
134
+ {
135
+ "cell_type": "code",
136
+ "execution_count": null,
137
+ "metadata": {},
138
+ "outputs": [],
139
+ "source": []
140
  }
141
  ],
142
  "metadata": {
historical_stock.ipynb CHANGED
@@ -12,7 +12,15 @@
12
  "import pandas as pd\n",
13
  "import hopsworks\n",
14
  "import re \n",
15
- "import modal \n"
 
 
 
 
 
 
 
 
16
  ]
17
  },
18
  {
@@ -46,6 +54,137 @@
46
  "print(data.head())"
47
  ]
48
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  {
50
  "cell_type": "code",
51
  "execution_count": null,
 
12
  "import pandas as pd\n",
13
  "import hopsworks\n",
14
  "import re \n",
15
+ "import modal \n",
16
+ "#prepocessing\n",
17
+ "import requests\n",
18
+ "import pandas as pd\n",
19
+ "import json\n",
20
+ "#import pandas_market_calendars as mcal\n",
21
+ "import datetime\n",
22
+ "import numpy as np\n",
23
+ "from datetime import timedelta \n"
24
  ]
25
  },
26
  {
 
54
  "print(data.head())"
55
  ]
56
  },
57
+ {
58
+ "cell_type": "code",
59
+ "execution_count": null,
60
+ "metadata": {},
61
+ "outputs": [],
62
+ "source": [
63
+ "def create_tsla_history():\n",
64
+ "\n",
65
+ " start_date = datetime.datetime.strptime('2015-07-16',\"%Y-%m-%d\")\n",
66
+ " end_date = datetime.datetime.strptime('2023-01-05',\"%Y-%m-%d\")\n",
67
+ "\n",
68
+ " # Get the TSLA stock data from yfinance\n",
69
+ " tsla = yf.Ticker(\"TSLA\") #VEFAB.ST\n",
70
+ " # info = tsla.info\n",
71
+ "\n",
72
+ " # get historical market data\n",
73
+ " data = tsla.history(start=start_date, end=end_date)\n",
74
+ "\n",
75
+ " # drop some columns\n",
76
+ " tesla_df = data.drop(columns=['Dividends','Stock Splits'])\n",
77
+ " tesla_df.index = tesla_df.index.strftime('%Y-%m-%d')\n",
78
+ " \n",
79
+ " print('Number of business days included in data set: ',np.shape(tesla_df))\n",
80
+ "\n",
81
+ " # Create an array of all dates in the specified period\n",
82
+ " all_dates = np.array([start_date + datetime.timedelta(days=i) for i in range((end_date - start_date).days)])\n",
83
+ " all_dates = [d.strftime('%Y-%m-%d') for d in all_dates]\n",
84
+ "\n",
85
+ " # Use setdiff1d() to find the non-business days\n",
86
+ " isBusinessDay, _ = extract_business_day(start_date='2015-07-16',end_date='2023-01-04')\n",
87
+ " non_business_days = np.setdiff1d(all_dates, isBusinessDay)\n",
88
+ "\n",
89
+ " # Add nan-values to the non-business days\n",
90
+ " print('Add {} non business days with NaN-values'.format(len(non_business_days)))\n",
91
+ " for d in non_business_days:\n",
92
+ " tesla_df.loc[d,:] = [np.nan,np.nan,np.nan,np.nan,np.nan]\n",
93
+ "\n",
94
+ " # sort index (dates)\n",
95
+ " tesla_df = tesla_df.sort_index()\n",
96
+ " \n",
97
+ " # move \"date\"-index into its own column\n",
98
+ " tesla_df = tesla_df.reset_index()\n",
99
+ " \n",
100
+ " # Rename column 'Date' to 'date'\n",
101
+ " tesla_df = tesla_df.rename(columns={'Date': 'date'})\n",
102
+ " print('Final size of dataframe',np.shape(tesla_df))\n",
103
+ " \n",
104
+ " # Write the merged dataframe to a CSV file\n",
105
+ " start_date ='2015-07-16'\n",
106
+ " end_date = '2023-01-05'\n",
107
+ " save_path = \"data/stock/tesla_{}-{}.csv\".format(start_date,end_date)\n",
108
+ " \n",
109
+ " print('Save at :',save_path)\n",
110
+ " tesla_df.to_csv(save_path, index=False)\n",
111
+ " \n",
112
+ " return tesla_df"
113
+ ]
114
+ },
115
+ {
116
+ "cell_type": "code",
117
+ "execution_count": null,
118
+ "metadata": {},
119
+ "outputs": [],
120
+ "source": [
121
+ "def extract_business_day(start_date,end_date):\n",
122
+ " \"\"\"\n",
123
+ " Given a start_date and end_date.\n",
124
+ " \n",
125
+ " `Returns`:\n",
126
+ " \n",
127
+ " isBusinessDay: list of str (with all dates being business days)\n",
128
+ " is_open: boolean list\n",
129
+ " e.g is_open = [1,0,...,1] means that start_date = open, day after start_date = closed, and end_date = open\n",
130
+ " \"\"\"\n",
131
+ " \n",
132
+ " # Save for later\n",
133
+ " end_date_save = end_date\n",
134
+ " \n",
135
+ " # Get the NYSE calendar\n",
136
+ " cal = mcal.get_calendar('NYSE')\n",
137
+ "\n",
138
+ " # Get the NYSE calendar's open and close times for the specified period\n",
139
+ " schedule = cal.schedule(start_date=start_date, end_date=end_date)\n",
140
+ " \n",
141
+ " # Only need a list of dates when it's open (not open and close times)\n",
142
+ " isBusinessDay = np.array(schedule.market_open.dt.strftime('%Y-%m-%d')) \n",
143
+ " \n",
144
+ " # Go over all days: \n",
145
+ " delta = datetime.timedelta(days=1)\n",
146
+ " start_date = datetime.datetime.strptime(start_date,\"%Y-%m-%d\") #datetime.date(2015, 7, 16)\n",
147
+ " end_date = datetime.datetime.strptime(end_date,\"%Y-%m-%d\") #datetime.date(2023, 1, 4)\n",
148
+ " \n",
149
+ " # Extract days from the timedelta object\n",
150
+ " num_days = (end_date - start_date).days + 1\n",
151
+ " \n",
152
+ " # Create boolean array for days being open (1) and closed (0) \n",
153
+ " is_open = np.zeros(num_days)\n",
154
+ " \n",
155
+ " # iterate over range of dates\n",
156
+ " current_BusinessDay = isBusinessDay[0]\n",
157
+ " count_dates = 0\n",
158
+ " next_BusinessDay = 0\n",
159
+ " \n",
160
+ " while (start_date <= end_date):\n",
161
+ " \n",
162
+ " if start_date.strftime('%Y-%m-%d') == current_BusinessDay:\n",
163
+ " is_open[count_dates] = True\n",
164
+ "\n",
165
+ " if current_BusinessDay == end_date_save or current_BusinessDay==isBusinessDay[-1]:\n",
166
+ " break\n",
167
+ " else:\n",
168
+ " next_BusinessDay += 1\n",
169
+ " current_BusinessDay = isBusinessDay[next_BusinessDay]\n",
170
+ " else:\n",
171
+ " is_open[count_dates] = False\n",
172
+ "\n",
173
+ " count_dates += 1 \n",
174
+ " start_date += delta\n",
175
+ " \n",
176
+ " print(np.shape(is_open))\n",
177
+ " \n",
178
+ " return isBusinessDay, is_open"
179
+ ]
180
+ },
181
+ {
182
+ "cell_type": "code",
183
+ "execution_count": null,
184
+ "metadata": {},
185
+ "outputs": [],
186
+ "source": []
187
+ },
188
  {
189
  "cell_type": "code",
190
  "execution_count": null,
news_articles.csv CHANGED
The diff for this file is too large to render. See raw diff