mtzeve commited on
Commit
cf1f1a2
·
1 Parent(s): b6eae30

check commit

Browse files
feature_engineering.py CHANGED
@@ -21,7 +21,7 @@ def getNews(api_key,endpoint,ticker,from_date,to_date,num=1000):
21
  response = requests.get(endpoint, params=params)
22
 
23
  # Print the response from the API
24
- #print(response.json())
25
 
26
  #Return a Pandas dataframe from the response
27
  return pd.DataFrame(response.json())
 
21
  response = requests.get(endpoint, params=params)
22
 
23
  # Print the response from the API
24
+ print(response.json())
25
 
26
  #Return a Pandas dataframe from the response
27
  return pd.DataFrame(response.json())
news_experimenting.ipynb ADDED
@@ -0,0 +1,255 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 31,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import requests\n",
10
+ "import pandas as pd\n",
11
+ "from datetime import datetime\n",
12
+ "from textblob import TextBlob"
13
+ ]
14
+ },
15
+ {
16
+ "cell_type": "code",
17
+ "execution_count": 38,
18
+ "metadata": {},
19
+ "outputs": [],
20
+ "source": [
21
+ "def fetch_tesla_news(api_key, start_date, end_date):\n",
22
+ " url = \"https://api.marketaux.com/v1/news/all?symbols=TSLA&filter_entities=true&language=en&api_token=iy6rRX4oxFrouZocXr8JNpOzaxZLk3UvMfoMGxYs\" # Modify this based on the exact endpoint you need\n",
23
+ " headers = {\n",
24
+ " \"x-api-key\": api_key\n",
25
+ " }\n",
26
+ " # Since each page corresponds to a single request, limit the number of pages to 100\n",
27
+ " for page in range(1, 101): # start from page 1 to page 100\n",
28
+ " params = {\n",
29
+ " \"tickers\": \"TSLA\",\n",
30
+ " \"filter_entities\": \"true\",\n",
31
+ " \"language\": \"en\",\n",
32
+ " \"from\": start_date,\n",
33
+ " \"to\": end_date,\n",
34
+ " \"page\": page\n",
35
+ " }\n",
36
+ " \n",
37
+ " response = requests.get(url, headers=headers, params=params)\n",
38
+ " if response.status_code == 200:\n",
39
+ " return pd.json_normalize(response.json()['data'])\n",
40
+ " else:\n",
41
+ " raise Exception(f\"Failed to fetch data: {response.text}\")"
42
+ ]
43
+ },
44
+ {
45
+ "cell_type": "code",
46
+ "execution_count": 39,
47
+ "metadata": {},
48
+ "outputs": [
49
+ {
50
+ "name": "stdout",
51
+ "output_type": "stream",
52
+ "text": [
53
+ " uuid \\\n",
54
+ "0 daf76e3e-caea-4c92-a461-6b3132655788 \n",
55
+ "1 8dab10ca-5b23-465a-aa86-360bc987a774 \n",
56
+ "2 b8c381b9-4187-433e-ad15-cecc9d227b13 \n",
57
+ "\n",
58
+ " title \\\n",
59
+ "0 Stock market today: US futures climb as earnin... \n",
60
+ "1 5 things to know before the stock market opens... \n",
61
+ "2 Wall Street Breakfast Podcast: UNH: Personal D... \n",
62
+ "\n",
63
+ " description \\\n",
64
+ "0 The wait for Tesla results is on as investors ... \n",
65
+ "1 Here are the most important news items that in... \n",
66
+ "2 UnitedHealth confirms personal data compromise... \n",
67
+ "\n",
68
+ " keywords \\\n",
69
+ "0 \n",
70
+ "1 Investment strategy, Economy, Markets, Busines... \n",
71
+ "2 \n",
72
+ "\n",
73
+ " snippet \\\n",
74
+ "0 US stocks climbed on Tuesday, on track for fur... \n",
75
+ "1 In this article CPRI Follow your favorite stoc... \n",
76
+ "2 JHVEPhoto/iStock Editorial via Getty Images\\n\\... \n",
77
+ "\n",
78
+ " url \\\n",
79
+ "0 https://finance.yahoo.com/news/stock-market-to... \n",
80
+ "1 https://www.cnbc.com/2024/04/23/5-things-to-kn... \n",
81
+ "2 https://seekingalpha.com/article/4685243-wall-... \n",
82
+ "\n",
83
+ " image_url language \\\n",
84
+ "0 https://s.yimg.com/ny/api/res/1.2/mqjC0VUO61dY... en \n",
85
+ "1 https://image.cnbcfm.com/api/v1/image/10692170... en \n",
86
+ "2 https://static.seekingalpha.com/cdn/s3/uploads... en \n",
87
+ "\n",
88
+ " published_at source relevance_score \\\n",
89
+ "0 2024-04-23T11:22:53.000000Z finance.yahoo.com None \n",
90
+ "1 2024-04-23T11:16:00.000000Z cnbc.com None \n",
91
+ "2 2024-04-23T11:00:00.000000Z seekingalpha.com None \n",
92
+ "\n",
93
+ " entities \\\n",
94
+ "0 [{'symbol': 'TSLA', 'name': 'Tesla, Inc.', 'ex... \n",
95
+ "1 [{'symbol': 'TSLA', 'name': 'Tesla, Inc.', 'ex... \n",
96
+ "2 [{'symbol': 'TSLA', 'name': 'Tesla, Inc.', 'ex... \n",
97
+ "\n",
98
+ " similar \n",
99
+ "0 [{'uuid': '01cd65de-530d-407a-96c9-5b3359e98a0... \n",
100
+ "1 [] \n",
101
+ "2 [] \n"
102
+ ]
103
+ }
104
+ ],
105
+ "source": [
106
+ "api_key = \"iy6rRX4oxFrouZocXr8JNpOzaxZLk3UvMfoMGxYs\"\n",
107
+ "tesla_news_df = fetch_tesla_news(api_key, \"2017-01-01\", datetime.now().strftime('%Y-%m-%d'))\n",
108
+ "print(tesla_news_df.head())"
109
+ ]
110
+ },
111
+ {
112
+ "cell_type": "code",
113
+ "execution_count": null,
114
+ "metadata": {},
115
+ "outputs": [],
116
+ "source": []
117
+ },
118
+ {
119
+ "cell_type": "code",
120
+ "execution_count": 30,
121
+ "metadata": {},
122
+ "outputs": [
123
+ {
124
+ "name": "stdout",
125
+ "output_type": "stream",
126
+ "text": [
127
+ "uuid 0\n",
128
+ "title 0\n",
129
+ "description 0\n",
130
+ "keywords 0\n",
131
+ "snippet 0\n",
132
+ "url 0\n",
133
+ "image_url 0\n",
134
+ "language 0\n",
135
+ "published_at 0\n",
136
+ "source 0\n",
137
+ "relevance_score 3\n",
138
+ "entities 0\n",
139
+ "similar 0\n",
140
+ "dtype: int64\n"
141
+ ]
142
+ }
143
+ ],
144
+ "source": [
145
+ "# Clean text data, Check for any missing values or inconsistencies in the data\n",
146
+ "tesla_news_df['description'] = tesla_news_df['description'].apply(lambda x: x.lower().replace('[^\\w\\s]', ''))\n",
147
+ "\n",
148
+ "# Check for any missing values\n",
149
+ "print(tesla_news_df.isnull().sum())\n"
150
+ ]
151
+ },
152
+ {
153
+ "cell_type": "code",
154
+ "execution_count": 32,
155
+ "metadata": {},
156
+ "outputs": [
157
+ {
158
+ "name": "stdout",
159
+ "output_type": "stream",
160
+ "text": [
161
+ " title sentiment\n",
162
+ "0 Wall Street Breakfast: What Moved Markets 0.197443\n",
163
+ "1 1 \"Magnificent Seven\" Stock With 1,234% Upside... 1.000000\n",
164
+ "2 Market Today: Tech Giants Reignite AI Craze, A... -0.024242\n"
165
+ ]
166
+ }
167
+ ],
168
+ "source": [
169
+ "# Sentiment analysis on descriptions\n",
170
+ "tesla_news_df['sentiment'] = tesla_news_df['description'].apply(lambda text: TextBlob(text).sentiment.polarity)\n",
171
+ "\n",
172
+ "print(tesla_news_df[['title', 'sentiment']])\n"
173
+ ]
174
+ },
175
+ {
176
+ "cell_type": "code",
177
+ "execution_count": 35,
178
+ "metadata": {},
179
+ "outputs": [],
180
+ "source": [
181
+ "# Example: Counting the number of articles per day\n",
182
+ "tesla_news_df['published_at'] = pd.to_datetime(tesla_news_df['published_at']) # Convert to datetime\n",
183
+ "tesla_news_df['date'] = tesla_news_df['published_at'].dt.date\n",
184
+ "daily_news_count = tesla_news_df.groupby('date').size()\n"
185
+ ]
186
+ },
187
+ {
188
+ "cell_type": "code",
189
+ "execution_count": 37,
190
+ "metadata": {},
191
+ "outputs": [
192
+ {
193
+ "name": "stdout",
194
+ "output_type": "stream",
195
+ "text": [
196
+ "<class 'pandas.core.frame.DataFrame'>\n",
197
+ "RangeIndex: 3 entries, 0 to 2\n",
198
+ "Data columns (total 15 columns):\n",
199
+ " # Column Non-Null Count Dtype \n",
200
+ "--- ------ -------------- ----- \n",
201
+ " 0 uuid 3 non-null object \n",
202
+ " 1 title 3 non-null object \n",
203
+ " 2 description 3 non-null object \n",
204
+ " 3 keywords 3 non-null object \n",
205
+ " 4 snippet 3 non-null object \n",
206
+ " 5 url 3 non-null object \n",
207
+ " 6 image_url 3 non-null object \n",
208
+ " 7 language 3 non-null object \n",
209
+ " 8 published_at 3 non-null datetime64[ns, UTC]\n",
210
+ " 9 source 3 non-null object \n",
211
+ " 10 relevance_score 0 non-null object \n",
212
+ " 11 entities 3 non-null object \n",
213
+ " 12 similar 3 non-null object \n",
214
+ " 13 sentiment 3 non-null float64 \n",
215
+ " 14 date 3 non-null object \n",
216
+ "dtypes: datetime64[ns, UTC](1), float64(1), object(13)\n",
217
+ "memory usage: 492.0+ bytes\n"
218
+ ]
219
+ }
220
+ ],
221
+ "source": [
222
+ "tesla_news_df.info()"
223
+ ]
224
+ },
225
+ {
226
+ "cell_type": "code",
227
+ "execution_count": null,
228
+ "metadata": {},
229
+ "outputs": [],
230
+ "source": []
231
+ }
232
+ ],
233
+ "metadata": {
234
+ "kernelspec": {
235
+ "display_name": "base",
236
+ "language": "python",
237
+ "name": "python3"
238
+ },
239
+ "language_info": {
240
+ "codemirror_mode": {
241
+ "name": "ipython",
242
+ "version": 3
243
+ },
244
+ "file_extension": ".py",
245
+ "mimetype": "text/x-python",
246
+ "name": "python",
247
+ "nbconvert_exporter": "python",
248
+ "pygments_lexer": "ipython3",
249
+ "version": "3.11.4"
250
+ },
251
+ "orig_nbformat": 4
252
+ },
253
+ "nbformat": 4,
254
+ "nbformat_minor": 2
255
+ }
TSLA_news.csv → news_experimenting1.ipynb RENAMED
File without changes
tesla_articles.json ADDED
The diff for this file is too large to render. See raw diff