{
"cells": [
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"from dotenv import load_dotenv\n",
"import os \n",
"from alpha_vantage.timeseries import TimeSeries\n",
"import pandas as pd\n",
"import hopsworks\n",
"import re \n",
"import modal \n",
"#prepocessing\n",
"import requests\n",
"import pandas as pd\n",
"import json\n",
"import pandas_market_calendars as mcal\n",
"import datetime\n",
"import numpy as np\n",
"from datetime import datetime, timedelta\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 1. open 2. high 3. low 4. close 5. volume\n",
"date \n",
"2024-05-02 182.86 184.60 176.0200 180.01 89148041.0\n",
"2024-05-01 182.00 185.86 179.0100 179.99 92829719.0\n",
"2024-04-30 186.98 190.95 182.8401 183.28 127031787.0\n",
"2024-04-29 188.42 198.87 184.5400 194.05 243869678.0\n",
"2024-04-26 168.85 172.12 166.3700 168.29 109815725.0\n"
]
}
],
"source": [
"load_dotenv()\n",
"\n",
"api_key = os.environ.get('stocks_api') # Replace this with your actual API key\n",
"ts = TimeSeries(key=api_key, output_format='pandas')\n",
"\n",
"# Fetch daily adjusted stock prices; adjust the symbol as needed\n",
"data, meta_data = ts.get_daily(symbol='TSLA', outputsize='full')\n",
"\n",
"print(data.head())"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 1. open | \n",
" 2. high | \n",
" 3. low | \n",
" 4. close | \n",
" 5. volume | \n",
"
\n",
" \n",
" date | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 2024-05-02 | \n",
" 182.86 | \n",
" 184.6000 | \n",
" 176.0200 | \n",
" 180.01 | \n",
" 89148041.0 | \n",
"
\n",
" \n",
" 2024-05-01 | \n",
" 182.00 | \n",
" 185.8600 | \n",
" 179.0100 | \n",
" 179.99 | \n",
" 92829719.0 | \n",
"
\n",
" \n",
" 2024-04-30 | \n",
" 186.98 | \n",
" 190.9500 | \n",
" 182.8401 | \n",
" 183.28 | \n",
" 127031787.0 | \n",
"
\n",
" \n",
" 2024-04-29 | \n",
" 188.42 | \n",
" 198.8700 | \n",
" 184.5400 | \n",
" 194.05 | \n",
" 243869678.0 | \n",
"
\n",
" \n",
" 2024-04-26 | \n",
" 168.85 | \n",
" 172.1200 | \n",
" 166.3700 | \n",
" 168.29 | \n",
" 109815725.0 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 2010-07-06 | \n",
" 20.00 | \n",
" 20.0000 | \n",
" 15.8300 | \n",
" 16.11 | \n",
" 6866900.0 | \n",
"
\n",
" \n",
" 2010-07-02 | \n",
" 23.00 | \n",
" 23.1000 | \n",
" 18.7100 | \n",
" 19.20 | \n",
" 5139800.0 | \n",
"
\n",
" \n",
" 2010-07-01 | \n",
" 25.00 | \n",
" 25.9200 | \n",
" 20.2700 | \n",
" 21.96 | \n",
" 8218800.0 | \n",
"
\n",
" \n",
" 2010-06-30 | \n",
" 25.79 | \n",
" 30.4192 | \n",
" 23.3000 | \n",
" 23.83 | \n",
" 17187100.0 | \n",
"
\n",
" \n",
" 2010-06-29 | \n",
" 19.00 | \n",
" 25.0000 | \n",
" 17.5400 | \n",
" 23.89 | \n",
" 18766300.0 | \n",
"
\n",
" \n",
"
\n",
"
3485 rows × 5 columns
\n",
"
"
],
"text/plain": [
" 1. open 2. high 3. low 4. close 5. volume\n",
"date \n",
"2024-05-02 182.86 184.6000 176.0200 180.01 89148041.0\n",
"2024-05-01 182.00 185.8600 179.0100 179.99 92829719.0\n",
"2024-04-30 186.98 190.9500 182.8401 183.28 127031787.0\n",
"2024-04-29 188.42 198.8700 184.5400 194.05 243869678.0\n",
"2024-04-26 168.85 172.1200 166.3700 168.29 109815725.0\n",
"... ... ... ... ... ...\n",
"2010-07-06 20.00 20.0000 15.8300 16.11 6866900.0\n",
"2010-07-02 23.00 23.1000 18.7100 19.20 5139800.0\n",
"2010-07-01 25.00 25.9200 20.2700 21.96 8218800.0\n",
"2010-06-30 25.79 30.4192 23.3000 23.83 17187100.0\n",
"2010-06-29 19.00 25.0000 17.5400 23.89 18766300.0\n",
"\n",
"[3485 rows x 5 columns]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"DatetimeIndex: 3485 entries, 2024-05-02 to 2010-06-29\n",
"Data columns (total 5 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 1. open 3485 non-null float64\n",
" 1 2. high 3485 non-null float64\n",
" 2 3. low 3485 non-null float64\n",
" 3 4. close 3485 non-null float64\n",
" 4 5. volume 3485 non-null float64\n",
"dtypes: float64(5)\n",
"memory usage: 163.4 KB\n"
]
}
],
"source": [
"data.info()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'1. Information': 'Daily Prices (open, high, low, close) and Volumes',\n",
" '2. Symbol': 'TSLA',\n",
" '3. Last Refreshed': '2024-05-02',\n",
" '4. Output Size': 'Full size',\n",
" '5. Time Zone': 'US/Eastern'}"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"meta_data"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"#Stock market:\n",
"def today_is_a_business_day(today):\n",
" # Get the NYSE calendar\n",
" cal = mcal.get_calendar('NYSE')\n",
" schedule = cal.schedule(start_date=today, end_date=today) # Get the NYSE calendar's open and close times for the specified period\n",
" try:\n",
" isBusinessDay = schedule.market_open.dt.strftime('%Y-%m-%d')\n",
" return True\n",
" except:\n",
" print('Today {} is not a business day'.format(today))\n",
" return False"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"def next_business_day(today):\n",
" \n",
" # Real tomorrow\n",
" tomorrow = (today + timedelta(days=1)).strftime(\"%Y-%m-%d\")\n",
" \n",
" # Get the NYSE calendar\n",
" cal = mcal.get_calendar('NYSE')\n",
"\n",
" found_next_business_day = False \n",
" while not found_next_business_day:\n",
" schedule = cal.schedule(start_date=tomorrow, end_date=tomorrow) # Get the NYSE calendar's open and close times for the specified period\n",
" try:\n",
" isBusinessDay = schedule.market_open.dt.strftime('%Y-%m-%d') # Only need a list of dates when it's open (not open and close times)\n",
" found_next_business_day = True\n",
" except:\n",
" print('The date {} is not a business day'.format(tomorrow))\n",
" tomorrow = (datetime.datetime.strptime(tomorrow,\"%Y-%m-%d\") + timedelta(days=1)).strftime(\"%Y-%m-%d\")\n",
" \n",
" return isBusinessDay.to_numpy()[0]"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"def extract_business_day(start_date,end_date):\n",
" \"\"\"\n",
" Given a start_date and end_date.\n",
" \n",
" `Returns`:\n",
" \n",
" isBusinessDay: list of str (with all dates being business days)\n",
" is_open: boolean list\n",
" e.g is_open = [1,0,...,1] means that start_date = open, day after start_date = closed, and end_date = open\n",
" \"\"\"\n",
" \n",
" # Save for later\n",
" end_date_save = end_date\n",
" \n",
" # Get the NYSE calendar\n",
" cal = mcal.get_calendar('NYSE')\n",
"\n",
" # Get the NYSE calendar's open and close times for the specified period\n",
" schedule = cal.schedule(start_date=start_date, end_date=end_date)\n",
" \n",
" # Only need a list of dates when it's open (not open and close times)\n",
" isBusinessDay = np.array(schedule.market_open.dt.strftime('%Y-%m-%d')) \n",
" \n",
" # Go over all days: \n",
" delta = datetime.timedelta(days=1)\n",
" start_date = datetime.datetime.strptime(start_date,\"%Y-%m-%d\") #datetime.date(2015, 7, 16)\n",
" end_date = datetime.datetime.strptime(end_date,\"%Y-%m-%d\") #datetime.date(2023, 1, 4)\n",
" \n",
" # Extract days from the timedelta object\n",
" num_days = (end_date - start_date).days + 1\n",
" \n",
" # Create boolean array for days being open (1) and closed (0) \n",
" is_open = np.zeros(num_days)\n",
" \n",
" # iterate over range of dates\n",
" current_BusinessDay = isBusinessDay[0]\n",
" count_dates = 0\n",
" next_BusinessDay = 0\n",
" \n",
" while (start_date <= end_date):\n",
" \n",
" if start_date.strftime('%Y-%m-%d') == current_BusinessDay:\n",
" is_open[count_dates] = True\n",
"\n",
" if current_BusinessDay == end_date_save or current_BusinessDay==isBusinessDay[-1]:\n",
" break\n",
" else:\n",
" next_BusinessDay += 1\n",
" current_BusinessDay = isBusinessDay[next_BusinessDay]\n",
" else:\n",
" is_open[count_dates] = False\n",
"\n",
" count_dates += 1 \n",
" start_date += delta\n",
" \n",
" print(np.shape(is_open))\n",
" \n",
" return isBusinessDay, is_open"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"def clean_column_name(name):\n",
" # Remove all non-letter characters\n",
" cleaned_name = re.sub(r'[^a-zA-Z]', '', name)\n",
" return cleaned_name"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"data.columns = [clean_column_name(col) for col in data.columns]"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" open | \n",
" high | \n",
" low | \n",
" close | \n",
" volume | \n",
"
\n",
" \n",
" date | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 2024-05-02 | \n",
" 182.86 | \n",
" 184.60 | \n",
" 176.0200 | \n",
" 180.01 | \n",
" 89148041.0 | \n",
"
\n",
" \n",
" 2024-05-01 | \n",
" 182.00 | \n",
" 185.86 | \n",
" 179.0100 | \n",
" 179.99 | \n",
" 92829719.0 | \n",
"
\n",
" \n",
" 2024-04-30 | \n",
" 186.98 | \n",
" 190.95 | \n",
" 182.8401 | \n",
" 183.28 | \n",
" 127031787.0 | \n",
"
\n",
" \n",
" 2024-04-29 | \n",
" 188.42 | \n",
" 198.87 | \n",
" 184.5400 | \n",
" 194.05 | \n",
" 243869678.0 | \n",
"
\n",
" \n",
" 2024-04-26 | \n",
" 168.85 | \n",
" 172.12 | \n",
" 166.3700 | \n",
" 168.29 | \n",
" 109815725.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" open high low close volume\n",
"date \n",
"2024-05-02 182.86 184.60 176.0200 180.01 89148041.0\n",
"2024-05-01 182.00 185.86 179.0100 179.99 92829719.0\n",
"2024-04-30 186.98 190.95 182.8401 183.28 127031787.0\n",
"2024-04-29 188.42 198.87 184.5400 194.05 243869678.0\n",
"2024-04-26 168.85 172.12 166.3700 168.29 109815725.0"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"data.reset_index(inplace=True)\n"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" date | \n",
" open | \n",
" high | \n",
" low | \n",
" close | \n",
" volume | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 2024-05-02 | \n",
" 182.86 | \n",
" 184.60 | \n",
" 176.0200 | \n",
" 180.01 | \n",
" 89148041.0 | \n",
"
\n",
" \n",
" 1 | \n",
" 2024-05-01 | \n",
" 182.00 | \n",
" 185.86 | \n",
" 179.0100 | \n",
" 179.99 | \n",
" 92829719.0 | \n",
"
\n",
" \n",
" 2 | \n",
" 2024-04-30 | \n",
" 186.98 | \n",
" 190.95 | \n",
" 182.8401 | \n",
" 183.28 | \n",
" 127031787.0 | \n",
"
\n",
" \n",
" 3 | \n",
" 2024-04-29 | \n",
" 188.42 | \n",
" 198.87 | \n",
" 184.5400 | \n",
" 194.05 | \n",
" 243869678.0 | \n",
"
\n",
" \n",
" 4 | \n",
" 2024-04-26 | \n",
" 168.85 | \n",
" 172.12 | \n",
" 166.3700 | \n",
" 168.29 | \n",
" 109815725.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" date open high low close volume\n",
"0 2024-05-02 182.86 184.60 176.0200 180.01 89148041.0\n",
"1 2024-05-01 182.00 185.86 179.0100 179.99 92829719.0\n",
"2 2024-04-30 186.98 190.95 182.8401 183.28 127031787.0\n",
"3 2024-04-29 188.42 198.87 184.5400 194.05 243869678.0\n",
"4 2024-04-26 168.85 172.12 166.3700 168.29 109815725.0"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" date | \n",
" open | \n",
" high | \n",
" low | \n",
" close | \n",
" volume | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 2024-05-02 | \n",
" 182.86 | \n",
" 184.6000 | \n",
" 176.0200 | \n",
" 180.01 | \n",
" 89148041.0 | \n",
"
\n",
" \n",
" 1 | \n",
" 2024-05-01 | \n",
" 182.00 | \n",
" 185.8600 | \n",
" 179.0100 | \n",
" 179.99 | \n",
" 92829719.0 | \n",
"
\n",
" \n",
" 2 | \n",
" 2024-04-30 | \n",
" 186.98 | \n",
" 190.9500 | \n",
" 182.8401 | \n",
" 183.28 | \n",
" 127031787.0 | \n",
"
\n",
" \n",
" 3 | \n",
" 2024-04-29 | \n",
" 188.42 | \n",
" 198.8700 | \n",
" 184.5400 | \n",
" 194.05 | \n",
" 243869678.0 | \n",
"
\n",
" \n",
" 4 | \n",
" 2024-04-26 | \n",
" 168.85 | \n",
" 172.1200 | \n",
" 166.3700 | \n",
" 168.29 | \n",
" 109815725.0 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 3480 | \n",
" 2010-07-06 | \n",
" 20.00 | \n",
" 20.0000 | \n",
" 15.8300 | \n",
" 16.11 | \n",
" 6866900.0 | \n",
"
\n",
" \n",
" 3481 | \n",
" 2010-07-02 | \n",
" 23.00 | \n",
" 23.1000 | \n",
" 18.7100 | \n",
" 19.20 | \n",
" 5139800.0 | \n",
"
\n",
" \n",
" 3482 | \n",
" 2010-07-01 | \n",
" 25.00 | \n",
" 25.9200 | \n",
" 20.2700 | \n",
" 21.96 | \n",
" 8218800.0 | \n",
"
\n",
" \n",
" 3483 | \n",
" 2010-06-30 | \n",
" 25.79 | \n",
" 30.4192 | \n",
" 23.3000 | \n",
" 23.83 | \n",
" 17187100.0 | \n",
"
\n",
" \n",
" 3484 | \n",
" 2010-06-29 | \n",
" 19.00 | \n",
" 25.0000 | \n",
" 17.5400 | \n",
" 23.89 | \n",
" 18766300.0 | \n",
"
\n",
" \n",
"
\n",
"
3485 rows × 6 columns
\n",
"
"
],
"text/plain": [
" date open high low close volume\n",
"0 2024-05-02 182.86 184.6000 176.0200 180.01 89148041.0\n",
"1 2024-05-01 182.00 185.8600 179.0100 179.99 92829719.0\n",
"2 2024-04-30 186.98 190.9500 182.8401 183.28 127031787.0\n",
"3 2024-04-29 188.42 198.8700 184.5400 194.05 243869678.0\n",
"4 2024-04-26 168.85 172.1200 166.3700 168.29 109815725.0\n",
"... ... ... ... ... ... ...\n",
"3480 2010-07-06 20.00 20.0000 15.8300 16.11 6866900.0\n",
"3481 2010-07-02 23.00 23.1000 18.7100 19.20 5139800.0\n",
"3482 2010-07-01 25.00 25.9200 20.2700 21.96 8218800.0\n",
"3483 2010-06-30 25.79 30.4192 23.3000 23.83 17187100.0\n",
"3484 2010-06-29 19.00 25.0000 17.5400 23.89 18766300.0\n",
"\n",
"[3485 rows x 6 columns]"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
"# Define the date range you're interested in\n",
"yesterday =datetime.now()-timedelta(days=1)\n",
"two_years_back = yesterday - timedelta(days=684)"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
"# Filter the DataFrame to this range\n",
"filtered_df = data[(data['date'] >= two_years_back) & (data['date'] <= yesterday)]"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" date | \n",
" open | \n",
" high | \n",
" low | \n",
" close | \n",
" volume | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 2024-05-02 | \n",
" 182.86 | \n",
" 184.60 | \n",
" 176.0200 | \n",
" 180.01 | \n",
" 89148041.0 | \n",
"
\n",
" \n",
" 1 | \n",
" 2024-05-01 | \n",
" 182.00 | \n",
" 185.86 | \n",
" 179.0100 | \n",
" 179.99 | \n",
" 92829719.0 | \n",
"
\n",
" \n",
" 2 | \n",
" 2024-04-30 | \n",
" 186.98 | \n",
" 190.95 | \n",
" 182.8401 | \n",
" 183.28 | \n",
" 127031787.0 | \n",
"
\n",
" \n",
" 3 | \n",
" 2024-04-29 | \n",
" 188.42 | \n",
" 198.87 | \n",
" 184.5400 | \n",
" 194.05 | \n",
" 243869678.0 | \n",
"
\n",
" \n",
" 4 | \n",
" 2024-04-26 | \n",
" 168.85 | \n",
" 172.12 | \n",
" 166.3700 | \n",
" 168.29 | \n",
" 109815725.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" date open high low close volume\n",
"0 2024-05-02 182.86 184.60 176.0200 180.01 89148041.0\n",
"1 2024-05-01 182.00 185.86 179.0100 179.99 92829719.0\n",
"2 2024-04-30 186.98 190.95 182.8401 183.28 127031787.0\n",
"3 2024-04-29 188.42 198.87 184.5400 194.05 243869678.0\n",
"4 2024-04-26 168.85 172.12 166.3700 168.29 109815725.0"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"filtered_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2022-06-21 00:00:00\n",
"2024-05-02 00:00:00\n"
]
}
],
"source": [
"print(filtered_df['date'].min())\n",
"print(filtered_df['date'].max())"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(470, 6)"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"filtered_df.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.4"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}