{ "cells": [ { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "from dotenv import load_dotenv\n", "import os \n", "from alpha_vantage.timeseries import TimeSeries\n", "import pandas as pd\n", "import hopsworks\n", "import re \n", "import modal \n", "#prepocessing\n", "import requests\n", "import pandas as pd\n", "import json\n", "import pandas_market_calendars as mcal\n", "import datetime\n", "import numpy as np\n", "from datetime import datetime, timedelta\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 1. open 2. high 3. low 4. close 5. volume\n", "date \n", "2024-05-02 182.86 184.60 176.0200 180.01 89148041.0\n", "2024-05-01 182.00 185.86 179.0100 179.99 92829719.0\n", "2024-04-30 186.98 190.95 182.8401 183.28 127031787.0\n", "2024-04-29 188.42 198.87 184.5400 194.05 243869678.0\n", "2024-04-26 168.85 172.12 166.3700 168.29 109815725.0\n" ] } ], "source": [ "load_dotenv()\n", "\n", "api_key = os.environ.get('stocks_api') # Replace this with your actual API key\n", "ts = TimeSeries(key=api_key, output_format='pandas')\n", "\n", "# Fetch daily adjusted stock prices; adjust the symbol as needed\n", "data, meta_data = ts.get_daily(symbol='TSLA', outputsize='full')\n", "\n", "print(data.head())" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
1. open2. high3. low4. close5. volume
date
2024-05-02182.86184.6000176.0200180.0189148041.0
2024-05-01182.00185.8600179.0100179.9992829719.0
2024-04-30186.98190.9500182.8401183.28127031787.0
2024-04-29188.42198.8700184.5400194.05243869678.0
2024-04-26168.85172.1200166.3700168.29109815725.0
..................
2010-07-0620.0020.000015.830016.116866900.0
2010-07-0223.0023.100018.710019.205139800.0
2010-07-0125.0025.920020.270021.968218800.0
2010-06-3025.7930.419223.300023.8317187100.0
2010-06-2919.0025.000017.540023.8918766300.0
\n", "

3485 rows × 5 columns

\n", "
" ], "text/plain": [ " 1. open 2. high 3. low 4. close 5. volume\n", "date \n", "2024-05-02 182.86 184.6000 176.0200 180.01 89148041.0\n", "2024-05-01 182.00 185.8600 179.0100 179.99 92829719.0\n", "2024-04-30 186.98 190.9500 182.8401 183.28 127031787.0\n", "2024-04-29 188.42 198.8700 184.5400 194.05 243869678.0\n", "2024-04-26 168.85 172.1200 166.3700 168.29 109815725.0\n", "... ... ... ... ... ...\n", "2010-07-06 20.00 20.0000 15.8300 16.11 6866900.0\n", "2010-07-02 23.00 23.1000 18.7100 19.20 5139800.0\n", "2010-07-01 25.00 25.9200 20.2700 21.96 8218800.0\n", "2010-06-30 25.79 30.4192 23.3000 23.83 17187100.0\n", "2010-06-29 19.00 25.0000 17.5400 23.89 18766300.0\n", "\n", "[3485 rows x 5 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "DatetimeIndex: 3485 entries, 2024-05-02 to 2010-06-29\n", "Data columns (total 5 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 1. open 3485 non-null float64\n", " 1 2. high 3485 non-null float64\n", " 2 3. low 3485 non-null float64\n", " 3 4. close 3485 non-null float64\n", " 4 5. volume 3485 non-null float64\n", "dtypes: float64(5)\n", "memory usage: 163.4 KB\n" ] } ], "source": [ "data.info()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'1. Information': 'Daily Prices (open, high, low, close) and Volumes',\n", " '2. Symbol': 'TSLA',\n", " '3. Last Refreshed': '2024-05-02',\n", " '4. Output Size': 'Full size',\n", " '5. Time Zone': 'US/Eastern'}" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "meta_data" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "#Stock market:\n", "def today_is_a_business_day(today):\n", " # Get the NYSE calendar\n", " cal = mcal.get_calendar('NYSE')\n", " schedule = cal.schedule(start_date=today, end_date=today) # Get the NYSE calendar's open and close times for the specified period\n", " try:\n", " isBusinessDay = schedule.market_open.dt.strftime('%Y-%m-%d')\n", " return True\n", " except:\n", " print('Today {} is not a business day'.format(today))\n", " return False" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "def next_business_day(today):\n", " \n", " # Real tomorrow\n", " tomorrow = (today + timedelta(days=1)).strftime(\"%Y-%m-%d\")\n", " \n", " # Get the NYSE calendar\n", " cal = mcal.get_calendar('NYSE')\n", "\n", " found_next_business_day = False \n", " while not found_next_business_day:\n", " schedule = cal.schedule(start_date=tomorrow, end_date=tomorrow) # Get the NYSE calendar's open and close times for the specified period\n", " try:\n", " isBusinessDay = schedule.market_open.dt.strftime('%Y-%m-%d') # Only need a list of dates when it's open (not open and close times)\n", " found_next_business_day = True\n", " except:\n", " print('The date {} is not a business day'.format(tomorrow))\n", " tomorrow = (datetime.datetime.strptime(tomorrow,\"%Y-%m-%d\") + timedelta(days=1)).strftime(\"%Y-%m-%d\")\n", " \n", " return isBusinessDay.to_numpy()[0]" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "def extract_business_day(start_date,end_date):\n", " \"\"\"\n", " Given a start_date and end_date.\n", " \n", " `Returns`:\n", " \n", " isBusinessDay: list of str (with all dates being business days)\n", " is_open: boolean list\n", " e.g is_open = [1,0,...,1] means that start_date = open, day after start_date = closed, and end_date = open\n", " \"\"\"\n", " \n", " # Save for later\n", " end_date_save = end_date\n", " \n", " # Get the NYSE calendar\n", " cal = mcal.get_calendar('NYSE')\n", "\n", " # Get the NYSE calendar's open and close times for the specified period\n", " schedule = cal.schedule(start_date=start_date, end_date=end_date)\n", " \n", " # Only need a list of dates when it's open (not open and close times)\n", " isBusinessDay = np.array(schedule.market_open.dt.strftime('%Y-%m-%d')) \n", " \n", " # Go over all days: \n", " delta = datetime.timedelta(days=1)\n", " start_date = datetime.datetime.strptime(start_date,\"%Y-%m-%d\") #datetime.date(2015, 7, 16)\n", " end_date = datetime.datetime.strptime(end_date,\"%Y-%m-%d\") #datetime.date(2023, 1, 4)\n", " \n", " # Extract days from the timedelta object\n", " num_days = (end_date - start_date).days + 1\n", " \n", " # Create boolean array for days being open (1) and closed (0) \n", " is_open = np.zeros(num_days)\n", " \n", " # iterate over range of dates\n", " current_BusinessDay = isBusinessDay[0]\n", " count_dates = 0\n", " next_BusinessDay = 0\n", " \n", " while (start_date <= end_date):\n", " \n", " if start_date.strftime('%Y-%m-%d') == current_BusinessDay:\n", " is_open[count_dates] = True\n", "\n", " if current_BusinessDay == end_date_save or current_BusinessDay==isBusinessDay[-1]:\n", " break\n", " else:\n", " next_BusinessDay += 1\n", " current_BusinessDay = isBusinessDay[next_BusinessDay]\n", " else:\n", " is_open[count_dates] = False\n", "\n", " count_dates += 1 \n", " start_date += delta\n", " \n", " print(np.shape(is_open))\n", " \n", " return isBusinessDay, is_open" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "def clean_column_name(name):\n", " # Remove all non-letter characters\n", " cleaned_name = re.sub(r'[^a-zA-Z]', '', name)\n", " return cleaned_name" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "data.columns = [clean_column_name(col) for col in data.columns]" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
openhighlowclosevolume
date
2024-05-02182.86184.60176.0200180.0189148041.0
2024-05-01182.00185.86179.0100179.9992829719.0
2024-04-30186.98190.95182.8401183.28127031787.0
2024-04-29188.42198.87184.5400194.05243869678.0
2024-04-26168.85172.12166.3700168.29109815725.0
\n", "
" ], "text/plain": [ " open high low close volume\n", "date \n", "2024-05-02 182.86 184.60 176.0200 180.01 89148041.0\n", "2024-05-01 182.00 185.86 179.0100 179.99 92829719.0\n", "2024-04-30 186.98 190.95 182.8401 183.28 127031787.0\n", "2024-04-29 188.42 198.87 184.5400 194.05 243869678.0\n", "2024-04-26 168.85 172.12 166.3700 168.29 109815725.0" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.head()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "data.reset_index(inplace=True)\n" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
dateopenhighlowclosevolume
02024-05-02182.86184.60176.0200180.0189148041.0
12024-05-01182.00185.86179.0100179.9992829719.0
22024-04-30186.98190.95182.8401183.28127031787.0
32024-04-29188.42198.87184.5400194.05243869678.0
42024-04-26168.85172.12166.3700168.29109815725.0
\n", "
" ], "text/plain": [ " date open high low close volume\n", "0 2024-05-02 182.86 184.60 176.0200 180.01 89148041.0\n", "1 2024-05-01 182.00 185.86 179.0100 179.99 92829719.0\n", "2 2024-04-30 186.98 190.95 182.8401 183.28 127031787.0\n", "3 2024-04-29 188.42 198.87 184.5400 194.05 243869678.0\n", "4 2024-04-26 168.85 172.12 166.3700 168.29 109815725.0" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.head()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
dateopenhighlowclosevolume
02024-05-02182.86184.6000176.0200180.0189148041.0
12024-05-01182.00185.8600179.0100179.9992829719.0
22024-04-30186.98190.9500182.8401183.28127031787.0
32024-04-29188.42198.8700184.5400194.05243869678.0
42024-04-26168.85172.1200166.3700168.29109815725.0
.....................
34802010-07-0620.0020.000015.830016.116866900.0
34812010-07-0223.0023.100018.710019.205139800.0
34822010-07-0125.0025.920020.270021.968218800.0
34832010-06-3025.7930.419223.300023.8317187100.0
34842010-06-2919.0025.000017.540023.8918766300.0
\n", "

3485 rows × 6 columns

\n", "
" ], "text/plain": [ " date open high low close volume\n", "0 2024-05-02 182.86 184.6000 176.0200 180.01 89148041.0\n", "1 2024-05-01 182.00 185.8600 179.0100 179.99 92829719.0\n", "2 2024-04-30 186.98 190.9500 182.8401 183.28 127031787.0\n", "3 2024-04-29 188.42 198.8700 184.5400 194.05 243869678.0\n", "4 2024-04-26 168.85 172.1200 166.3700 168.29 109815725.0\n", "... ... ... ... ... ... ...\n", "3480 2010-07-06 20.00 20.0000 15.8300 16.11 6866900.0\n", "3481 2010-07-02 23.00 23.1000 18.7100 19.20 5139800.0\n", "3482 2010-07-01 25.00 25.9200 20.2700 21.96 8218800.0\n", "3483 2010-06-30 25.79 30.4192 23.3000 23.83 17187100.0\n", "3484 2010-06-29 19.00 25.0000 17.5400 23.89 18766300.0\n", "\n", "[3485 rows x 6 columns]" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [], "source": [ "# Define the date range you're interested in\n", "yesterday =datetime.now()-timedelta(days=1)\n", "two_years_back = yesterday - timedelta(days=684)" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [], "source": [ "# Filter the DataFrame to this range\n", "filtered_df = data[(data['date'] >= two_years_back) & (data['date'] <= yesterday)]" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
dateopenhighlowclosevolume
02024-05-02182.86184.60176.0200180.0189148041.0
12024-05-01182.00185.86179.0100179.9992829719.0
22024-04-30186.98190.95182.8401183.28127031787.0
32024-04-29188.42198.87184.5400194.05243869678.0
42024-04-26168.85172.12166.3700168.29109815725.0
\n", "
" ], "text/plain": [ " date open high low close volume\n", "0 2024-05-02 182.86 184.60 176.0200 180.01 89148041.0\n", "1 2024-05-01 182.00 185.86 179.0100 179.99 92829719.0\n", "2 2024-04-30 186.98 190.95 182.8401 183.28 127031787.0\n", "3 2024-04-29 188.42 198.87 184.5400 194.05 243869678.0\n", "4 2024-04-26 168.85 172.12 166.3700 168.29 109815725.0" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "filtered_df.head()" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2022-06-21 00:00:00\n", "2024-05-02 00:00:00\n" ] } ], "source": [ "print(filtered_df['date'].min())\n", "print(filtered_df['date'].max())" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(470, 6)" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "filtered_df.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.4" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }