Spaces:

mtzeve
/

stocks-prediction-app

No application file

App Files Files Community

mtzeve commited on May 1, 2024

Commit

26654a1

1 Parent(s): 1f66d66

Updated_2

Browse files

Files changed (3) hide show

historical_news.ipynb +17 -1
historical_stock.ipynb +140 -1
news_articles.csv +0 -0

historical_news.ipynb CHANGED Viewed

@@ -115,12 +115,28 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
     "df.to_csv('news_articles.csv', index=False)\n"
    ]
   }
  ],
  "metadata": {

   },
   {
    "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df= df.sort_index(ascending=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
    "metadata": {},
    "outputs": [],
    "source": [
     "df.to_csv('news_articles.csv', index=False)\n"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {

historical_stock.ipynb CHANGED Viewed

@@ -12,7 +12,15 @@
     "import pandas as pd\n",
     "import hopsworks\n",
     "import re \n",
-    "import modal \n"
    ]
   },
   {
@@ -46,6 +54,137 @@
     "print(data.head())"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,

     "import pandas as pd\n",
     "import hopsworks\n",
     "import re \n",
+    "import modal \n",
+    "#prepocessing\n",
+    "import requests\n",
+    "import pandas as pd\n",
+    "import json\n",
+    "#import pandas_market_calendars as mcal\n",
+    "import datetime\n",
+    "import numpy as np\n",
+    "from datetime import timedelta \n"
    ]
   },
   {
     "print(data.head())"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_tsla_history():\n",
+    "\n",
+    "    start_date = datetime.datetime.strptime('2015-07-16',\"%Y-%m-%d\")\n",
+    "    end_date = datetime.datetime.strptime('2023-01-05',\"%Y-%m-%d\")\n",
+    "\n",
+    "    # Get the TSLA stock data from yfinance\n",
+    "    tsla = yf.Ticker(\"TSLA\") #VEFAB.ST\n",
+    "    # info = tsla.info\n",
+    "\n",
+    "    # get historical market data\n",
+    "    data = tsla.history(start=start_date, end=end_date)\n",
+    "\n",
+    "    # drop some columns\n",
+    "    tesla_df = data.drop(columns=['Dividends','Stock Splits'])\n",
+    "    tesla_df.index = tesla_df.index.strftime('%Y-%m-%d')\n",
+    "    \n",
+    "    print('Number of business days included in data set: ',np.shape(tesla_df))\n",
+    "\n",
+    "    # Create an array of all dates in the specified period\n",
+    "    all_dates = np.array([start_date + datetime.timedelta(days=i) for i in range((end_date - start_date).days)])\n",
+    "    all_dates = [d.strftime('%Y-%m-%d') for d in all_dates]\n",
+    "\n",
+    "    # Use setdiff1d() to find the non-business days\n",
+    "    isBusinessDay, _ = extract_business_day(start_date='2015-07-16',end_date='2023-01-04')\n",
+    "    non_business_days = np.setdiff1d(all_dates, isBusinessDay)\n",
+    "\n",
+    "    # Add nan-values to the non-business days\n",
+    "    print('Add {} non business days with NaN-values'.format(len(non_business_days)))\n",
+    "    for d in non_business_days:\n",
+    "        tesla_df.loc[d,:] = [np.nan,np.nan,np.nan,np.nan,np.nan]\n",
+    "\n",
+    "    # sort index (dates)\n",
+    "    tesla_df = tesla_df.sort_index()\n",
+    " \n",
+    "    # move \"date\"-index into its own column\n",
+    "    tesla_df = tesla_df.reset_index()\n",
+    "    \n",
+    "    # Rename column 'Date' to 'date'\n",
+    "    tesla_df = tesla_df.rename(columns={'Date': 'date'})\n",
+    "    print('Final size of dataframe',np.shape(tesla_df))\n",
+    "    \n",
+    "    # Write the merged dataframe to a CSV file\n",
+    "    start_date ='2015-07-16'\n",
+    "    end_date = '2023-01-05'\n",
+    "    save_path = \"data/stock/tesla_{}-{}.csv\".format(start_date,end_date)\n",
+    "    \n",
+    "    print('Save at :',save_path)\n",
+    "    tesla_df.to_csv(save_path, index=False)\n",
+    "    \n",
+    "    return tesla_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def extract_business_day(start_date,end_date):\n",
+    "    \"\"\"\n",
+    "    Given a start_date and end_date.\n",
+    "    \n",
+    "    `Returns`:\n",
+    "    \n",
+    "    isBusinessDay: list of str (with all dates being business days)\n",
+    "    is_open: boolean list\n",
+    "        e.g is_open = [1,0,...,1] means that start_date = open, day after start_date = closed, and end_date = open\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    # Save for later\n",
+    "    end_date_save = end_date\n",
+    "    \n",
+    "    # Get the NYSE calendar\n",
+    "    cal = mcal.get_calendar('NYSE')\n",
+    "\n",
+    "    # Get the NYSE calendar's open and close times for the specified period\n",
+    "    schedule = cal.schedule(start_date=start_date, end_date=end_date)\n",
+    "    \n",
+    "    # Only need a list of dates when it's open (not open and close times)\n",
+    "    isBusinessDay = np.array(schedule.market_open.dt.strftime('%Y-%m-%d')) \n",
+    "    \n",
+    "    # Go over all days: \n",
+    "    delta = datetime.timedelta(days=1)\n",
+    "    start_date = datetime.datetime.strptime(start_date,\"%Y-%m-%d\") #datetime.date(2015, 7, 16)\n",
+    "    end_date = datetime.datetime.strptime(end_date,\"%Y-%m-%d\") #datetime.date(2023, 1, 4)\n",
+    "    \n",
+    "    # Extract days from the timedelta object\n",
+    "    num_days = (end_date - start_date).days + 1\n",
+    "    \n",
+    "    # Create boolean array for days being open (1) and closed (0) \n",
+    "    is_open = np.zeros(num_days)\n",
+    "    \n",
+    "    # iterate over range of dates\n",
+    "    current_BusinessDay = isBusinessDay[0]\n",
+    "    count_dates = 0\n",
+    "    next_BusinessDay = 0\n",
+    "    \n",
+    "    while (start_date <= end_date):\n",
+    "    \n",
+    "        if start_date.strftime('%Y-%m-%d') == current_BusinessDay:\n",
+    "            is_open[count_dates] = True\n",
+    "\n",
+    "            if current_BusinessDay == end_date_save or current_BusinessDay==isBusinessDay[-1]:\n",
+    "                break\n",
+    "            else:\n",
+    "                next_BusinessDay += 1\n",
+    "                current_BusinessDay = isBusinessDay[next_BusinessDay]\n",
+    "        else:\n",
+    "            is_open[count_dates] = False\n",
+    "\n",
+    "        count_dates += 1   \n",
+    "        start_date += delta\n",
+    "        \n",
+    "    print(np.shape(is_open))\n",
+    "        \n",
+    "    return isBusinessDay, is_open"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "code",
    "execution_count": null,

news_articles.csv CHANGED Viewed

The diff for this file is too large to render. See raw diff