{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Importing necessary libraries\n",
    "from dotenv import load_dotenv\n",
    "from datetime import datetime, timedelta\n",
    "import requests\n",
    "import os\n",
    "import time\n",
    "import pandas as pd \n",
    "from SML.news_preprocessing import process_news_articles    #Importing everything from 'news_preprocessing'\n",
    "from SML.news_preprocessing import exponential_moving_average\n",
    "load_dotenv()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fetched 50 articles from 2022-05-14 to 2022-07-03\n",
      "Fetched 50 articles from 2022-07-04 to 2022-08-23\n",
      "Fetched 50 articles from 2022-08-24 to 2022-10-13\n",
      "Fetched 50 articles from 2022-10-14 to 2022-12-03\n",
      "Fetched 50 articles from 2022-12-04 to 2023-01-23\n",
      "Rate limit reached. Waiting to retry...\n",
      "Fetched 50 articles from 2023-01-24 to 2023-03-15\n",
      "Fetched 50 articles from 2023-03-16 to 2023-05-05\n",
      "Fetched 50 articles from 2023-05-06 to 2023-06-25\n",
      "Fetched 50 articles from 2023-06-26 to 2023-08-15\n",
      "Fetched 50 articles from 2023-08-16 to 2023-10-05\n",
      "Rate limit reached. Waiting to retry...\n",
      "Fetched 50 articles from 2023-10-06 to 2023-11-25\n",
      "Fetched 50 articles from 2023-11-26 to 2024-01-15\n",
      "Fetched 50 articles from 2024-01-16 to 2024-03-06\n",
      "Fetched 50 articles from 2024-03-07 to 2024-04-26\n",
      "Fetched 50 articles from 2024-04-27 to 2024-05-13\n",
      "Total articles fetched: 750\n"
     ]
    }
   ],
   "source": [
    "#Defining a function for fetching news\n",
    "\n",
    "def fetch_news(api_key, ticker, start_date, end_date):\n",
    "    base_url = os.environ.get(\"endpointnewsp\")\n",
    "    headers = {\"Authorization\": f\"Bearer {api_key}\"}\n",
    "    all_news = []\n",
    "    \n",
    "    current_date = start_date\n",
    "\n",
    "    while current_date <= end_date:\n",
    "        batch_end_date = current_date + timedelta(days=50)\n",
    "        if batch_end_date > end_date:\n",
    "            batch_end_date = end_date\n",
    "\n",
    "        params = {\n",
    "            \"ticker\": ticker,\n",
    "            \"published_utc.gte\": current_date.strftime('%Y-%m-%d'),\n",
    "            \"published_utc.lte\": batch_end_date.strftime('%Y-%m-%d'),\n",
    "            \"limit\": 50,\n",
    "            \"sort\": \"published_utc\"\n",
    "        }\n",
    "\n",
    "        try:\n",
    "            response = requests.get(base_url, headers=headers, params=params)\n",
    "            if response.status_code == 200:\n",
    "                data = response.json()\n",
    "                articles = data.get('results', [])\n",
    "                \n",
    "                # Creating a DataFrame from articles\n",
    "                df = pd.DataFrame(articles)\n",
    "                \n",
    "                # Adding primary_key column if ticker is found\n",
    "                df['ticker'] = df['tickers'].apply(lambda x: ticker if ticker in x else None)\n",
    "                \n",
    "                all_news.append(df)  # Append DataFrame to the list\n",
    "                print(f\"Fetched {len(articles)} articles from {current_date.strftime('%Y-%m-%d')} to {batch_end_date.strftime('%Y-%m-%d')}\")\n",
    "                current_date = batch_end_date + timedelta(days=1)\n",
    "            elif response.status_code == 429:\n",
    "                print(\"Rate limit reached. Waiting to retry...\")\n",
    "                time.sleep(60)  # Wait for 60 seconds or as recommended by the API\n",
    "                continue  # Retry the current request\n",
    "            else:\n",
    "                print(f\"Failed to fetch data: {response.status_code}, {response.text}\")\n",
    "                break\n",
    "        except Exception as e:\n",
    "            print(f\"An error occurred: {e}\")\n",
    "            break\n",
    "\n",
    "    return pd.concat(all_news, ignore_index=True)\n",
    "\n",
    "#Usage\n",
    "api_key = os.environ.get('newsp_api')\n",
    "ticker = 'TSLA'\n",
    "end_date = datetime.now() - timedelta(days=1)  # Yesterday's date\n",
    "start_date = end_date - timedelta(days=365 * 2)\n",
    "news_articles = fetch_news(api_key, ticker, start_date, end_date)\n",
    "print(f\"Total articles fetched: {len(news_articles)}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Process the news articles\n",
    "df = process_news_articles(news_articles)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 74 entries, 0 to 73\n",
      "Data columns (total 3 columns):\n",
      " #   Column     Non-Null Count  Dtype  \n",
      "---  ------     --------------  -----  \n",
      " 0   date       74 non-null     object \n",
      " 1   ticker     74 non-null     object \n",
      " 2   sentiment  74 non-null     float64\n",
      "dtypes: float64(1), object(2)\n",
      "memory usage: 1.9+ KB\n"
     ]
    }
   ],
   "source": [
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>date</th>\n",
       "      <th>ticker</th>\n",
       "      <th>sentiment</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2022-06-29</td>\n",
       "      <td>TSLA</td>\n",
       "      <td>0.076381</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2022-06-30</td>\n",
       "      <td>TSLA</td>\n",
       "      <td>0.084328</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2022-07-01</td>\n",
       "      <td>TSLA</td>\n",
       "      <td>0.178838</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2022-07-02</td>\n",
       "      <td>TSLA</td>\n",
       "      <td>0.037667</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2022-07-03</td>\n",
       "      <td>TSLA</td>\n",
       "      <td>-0.375000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         date ticker  sentiment\n",
       "0  2022-06-29   TSLA   0.076381\n",
       "1  2022-06-30   TSLA   0.084328\n",
       "2  2022-07-01   TSLA   0.178838\n",
       "3  2022-07-02   TSLA   0.037667\n",
       "4  2022-07-03   TSLA  -0.375000"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "df= df.sort_index(ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Putting the news articles into a csv\n",
    "df.to_csv('news_articles.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_processed = exponential_moving_average(df, window=7)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_processed.to_csv('news_articles_ema.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>date</th>\n",
       "      <th>ticker</th>\n",
       "      <th>sentiment</th>\n",
       "      <th>exp_mean_7_days</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>73</th>\n",
       "      <td>2024-05-13</td>\n",
       "      <td>TSLA</td>\n",
       "      <td>0.115443</td>\n",
       "      <td>0.115443</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>72</th>\n",
       "      <td>2024-05-12</td>\n",
       "      <td>TSLA</td>\n",
       "      <td>0.037500</td>\n",
       "      <td>0.095957</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>71</th>\n",
       "      <td>2024-05-11</td>\n",
       "      <td>TSLA</td>\n",
       "      <td>0.100000</td>\n",
       "      <td>0.096968</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>70</th>\n",
       "      <td>2024-05-10</td>\n",
       "      <td>TSLA</td>\n",
       "      <td>0.069650</td>\n",
       "      <td>0.090138</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>69</th>\n",
       "      <td>2024-05-09</td>\n",
       "      <td>TSLA</td>\n",
       "      <td>-0.031250</td>\n",
       "      <td>0.059791</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          date ticker  sentiment  exp_mean_7_days\n",
       "73  2024-05-13   TSLA   0.115443         0.115443\n",
       "72  2024-05-12   TSLA   0.037500         0.095957\n",
       "71  2024-05-11   TSLA   0.100000         0.096968\n",
       "70  2024-05-10   TSLA   0.069650         0.090138\n",
       "69  2024-05-09   TSLA  -0.031250         0.059791"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_processed.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>date</th>\n",
       "      <th>ticker</th>\n",
       "      <th>sentiment</th>\n",
       "      <th>exp_mean_7_days</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2022-07-03</td>\n",
       "      <td>TSLA</td>\n",
       "      <td>-0.375000</td>\n",
       "      <td>-0.004703</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2022-07-02</td>\n",
       "      <td>TSLA</td>\n",
       "      <td>0.037667</td>\n",
       "      <td>0.005889</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2022-07-01</td>\n",
       "      <td>TSLA</td>\n",
       "      <td>0.178838</td>\n",
       "      <td>0.049127</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2022-06-30</td>\n",
       "      <td>TSLA</td>\n",
       "      <td>0.084328</td>\n",
       "      <td>0.057927</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2022-06-29</td>\n",
       "      <td>TSLA</td>\n",
       "      <td>0.076381</td>\n",
       "      <td>0.062540</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         date ticker  sentiment  exp_mean_7_days\n",
       "4  2022-07-03   TSLA  -0.375000        -0.004703\n",
       "3  2022-07-02   TSLA   0.037667         0.005889\n",
       "2  2022-07-01   TSLA   0.178838         0.049127\n",
       "1  2022-06-30   TSLA   0.084328         0.057927\n",
       "0  2022-06-29   TSLA   0.076381         0.062540"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_processed.tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2022-06-29\n",
      "2024-05-13\n"
     ]
    }
   ],
   "source": [
    "print(df_processed['date'].min())\n",
    "print(df_processed['date'].max())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "684 days, 0:00:00\n"
     ]
    }
   ],
   "source": [
    "print(df_processed['date'].max() - df_processed['date'].min()) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(74, 4)"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_processed.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "duplicates = df_processed[df_processed.duplicated('date')]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0, 4)"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "duplicates.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>date</th>\n",
       "      <th>ticker</th>\n",
       "      <th>sentiment</th>\n",
       "      <th>exp_mean_7_days</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>73</th>\n",
       "      <td>2024-05-13</td>\n",
       "      <td>TSLA</td>\n",
       "      <td>0.115443</td>\n",
       "      <td>0.115443</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>72</th>\n",
       "      <td>2024-05-12</td>\n",
       "      <td>TSLA</td>\n",
       "      <td>0.037500</td>\n",
       "      <td>0.095957</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>71</th>\n",
       "      <td>2024-05-11</td>\n",
       "      <td>TSLA</td>\n",
       "      <td>0.100000</td>\n",
       "      <td>0.096968</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>70</th>\n",
       "      <td>2024-05-10</td>\n",
       "      <td>TSLA</td>\n",
       "      <td>0.069650</td>\n",
       "      <td>0.090138</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>69</th>\n",
       "      <td>2024-05-09</td>\n",
       "      <td>TSLA</td>\n",
       "      <td>-0.031250</td>\n",
       "      <td>0.059791</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          date ticker  sentiment  exp_mean_7_days\n",
       "73  2024-05-13   TSLA   0.115443         0.115443\n",
       "72  2024-05-12   TSLA   0.037500         0.095957\n",
       "71  2024-05-11   TSLA   0.100000         0.096968\n",
       "70  2024-05-10   TSLA   0.069650         0.090138\n",
       "69  2024-05-09   TSLA  -0.031250         0.059791"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_processed.head()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.9"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}