{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from dotenv import load_dotenv\n",
    "from datetime import datetime, timedelta\n",
    "import requests\n",
    "import os\n",
    "import time\n",
    "import pandas as pd \n",
    "from news_preprocessing import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "load_dotenv()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fetched 50 articles from 2022-05-06 to 2022-06-25\n",
      "Fetched 50 articles from 2022-06-26 to 2022-08-15\n",
      "Fetched 50 articles from 2022-08-16 to 2022-10-05\n",
      "Fetched 50 articles from 2022-10-06 to 2022-11-25\n",
      "Fetched 50 articles from 2022-11-26 to 2023-01-15\n",
      "Rate limit reached. Waiting to retry...\n",
      "Fetched 50 articles from 2023-01-16 to 2023-03-07\n",
      "Fetched 50 articles from 2023-03-08 to 2023-04-27\n",
      "Fetched 50 articles from 2023-04-28 to 2023-06-17\n",
      "Fetched 50 articles from 2023-06-18 to 2023-08-07\n",
      "Fetched 50 articles from 2023-08-08 to 2023-09-27\n",
      "Rate limit reached. Waiting to retry...\n",
      "Fetched 50 articles from 2023-09-28 to 2023-11-17\n",
      "Fetched 50 articles from 2023-11-18 to 2024-01-07\n",
      "Fetched 50 articles from 2024-01-08 to 2024-02-27\n",
      "Fetched 50 articles from 2024-02-28 to 2024-04-18\n",
      "Fetched 50 articles from 2024-04-19 to 2024-05-05\n",
      "Total articles fetched: 750\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import requests\n",
    "from datetime import datetime, timedelta\n",
    "import pandas as pd\n",
    "\n",
    "def fetch_news(api_key, ticker, start_date, end_date):\n",
    "    base_url = os.environ.get(\"endpointnewsp\")\n",
    "    headers = {\"Authorization\": f\"Bearer {api_key}\"}\n",
    "    all_news = []\n",
    "    \n",
    "    current_date = start_date\n",
    "\n",
    "    while current_date <= end_date:\n",
    "        batch_end_date = current_date + timedelta(days=50)\n",
    "        if batch_end_date > end_date:\n",
    "            batch_end_date = end_date\n",
    "\n",
    "        params = {\n",
    "            \"ticker\": ticker,\n",
    "            \"published_utc.gte\": current_date.strftime('%Y-%m-%d'),\n",
    "            \"published_utc.lte\": batch_end_date.strftime('%Y-%m-%d'),\n",
    "            \"limit\": 50,\n",
    "            \"sort\": \"published_utc\"\n",
    "        }\n",
    "\n",
    "        try:\n",
    "            response = requests.get(base_url, headers=headers, params=params)\n",
    "            if response.status_code == 200:\n",
    "                data = response.json()\n",
    "                articles = data.get('results', [])\n",
    "                \n",
    "                # Create DataFrame from articles\n",
    "                df = pd.DataFrame(articles)\n",
    "                \n",
    "                # Add primary_key column if ticker is found\n",
    "                df['ticker'] = df['tickers'].apply(lambda x: ticker if ticker in x else None)\n",
    "                \n",
    "                all_news.append(df)  # Append DataFrame to the list\n",
    "                print(f\"Fetched {len(articles)} articles from {current_date.strftime('%Y-%m-%d')} to {batch_end_date.strftime('%Y-%m-%d')}\")\n",
    "                current_date = batch_end_date + timedelta(days=1)\n",
    "            elif response.status_code == 429:\n",
    "                print(\"Rate limit reached. Waiting to retry...\")\n",
    "                time.sleep(60)  # Wait for 60 seconds or as recommended by the API\n",
    "                continue  # Retry the current request\n",
    "            else:\n",
    "                print(f\"Failed to fetch data: {response.status_code}, {response.text}\")\n",
    "                break\n",
    "        except Exception as e:\n",
    "            print(f\"An error occurred: {e}\")\n",
    "            break\n",
    "\n",
    "    return pd.concat(all_news, ignore_index=True)\n",
    "\n",
    "# Example usage\n",
    "api_key = os.environ.get('newsp_api')\n",
    "ticker = 'TSLA'\n",
    "end_date = datetime.now() - timedelta(days=1)  # Yesterday's date\n",
    "start_date = end_date - timedelta(days=365 * 2)\n",
    "news_articles = fetch_news(api_key, ticker, start_date, end_date)\n",
    "print(f\"Total articles fetched: {len(news_articles)}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Process the news articles\n",
    "df = process_news_articles(news_articles)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 66 entries, 0 to 65\n",
      "Data columns (total 3 columns):\n",
      " #   Column     Non-Null Count  Dtype  \n",
      "---  ------     --------------  -----  \n",
      " 0   date       66 non-null     object \n",
      " 1   ticker     66 non-null     object \n",
      " 2   sentiment  66 non-null     float64\n",
      "dtypes: float64(1), object(2)\n",
      "memory usage: 1.7+ KB\n"
     ]
    }
   ],
   "source": [
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>date</th>\n",
       "      <th>ticker</th>\n",
       "      <th>sentiment</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2022-06-23</td>\n",
       "      <td>TSLA</td>\n",
       "      <td>0.091056</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2022-06-24</td>\n",
       "      <td>TSLA</td>\n",
       "      <td>0.059212</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2022-06-25</td>\n",
       "      <td>TSLA</td>\n",
       "      <td>0.250000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2022-08-11</td>\n",
       "      <td>TSLA</td>\n",
       "      <td>0.171968</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2022-08-12</td>\n",
       "      <td>TSLA</td>\n",
       "      <td>0.035351</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         date ticker  sentiment\n",
       "0  2022-06-23   TSLA   0.091056\n",
       "1  2022-06-24   TSLA   0.059212\n",
       "2  2022-06-25   TSLA   0.250000\n",
       "3  2022-08-11   TSLA   0.171968\n",
       "4  2022-08-12   TSLA   0.035351"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "df= df.sort_index(ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_csv('news_articles.csv', index=False)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_processed = exponential_moving_average(df, window=7)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_processed.to_csv('news_articles_ema.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>date</th>\n",
       "      <th>ticker</th>\n",
       "      <th>sentiment</th>\n",
       "      <th>exp_mean_7_days</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>65</th>\n",
       "      <td>2024-05-05</td>\n",
       "      <td>TSLA</td>\n",
       "      <td>0.036190</td>\n",
       "      <td>0.036190</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>64</th>\n",
       "      <td>2024-05-04</td>\n",
       "      <td>TSLA</td>\n",
       "      <td>0.062665</td>\n",
       "      <td>0.042809</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>63</th>\n",
       "      <td>2024-05-03</td>\n",
       "      <td>TSLA</td>\n",
       "      <td>0.027798</td>\n",
       "      <td>0.039056</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>62</th>\n",
       "      <td>2024-05-02</td>\n",
       "      <td>TSLA</td>\n",
       "      <td>0.001443</td>\n",
       "      <td>0.029653</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>61</th>\n",
       "      <td>2024-05-01</td>\n",
       "      <td>TSLA</td>\n",
       "      <td>0.162742</td>\n",
       "      <td>0.062925</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          date ticker  sentiment  exp_mean_7_days\n",
       "65  2024-05-05   TSLA   0.036190         0.036190\n",
       "64  2024-05-04   TSLA   0.062665         0.042809\n",
       "63  2024-05-03   TSLA   0.027798         0.039056\n",
       "62  2024-05-02   TSLA   0.001443         0.029653\n",
       "61  2024-05-01   TSLA   0.162742         0.062925"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_processed.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>date</th>\n",
       "      <th>ticker</th>\n",
       "      <th>sentiment</th>\n",
       "      <th>exp_mean_7_days</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2022-08-12</td>\n",
       "      <td>TSLA</td>\n",
       "      <td>0.035351</td>\n",
       "      <td>0.053681</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2022-08-11</td>\n",
       "      <td>TSLA</td>\n",
       "      <td>0.171968</td>\n",
       "      <td>0.083253</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2022-06-25</td>\n",
       "      <td>TSLA</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.124940</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2022-06-24</td>\n",
       "      <td>TSLA</td>\n",
       "      <td>0.059212</td>\n",
       "      <td>0.108508</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2022-06-23</td>\n",
       "      <td>TSLA</td>\n",
       "      <td>0.091056</td>\n",
       "      <td>0.104145</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         date ticker  sentiment  exp_mean_7_days\n",
       "4  2022-08-12   TSLA   0.035351         0.053681\n",
       "3  2022-08-11   TSLA   0.171968         0.083253\n",
       "2  2022-06-25   TSLA   0.250000         0.124940\n",
       "1  2022-06-24   TSLA   0.059212         0.108508\n",
       "0  2022-06-23   TSLA   0.091056         0.104145"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_processed.tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2022-06-23\n",
      "2024-05-05\n"
     ]
    }
   ],
   "source": [
    "print(df_processed['date'].min())\n",
    "print(df_processed['date'].max())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "682 days, 0:00:00\n"
     ]
    }
   ],
   "source": [
    "print(df_processed['date'].max() - df_processed['date'].min()) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(66, 4)"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_processed.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "duplicates = df_processed[df_processed.duplicated('date')]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0, 4)"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "duplicates.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>date</th>\n",
       "      <th>ticker</th>\n",
       "      <th>sentiment</th>\n",
       "      <th>exp_mean_7_days</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>65</th>\n",
       "      <td>2024-05-05</td>\n",
       "      <td>TSLA</td>\n",
       "      <td>0.036190</td>\n",
       "      <td>0.036190</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>64</th>\n",
       "      <td>2024-05-04</td>\n",
       "      <td>TSLA</td>\n",
       "      <td>0.062665</td>\n",
       "      <td>0.042809</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>63</th>\n",
       "      <td>2024-05-03</td>\n",
       "      <td>TSLA</td>\n",
       "      <td>0.027798</td>\n",
       "      <td>0.039056</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>62</th>\n",
       "      <td>2024-05-02</td>\n",
       "      <td>TSLA</td>\n",
       "      <td>0.001443</td>\n",
       "      <td>0.029653</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>61</th>\n",
       "      <td>2024-05-01</td>\n",
       "      <td>TSLA</td>\n",
       "      <td>0.162742</td>\n",
       "      <td>0.062925</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          date ticker  sentiment  exp_mean_7_days\n",
       "65  2024-05-05   TSLA   0.036190         0.036190\n",
       "64  2024-05-04   TSLA   0.062665         0.042809\n",
       "63  2024-05-03   TSLA   0.027798         0.039056\n",
       "62  2024-05-02   TSLA   0.001443         0.029653\n",
       "61  2024-05-01   TSLA   0.162742         0.062925"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_processed.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.9"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}