{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from dotenv import load_dotenv\n", "from datetime import datetime, timedelta\n", "import requests\n", "import os\n", "import time\n", "import pandas as pd \n", "from news_preprocessing import *" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "load_dotenv()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Fetched 50 articles from 2022-05-06 to 2022-06-25\n", "Fetched 50 articles from 2022-06-26 to 2022-08-15\n", "Fetched 50 articles from 2022-08-16 to 2022-10-05\n", "Fetched 50 articles from 2022-10-06 to 2022-11-25\n", "Fetched 50 articles from 2022-11-26 to 2023-01-15\n", "Rate limit reached. Waiting to retry...\n", "Fetched 50 articles from 2023-01-16 to 2023-03-07\n", "Fetched 50 articles from 2023-03-08 to 2023-04-27\n", "Fetched 50 articles from 2023-04-28 to 2023-06-17\n", "Fetched 50 articles from 2023-06-18 to 2023-08-07\n", "Fetched 50 articles from 2023-08-08 to 2023-09-27\n", "Rate limit reached. Waiting to retry...\n", "Fetched 50 articles from 2023-09-28 to 2023-11-17\n", "Fetched 50 articles from 2023-11-18 to 2024-01-07\n", "Fetched 50 articles from 2024-01-08 to 2024-02-27\n", "Fetched 50 articles from 2024-02-28 to 2024-04-18\n", "Fetched 50 articles from 2024-04-19 to 2024-05-05\n", "Total articles fetched: 750\n" ] } ], "source": [ "import os\n", "import requests\n", "from datetime import datetime, timedelta\n", "import pandas as pd\n", "\n", "def fetch_news(api_key, ticker, start_date, end_date):\n", " base_url = os.environ.get(\"endpointnewsp\")\n", " headers = {\"Authorization\": f\"Bearer {api_key}\"}\n", " all_news = []\n", " \n", " current_date = start_date\n", "\n", " while current_date <= end_date:\n", " batch_end_date = current_date + timedelta(days=50)\n", " if batch_end_date > end_date:\n", " batch_end_date = end_date\n", "\n", " params = {\n", " \"ticker\": ticker,\n", " \"published_utc.gte\": current_date.strftime('%Y-%m-%d'),\n", " \"published_utc.lte\": batch_end_date.strftime('%Y-%m-%d'),\n", " \"limit\": 50,\n", " \"sort\": \"published_utc\"\n", " }\n", "\n", " try:\n", " response = requests.get(base_url, headers=headers, params=params)\n", " if response.status_code == 200:\n", " data = response.json()\n", " articles = data.get('results', [])\n", " \n", " # Create DataFrame from articles\n", " df = pd.DataFrame(articles)\n", " \n", " # Add primary_key column if ticker is found\n", " df['ticker'] = df['tickers'].apply(lambda x: ticker if ticker in x else None)\n", " \n", " all_news.append(df) # Append DataFrame to the list\n", " print(f\"Fetched {len(articles)} articles from {current_date.strftime('%Y-%m-%d')} to {batch_end_date.strftime('%Y-%m-%d')}\")\n", " current_date = batch_end_date + timedelta(days=1)\n", " elif response.status_code == 429:\n", " print(\"Rate limit reached. Waiting to retry...\")\n", " time.sleep(60) # Wait for 60 seconds or as recommended by the API\n", " continue # Retry the current request\n", " else:\n", " print(f\"Failed to fetch data: {response.status_code}, {response.text}\")\n", " break\n", " except Exception as e:\n", " print(f\"An error occurred: {e}\")\n", " break\n", "\n", " return pd.concat(all_news, ignore_index=True)\n", "\n", "# Example usage\n", "api_key = os.environ.get('newsp_api')\n", "ticker = 'TSLA'\n", "end_date = datetime.now() - timedelta(days=1) # Yesterday's date\n", "start_date = end_date - timedelta(days=365 * 2)\n", "news_articles = fetch_news(api_key, ticker, start_date, end_date)\n", "print(f\"Total articles fetched: {len(news_articles)}\")\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# Process the news articles\n", "df = process_news_articles(news_articles)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 66 entries, 0 to 65\n", "Data columns (total 3 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 date 66 non-null object \n", " 1 ticker 66 non-null object \n", " 2 sentiment 66 non-null float64\n", "dtypes: float64(1), object(2)\n", "memory usage: 1.7+ KB\n" ] } ], "source": [ "df.info()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datetickersentiment
02022-06-23TSLA0.091056
12022-06-24TSLA0.059212
22022-06-25TSLA0.250000
32022-08-11TSLA0.171968
42022-08-12TSLA0.035351
\n", "
" ], "text/plain": [ " date ticker sentiment\n", "0 2022-06-23 TSLA 0.091056\n", "1 2022-06-24 TSLA 0.059212\n", "2 2022-06-25 TSLA 0.250000\n", "3 2022-08-11 TSLA 0.171968\n", "4 2022-08-12 TSLA 0.035351" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "df= df.sort_index(ascending=False)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "df.to_csv('news_articles.csv', index=False)\n" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "df_processed = exponential_moving_average(df, window=7)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "df_processed.to_csv('news_articles_ema.csv', index=False)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datetickersentimentexp_mean_7_days
652024-05-05TSLA0.0361900.036190
642024-05-04TSLA0.0626650.042809
632024-05-03TSLA0.0277980.039056
622024-05-02TSLA0.0014430.029653
612024-05-01TSLA0.1627420.062925
\n", "
" ], "text/plain": [ " date ticker sentiment exp_mean_7_days\n", "65 2024-05-05 TSLA 0.036190 0.036190\n", "64 2024-05-04 TSLA 0.062665 0.042809\n", "63 2024-05-03 TSLA 0.027798 0.039056\n", "62 2024-05-02 TSLA 0.001443 0.029653\n", "61 2024-05-01 TSLA 0.162742 0.062925" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_processed.head()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datetickersentimentexp_mean_7_days
42022-08-12TSLA0.0353510.053681
32022-08-11TSLA0.1719680.083253
22022-06-25TSLA0.2500000.124940
12022-06-24TSLA0.0592120.108508
02022-06-23TSLA0.0910560.104145
\n", "
" ], "text/plain": [ " date ticker sentiment exp_mean_7_days\n", "4 2022-08-12 TSLA 0.035351 0.053681\n", "3 2022-08-11 TSLA 0.171968 0.083253\n", "2 2022-06-25 TSLA 0.250000 0.124940\n", "1 2022-06-24 TSLA 0.059212 0.108508\n", "0 2022-06-23 TSLA 0.091056 0.104145" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_processed.tail()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2022-06-23\n", "2024-05-05\n" ] } ], "source": [ "print(df_processed['date'].min())\n", "print(df_processed['date'].max())" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "682 days, 0:00:00\n" ] } ], "source": [ "print(df_processed['date'].max() - df_processed['date'].min()) " ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(66, 4)" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_processed.shape" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "duplicates = df_processed[df_processed.duplicated('date')]" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(0, 4)" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "duplicates.shape" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datetickersentimentexp_mean_7_days
652024-05-05TSLA0.0361900.036190
642024-05-04TSLA0.0626650.042809
632024-05-03TSLA0.0277980.039056
622024-05-02TSLA0.0014430.029653
612024-05-01TSLA0.1627420.062925
\n", "
" ], "text/plain": [ " date ticker sentiment exp_mean_7_days\n", "65 2024-05-05 TSLA 0.036190 0.036190\n", "64 2024-05-04 TSLA 0.062665 0.042809\n", "63 2024-05-03 TSLA 0.027798 0.039056\n", "62 2024-05-02 TSLA 0.001443 0.029653\n", "61 2024-05-01 TSLA 0.162742 0.062925" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_processed.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.9" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }