{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Importing necessary libraries\n", "from dotenv import load_dotenv\n", "from datetime import datetime, timedelta\n", "import requests\n", "import os\n", "import time\n", "import pandas as pd \n", "from SML.news_preprocessing import process_news_articles #Importing everything from 'news_preprocessing'\n", "from SML.news_preprocessing import exponential_moving_average\n", "load_dotenv()" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Fetched 50 articles from 2022-05-14 to 2022-07-03\n", "Fetched 50 articles from 2022-07-04 to 2022-08-23\n", "Fetched 50 articles from 2022-08-24 to 2022-10-13\n", "Fetched 50 articles from 2022-10-14 to 2022-12-03\n", "Fetched 50 articles from 2022-12-04 to 2023-01-23\n", "Rate limit reached. Waiting to retry...\n", "Fetched 50 articles from 2023-01-24 to 2023-03-15\n", "Fetched 50 articles from 2023-03-16 to 2023-05-05\n", "Fetched 50 articles from 2023-05-06 to 2023-06-25\n", "Fetched 50 articles from 2023-06-26 to 2023-08-15\n", "Fetched 50 articles from 2023-08-16 to 2023-10-05\n", "Rate limit reached. Waiting to retry...\n", "Fetched 50 articles from 2023-10-06 to 2023-11-25\n", "Fetched 50 articles from 2023-11-26 to 2024-01-15\n", "Fetched 50 articles from 2024-01-16 to 2024-03-06\n", "Fetched 50 articles from 2024-03-07 to 2024-04-26\n", "Fetched 50 articles from 2024-04-27 to 2024-05-13\n", "Total articles fetched: 750\n" ] } ], "source": [ "#Defining a function for fetching news\n", "\n", "def fetch_news(api_key, ticker, start_date, end_date):\n", " base_url = os.environ.get(\"endpointnewsp\")\n", " headers = {\"Authorization\": f\"Bearer {api_key}\"}\n", " all_news = []\n", " \n", " current_date = start_date\n", "\n", " while current_date <= end_date:\n", " batch_end_date = current_date + timedelta(days=50)\n", " if batch_end_date > end_date:\n", " batch_end_date = end_date\n", "\n", " params = {\n", " \"ticker\": ticker,\n", " \"published_utc.gte\": current_date.strftime('%Y-%m-%d'),\n", " \"published_utc.lte\": batch_end_date.strftime('%Y-%m-%d'),\n", " \"limit\": 50,\n", " \"sort\": \"published_utc\"\n", " }\n", "\n", " try:\n", " response = requests.get(base_url, headers=headers, params=params)\n", " if response.status_code == 200:\n", " data = response.json()\n", " articles = data.get('results', [])\n", " \n", " # Creating a DataFrame from articles\n", " df = pd.DataFrame(articles)\n", " \n", " # Adding primary_key column if ticker is found\n", " df['ticker'] = df['tickers'].apply(lambda x: ticker if ticker in x else None)\n", " \n", " all_news.append(df) # Append DataFrame to the list\n", " print(f\"Fetched {len(articles)} articles from {current_date.strftime('%Y-%m-%d')} to {batch_end_date.strftime('%Y-%m-%d')}\")\n", " current_date = batch_end_date + timedelta(days=1)\n", " elif response.status_code == 429:\n", " print(\"Rate limit reached. Waiting to retry...\")\n", " time.sleep(60) # Wait for 60 seconds or as recommended by the API\n", " continue # Retry the current request\n", " else:\n", " print(f\"Failed to fetch data: {response.status_code}, {response.text}\")\n", " break\n", " except Exception as e:\n", " print(f\"An error occurred: {e}\")\n", " break\n", "\n", " return pd.concat(all_news, ignore_index=True)\n", "\n", "#Usage\n", "api_key = os.environ.get('newsp_api')\n", "ticker = 'TSLA'\n", "end_date = datetime.now() - timedelta(days=1) # Yesterday's date\n", "start_date = end_date - timedelta(days=365 * 2)\n", "news_articles = fetch_news(api_key, ticker, start_date, end_date)\n", "print(f\"Total articles fetched: {len(news_articles)}\")\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# Process the news articles\n", "df = process_news_articles(news_articles)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 74 entries, 0 to 73\n", "Data columns (total 3 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 date 74 non-null object \n", " 1 ticker 74 non-null object \n", " 2 sentiment 74 non-null float64\n", "dtypes: float64(1), object(2)\n", "memory usage: 1.9+ KB\n" ] } ], "source": [ "df.info()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datetickersentiment
02022-06-29TSLA0.076381
12022-06-30TSLA0.084328
22022-07-01TSLA0.178838
32022-07-02TSLA0.037667
42022-07-03TSLA-0.375000
\n", "
" ], "text/plain": [ " date ticker sentiment\n", "0 2022-06-29 TSLA 0.076381\n", "1 2022-06-30 TSLA 0.084328\n", "2 2022-07-01 TSLA 0.178838\n", "3 2022-07-02 TSLA 0.037667\n", "4 2022-07-03 TSLA -0.375000" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "df= df.sort_index(ascending=False)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "#Putting the news articles into a csv\n", "df.to_csv('news_articles.csv', index=False)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "df_processed = exponential_moving_average(df, window=7)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "df_processed.to_csv('news_articles_ema.csv', index=False)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datetickersentimentexp_mean_7_days
732024-05-13TSLA0.1154430.115443
722024-05-12TSLA0.0375000.095957
712024-05-11TSLA0.1000000.096968
702024-05-10TSLA0.0696500.090138
692024-05-09TSLA-0.0312500.059791
\n", "
" ], "text/plain": [ " date ticker sentiment exp_mean_7_days\n", "73 2024-05-13 TSLA 0.115443 0.115443\n", "72 2024-05-12 TSLA 0.037500 0.095957\n", "71 2024-05-11 TSLA 0.100000 0.096968\n", "70 2024-05-10 TSLA 0.069650 0.090138\n", "69 2024-05-09 TSLA -0.031250 0.059791" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_processed.head()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datetickersentimentexp_mean_7_days
42022-07-03TSLA-0.375000-0.004703
32022-07-02TSLA0.0376670.005889
22022-07-01TSLA0.1788380.049127
12022-06-30TSLA0.0843280.057927
02022-06-29TSLA0.0763810.062540
\n", "
" ], "text/plain": [ " date ticker sentiment exp_mean_7_days\n", "4 2022-07-03 TSLA -0.375000 -0.004703\n", "3 2022-07-02 TSLA 0.037667 0.005889\n", "2 2022-07-01 TSLA 0.178838 0.049127\n", "1 2022-06-30 TSLA 0.084328 0.057927\n", "0 2022-06-29 TSLA 0.076381 0.062540" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_processed.tail()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2022-06-29\n", "2024-05-13\n" ] } ], "source": [ "print(df_processed['date'].min())\n", "print(df_processed['date'].max())" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "684 days, 0:00:00\n" ] } ], "source": [ "print(df_processed['date'].max() - df_processed['date'].min()) " ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(74, 4)" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_processed.shape" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "duplicates = df_processed[df_processed.duplicated('date')]" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(0, 4)" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "duplicates.shape" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datetickersentimentexp_mean_7_days
732024-05-13TSLA0.1154430.115443
722024-05-12TSLA0.0375000.095957
712024-05-11TSLA0.1000000.096968
702024-05-10TSLA0.0696500.090138
692024-05-09TSLA-0.0312500.059791
\n", "
" ], "text/plain": [ " date ticker sentiment exp_mean_7_days\n", "73 2024-05-13 TSLA 0.115443 0.115443\n", "72 2024-05-12 TSLA 0.037500 0.095957\n", "71 2024-05-11 TSLA 0.100000 0.096968\n", "70 2024-05-10 TSLA 0.069650 0.090138\n", "69 2024-05-09 TSLA -0.031250 0.059791" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_processed.head()" ] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.9" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }