In [1]:
from dotenv import load_dotenv
from datetime import datetime, timedelta
import requests
import os
import time
import pandas as pd 
from news_preprocessing import *

In [2]:
load_dotenv()

True

In [3]:
import os
import requests
from datetime import datetime, timedelta
import pandas as pd

def fetch_news(api_key, ticker, start_date, end_date):
    base_url = os.environ.get("endpointnewsp")
    headers = {"Authorization": f"Bearer {api_key}"}
    all_news = []
    
    current_date = start_date

    while current_date <= end_date:
        batch_end_date = current_date + timedelta(days=50)
        if batch_end_date > end_date:
            batch_end_date = end_date

        params = {
            "ticker": ticker,
            "published_utc.gte": current_date.strftime('%Y-%m-%d'),
            "published_utc.lte": batch_end_date.strftime('%Y-%m-%d'),
            "limit": 50,
            "sort": "published_utc"
        }

        try:
            response = requests.get(base_url, headers=headers, params=params)
            if response.status_code == 200:
                data = response.json()
                articles = data.get('results', [])
                
                # Create DataFrame from articles
                df = pd.DataFrame(articles)
                
                # Add primary_key column if ticker is found
                df['ticker'] = df['tickers'].apply(lambda x: ticker if ticker in x else None)
                
                all_news.append(df)  # Append DataFrame to the list
                print(f"Fetched {len(articles)} articles from {current_date.strftime('%Y-%m-%d')} to {batch_end_date.strftime('%Y-%m-%d')}")
                current_date = batch_end_date + timedelta(days=1)
            elif response.status_code == 429:
                print("Rate limit reached. Waiting to retry...")
                time.sleep(60)  # Wait for 60 seconds or as recommended by the API
                continue  # Retry the current request
            else:
                print(f"Failed to fetch data: {response.status_code}, {response.text}")
                break
        except Exception as e:
            print(f"An error occurred: {e}")
            break

    return pd.concat(all_news, ignore_index=True)

# Example usage
api_key = os.environ.get('newsp_api')
ticker = 'TSLA'
end_date = datetime.now() - timedelta(days=1)  # Yesterday's date
start_date = end_date - timedelta(days=365 * 2)
news_articles = fetch_news(api_key, ticker, start_date, end_date)
print(f"Total articles fetched: {len(news_articles)}")


Fetched 50 articles from 2022-05-06 to 2022-06-25
Fetched 50 articles from 2022-06-26 to 2022-08-15
Fetched 50 articles from 2022-08-16 to 2022-10-05
Fetched 50 articles from 2022-10-06 to 2022-11-25
Fetched 50 articles from 2022-11-26 to 2023-01-15
Rate limit reached. Waiting to retry...
Fetched 50 articles from 2023-01-16 to 2023-03-07
Fetched 50 articles from 2023-03-08 to 2023-04-27
Fetched 50 articles from 2023-04-28 to 2023-06-17
Fetched 50 articles from 2023-06-18 to 2023-08-07
Fetched 50 articles from 2023-08-08 to 2023-09-27
Rate limit reached. Waiting to retry...
Fetched 50 articles from 2023-09-28 to 2023-11-17
Fetched 50 articles from 2023-11-18 to 2024-01-07
Fetched 50 articles from 2024-01-08 to 2024-02-27
Fetched 50 articles from 2024-02-28 to 2024-04-18
Fetched 50 articles from 2024-04-19 to 2024-05-05
Total articles fetched: 750


In [4]:
# Process the news articles
df = process_news_articles(news_articles)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66 entries, 0 to 65
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   date       66 non-null     object 
 1   ticker     66 non-null     object 
 2   sentiment  66 non-null     float64
dtypes: float64(1), object(2)
memory usage: 1.7+ KB


In [6]:
df.head()

Unnamed: 0,date,ticker,sentiment
0,2022-06-23,TSLA,0.091056
1,2022-06-24,TSLA,0.059212
2,2022-06-25,TSLA,0.25
3,2022-08-11,TSLA,0.171968
4,2022-08-12,TSLA,0.035351


In [7]:
df= df.sort_index(ascending=False)

In [8]:
df.to_csv('news_articles.csv', index=False)


In [9]:
df_processed = exponential_moving_average(df, window=7)

In [10]:
df_processed.to_csv('news_articles_ema.csv', index=False)

In [11]:
df_processed.head()

Unnamed: 0,date,ticker,sentiment,exp_mean_7_days
65,2024-05-05,TSLA,0.03619,0.03619
64,2024-05-04,TSLA,0.062665,0.042809
63,2024-05-03,TSLA,0.027798,0.039056
62,2024-05-02,TSLA,0.001443,0.029653
61,2024-05-01,TSLA,0.162742,0.062925


In [12]:
df_processed.tail()

Unnamed: 0,date,ticker,sentiment,exp_mean_7_days
4,2022-08-12,TSLA,0.035351,0.053681
3,2022-08-11,TSLA,0.171968,0.083253
2,2022-06-25,TSLA,0.25,0.12494
1,2022-06-24,TSLA,0.059212,0.108508
0,2022-06-23,TSLA,0.091056,0.104145


In [13]:
print(df_processed['date'].min())
print(df_processed['date'].max())

2022-06-23
2024-05-05


In [14]:
print(df_processed['date'].max() - df_processed['date'].min()) 

682 days, 0:00:00


In [15]:
df_processed.shape

(66, 4)

In [16]:
duplicates = df_processed[df_processed.duplicated('date')]

In [17]:
duplicates.shape

(0, 4)

In [18]:
df_processed.head()

Unnamed: 0,date,ticker,sentiment,exp_mean_7_days
65,2024-05-05,TSLA,0.03619,0.03619
64,2024-05-04,TSLA,0.062665,0.042809
63,2024-05-03,TSLA,0.027798,0.039056
62,2024-05-02,TSLA,0.001443,0.029653
61,2024-05-01,TSLA,0.162742,0.062925
