In [1]:
#Importing necessary libraries
from dotenv import load_dotenv
from datetime import datetime, timedelta
import requests
import os
import time
import pandas as pd 
from SML.news_preprocessing import process_news_articles    #Importing everything from 'news_preprocessing'
from SML.news_preprocessing import exponential_moving_average
load_dotenv()

True

In [2]:
#Defining a function for fetching news

def fetch_news(api_key, ticker, start_date, end_date):
    base_url = os.environ.get("endpointnewsp")
    headers = {"Authorization": f"Bearer {api_key}"}
    all_news = []
    
    current_date = start_date

    while current_date <= end_date:
        batch_end_date = current_date + timedelta(days=50)
        if batch_end_date > end_date:
            batch_end_date = end_date

        params = {
            "ticker": ticker,
            "published_utc.gte": current_date.strftime('%Y-%m-%d'),
            "published_utc.lte": batch_end_date.strftime('%Y-%m-%d'),
            "limit": 50,
            "sort": "published_utc"
        }

        try:
            response = requests.get(base_url, headers=headers, params=params)
            if response.status_code == 200:
                data = response.json()
                articles = data.get('results', [])
                
                # Creating a DataFrame from articles
                df = pd.DataFrame(articles)
                
                # Adding primary_key column if ticker is found
                df['ticker'] = df['tickers'].apply(lambda x: ticker if ticker in x else None)
                
                all_news.append(df)  # Append DataFrame to the list
                print(f"Fetched {len(articles)} articles from {current_date.strftime('%Y-%m-%d')} to {batch_end_date.strftime('%Y-%m-%d')}")
                current_date = batch_end_date + timedelta(days=1)
            elif response.status_code == 429:
                print("Rate limit reached. Waiting to retry...")
                time.sleep(60)  # Wait for 60 seconds or as recommended by the API
                continue  # Retry the current request
            else:
                print(f"Failed to fetch data: {response.status_code}, {response.text}")
                break
        except Exception as e:
            print(f"An error occurred: {e}")
            break

    return pd.concat(all_news, ignore_index=True)

#Usage
api_key = os.environ.get('newsp_api')
ticker = 'TSLA'
end_date = datetime.now() - timedelta(days=1)  # Yesterday's date
start_date = end_date - timedelta(days=365 * 2)
news_articles = fetch_news(api_key, ticker, start_date, end_date)
print(f"Total articles fetched: {len(news_articles)}")


Fetched 50 articles from 2022-05-14 to 2022-07-03
Fetched 50 articles from 2022-07-04 to 2022-08-23
Fetched 50 articles from 2022-08-24 to 2022-10-13
Fetched 50 articles from 2022-10-14 to 2022-12-03
Fetched 50 articles from 2022-12-04 to 2023-01-23
Rate limit reached. Waiting to retry...
Fetched 50 articles from 2023-01-24 to 2023-03-15
Fetched 50 articles from 2023-03-16 to 2023-05-05
Fetched 50 articles from 2023-05-06 to 2023-06-25
Fetched 50 articles from 2023-06-26 to 2023-08-15
Fetched 50 articles from 2023-08-16 to 2023-10-05
Rate limit reached. Waiting to retry...
Fetched 50 articles from 2023-10-06 to 2023-11-25
Fetched 50 articles from 2023-11-26 to 2024-01-15
Fetched 50 articles from 2024-01-16 to 2024-03-06
Fetched 50 articles from 2024-03-07 to 2024-04-26
Fetched 50 articles from 2024-04-27 to 2024-05-13
Total articles fetched: 750


In [3]:
# Process the news articles
df = process_news_articles(news_articles)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74 entries, 0 to 73
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   date       74 non-null     object 
 1   ticker     74 non-null     object 
 2   sentiment  74 non-null     float64
dtypes: float64(1), object(2)
memory usage: 1.9+ KB


In [5]:
df.head()

Unnamed: 0,date,ticker,sentiment
0,2022-06-29,TSLA,0.076381
1,2022-06-30,TSLA,0.084328
2,2022-07-01,TSLA,0.178838
3,2022-07-02,TSLA,0.037667
4,2022-07-03,TSLA,-0.375


In [6]:
df= df.sort_index(ascending=False)

In [7]:
#Putting the news articles into a csv
df.to_csv('news_articles.csv', index=False)

In [8]:
df_processed = exponential_moving_average(df, window=7)

In [9]:
df_processed.to_csv('news_articles_ema.csv', index=False)

In [10]:
df_processed.head()

Unnamed: 0,date,ticker,sentiment,exp_mean_7_days
73,2024-05-13,TSLA,0.115443,0.115443
72,2024-05-12,TSLA,0.0375,0.095957
71,2024-05-11,TSLA,0.1,0.096968
70,2024-05-10,TSLA,0.06965,0.090138
69,2024-05-09,TSLA,-0.03125,0.059791


In [11]:
df_processed.tail()

Unnamed: 0,date,ticker,sentiment,exp_mean_7_days
4,2022-07-03,TSLA,-0.375,-0.004703
3,2022-07-02,TSLA,0.037667,0.005889
2,2022-07-01,TSLA,0.178838,0.049127
1,2022-06-30,TSLA,0.084328,0.057927
0,2022-06-29,TSLA,0.076381,0.06254


In [12]:
print(df_processed['date'].min())
print(df_processed['date'].max())

2022-06-29
2024-05-13


In [13]:
print(df_processed['date'].max() - df_processed['date'].min()) 

684 days, 0:00:00


In [14]:
df_processed.shape

(74, 4)

In [15]:
duplicates = df_processed[df_processed.duplicated('date')]

In [16]:
duplicates.shape

(0, 4)

In [17]:
df_processed.head()

Unnamed: 0,date,ticker,sentiment,exp_mean_7_days
73,2024-05-13,TSLA,0.115443,0.115443
72,2024-05-12,TSLA,0.0375,0.095957
71,2024-05-11,TSLA,0.1,0.096968
70,2024-05-10,TSLA,0.06965,0.090138
69,2024-05-09,TSLA,-0.03125,0.059791
