## Load Environment

In [3]:
import os
from dotenv import find_dotenv, dotenv_values

keys = list(dotenv_values(find_dotenv('.env')).items())
OPENAI_API_KEY = os.environ['OPENAI_API_KEY'] = keys[0][1]

## Preprocess S&P data

In [None]:
import pandas as pd

s_and_p = pd.read_csv("./data/S&P.csv", sep='\t', names=["No.", "Ticker", "Company Name", "Market Cap", "Stock Price", "% Change", "Revenue"])

tickers = s_and_p["Ticker"]

for ticker in tickers:
 print(ticker)

## Get JSON CIK data

In [None]:
import json

#Reference found here: https://www.kaggle.com/code/svendaj/extracting-data-from-sec-edgar-restful-apis

with open("./data/company_tickers_exchange.json", "r") as f:
 CIK_dict = json.load(f)

CIK_df = pd.DataFrame(CIK_dict["data"], columns=CIK_dict["fields"])

## Loop through S&P 500 Companies and retreive docs

In [None]:
report_types = ["10-K", "10-Q", "8-K", "SD", "11-K"]

for ticker in tickers:
 try:
 cik = get_CIK(ticker)
 for report in report_types:
 get_financial_report(cik, report)
 except:
 continue
 

In [None]:
from weasyprint import HTML
import os
import requests

def get_CIK(ticker) -> str:
 """This tool takes a company stock ticker as an argument and returns the CIK number. This is used when trying to query the EDGAR database for financial statements."""
 if ticker is not None:
 result = CIK_df[CIK_df["ticker"] == ticker]
 cik = result["cik"]
 print(f'Ticker: {ticker} CIK: {str(cik.item()).zfill(10)}')
 return str(cik.item()).zfill(10)
 
def get_financial_report(cik, report_type):
 """This tool takes a company CIK number from the get_CIK tool, financial report type, and an optional date. 
 This information is used to retrieve requested document from the EDGAR database and save it to a path that can be returned to the user. 
 If a date is provided, it must be in YYYY-MM-DD format."""

 url = f"https://data.sec.gov/submissions/CIK{cik}.json"
 header = {
 "User-Agent" : EMAIL
 }
 company_filings = requests.get(url, headers=header).json()
 company_filings_df = pd.DataFrame(company_filings["filings"]["recent"])
 company_filings_df = company_filings_df[company_filings_df.form == report_type]
 
 access_number = company_filings_df.accessionNumber.values[0].replace("-", "")
 file_name = company_filings_df.primaryDocument.values[0]
 pdf_path = f'./data/{report_type}/{file_name}'+".pdf"
 if not os.path.exists(pdf_path):
 url = f"https://www.sec.gov/Archives/edgar/data/{cik}/{access_number}/{file_name}"
 # dowloading and saving requested document to working directory
 req_content = requests.get(url, headers=header).content.decode("utf-8")
 print(f'Creating {pdf_path}...')
 HTML(string=req_content, base_url="").write_pdf(pdf_path)

## Set up functions to get docs from the EDGAR database

In [1]:
def vectorize(filepath):
 print(f"Loading PDF from path: {filepath}")
 pdf_loader = PyPDFLoader(filepath)
 pages = pdf_loader.load()
 print(f"Splitting PDF from path: {filepath}")
 split_documents = text_splitter.split_documents(pages)
 print("Indexing Files")
 for i in range(0, len(split_documents), 32):
 new_docs = split_documents[i:i+32]
 vectorstore.add_documents(new_docs)
 vectorstore.save_local("./data/vectorstore")
 file = open("./data/status.txt", "w")
 file.write(f"{filepath}")

## Set Up Vectorstore

In [4]:
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os

base_paths = ["./data/10-K", "./data/10-Q", "./data/11-K", "./data/SD"]

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

text_splitter = RecursiveCharacterTextSplitter(
 chunk_size = 500,
 chunk_overlap = 50
)

#Check if vectorstore exists
if os.path.exists("./data/vectorstore"):
 vectorstore = FAISS.load_local("./data/vectorstore", embeddings, allow_dangerous_deserialization=True)
 print(f"./data/vectorstore found.")
 exists = 1
else:
 os.makedirs("./data/vectorstore")
 exists = 0

#get all the files we want to vectorize.
all_filepaths = []
for path in base_paths:
 files = os.listdir(path)
 for file in files:
 filepath = os.path.join(path, file)
 all_filepaths.append(filepath)

#if the filepath exists, there will be an associated status
if exists:
 status_string = open("./data/status.txt", "r").readlines() 
 idx = all_filepaths.index(status_string[0])
 all_filepaths = all_filepaths[idx+1::]
 for filepath in all_filepaths:
 vectorize(filepath)
#otherwise, create the vectorstore and save to it, then use the vectorize function
else:
 for index, filepath in enumerate(all_filepaths):
 if index == 0:
 print(f"Loading PDF from path: {filepath}")
 pdf_loader = PyPDFLoader(filepath)
 pages = pdf_loader.load()
 print(f"Splitting PDF from path: {filepath}")
 split_documents = text_splitter.split_documents(pages)
 print("Indexing Files")
 for i in range(0, len(split_documents), 32):
 vectorstore = FAISS.from_documents(split_documents[i:i+32], embeddings)
 vectorstore.save_local("./data/vectorstore")
 file = open("./data/status.txt", "w")
 file.write(f"{filepath}") 
 else: 
 vectorize(filepath)
 

./data/vectorstore found.
Loading PDF from path: ./data/10-K/rfmd-20240330.htm.pdf
Splitting PDF from path: ./data/10-K/rfmd-20240330.htm.pdf
Indexing Files
Loading PDF from path: ./data/10-K/br-20230630.htm.pdf
Splitting PDF from path: ./data/10-K/br-20230630.htm.pdf
Indexing Files
Loading PDF from path: ./data/10-K/nrg-20231231.htm.pdf
Splitting PDF from path: ./data/10-K/nrg-20231231.htm.pdf
Indexing Files
Loading PDF from path: ./data/10-K/syy-20230701.htm.pdf
Splitting PDF from path: ./data/10-K/syy-20230701.htm.pdf
Indexing Files
Loading PDF from path: ./data/10-K/trv-20231231.htm.pdf
Splitting PDF from path: ./data/10-K/trv-20231231.htm.pdf
Indexing Files
Loading PDF from path: ./data/10-K/ko-20231231.htm.pdf
Splitting PDF from path: ./data/10-K/ko-20231231.htm.pdf
Indexing Files
Loading PDF from path: ./data/10-K/ame-20231231.htm.pdf
Splitting PDF from path: ./data/10-K/ame-20231231.htm.pdf
Indexing Files
Loading PDF from path: ./data/10-K/l-20231231.htm.pdf
Splitting PDF from 