# Dependencies and Initial Setup

In [1]:
%pip install -U numpy langchain langchain-core langchain-community langchain-openai qdrant-client tiktoken pymupdf wandb

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import wandb
import getpass

# UNCOMMENT TO ENTER WANDB KEY INTERACTIVELY
# wandb_key = getpass.getpass("Weights and Biases API Key: ")
# os.environ["WANDB_API_KEY"] = wandb_key
os.environ["WANDB_NOTEBOOK_NAME"] = "./QA_PDF_LangChain.ipynb"
wandb.init(project="QA_PDF_LangChain")

[34m[1mwandb[0m: Currently logged in as: [33mymath[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
# UNCOMMENT TO ENTER OPENAI KEY INTERACTIVELY
# os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")

# Create Vector Store with Source Documents

In [4]:
import tiktoken

def tiktoken_len(text):
    tokens = tiktoken.encoding_for_model("gpt-3.5-turbo").encode(
        text,
    )
    return len(tokens)

In [5]:
from langchain.document_loaders import PyMuPDFLoader

# docs = PyMuPDFLoader("data/c7318154-f6ae-4866-89fa-f0c589f2ee3d.pdf").load()
docs = PyMuPDFLoader("https://d18rn0p25nwr6d.cloudfront.net/CIK-0001326801/c7318154-f6ae-4866-89fa-f0c589f2ee3d.pdf").load()

print("Loaded", len(docs), "documents")
print(docs[0])

Loaded 147 documents
page_content='UNITED STATES\nSECURITIES AND EXCHANGE COMMISSION\nWashington, D.C.\xa020549\n__________________________\nFORM 10-K\n__________________________\n(Mark One)\n☒\xa0\xa0\xa0\xa0ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d)\xa0OF THE SECURITIES EXCHANGE ACT OF 1934\nFor the fiscal year ended December\xa031, 2023\nor\n☐\xa0\xa0\xa0\xa0TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d)\xa0OF THE SECURITIES EXCHANGE ACT OF 1934\nFor the transition period from\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0to\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\nCommission File Number:\xa0001-35551\n__________________________\nMeta Platforms, Inc.\n(Exact name of registrant as specified in its charter)\n__________________________\nDelaware\n20-1665019\n(State or other jurisdiction of incorporation or organization)\n(I.R.S. Employer Identification Number)\n1 Meta Way, Menlo Park, California 94025\n(Address of principal executive offices and Zip Code)\n(650)\xa0543-4

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 200,
    chunk_overlap = 0,
    length_function = tiktoken_len,
)

split_chunks = text_splitter.split_documents(docs)

len(split_chunks)

663

In [7]:
from langchain_openai.embeddings import OpenAIEmbeddings

embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")

In [8]:
from langchain_community.vectorstores import Qdrant

qdrant_vectorstore = Qdrant.from_documents(
    split_chunks,
    embedding_model,
    location=":memory:",
    collection_name="Meta 10-k Filings",
)

qdrant_retriever = qdrant_vectorstore.as_retriever()

# Create Chain

In [9]:
from langchain_core.prompts import ChatPromptTemplate

RAG_PROMPT = """
CONTEXT:
{context}

QUERY:
{question}

Use the provided context to answer the provided user query. Only use the provided context to answer the query. If you do not know the answer, respond with "I don't know".
"""

rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)

In [10]:
from langchain_openai import ChatOpenAI

openai_chat_model = ChatOpenAI(model="gpt-3.5-turbo")

In [11]:
from operator import itemgetter
# from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough

retrieval_augmented_qa_chain = (
    {"context": itemgetter("question") | qdrant_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt | openai_chat_model, "context": itemgetter("context")}
)

# Test Outputs

In [12]:
question_txt = "What was the total value of 'Cash and cash equivalents' as of December 31, 2023?"
response = retrieval_augmented_qa_chain.invoke({"question" : question_txt})
response["response"].content

"The total value of 'Cash and cash equivalents' as of December 31, 2023, was $65.40 billion."

In [13]:
question_txt = "Who are Meta's 'Directors' (i.e., members of the Board of Directors)?"
response = retrieval_augmented_qa_chain.invoke({"question" : question_txt})
response["response"].content

'The Directors of Meta, as mentioned in the provided context, are Peggy Alford, Marc L. Andreessen, Andrew W. Houston, Nancy Killefer, Robert M. Kimmitt, Sheryl K. Sandberg, Tracey T. Travis, and Tony Xu.'

In [14]:
wandb.finish()