vgkienzler
Add application files.
203f5b6
import os
from dotenv import load_dotenv
from langchain.embeddings import CacheBackedEmbeddings
from langchain.storage import LocalFileStore
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from datasets import Dataset
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from ragas import evaluate
from ragas.metrics import (
faithfulness,
answer_relevancy,
answer_correctness,
context_recall,
context_precision,
)
TEST_SIZE = 10
CACHE_STORE = "data/cache/"
# Load the environment variables to gain access to OpenAI, WandB, and the other APIs.
load_dotenv()
# Initialize tracking and monitoring
os.environ['LANGCHAIN_TRACING_V2'] = "true"
os.environ["LANGCHAIN_PROJECT"] = "midterm_chainlit"
# Set the embedding and completion model.
embedding_model = "text-embedding-3-small"
llm_model_name = "gpt-3.5-turbo"
def get_cached_embedder(cache_store_path: str, embedding_model: str):
"""Cache the embeddings and return the embedder."""
local_file_store = LocalFileStore(cache_store_path)
embeddings = OpenAIEmbeddings(
model=embedding_model,
)
return CacheBackedEmbeddings.from_bytes_store(
embeddings, local_file_store, namespace=embeddings.model
)
def get_documents(test: bool = False):
if test:
return "harrison worked at Kensho"
else:
# Load the document.
loader = PyMuPDFLoader(
"data/nvidia_filings.pdf",
)
return loader.load()
def chunk_and_store(documents, cached_embedder):
splitter = RecursiveCharacterTextSplitter(
chunk_size=400,
chunk_overlap=50,
)
if type(documents) is str:
chunks = splitter.split_text(documents)
vector_store = FAISS.from_texts(chunks, cached_embedder)
else:
# Split the document into chunks.
chunks = splitter.split_documents(documents)
# Store the embeddings.
vector_store = FAISS.from_documents(chunks, cached_embedder)
vector_store.save_local("data/vector_store.index")
print("Vector store index saved on disk.")
print(len(chunks))
return vector_store
def get_store():
if not os.path.exists("data/vector_store.index"):
chunk_and_store(get_documents(), get_cached_embedder(CACHE_STORE, embedding_model))
else:
print("Loading the vector store from the disk.")
return FAISS.load_local(
"data/vector_store.index",
get_cached_embedder(CACHE_STORE, embedding_model),
allow_dangerous_deserialization=True)
def get_chain(retriever):
return (
RunnableParallel(context=retriever, question=RunnablePassthrough())
| prompt
| primary_qa_llm
| StrOutputParser()
)
def retrieve_and_answer(questions: list, retriever):
answers = []
for question in questions:
chain = get_chain(retriever)
answer = chain.invoke(question)
answers.append(answer)
return answers
documents = get_documents(test=False)
# Define the test questions
question_1 = "Who is the E-VP, Operations - and how old are they?"
question_2 = "what is the gross carrying amount of Total Amortizable Intangible Assets for Jan 29, 2023?"
questions = [question_1, question_2]
# Define the retrieval prompt.
retrieval_prompt_template = """Answer the question based only on the following context. If you cannot answer the question with the context, please respond with 'I cannot answer the question with the context provided.':
Context: {context}
Question:
{question}
"""
prompt = ChatPromptTemplate.from_template(retrieval_prompt_template)
# Define the model
primary_qa_llm = ChatOpenAI(
model_name=llm_model_name,
temperature=0.0,
streaming=True,
)
store = get_store()
retriever = store.as_retriever()
chain = get_chain(retriever)
for answer in retrieve_and_answer(questions, retriever):
print(answer)
#
# eval_documents = documents
#
# text_splitter = RecursiveCharacterTextSplitter(
# chunk_size=1500,
# chunk_overlap=400
# )
#
# eval_documents = text_splitter.split_documents(eval_documents)
#
# generator = TestsetGenerator.with_openai()
#
# test_set = generator.generate_with_langchain_docs(
# eval_documents,
# test_size=TEST_SIZE,
# distributions={simple: 1},
# )
#
# test_df = test_set.to_pandas()
# test_questions = test_df["question"].values.tolist()
# test_ground_truths = test_df["ground_truth"].values.tolist()
#
# ragas_answers = []
# ragas_contexts = []
#
# for question in test_questions:
# response = get_chain(retriever).invoke({"question": question})
# ragas_answers.append(response["response"].content)
# ragas_contexts.append([context.page_content for context in response["context"]])
#
# response_dataset = Dataset.from_dict({
# "question": test_questions,
# "answer": ragas_answers,
# "contexts": ragas_contexts,
# "ground_truth": test_ground_truths
# })
#
# metrics = [
# faithfulness,
# answer_relevancy,
# context_recall,
# context_precision,
# answer_correctness,
# ]
#
# results = evaluate(response_dataset, metrics)
#
# results_df = results.to_pandas()
#
# print(results_df)