Vincent Claes
use chromadb and openai to query docs
9c2548e
raw
history blame
No virus
2.76 kB
import gradio as gr
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.retrievers import SVMRetriever
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
def load_data():
# load the documents
loader = DirectoryLoader('./data', glob="**/*.pdf", show_progress=True, loader_cls=PyPDFLoader)
docs = loader.load()
# replace all new lines with spaces
[setattr(doc, "page_content", doc.page_content.replace("\n", " ")) for doc in docs]
print(docs)
# split the documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 50)
all_splits = text_splitter.split_documents(docs)
# construct vector store
vectorstore = Chroma.from_documents(documents=all_splits, embedding=OpenAIEmbeddings())
# https://python.langchain.com/docs/use_cases/question_answering.html#go-deeper-3
svm_retriever = SVMRetriever.from_documents(all_splits, OpenAIEmbeddings())
return svm_retriever, vectorstore
svm_retriever, vectorstore = load_data()
def process_question(question, svm_retriever=svm_retriever, vectorstore=vectorstore):
docs_svm=svm_retriever.get_relevant_documents(question)
print(len(docs_svm))
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
qa_chain = RetrievalQA.from_chain_type(llm, retriever=vectorstore.as_retriever(), return_source_documents=True)
result = qa_chain({"query": question})
output = f"""
============RESULT==============
\n
{result["result"]}
\n
============SOURCES=============
"""
# Initialize an empty list to hold the lines
lines = []
source_docs = [(x.metadata["source"], x.page_content) for x in result["source_documents"]]
for i, doc in enumerate(source_docs):
lines.append(f"* CHUNK: {i} *")
lines.append(f"original doc: {doc[0]}")
lines.append(f"{doc[1]}")
lines.append('') # for a newline between chunks
# Join the lines with a newline character to get the multi-line string
output += '\n'.join(lines)
return output
iface = gr.Interface(
fn=process_question, # the function to wrap
inputs="text", # the input type
outputs="text", # the output type
examples=[
[f"what is the process of raising an incident?"],
[f"What is Cx0 program management?"],
[
f"What is process for identifying risksthat can impact the desired outcomes of a project?"
],
[f"What is the release management process?"],
],
)
if __name__ == "__main__":
iface.launch()