Spaces:

drift-ai
/

internal-document-qa

Runtime error

App Files Files Community

internal-document-qa / app.py

Vincent Claes

use chromadb and openai to query docs

9c2548e 11 months ago

raw

history blame

No virus

2.76 kB

	import gradio as gr
	from langchain.document_loaders import PyPDFLoader, DirectoryLoader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.embeddings import OpenAIEmbeddings
	from langchain.vectorstores import Chroma
	from langchain.retrievers import SVMRetriever
	from langchain.chains import RetrievalQA
	from langchain.chat_models import ChatOpenAI


	def load_data():
	# load the documents
	loader = DirectoryLoader('./data', glob="*/.pdf", show_progress=True, loader_cls=PyPDFLoader)
	docs = loader.load()
	# replace all new lines with spaces
	[setattr(doc, "page_content", doc.page_content.replace("\n", " ")) for doc in docs]
	print(docs)

	# split the documents into chunks
	text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 50)
	all_splits = text_splitter.split_documents(docs)

	# construct vector store
	vectorstore = Chroma.from_documents(documents=all_splits, embedding=OpenAIEmbeddings())
	# https://python.langchain.com/docs/use_cases/question_answering.html#go-deeper-3
	svm_retriever = SVMRetriever.from_documents(all_splits, OpenAIEmbeddings())
	return svm_retriever, vectorstore

	svm_retriever, vectorstore = load_data()

	def process_question(question, svm_retriever=svm_retriever, vectorstore=vectorstore):

	docs_svm=svm_retriever.get_relevant_documents(question)
	print(len(docs_svm))
	llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
	qa_chain = RetrievalQA.from_chain_type(llm, retriever=vectorstore.as_retriever(), return_source_documents=True)
	result = qa_chain({"query": question})

	output = f"""
	============RESULT==============
	\n
	{result["result"]}
	\n
	============SOURCES=============
	"""

	# Initialize an empty list to hold the lines
	lines = []

	source_docs = [(x.metadata["source"], x.page_content) for x in result["source_documents"]]
	for i, doc in enumerate(source_docs):
	lines.append(f"* CHUNK: {i} *")
	lines.append(f"original doc: {doc[0]}")
	lines.append(f"{doc[1]}")
	lines.append('') # for a newline between chunks

	# Join the lines with a newline character to get the multi-line string
	output += '\n'.join(lines)
	return output


	iface = gr.Interface(
	fn=process_question, # the function to wrap
	inputs="text", # the input type
	outputs="text", # the output type
	examples=[
	[f"what is the process of raising an incident?"],
	[f"What is Cx0 program management?"],
	[
	f"What is process for identifying risksthat can impact the desired outcomes of a project?"
	],
	[f"What is the release management process?"],
	],
	)

	if __name__ == "__main__":
	iface.launch()