Spaces:

drift-ai
/

internal-document-qa

Runtime error

Vincent Claes

working version with streamlit

5288ac6 over 1 year ago

4.77 kB

	import os
	import streamlit as st
	from langchain.chains import RetrievalQA
	from langchain.chat_models import ChatOpenAI

	from langchain.document_loaders import PyPDFLoader, DirectoryLoader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.embeddings import OpenAIEmbeddings
	from langchain.vectorstores import Chroma

	model = "gpt-3.5-turbo"


	st.set_page_config(
	page_title="Randstad Digital Doc QA", page_icon=":robot_face:", layout="wide"
	)
	st.header("Randstad Digital Doc QA :robot_face:")

	openai_api_key = os.environ["OPENAI_API_KEY"]


	if not openai_api_key:
	st.warning(
	"Enter your OpenAI API key in the sidebar. You can get a key at"
	" https://platform.openai.com/account/api-keys."
	)


	@st.cache_resource(show_spinner=False)
	def load_data():
	with st.spinner(
	text="Loading and indexing the documents – hang tight! This should take 1-2 minutes."
	):
	# load the documents
	loader = DirectoryLoader(
	"./data", glob="*/.pdf", show_progress=True, loader_cls=PyPDFLoader
	)
	docs = loader.load()
	# replace all new lines with spaces
	for doc in docs:
	setattr(doc, "page_content", doc.page_content.replace("\n", " "))

	# split the documents into chunks
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
	all_splits = text_splitter.split_documents(docs)

	for doc in all_splits:
	file_name = doc.metadata["source"]
	setattr(doc, "page_content", f"document: {file_name}\n{doc.page_content}")

	# construct vector store
	vectorstore = Chroma.from_documents(
	documents=all_splits, embedding=OpenAIEmbeddings()
	)
	# https://python.langchain.com/docs/use_cases/question_answering.html#go-deeper-3
	# svm_retriever = SVMRetriever.from_documents(all_splits, OpenAIEmbeddings())
	return vectorstore


	vectorstore = load_data()

	with st.form(key="qa_form"):
	query = st.text_area("Ask me anything about the documenation!")
	submit = st.form_submit_button("Submit")

	with st.expander("Examples"):
	with st.form(key="ex1"):
	ex1_query = "what is the process of raising an incident?"
	if st.form_submit_button(ex1_query):
	query = ex1_query
	submit = True
	ex2_query = "what is the release management process?"
	if st.form_submit_button(ex2_query):
	query = ex2_query
	submit = True
	ex3_query = "What is process for identifying risks that can impact the desired outcomes of a project?"
	if st.form_submit_button(ex3_query):
	query = ex3_query
	submit = True
	ex4_query = "What is the process?"
	if st.form_submit_button(ex4_query):
	query = ex4_query
	submit = True
	ex5_query = "What is Cx0 program management?"
	if st.form_submit_button(ex5_query):
	query = ex4_query
	submit = True


	with st.expander("Advanced Options"):
	return_all_chunks = st.checkbox("Group answer per document")


	def is_query_valid(query: str) -> bool:
	if not query:
	st.error("Please enter a question!")
	return False
	return True


	if submit:
	if not is_query_valid(query):
	st.stop()
	with st.spinner(text="Thinking about an answer ..."):
	# Output Columns
	answer_col, sources_col = st.columns(2)

	# llm = get_llm(model=model, openai_api_key=openai_api_key, temperature=0)
	llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
	qa_chain = RetrievalQA.from_chain_type(
	llm=llm,
	chain_type="stuff",
	retriever=vectorstore.as_retriever(search_kwargs={"k": 6}),
	return_source_documents=True,
	)
	SYSTEM_MESSAGE = "You are an internal document expert and you respond to the query in 1 to 5 sentences. If the answer is a list, write bullet points."
	if return_all_chunks:
	SYSTEM_MESSAGE += "Group the answer per document"
	SYSTEM_MESSAGE += " \n\nQuery:\n"
	result = qa_chain({"query": f"{SYSTEM_MESSAGE}{query}"})

	with answer_col:
	st.markdown("#### Answer")
	st.markdown(result["result"])

	with sources_col:
	st.markdown("#### Sources")
	lines = []

	source_docs = [
	(x.metadata["source"], x.page_content) for x in result["source_documents"]
	]
	for i, doc in enumerate(source_docs, start=1):
	st.markdown(f"* CHUNK: {i}")
	st.markdown(f"original doc: {doc[0]}")
	st.markdown(f"{doc[1]}")
	lines.append("") # for a newline between chunks