import os import streamlit as st from langchain.chains import RetrievalQA from langchain.chat_models import ChatOpenAI from langchain.document_loaders import PyPDFLoader, DirectoryLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.embeddings import OpenAIEmbeddings from langchain.vectorstores import Chroma model = "gpt-3.5-turbo" st.set_page_config( page_title="Randstad Digital Doc QA", page_icon=":robot_face:", layout="wide" ) st.header("Randstad Digital Doc QA :robot_face:") openai_api_key = os.environ["OPENAI_API_KEY"] if not openai_api_key: st.warning( "Enter your OpenAI API key in the sidebar. You can get a key at" " https://platform.openai.com/account/api-keys." ) @st.cache_resource(show_spinner=False) def load_data(): with st.spinner( text="Loading and indexing the documents – hang tight! This should take 1-2 minutes." ): # load the documents loader = DirectoryLoader( "./data", glob="**/*.pdf", show_progress=True, loader_cls=PyPDFLoader ) docs = loader.load() # replace all new lines with spaces for doc in docs: setattr(doc, "page_content", doc.page_content.replace("\n", " ")) # split the documents into chunks text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) all_splits = text_splitter.split_documents(docs) for doc in all_splits: file_name = doc.metadata["source"] setattr(doc, "page_content", f"document: {file_name}\n{doc.page_content}") # construct vector store vectorstore = Chroma.from_documents( documents=all_splits, embedding=OpenAIEmbeddings() ) # https://python.langchain.com/docs/use_cases/question_answering.html#go-deeper-3 # svm_retriever = SVMRetriever.from_documents(all_splits, OpenAIEmbeddings()) return vectorstore vectorstore = load_data() with st.form(key="qa_form"): query = st.text_area("Ask me anything about the documenation!") submit = st.form_submit_button("Submit") with st.expander("Examples"): with st.form(key="ex1"): ex1_query = "what is the process of raising an incident?" if st.form_submit_button(ex1_query): query = ex1_query submit = True ex2_query = "what is the release management process?" if st.form_submit_button(ex2_query): query = ex2_query submit = True ex3_query = "What is process for identifying risks that can impact the desired outcomes of a project?" if st.form_submit_button(ex3_query): query = ex3_query submit = True ex4_query = "What is the process?" if st.form_submit_button(ex4_query): query = ex4_query submit = True ex5_query = "What is Cx0 program management?" if st.form_submit_button(ex5_query): query = ex4_query submit = True with st.expander("Advanced Options"): return_all_chunks = st.checkbox("Group answer per document") def is_query_valid(query: str) -> bool: if not query: st.error("Please enter a question!") return False return True if submit: if not is_query_valid(query): st.stop() with st.spinner(text="Thinking about an answer ..."): # Output Columns answer_col, sources_col = st.columns(2) # llm = get_llm(model=model, openai_api_key=openai_api_key, temperature=0) llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0) qa_chain = RetrievalQA.from_chain_type( llm=llm, chain_type="stuff", retriever=vectorstore.as_retriever(search_kwargs={"k": 6}), return_source_documents=True, ) SYSTEM_MESSAGE = "You are an internal document expert and you respond to the query in 1 to 5 sentences. If the answer is a list, write bullet points." if return_all_chunks: SYSTEM_MESSAGE += "Group the answer per document" SYSTEM_MESSAGE += " \n\nQuery:\n" result = qa_chain({"query": f"{SYSTEM_MESSAGE}{query}"}) with answer_col: st.markdown("#### Answer") st.markdown(result["result"]) with sources_col: st.markdown("#### Sources") lines = [] source_docs = [ (x.metadata["source"], x.page_content) for x in result["source_documents"] ] for i, doc in enumerate(source_docs, start=1): st.markdown(f"* CHUNK: {i}") st.markdown(f"original doc: {doc[0]}") st.markdown(f"{doc[1]}") lines.append("") # for a newline between chunks