Spaces:
Runtime error
Runtime error
| import os | |
| import streamlit as st | |
| from langchain.chains import RetrievalQA | |
| from langchain.chat_models import ChatOpenAI | |
| from langchain.document_loaders import PyPDFLoader, DirectoryLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.embeddings import OpenAIEmbeddings | |
| from langchain.vectorstores import Chroma | |
| model = "gpt-3.5-turbo" | |
| st.set_page_config( | |
| page_title="Randstad Digital Doc QA", page_icon=":robot_face:", layout="wide" | |
| ) | |
| st.header("Randstad Digital Doc QA :robot_face:") | |
| openai_api_key = os.environ["OPENAI_API_KEY"] | |
| if not openai_api_key: | |
| st.warning( | |
| "Enter your OpenAI API key in the sidebar. You can get a key at" | |
| " https://platform.openai.com/account/api-keys." | |
| ) | |
| def load_data(): | |
| with st.spinner( | |
| text="Loading and indexing the documents – hang tight! This should take 1-2 minutes." | |
| ): | |
| # load the documents | |
| loader = DirectoryLoader( | |
| "./data", glob="**/*.pdf", show_progress=True, loader_cls=PyPDFLoader | |
| ) | |
| docs = loader.load() | |
| # replace all new lines with spaces | |
| for doc in docs: | |
| setattr(doc, "page_content", doc.page_content.replace("\n", " ")) | |
| # split the documents into chunks | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) | |
| all_splits = text_splitter.split_documents(docs) | |
| for doc in all_splits: | |
| file_name = doc.metadata["source"] | |
| setattr(doc, "page_content", f"document: {file_name}\n{doc.page_content}") | |
| # construct vector store | |
| vectorstore = Chroma.from_documents( | |
| documents=all_splits, embedding=OpenAIEmbeddings() | |
| ) | |
| # https://python.langchain.com/docs/use_cases/question_answering.html#go-deeper-3 | |
| # svm_retriever = SVMRetriever.from_documents(all_splits, OpenAIEmbeddings()) | |
| return vectorstore | |
| vectorstore = load_data() | |
| with st.form(key="qa_form"): | |
| query = st.text_area("Ask me anything about the documenation!") | |
| submit = st.form_submit_button("Submit") | |
| with st.expander("Examples"): | |
| with st.form(key="ex1"): | |
| ex1_query = "what is the process of raising an incident?" | |
| if st.form_submit_button(ex1_query): | |
| query = ex1_query | |
| submit = True | |
| ex2_query = "what is the release management process?" | |
| if st.form_submit_button(ex2_query): | |
| query = ex2_query | |
| submit = True | |
| ex3_query = "What is process for identifying risks that can impact the desired outcomes of a project?" | |
| if st.form_submit_button(ex3_query): | |
| query = ex3_query | |
| submit = True | |
| ex4_query = "What is the process?" | |
| if st.form_submit_button(ex4_query): | |
| query = ex4_query | |
| submit = True | |
| ex5_query = "What is Cx0 program management?" | |
| if st.form_submit_button(ex5_query): | |
| query = ex4_query | |
| submit = True | |
| with st.expander("Advanced Options"): | |
| return_all_chunks = st.checkbox("Group answer per document") | |
| def is_query_valid(query: str) -> bool: | |
| if not query: | |
| st.error("Please enter a question!") | |
| return False | |
| return True | |
| if submit: | |
| if not is_query_valid(query): | |
| st.stop() | |
| with st.spinner(text="Thinking about an answer ..."): | |
| # Output Columns | |
| answer_col, sources_col = st.columns(2) | |
| # llm = get_llm(model=model, openai_api_key=openai_api_key, temperature=0) | |
| llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0) | |
| qa_chain = RetrievalQA.from_chain_type( | |
| llm=llm, | |
| chain_type="stuff", | |
| retriever=vectorstore.as_retriever(search_kwargs={"k": 6}), | |
| return_source_documents=True, | |
| ) | |
| SYSTEM_MESSAGE = "You are an internal document expert and you respond to the query in 1 to 5 sentences. If the answer is a list, write bullet points." | |
| if return_all_chunks: | |
| SYSTEM_MESSAGE += "Group the answer per document" | |
| SYSTEM_MESSAGE += " \n\nQuery:\n" | |
| result = qa_chain({"query": f"{SYSTEM_MESSAGE}{query}"}) | |
| with answer_col: | |
| st.markdown("#### Answer") | |
| st.markdown(result["result"]) | |
| with sources_col: | |
| st.markdown("#### Sources") | |
| lines = [] | |
| source_docs = [ | |
| (x.metadata["source"], x.page_content) for x in result["source_documents"] | |
| ] | |
| for i, doc in enumerate(source_docs, start=1): | |
| st.markdown(f"* CHUNK: {i}") | |
| st.markdown(f"original doc: {doc[0]}") | |
| st.markdown(f"{doc[1]}") | |
| lines.append("") # for a newline between chunks | |