Spaces:

drift-ai
/

internal-document-qa

Runtime error

App Files Files Community

Vincent Claes commited on Oct 17, 2023

Commit

5288ac6

•

1 Parent(s): ee30e14

working version with streamlit

Browse files

Files changed (3) hide show

README.md +2 -3
app.py +129 -70
requirements.txt +2 -3

README.md CHANGED Viewed

@@ -3,13 +3,12 @@ title: Internal DOC QA
 emoji:
 colorFrom: purple
 colorTo: blue
-sdk: gradio
-sdk_version: 3.39.0
 app_file: app.py
 pinned: false
 ---
-# Internal DOC QA
 ```bash
 make deps

 emoji:
 colorFrom: purple
 colorTo: blue
+sdk: streamlit
 app_file: app.py
 pinned: false
 ---
+df# Internal DOC QA
 ```bash
 make deps

app.py CHANGED Viewed

@@ -1,78 +1,137 @@
-import gradio as gr
 from langchain.document_loaders import PyPDFLoader, DirectoryLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.embeddings import OpenAIEmbeddings
 from langchain.vectorstores import Chroma
-from langchain.retrievers import SVMRetriever
-from langchain.chains import RetrievalQA
-from langchain.chat_models import ChatOpenAI
-def load_data():
-    # load the documents
-    loader = DirectoryLoader('./data', glob="**/*.pdf", show_progress=True, loader_cls=PyPDFLoader)
-    docs = loader.load()
-    # replace all new lines with spaces
-    [setattr(doc, "page_content", doc.page_content.replace("\n", " ")) for doc in docs]
-    print(docs)
-    # split the documents into chunks
-    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 50)
-    all_splits = text_splitter.split_documents(docs)
-    # construct vector store
-    vectorstore = Chroma.from_documents(documents=all_splits, embedding=OpenAIEmbeddings())
-    # https://python.langchain.com/docs/use_cases/question_answering.html#go-deeper-3
-    svm_retriever = SVMRetriever.from_documents(all_splits, OpenAIEmbeddings())
-    return svm_retriever, vectorstore
-svm_retriever, vectorstore = load_data()
-def process_question(question, history, svm_retriever=svm_retriever, vectorstore=vectorstore):
-    docs_svm=svm_retriever.get_relevant_documents(question)
-    print(len(docs_svm))
-    llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
-    qa_chain = RetrievalQA.from_chain_type(llm, retriever=vectorstore.as_retriever(), return_source_documents=True)
-    result = qa_chain({"query": question})
-    output = f"""============RESULT==============
-\n
-{result["result"]}
-\n
-============SOURCES=============
-"""
-    # Initialize an empty list to hold the lines
-    lines = []
-    source_docs = [(x.metadata["source"], x.page_content) for x in result["source_documents"]]
-    for i, doc in enumerate(source_docs):
-        lines.append(f"* CHUNK: {i} *")
-        lines.append(f"original doc: {doc[0]}")
-        lines.append(f"{doc[1]}")
-        lines.append('')  # for a newline between chunks
-    # Join the lines with a newline character to get the multi-line string
-    output += '\n'.join(lines)
-    return output
-iface = gr.ChatInterface(
-    title="Internal DOC QA",
-    theme=gr.themes.Soft,
-    fn=process_question,  # the function to wrap
-    # inputs="text",  # the input type
-    # outputs="text",  # the output type
-    examples=[
-        [f"what is the process of raising an incident?"],
-        [f"What is Cx0 program management?"],
-        [
-            f"What is process for identifying risksthat can impact the desired outcomes of a project?"
-        ],
-        [f"What is the release management process?"],
-    ],
 )
-if __name__ == "__main__":
-    iface.launch()

+import os
+import streamlit as st
+from langchain.chains import RetrievalQA
+from langchain.chat_models import ChatOpenAI
 from langchain.document_loaders import PyPDFLoader, DirectoryLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.embeddings import OpenAIEmbeddings
 from langchain.vectorstores import Chroma
+model = "gpt-3.5-turbo"
+st.set_page_config(
+    page_title="Randstad Digital Doc QA", page_icon=":robot_face:", layout="wide"
 )
+st.header("Randstad Digital Doc QA :robot_face:")
+openai_api_key = os.environ["OPENAI_API_KEY"]
+if not openai_api_key:
+    st.warning(
+        "Enter your OpenAI API key in the sidebar. You can get a key at"
+        " https://platform.openai.com/account/api-keys."
+    )
+@st.cache_resource(show_spinner=False)
+def load_data():
+    with st.spinner(
+        text="Loading and indexing the documents – hang tight! This should take 1-2 minutes."
+    ):
+        # load the documents
+        loader = DirectoryLoader(
+            "./data", glob="**/*.pdf", show_progress=True, loader_cls=PyPDFLoader
+        )
+        docs = loader.load()
+        # replace all new lines with spaces
+        for doc in docs:
+            setattr(doc, "page_content", doc.page_content.replace("\n", " "))
+        # split the documents into chunks
+        text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
+        all_splits = text_splitter.split_documents(docs)
+        for doc in all_splits:
+            file_name = doc.metadata["source"]
+            setattr(doc, "page_content", f"document: {file_name}\n{doc.page_content}")
+        # construct vector store
+        vectorstore = Chroma.from_documents(
+            documents=all_splits, embedding=OpenAIEmbeddings()
+        )
+        # https://python.langchain.com/docs/use_cases/question_answering.html#go-deeper-3
+        # svm_retriever = SVMRetriever.from_documents(all_splits, OpenAIEmbeddings())
+        return vectorstore
+vectorstore = load_data()
+with st.form(key="qa_form"):
+    query = st.text_area("Ask me anything about the documenation!")
+    submit = st.form_submit_button("Submit")
+with st.expander("Examples"):
+    with st.form(key="ex1"):
+        ex1_query = "what is the process of raising an incident?"
+        if st.form_submit_button(ex1_query):
+            query = ex1_query
+            submit = True
+        ex2_query = "what is the release management process?"
+        if st.form_submit_button(ex2_query):
+            query = ex2_query
+            submit = True
+        ex3_query = "What is process for identifying risks that can impact the desired outcomes of a project?"
+        if st.form_submit_button(ex3_query):
+            query = ex3_query
+            submit = True
+        ex4_query = "What is the process?"
+        if st.form_submit_button(ex4_query):
+            query = ex4_query
+            submit = True
+        ex5_query = "What is Cx0 program management?"
+        if st.form_submit_button(ex5_query):
+            query = ex4_query
+            submit = True
+with st.expander("Advanced Options"):
+    return_all_chunks = st.checkbox("Group answer per document")
+def is_query_valid(query: str) -> bool:
+    if not query:
+        st.error("Please enter a question!")
+        return False
+    return True
+if submit:
+    if not is_query_valid(query):
+        st.stop()
+    with st.spinner(text="Thinking about an answer ..."):
+        # Output Columns
+        answer_col, sources_col = st.columns(2)
+        # llm = get_llm(model=model, openai_api_key=openai_api_key, temperature=0)
+        llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
+        qa_chain = RetrievalQA.from_chain_type(
+            llm=llm,
+            chain_type="stuff",
+            retriever=vectorstore.as_retriever(search_kwargs={"k": 6}),
+            return_source_documents=True,
+        )
+        SYSTEM_MESSAGE = "You are an internal document expert and you respond to the query in 1 to 5 sentences. If the answer is a list, write bullet points."
+        if return_all_chunks:
+            SYSTEM_MESSAGE += "Group the answer per document"
+        SYSTEM_MESSAGE += " \n\nQuery:\n"
+        result = qa_chain({"query": f"{SYSTEM_MESSAGE}{query}"})
+    with answer_col:
+        st.markdown("#### Answer")
+        st.markdown(result["result"])
+    with sources_col:
+        st.markdown("#### Sources")
+        lines = []
+        source_docs = [
+            (x.metadata["source"], x.page_content) for x in result["source_documents"]
+        ]
+        for i, doc in enumerate(source_docs, start=1):
+            st.markdown(f"* CHUNK: {i}")
+            st.markdown(f"original doc: {doc[0]}")
+            st.markdown(f"{doc[1]}")
+            lines.append("")  # for a newline between chunks

requirements.txt CHANGED Viewed

@@ -1,7 +1,6 @@
 openai
-chromadb
 langchain
 pypdf
 tiktoken
-scikit-learn
-gradio

+streamlit
 openai
 langchain
+chromadb
 pypdf
 tiktoken