Spaces:

drift-ai
/

internal-document-qa

Runtime error

File size: 4,765 Bytes

import os
import streamlit as st
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma

model = "gpt-3.5-turbo"


st.set_page_config(
    page_title="Randstad Digital Doc QA", page_icon=":robot_face:", layout="wide"
)
st.header("Randstad Digital Doc QA :robot_face:")

openai_api_key = os.environ["OPENAI_API_KEY"]


if not openai_api_key:
    st.warning(
        "Enter your OpenAI API key in the sidebar. You can get a key at"
        " https://platform.openai.com/account/api-keys."
    )


@st.cache_resource(show_spinner=False)
def load_data():
    with st.spinner(
        text="Loading and indexing the documents – hang tight! This should take 1-2 minutes."
    ):
        # load the documents
        loader = DirectoryLoader(
            "./data", glob="**/*.pdf", show_progress=True, loader_cls=PyPDFLoader
        )
        docs = loader.load()
        # replace all new lines with spaces
        for doc in docs:
            setattr(doc, "page_content", doc.page_content.replace("\n", " "))

        # split the documents into chunks
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
        all_splits = text_splitter.split_documents(docs)

        for doc in all_splits:
            file_name = doc.metadata["source"]
            setattr(doc, "page_content", f"document: {file_name}\n{doc.page_content}")

        # construct vector store
        vectorstore = Chroma.from_documents(
            documents=all_splits, embedding=OpenAIEmbeddings()
        )
        # https://python.langchain.com/docs/use_cases/question_answering.html#go-deeper-3
        # svm_retriever = SVMRetriever.from_documents(all_splits, OpenAIEmbeddings())
        return vectorstore


vectorstore = load_data()

with st.form(key="qa_form"):
    query = st.text_area("Ask me anything about the documenation!")
    submit = st.form_submit_button("Submit")

with st.expander("Examples"):
    with st.form(key="ex1"):
        ex1_query = "what is the process of raising an incident?"
        if st.form_submit_button(ex1_query):
            query = ex1_query
            submit = True
        ex2_query = "what is the release management process?"
        if st.form_submit_button(ex2_query):
            query = ex2_query
            submit = True
        ex3_query = "What is process for identifying risks that can impact the desired outcomes of a project?"
        if st.form_submit_button(ex3_query):
            query = ex3_query
            submit = True
        ex4_query = "What is the process?"
        if st.form_submit_button(ex4_query):
            query = ex4_query
            submit = True
        ex5_query = "What is Cx0 program management?"
        if st.form_submit_button(ex5_query):
            query = ex4_query
            submit = True


with st.expander("Advanced Options"):
    return_all_chunks = st.checkbox("Group answer per document")


def is_query_valid(query: str) -> bool:
    if not query:
        st.error("Please enter a question!")
        return False
    return True


if submit:
    if not is_query_valid(query):
        st.stop()
    with st.spinner(text="Thinking about an answer ..."):
        # Output Columns
        answer_col, sources_col = st.columns(2)

        # llm = get_llm(model=model, openai_api_key=openai_api_key, temperature=0)
        llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
        qa_chain = RetrievalQA.from_chain_type(
            llm=llm,
            chain_type="stuff",
            retriever=vectorstore.as_retriever(search_kwargs={"k": 6}),
            return_source_documents=True,
        )
        SYSTEM_MESSAGE = "You are an internal document expert and you respond to the query in 1 to 5 sentences. If the answer is a list, write bullet points."
        if return_all_chunks:
            SYSTEM_MESSAGE += "Group the answer per document"
        SYSTEM_MESSAGE += " \n\nQuery:\n"
        result = qa_chain({"query": f"{SYSTEM_MESSAGE}{query}"})

    with answer_col:
        st.markdown("#### Answer")
        st.markdown(result["result"])

    with sources_col:
        st.markdown("#### Sources")
        lines = []

        source_docs = [
            (x.metadata["source"], x.page_content) for x in result["source_documents"]
        ]
        for i, doc in enumerate(source_docs, start=1):
            st.markdown(f"* CHUNK: {i}")
            st.markdown(f"original doc: {doc[0]}")
            st.markdown(f"{doc[1]}")
            lines.append("")  # for a newline between chunks