File size: 4,765 Bytes
5288ac6
 
 
 
 
9c2548e
 
 
 
611aebd
5288ac6
611aebd
5288ac6
 
 
611aebd
5288ac6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
611aebd
5288ac6
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import os
import streamlit as st
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma

model = "gpt-3.5-turbo"


st.set_page_config(
    page_title="Randstad Digital Doc QA", page_icon=":robot_face:", layout="wide"
)
st.header("Randstad Digital Doc QA :robot_face:")

openai_api_key = os.environ["OPENAI_API_KEY"]


if not openai_api_key:
    st.warning(
        "Enter your OpenAI API key in the sidebar. You can get a key at"
        " https://platform.openai.com/account/api-keys."
    )


@st.cache_resource(show_spinner=False)
def load_data():
    with st.spinner(
        text="Loading and indexing the documents – hang tight! This should take 1-2 minutes."
    ):
        # load the documents
        loader = DirectoryLoader(
            "./data", glob="**/*.pdf", show_progress=True, loader_cls=PyPDFLoader
        )
        docs = loader.load()
        # replace all new lines with spaces
        for doc in docs:
            setattr(doc, "page_content", doc.page_content.replace("\n", " "))

        # split the documents into chunks
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
        all_splits = text_splitter.split_documents(docs)

        for doc in all_splits:
            file_name = doc.metadata["source"]
            setattr(doc, "page_content", f"document: {file_name}\n{doc.page_content}")

        # construct vector store
        vectorstore = Chroma.from_documents(
            documents=all_splits, embedding=OpenAIEmbeddings()
        )
        # https://python.langchain.com/docs/use_cases/question_answering.html#go-deeper-3
        # svm_retriever = SVMRetriever.from_documents(all_splits, OpenAIEmbeddings())
        return vectorstore


vectorstore = load_data()

with st.form(key="qa_form"):
    query = st.text_area("Ask me anything about the documenation!")
    submit = st.form_submit_button("Submit")

with st.expander("Examples"):
    with st.form(key="ex1"):
        ex1_query = "what is the process of raising an incident?"
        if st.form_submit_button(ex1_query):
            query = ex1_query
            submit = True
        ex2_query = "what is the release management process?"
        if st.form_submit_button(ex2_query):
            query = ex2_query
            submit = True
        ex3_query = "What is process for identifying risks that can impact the desired outcomes of a project?"
        if st.form_submit_button(ex3_query):
            query = ex3_query
            submit = True
        ex4_query = "What is the process?"
        if st.form_submit_button(ex4_query):
            query = ex4_query
            submit = True
        ex5_query = "What is Cx0 program management?"
        if st.form_submit_button(ex5_query):
            query = ex4_query
            submit = True


with st.expander("Advanced Options"):
    return_all_chunks = st.checkbox("Group answer per document")


def is_query_valid(query: str) -> bool:
    if not query:
        st.error("Please enter a question!")
        return False
    return True


if submit:
    if not is_query_valid(query):
        st.stop()
    with st.spinner(text="Thinking about an answer ..."):
        # Output Columns
        answer_col, sources_col = st.columns(2)

        # llm = get_llm(model=model, openai_api_key=openai_api_key, temperature=0)
        llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
        qa_chain = RetrievalQA.from_chain_type(
            llm=llm,
            chain_type="stuff",
            retriever=vectorstore.as_retriever(search_kwargs={"k": 6}),
            return_source_documents=True,
        )
        SYSTEM_MESSAGE = "You are an internal document expert and you respond to the query in 1 to 5 sentences. If the answer is a list, write bullet points."
        if return_all_chunks:
            SYSTEM_MESSAGE += "Group the answer per document"
        SYSTEM_MESSAGE += " \n\nQuery:\n"
        result = qa_chain({"query": f"{SYSTEM_MESSAGE}{query}"})

    with answer_col:
        st.markdown("#### Answer")
        st.markdown(result["result"])

    with sources_col:
        st.markdown("#### Sources")
        lines = []

        source_docs = [
            (x.metadata["source"], x.page_content) for x in result["source_documents"]
        ]
        for i, doc in enumerate(source_docs, start=1):
            st.markdown(f"* CHUNK: {i}")
            st.markdown(f"original doc: {doc[0]}")
            st.markdown(f"{doc[1]}")
            lines.append("")  # for a newline between chunks