Vincent Claes commited on
Commit
5288ac6
1 Parent(s): ee30e14

working version with streamlit

Browse files
Files changed (3) hide show
  1. README.md +2 -3
  2. app.py +129 -70
  3. requirements.txt +2 -3
README.md CHANGED
@@ -3,13 +3,12 @@ title: Internal DOC QA
3
  emoji:
4
  colorFrom: purple
5
  colorTo: blue
6
- sdk: gradio
7
- sdk_version: 3.39.0
8
  app_file: app.py
9
  pinned: false
10
  ---
11
 
12
- # Internal DOC QA
13
 
14
  ```bash
15
  make deps
 
3
  emoji:
4
  colorFrom: purple
5
  colorTo: blue
6
+ sdk: streamlit
 
7
  app_file: app.py
8
  pinned: false
9
  ---
10
 
11
+ df# Internal DOC QA
12
 
13
  ```bash
14
  make deps
app.py CHANGED
@@ -1,78 +1,137 @@
1
- import gradio as gr
 
 
 
 
2
  from langchain.document_loaders import PyPDFLoader, DirectoryLoader
3
  from langchain.text_splitter import RecursiveCharacterTextSplitter
4
  from langchain.embeddings import OpenAIEmbeddings
5
  from langchain.vectorstores import Chroma
6
- from langchain.retrievers import SVMRetriever
7
- from langchain.chains import RetrievalQA
8
- from langchain.chat_models import ChatOpenAI
9
 
 
10
 
11
- def load_data():
12
- # load the documents
13
- loader = DirectoryLoader('./data', glob="**/*.pdf", show_progress=True, loader_cls=PyPDFLoader)
14
- docs = loader.load()
15
- # replace all new lines with spaces
16
- [setattr(doc, "page_content", doc.page_content.replace("\n", " ")) for doc in docs]
17
- print(docs)
18
-
19
- # split the documents into chunks
20
- text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 50)
21
- all_splits = text_splitter.split_documents(docs)
22
-
23
- # construct vector store
24
- vectorstore = Chroma.from_documents(documents=all_splits, embedding=OpenAIEmbeddings())
25
- # https://python.langchain.com/docs/use_cases/question_answering.html#go-deeper-3
26
- svm_retriever = SVMRetriever.from_documents(all_splits, OpenAIEmbeddings())
27
- return svm_retriever, vectorstore
28
-
29
- svm_retriever, vectorstore = load_data()
30
-
31
- def process_question(question, history, svm_retriever=svm_retriever, vectorstore=vectorstore):
32
-
33
- docs_svm=svm_retriever.get_relevant_documents(question)
34
- print(len(docs_svm))
35
- llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
36
- qa_chain = RetrievalQA.from_chain_type(llm, retriever=vectorstore.as_retriever(), return_source_documents=True)
37
- result = qa_chain({"query": question})
38
-
39
- output = f"""============RESULT==============
40
- \n
41
- {result["result"]}
42
- \n
43
- ============SOURCES=============
44
- """
45
-
46
- # Initialize an empty list to hold the lines
47
- lines = []
48
-
49
- source_docs = [(x.metadata["source"], x.page_content) for x in result["source_documents"]]
50
- for i, doc in enumerate(source_docs):
51
- lines.append(f"* CHUNK: {i} *")
52
- lines.append(f"original doc: {doc[0]}")
53
- lines.append(f"{doc[1]}")
54
- lines.append('') # for a newline between chunks
55
-
56
- # Join the lines with a newline character to get the multi-line string
57
- output += '\n'.join(lines)
58
- return output
59
-
60
-
61
- iface = gr.ChatInterface(
62
- title="Internal DOC QA",
63
- theme=gr.themes.Soft,
64
- fn=process_question, # the function to wrap
65
- # inputs="text", # the input type
66
- # outputs="text", # the output type
67
- examples=[
68
- [f"what is the process of raising an incident?"],
69
- [f"What is Cx0 program management?"],
70
- [
71
- f"What is process for identifying risksthat can impact the desired outcomes of a project?"
72
- ],
73
- [f"What is the release management process?"],
74
- ],
75
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
- if __name__ == "__main__":
78
- iface.launch()
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
+ from langchain.chains import RetrievalQA
4
+ from langchain.chat_models import ChatOpenAI
5
+
6
  from langchain.document_loaders import PyPDFLoader, DirectoryLoader
7
  from langchain.text_splitter import RecursiveCharacterTextSplitter
8
  from langchain.embeddings import OpenAIEmbeddings
9
  from langchain.vectorstores import Chroma
 
 
 
10
 
11
+ model = "gpt-3.5-turbo"
12
 
13
+
14
+ st.set_page_config(
15
+ page_title="Randstad Digital Doc QA", page_icon=":robot_face:", layout="wide"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  )
17
+ st.header("Randstad Digital Doc QA :robot_face:")
18
+
19
+ openai_api_key = os.environ["OPENAI_API_KEY"]
20
+
21
+
22
+ if not openai_api_key:
23
+ st.warning(
24
+ "Enter your OpenAI API key in the sidebar. You can get a key at"
25
+ " https://platform.openai.com/account/api-keys."
26
+ )
27
+
28
+
29
+ @st.cache_resource(show_spinner=False)
30
+ def load_data():
31
+ with st.spinner(
32
+ text="Loading and indexing the documents – hang tight! This should take 1-2 minutes."
33
+ ):
34
+ # load the documents
35
+ loader = DirectoryLoader(
36
+ "./data", glob="**/*.pdf", show_progress=True, loader_cls=PyPDFLoader
37
+ )
38
+ docs = loader.load()
39
+ # replace all new lines with spaces
40
+ for doc in docs:
41
+ setattr(doc, "page_content", doc.page_content.replace("\n", " "))
42
+
43
+ # split the documents into chunks
44
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
45
+ all_splits = text_splitter.split_documents(docs)
46
+
47
+ for doc in all_splits:
48
+ file_name = doc.metadata["source"]
49
+ setattr(doc, "page_content", f"document: {file_name}\n{doc.page_content}")
50
+
51
+ # construct vector store
52
+ vectorstore = Chroma.from_documents(
53
+ documents=all_splits, embedding=OpenAIEmbeddings()
54
+ )
55
+ # https://python.langchain.com/docs/use_cases/question_answering.html#go-deeper-3
56
+ # svm_retriever = SVMRetriever.from_documents(all_splits, OpenAIEmbeddings())
57
+ return vectorstore
58
+
59
+
60
+ vectorstore = load_data()
61
+
62
+ with st.form(key="qa_form"):
63
+ query = st.text_area("Ask me anything about the documenation!")
64
+ submit = st.form_submit_button("Submit")
65
+
66
+ with st.expander("Examples"):
67
+ with st.form(key="ex1"):
68
+ ex1_query = "what is the process of raising an incident?"
69
+ if st.form_submit_button(ex1_query):
70
+ query = ex1_query
71
+ submit = True
72
+ ex2_query = "what is the release management process?"
73
+ if st.form_submit_button(ex2_query):
74
+ query = ex2_query
75
+ submit = True
76
+ ex3_query = "What is process for identifying risks that can impact the desired outcomes of a project?"
77
+ if st.form_submit_button(ex3_query):
78
+ query = ex3_query
79
+ submit = True
80
+ ex4_query = "What is the process?"
81
+ if st.form_submit_button(ex4_query):
82
+ query = ex4_query
83
+ submit = True
84
+ ex5_query = "What is Cx0 program management?"
85
+ if st.form_submit_button(ex5_query):
86
+ query = ex4_query
87
+ submit = True
88
+
89
+
90
+ with st.expander("Advanced Options"):
91
+ return_all_chunks = st.checkbox("Group answer per document")
92
+
93
+
94
+ def is_query_valid(query: str) -> bool:
95
+ if not query:
96
+ st.error("Please enter a question!")
97
+ return False
98
+ return True
99
+
100
+
101
+ if submit:
102
+ if not is_query_valid(query):
103
+ st.stop()
104
+ with st.spinner(text="Thinking about an answer ..."):
105
+ # Output Columns
106
+ answer_col, sources_col = st.columns(2)
107
+
108
+ # llm = get_llm(model=model, openai_api_key=openai_api_key, temperature=0)
109
+ llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
110
+ qa_chain = RetrievalQA.from_chain_type(
111
+ llm=llm,
112
+ chain_type="stuff",
113
+ retriever=vectorstore.as_retriever(search_kwargs={"k": 6}),
114
+ return_source_documents=True,
115
+ )
116
+ SYSTEM_MESSAGE = "You are an internal document expert and you respond to the query in 1 to 5 sentences. If the answer is a list, write bullet points."
117
+ if return_all_chunks:
118
+ SYSTEM_MESSAGE += "Group the answer per document"
119
+ SYSTEM_MESSAGE += " \n\nQuery:\n"
120
+ result = qa_chain({"query": f"{SYSTEM_MESSAGE}{query}"})
121
+
122
+ with answer_col:
123
+ st.markdown("#### Answer")
124
+ st.markdown(result["result"])
125
+
126
+ with sources_col:
127
+ st.markdown("#### Sources")
128
+ lines = []
129
 
130
+ source_docs = [
131
+ (x.metadata["source"], x.page_content) for x in result["source_documents"]
132
+ ]
133
+ for i, doc in enumerate(source_docs, start=1):
134
+ st.markdown(f"* CHUNK: {i}")
135
+ st.markdown(f"original doc: {doc[0]}")
136
+ st.markdown(f"{doc[1]}")
137
+ lines.append("") # for a newline between chunks
requirements.txt CHANGED
@@ -1,7 +1,6 @@
 
1
  openai
2
- chromadb
3
  langchain
 
4
  pypdf
5
  tiktoken
6
- scikit-learn
7
- gradio
 
1
+ streamlit
2
  openai
 
3
  langchain
4
+ chromadb
5
  pypdf
6
  tiktoken