fracapuano commited on
Commit
4f5c619
1 Parent(s): 0e17089

fix: bug fixing inheritance

Browse files
Files changed (1) hide show
  1. qa/qa.py +130 -79
qa/qa.py CHANGED
@@ -5,11 +5,63 @@ from typing import Text, Union
5
 
6
  multiple_files = True
7
 
8
- def clear_submit():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  """
10
- Toggles the file_submitted internal session state variable to False.
11
  """
12
- st.session_state["file_submitted"] = False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  def set_openai_api_key(api_key:Text)->bool:
15
  """Sets the internal OpenAI API key to the given value.
@@ -17,15 +69,14 @@ def set_openai_api_key(api_key:Text)->bool:
17
  Args:
18
  api_key (Text): OpenAI API key
19
  """
20
- if not (api_key.startswith('sk-') and len(api_key)==51):
21
- st.error("Invalid OpenAI API key! Please provide a valid key.")
22
- return False
23
 
24
  st.session_state["OPENAI_API_KEY"] = api_key
25
  st.session_state["api_key_configured"] = True
26
  return True
27
 
28
- def file_to_doc(file:Union[PDFFile, DocxFile, TxtFile, CodeFile]) -> None:
29
  """Converts a file to a document using specialized parsers."""
30
  if file.name.endswith(".pdf"):
31
  doc = parse_pdf(file)
@@ -43,14 +94,9 @@ def file_to_doc(file:Union[PDFFile, DocxFile, TxtFile, CodeFile]) -> None:
43
  # def document_embedding_pipeline(file:Union[PDFFile, DocxFile, TxtFile, CodeFile]) -> None:
44
 
45
  def qa_main():
46
- st.markdown("<h2>This app allows to chat with files!</h2>", unsafe_allow_html=True)
47
  st.write("Just upload something using and start chatting with a version of GPT4 that has read the file!")
48
 
49
- index = None
50
- doc = None
51
-
52
- upload_document_greenlight = False
53
- uploaded_processed_document_greenlight = False
54
  # OpenAI API Key - TODO: consider adding a key valid for everyone
55
  # st.header("Configure OpenAI API Key")
56
  # st.warning('Please enter your OpenAI API Key!', icon='⚠️')
@@ -63,88 +109,93 @@ def qa_main():
63
  # help="You can get your API key from https://platform.openai.com/account/api-keys.",
64
  # value=st.session_state.get("OPENAI_API_KEY", ""),
65
  # )
 
66
  user_secret = st.secrets["OPENAI_API_KEY"]
67
  if user_secret:
68
  if set_openai_api_key(user_secret):
69
- st.success('OpenAI API key successfully accessed!', icon='✅')
70
- upload_document_greenlight = True
 
 
 
71
 
72
- if upload_document_greenlight:
73
  # File that needs to be queried
74
  st.header("Upload a file")
75
- uploaded_file = st.file_uploader(
76
  "Upload a pdf, docx, or txt file (scanned documents not supported)",
77
  type=["pdf", "docx", "txt", "py", "json", "html", "css", "md"],
78
  help="Scanned documents are not supported yet 🥲",
79
- on_change=clear_submit,
80
- accept_multiple_files=multiple_files
 
81
  )
82
-
83
- # reading the uploaded files
84
- text = []
85
- if len(uploaded_file) != 0:
86
- # toggle internal file submission state to True
87
- st.session_state["file_submitted"] = True
88
- for file in uploaded_file:
89
- # parse the file using custom parsers
90
- file_doc = file_to_doc(file)
91
- # converts the files into a list of documents
92
- file_text = text_to_docs(text=tuple(file_doc), file_name=file.name)
93
- text.extend(file_text)
94
 
95
- # embeds the documents using OpenAI API
96
  try:
97
- with st.spinner("Indexing the document... This might take a while!"):
98
- index = embed_docs(tuple(text))
99
- st.session_state["api_key_configured"] = True
100
  except OpenAIError as e:
101
  st.error("OpenAI error encountered: ", e._message)
102
-
103
- uploaded_processed_document_greenlight = True
104
-
105
- if uploaded_processed_document_greenlight:
106
- if "messages" not in st.session_state:
107
- st.session_state["messages"] = []
108
-
109
- for message in st.session_state.messages:
110
- with st.chat_message(message["role"]):
111
- st.markdown(message["content"])
112
-
113
- if prompt := st.chat_input("Ask the document something..."):
114
- st.session_state.messages.append({"role": "user", "content": prompt})
115
- with st.chat_message("user"):
116
- st.markdown(prompt)
117
-
118
- with st.chat_message("assistant"):
119
- message_placeholder = st.empty()
120
- # retrieving the most relevant sources
121
- sources = search_docs(index, prompt)
122
- # producing the answer, live
123
- full_response = ""
124
- for answer_bit in get_answer(sources, prompt)["output_text"]:
125
- full_response += answer_bit
126
- message_placeholder.markdown(full_response + "▌")
127
-
128
- message_placeholder.markdown(full_response)
129
 
130
- # answer = get_answer(sources, prompt)
131
- # message_placeholder.markdown(answer["output_text"])
132
-
133
- # st.session_state.messages.append({"role": "assistant", "content": answer["output_text"]})
134
- st.session_state.messages.append({"role": "assistant", "content": full_response})
 
 
 
 
 
 
 
 
 
135
 
136
- # This might be useful to add memory to the chatbot harnessing a more low-level approach
137
- # llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo")
 
138
 
139
- # memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True, output_key='answer')
140
- # retriever = your_vector_store.as_retriever()
 
 
141
 
142
- # # Create the multipurpose chain
143
- # qachat = ConversationalRetrievalChain.from_llm(
144
- # llm=ChatOpenAI(temperature=0),
145
- # memory=memory,
146
- # retriever=retriever,
147
- # return_source_documents=True
148
- # )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
 
150
- # qachat("Ask your question here...")
 
5
 
6
  multiple_files = True
7
 
8
+ def query_pipeline(index:VectorStore, query:Text, stream_answer:bool=False)->Text:
9
+ """This function reproduces the querying pipeline considering a given input index."""
10
+ # retrieving the most relevant pieces of information within the knowledge base
11
+ sources = search_docs(index, query=query)
12
+ # getting the answer, all at once
13
+ answer = get_answer(sources, query=query, stream_answer=stream_answer)["output_text"]
14
+
15
+ return answer
16
+
17
+ def toggle_process_document():
18
+ """Toggles the greenlight for the next step in the pipeline, i.e. processing the document."""
19
+ if "processing_document_greenlight" not in st.session_state:
20
+ st.session_state["processing_document_greenlight"] = True
21
+
22
+ st.session_state["processing_document_greenlight"] = not st.session_state["processing_document_greenlight"]
23
+
24
+ def register_new_file_name(file_name):
25
  """
26
+ Registers a new file name in the internal session state.
27
  """
28
+ if "uploaded_file_names" not in st.session_state:
29
+ st.session_state["uploaded_file_names"] = []
30
+
31
+ st.session_state["uploaded_file_names"].append(file_name)
32
+
33
+ def clear_index():
34
+ """
35
+ Clears the index from the internal session state.
36
+ This is a non reversible operation.
37
+ """
38
+ if "index" in st.session_state:
39
+ del globals()["index"]
40
+
41
+ def clear_session_state():
42
+ """
43
+ Clears the session state iterating over keys.
44
+ This is a non reversible operation.
45
+ """
46
+ for k in st.session_state.keys():
47
+ del st.session_state[k]
48
+
49
+ def register_new_file(new_file):
50
+ """
51
+ Registers a new file in the internal session state.
52
+ """
53
+ if "uploaded_files" not in st.session_state:
54
+ st.session_state["uploaded_files"] = []
55
+
56
+ st.session_state["uploaded_files"].extend(new_file)
57
+
58
+ def clear_all_files():
59
+ """Removes all uploaded files from the interal session state."""
60
+ st.session_state["uploaded_files"] = []
61
+
62
+ def append_uploaded_files(file):
63
+ """Appends the uploaded files to the internal session state."""
64
+ st.session_state.get("uploaded_files", []).extend(file)
65
 
66
  def set_openai_api_key(api_key:Text)->bool:
67
  """Sets the internal OpenAI API key to the given value.
 
69
  Args:
70
  api_key (Text): OpenAI API key
71
  """
72
+ if not check_openai_api_key(api_key=api_key):
73
+ raise ValueError("Invalid OpenAI API key! Please provide a valid key.")
 
74
 
75
  st.session_state["OPENAI_API_KEY"] = api_key
76
  st.session_state["api_key_configured"] = True
77
  return True
78
 
79
+ def parse_file(file:Union[PDFFile, DocxFile, TxtFile, CodeFile]) -> None:
80
  """Converts a file to a document using specialized parsers."""
81
  if file.name.endswith(".pdf"):
82
  doc = parse_pdf(file)
 
94
  # def document_embedding_pipeline(file:Union[PDFFile, DocxFile, TxtFile, CodeFile]) -> None:
95
 
96
  def qa_main():
97
+ """Main function for the QA app."""
98
  st.write("Just upload something using and start chatting with a version of GPT4 that has read the file!")
99
 
 
 
 
 
 
100
  # OpenAI API Key - TODO: consider adding a key valid for everyone
101
  # st.header("Configure OpenAI API Key")
102
  # st.warning('Please enter your OpenAI API Key!', icon='⚠️')
 
109
  # help="You can get your API key from https://platform.openai.com/account/api-keys.",
110
  # value=st.session_state.get("OPENAI_API_KEY", ""),
111
  # )
112
+
113
  user_secret = st.secrets["OPENAI_API_KEY"]
114
  if user_secret:
115
  if set_openai_api_key(user_secret):
116
+ # removing this when the OpenAI API key is hardcoded
117
+ # st.success('OpenAI API key successfully accessed!', icon='✅')
118
+
119
+ # greenlight for next step, i.e. uploading the document to chat with
120
+ st.session_state["upload_document_greenlight"] = True
121
 
122
+ if st.session_state.get("upload_document_greenlight"):
123
  # File that needs to be queried
124
  st.header("Upload a file")
125
+ st.file_uploader(
126
  "Upload a pdf, docx, or txt file (scanned documents not supported)",
127
  type=["pdf", "docx", "txt", "py", "json", "html", "css", "md"],
128
  help="Scanned documents are not supported yet 🥲",
129
+ accept_multiple_files=multiple_files,
130
+ #on_change=toggle_process_document,
131
+ key="uploaded_file"
132
  )
133
+
134
+ documents = {}
135
+ indexes = {}
136
+ for file in st.session_state["uploaded_file"]:
137
+ parsed_file = parse_file(file)
138
+ # converts the files into a list of documents
139
+ document = text_to_docs(pages=tuple(parsed_file), file_name=file.name)
140
+ documents[file.name] = document
 
 
 
 
141
 
142
+ with st.spinner(f"Indexing {file.name} (might take some time)"):
143
  try:
144
+ # indexing the document uploaded
145
+ indexes[file.name] = embed_docs(file_name=file.name, _docs=tuple(document))
 
146
  except OpenAIError as e:
147
  st.error("OpenAI error encountered: ", e._message)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
 
149
+ if len(documents)>1:
150
+ # documents to be indexed when providing the query
151
+ st.multiselect(
152
+ label="Select the documents to be indexed",
153
+ options=list(documents.keys()),
154
+ key="multiselect_documents_choices",
155
+ )
156
+
157
+ elif len(documents)==1:
158
+ st.session_state["multiselect_documents_choices"] = [list(documents.keys())[0]]
159
+
160
+ # this is the code that actually performs the chat process
161
+ if "messages" not in st.session_state: # checking if there is any cache history
162
+ st.session_state["messages"] = []
163
 
164
+ for message in st.session_state.messages:
165
+ with st.chat_message(message["role"]):
166
+ st.markdown(message["content"], unsafe_allow_html=True)
167
 
168
+ if prompt:=st.chat_input("Ask the document something..."):
169
+
170
+ if prompt=="1":
171
+ prompt="What is this document about?"
172
 
173
+ st.session_state.messages.append({"role": "user", "content": prompt})
174
+
175
+ with st.chat_message("user"):
176
+ st.markdown(prompt)
177
+
178
+ with st.chat_message("assistant"):
179
+ # full_response will store every question asked to all the document(s) considered
180
+ full_response = ""
181
+ message_placeholder = st.empty()
182
+
183
+ # asking the same question to all of the documents considered
184
+ for chat_document in st.session_state["multiselect_documents_choices"]:
185
+ # keeping track of what is asked to what document
186
+ full_response += \
187
+ f"<i>Asking</i> <b>{chat_document}</b> <i>question</i> <b>{prompt}</b></i><br>"
188
+ message_placeholder.markdown(full_response, unsafe_allow_html=True)
189
+ # retrieving the vector store associated to the chat document considered
190
+ chat_index = indexes[chat_document]
191
+ # producing the answer considered, live
192
+ for answer_bit in query_pipeline(chat_index, prompt, stream_answer=True):
193
+ full_response += answer_bit
194
+ message_placeholder.markdown(full_response + "▌", unsafe_allow_html=True)
195
+ # appending a final entering
196
+ full_response += "<br>"
197
+ message_placeholder.markdown(full_response, unsafe_allow_html=True)
198
+
199
+ # appending the final response obtained after having asked all the documents
200
+ st.session_state.messages.append({"role": "assistant", "content": full_response})
201