fracapuano commited on
Commit
59359cb
·
1 Parent(s): 4f5c619

fix: bug fixing through appropriate caching

Browse files
Files changed (1) hide show
  1. qa/utils.py +57 -20
qa/utils.py CHANGED
@@ -15,6 +15,8 @@ import streamlit as st
15
  from .prompts import STUFF_PROMPT
16
  from pypdf import PdfReader
17
  from langchain.memory import ConversationBufferWindowMemory
 
 
18
 
19
  class PDFFile:
20
  """A PDF file class for typing purposes."""
@@ -48,6 +50,25 @@ class HashDocument(Document):
48
  return hash(content)
49
 
50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  @st.cache_data
52
  def parse_docx(file: BytesIO) -> str:
53
  text = docx2txt.process(file)
@@ -95,20 +116,20 @@ def get_text_splitter(
95
  return text_splitter
96
 
97
  @st.cache_data
98
- def text_to_docs(text: Union[Text, Tuple[Text]], **kwargs) -> List[Document]:
99
  """
100
- Converts a string or frozenset of strings to a list of Documents
101
  with metadata.
102
  """
103
  # sanity check on the input provided
104
- if not isinstance(text, (str, tuple)):
105
  raise ValueError("Text must be either a string or a list of strings. Got: {type(text)}")
106
- elif isinstance(text, str):
107
  # Take a single string as one page - make it a tuple so that is hashable
108
- text = (text, )
109
- if isinstance(text, tuple):
110
  # map each page into a document instance
111
- page_docs = [HashDocument(page_content=page) for page in text]
112
  # Add page numbers as metadata
113
  for i, doc in enumerate(page_docs):
114
  doc.metadata["page"] = i + 1
@@ -135,10 +156,12 @@ def text_to_docs(text: Union[Text, Tuple[Text]], **kwargs) -> List[Document]:
135
 
136
  return doc_chunks
137
 
138
-
139
  @st.cache_data
140
- def embed_docs(_docs: Tuple[Document]) -> VectorStore:
141
- """Embeds a list of Documents and returns a FAISS index"""
 
 
 
142
  # Embed the chunks
143
  embeddings = OpenAIEmbeddings(openai_api_key=st.session_state.get("OPENAI_API_KEY"))
144
  index = FAISS.from_documents(list(_docs), embeddings)
@@ -146,7 +169,8 @@ def embed_docs(_docs: Tuple[Document]) -> VectorStore:
146
  return index
147
 
148
 
149
- @st.cache_data
 
150
  def search_docs(_index: VectorStore, query: str, k:int=5) -> List[Document]:
151
  """Searches a FAISS index for similar chunks to the query
152
  and returns a list of Documents."""
@@ -156,17 +180,29 @@ def search_docs(_index: VectorStore, query: str, k:int=5) -> List[Document]:
156
  return docs
157
 
158
 
159
- @st.cache_data
160
- def get_answer(_docs: List[Document], query: str) -> Dict[str, Any]:
 
 
 
 
 
161
  """Gets an answer to a question from a list of Documents."""
162
- memory = ConversationBufferWindowMemory(k=5, input_key="question") # only considering the last 5 messages
163
 
164
  # Create the chain to be used in this specific setting
165
  chain = load_qa_with_sources_chain(
166
- ChatOpenAI(temperature=0, openai_api_key=st.session_state.get("OPENAI_API_KEY"), model="gpt-4", streaming=True),
167
  chain_type="stuff",
168
- prompt=STUFF_PROMPT,
169
- # memory=memory NOTE: As of Aug 2023, memory is not supported in the QA chain - uncomment this line when it is supported
 
 
 
 
 
 
 
 
170
  )
171
  # also returnig the text of the source used to form the answer
172
  answer = chain(
@@ -174,14 +210,14 @@ def get_answer(_docs: List[Document], query: str) -> Dict[str, Any]:
174
  )
175
  return answer
176
 
177
-
178
- @st.cache_data
179
  def get_sources(answer: Dict[str, Any], docs: List[Document]) -> List[Document]:
180
  """Gets the source documents for an answer."""
181
 
182
  # Get sources for the answer
183
  source_keys = [s for s in answer["output_text"].split("SOURCES: ")[-1].split(", ")]
184
-
185
  source_docs = []
186
  for doc in docs:
187
  if doc.metadata["source"] in source_keys:
@@ -189,6 +225,7 @@ def get_sources(answer: Dict[str, Any], docs: List[Document]) -> List[Document]:
189
 
190
  return source_docs
191
 
 
192
  def wrap_text_in_html(text: str) -> str:
193
  """Wraps each text block separated by newlines in <p> tags"""
194
  if isinstance(text, list):
 
15
  from .prompts import STUFF_PROMPT
16
  from pypdf import PdfReader
17
  from langchain.memory import ConversationBufferWindowMemory
18
+ import openai
19
+
20
 
21
  class PDFFile:
22
  """A PDF file class for typing purposes."""
 
50
  return hash(content)
51
 
52
 
53
+ @st.cache_data
54
+ def check_openai_api_key(api_key:str)->bool:
55
+ """This function checks the given OpenAI API key and returns True if it is valid, False otherwise.
56
+ Checking is performed using"""
57
+ if not (api_key.startswith('sk-') and len(api_key)==51):
58
+ st.error("Invalid OpenAI API key! Please provide a valid key.")
59
+ return False
60
+ # setting the openai api key to the given value
61
+ openai.api_key = api_key
62
+ try:
63
+ _ = openai.Completion.create(
64
+ engine="davinci",
65
+ prompt="This is a call test to test out the API Key.",
66
+ max_tokens=5
67
+ )
68
+ except openai.error.AuthenticationError:
69
+ return False
70
+ return True
71
+
72
  @st.cache_data
73
  def parse_docx(file: BytesIO) -> str:
74
  text = docx2txt.process(file)
 
116
  return text_splitter
117
 
118
  @st.cache_data
119
+ def text_to_docs(pages: Union[Text, Tuple[Text]], **kwargs) -> List[HashDocument]:
120
  """
121
+ Converts a string or frozenset of pages content to a list of HashDocuments (for efficient caching)
122
  with metadata.
123
  """
124
  # sanity check on the input provided
125
+ if not isinstance(pages, (str, tuple)):
126
  raise ValueError("Text must be either a string or a list of strings. Got: {type(text)}")
127
+ elif isinstance(pages, str):
128
  # Take a single string as one page - make it a tuple so that is hashable
129
+ pages = (pages, )
130
+ if isinstance(pages, tuple):
131
  # map each page into a document instance
132
+ page_docs = [HashDocument(page_content=page) for page in pages]
133
  # Add page numbers as metadata
134
  for i, doc in enumerate(page_docs):
135
  doc.metadata["page"] = i + 1
 
156
 
157
  return doc_chunks
158
 
 
159
  @st.cache_data
160
+ def embed_docs(file_name:Text, _docs: Tuple[Document]) -> VectorStore:
161
+ """
162
+ Embeds a list of Documents and returns a FAISS index.
163
+ Adds a dummy file_name variable to permit caching.
164
+ """
165
  # Embed the chunks
166
  embeddings = OpenAIEmbeddings(openai_api_key=st.session_state.get("OPENAI_API_KEY"))
167
  index = FAISS.from_documents(list(_docs), embeddings)
 
169
  return index
170
 
171
 
172
+ # removing caching - consider to reintroduce it afterwise considering performance
173
+ # @st.cache_data
174
  def search_docs(_index: VectorStore, query: str, k:int=5) -> List[Document]:
175
  """Searches a FAISS index for similar chunks to the query
176
  and returns a list of Documents."""
 
180
  return docs
181
 
182
 
183
+ # removing caching - consider to reintroduce it afterwise considering performance
184
+ # @st.cache_data
185
+ def get_answer(
186
+ _docs: List[Document],
187
+ query: str,
188
+ model: str="gpt-4",
189
+ stream_answer:bool=True) -> Dict[str, Any]:
190
  """Gets an answer to a question from a list of Documents."""
 
191
 
192
  # Create the chain to be used in this specific setting
193
  chain = load_qa_with_sources_chain(
194
+ ChatOpenAI(temperature=0, openai_api_key=st.session_state.get("OPENAI_API_KEY"), model=model, streaming=stream_answer),
195
  chain_type="stuff",
196
+ prompt=STUFF_PROMPT
197
+ # verbose=True,
198
+ # chain_type_kwargs={
199
+ # "verbose": True,
200
+ # "prompt": query,
201
+ # "memory": ConversationBufferWindowMemory(
202
+ # k=5,
203
+ # memory_key="history",
204
+ # input_key="question"),
205
+ # }
206
  )
207
  # also returnig the text of the source used to form the answer
208
  answer = chain(
 
210
  )
211
  return answer
212
 
213
+ # removing caching - consider to reintroduce it afterwise considering performance
214
+ # @st.cache_data
215
  def get_sources(answer: Dict[str, Any], docs: List[Document]) -> List[Document]:
216
  """Gets the source documents for an answer."""
217
 
218
  # Get sources for the answer
219
  source_keys = [s for s in answer["output_text"].split("SOURCES: ")[-1].split(", ")]
220
+ # Retrieving the documents the actual sources refer to
221
  source_docs = []
222
  for doc in docs:
223
  if doc.metadata["source"] in source_keys:
 
225
 
226
  return source_docs
227
 
228
+ # this function could be removed - it is not used anymore
229
  def wrap_text_in_html(text: str) -> str:
230
  """Wraps each text block separated by newlines in <p> tags"""
231
  if isinstance(text, list):