Spaces:
Runtime error
Runtime error
fracapuano
commited on
Commit
·
59359cb
1
Parent(s):
4f5c619
fix: bug fixing through appropriate caching
Browse files- qa/utils.py +57 -20
qa/utils.py
CHANGED
@@ -15,6 +15,8 @@ import streamlit as st
|
|
15 |
from .prompts import STUFF_PROMPT
|
16 |
from pypdf import PdfReader
|
17 |
from langchain.memory import ConversationBufferWindowMemory
|
|
|
|
|
18 |
|
19 |
class PDFFile:
|
20 |
"""A PDF file class for typing purposes."""
|
@@ -48,6 +50,25 @@ class HashDocument(Document):
|
|
48 |
return hash(content)
|
49 |
|
50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
@st.cache_data
|
52 |
def parse_docx(file: BytesIO) -> str:
|
53 |
text = docx2txt.process(file)
|
@@ -95,20 +116,20 @@ def get_text_splitter(
|
|
95 |
return text_splitter
|
96 |
|
97 |
@st.cache_data
|
98 |
-
def text_to_docs(
|
99 |
"""
|
100 |
-
Converts a string or frozenset of
|
101 |
with metadata.
|
102 |
"""
|
103 |
# sanity check on the input provided
|
104 |
-
if not isinstance(
|
105 |
raise ValueError("Text must be either a string or a list of strings. Got: {type(text)}")
|
106 |
-
elif isinstance(
|
107 |
# Take a single string as one page - make it a tuple so that is hashable
|
108 |
-
|
109 |
-
if isinstance(
|
110 |
# map each page into a document instance
|
111 |
-
page_docs = [HashDocument(page_content=page) for page in
|
112 |
# Add page numbers as metadata
|
113 |
for i, doc in enumerate(page_docs):
|
114 |
doc.metadata["page"] = i + 1
|
@@ -135,10 +156,12 @@ def text_to_docs(text: Union[Text, Tuple[Text]], **kwargs) -> List[Document]:
|
|
135 |
|
136 |
return doc_chunks
|
137 |
|
138 |
-
|
139 |
@st.cache_data
|
140 |
-
def embed_docs(_docs: Tuple[Document]) -> VectorStore:
|
141 |
-
"""
|
|
|
|
|
|
|
142 |
# Embed the chunks
|
143 |
embeddings = OpenAIEmbeddings(openai_api_key=st.session_state.get("OPENAI_API_KEY"))
|
144 |
index = FAISS.from_documents(list(_docs), embeddings)
|
@@ -146,7 +169,8 @@ def embed_docs(_docs: Tuple[Document]) -> VectorStore:
|
|
146 |
return index
|
147 |
|
148 |
|
149 |
-
|
|
|
150 |
def search_docs(_index: VectorStore, query: str, k:int=5) -> List[Document]:
|
151 |
"""Searches a FAISS index for similar chunks to the query
|
152 |
and returns a list of Documents."""
|
@@ -156,17 +180,29 @@ def search_docs(_index: VectorStore, query: str, k:int=5) -> List[Document]:
|
|
156 |
return docs
|
157 |
|
158 |
|
159 |
-
|
160 |
-
|
|
|
|
|
|
|
|
|
|
|
161 |
"""Gets an answer to a question from a list of Documents."""
|
162 |
-
memory = ConversationBufferWindowMemory(k=5, input_key="question") # only considering the last 5 messages
|
163 |
|
164 |
# Create the chain to be used in this specific setting
|
165 |
chain = load_qa_with_sources_chain(
|
166 |
-
ChatOpenAI(temperature=0, openai_api_key=st.session_state.get("OPENAI_API_KEY"), model=
|
167 |
chain_type="stuff",
|
168 |
-
prompt=STUFF_PROMPT
|
169 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
170 |
)
|
171 |
# also returnig the text of the source used to form the answer
|
172 |
answer = chain(
|
@@ -174,14 +210,14 @@ def get_answer(_docs: List[Document], query: str) -> Dict[str, Any]:
|
|
174 |
)
|
175 |
return answer
|
176 |
|
177 |
-
|
178 |
-
@st.cache_data
|
179 |
def get_sources(answer: Dict[str, Any], docs: List[Document]) -> List[Document]:
|
180 |
"""Gets the source documents for an answer."""
|
181 |
|
182 |
# Get sources for the answer
|
183 |
source_keys = [s for s in answer["output_text"].split("SOURCES: ")[-1].split(", ")]
|
184 |
-
|
185 |
source_docs = []
|
186 |
for doc in docs:
|
187 |
if doc.metadata["source"] in source_keys:
|
@@ -189,6 +225,7 @@ def get_sources(answer: Dict[str, Any], docs: List[Document]) -> List[Document]:
|
|
189 |
|
190 |
return source_docs
|
191 |
|
|
|
192 |
def wrap_text_in_html(text: str) -> str:
|
193 |
"""Wraps each text block separated by newlines in <p> tags"""
|
194 |
if isinstance(text, list):
|
|
|
15 |
from .prompts import STUFF_PROMPT
|
16 |
from pypdf import PdfReader
|
17 |
from langchain.memory import ConversationBufferWindowMemory
|
18 |
+
import openai
|
19 |
+
|
20 |
|
21 |
class PDFFile:
|
22 |
"""A PDF file class for typing purposes."""
|
|
|
50 |
return hash(content)
|
51 |
|
52 |
|
53 |
+
@st.cache_data
|
54 |
+
def check_openai_api_key(api_key:str)->bool:
|
55 |
+
"""This function checks the given OpenAI API key and returns True if it is valid, False otherwise.
|
56 |
+
Checking is performed using"""
|
57 |
+
if not (api_key.startswith('sk-') and len(api_key)==51):
|
58 |
+
st.error("Invalid OpenAI API key! Please provide a valid key.")
|
59 |
+
return False
|
60 |
+
# setting the openai api key to the given value
|
61 |
+
openai.api_key = api_key
|
62 |
+
try:
|
63 |
+
_ = openai.Completion.create(
|
64 |
+
engine="davinci",
|
65 |
+
prompt="This is a call test to test out the API Key.",
|
66 |
+
max_tokens=5
|
67 |
+
)
|
68 |
+
except openai.error.AuthenticationError:
|
69 |
+
return False
|
70 |
+
return True
|
71 |
+
|
72 |
@st.cache_data
|
73 |
def parse_docx(file: BytesIO) -> str:
|
74 |
text = docx2txt.process(file)
|
|
|
116 |
return text_splitter
|
117 |
|
118 |
@st.cache_data
|
119 |
+
def text_to_docs(pages: Union[Text, Tuple[Text]], **kwargs) -> List[HashDocument]:
|
120 |
"""
|
121 |
+
Converts a string or frozenset of pages content to a list of HashDocuments (for efficient caching)
|
122 |
with metadata.
|
123 |
"""
|
124 |
# sanity check on the input provided
|
125 |
+
if not isinstance(pages, (str, tuple)):
|
126 |
raise ValueError("Text must be either a string or a list of strings. Got: {type(text)}")
|
127 |
+
elif isinstance(pages, str):
|
128 |
# Take a single string as one page - make it a tuple so that is hashable
|
129 |
+
pages = (pages, )
|
130 |
+
if isinstance(pages, tuple):
|
131 |
# map each page into a document instance
|
132 |
+
page_docs = [HashDocument(page_content=page) for page in pages]
|
133 |
# Add page numbers as metadata
|
134 |
for i, doc in enumerate(page_docs):
|
135 |
doc.metadata["page"] = i + 1
|
|
|
156 |
|
157 |
return doc_chunks
|
158 |
|
|
|
159 |
@st.cache_data
|
160 |
+
def embed_docs(file_name:Text, _docs: Tuple[Document]) -> VectorStore:
|
161 |
+
"""
|
162 |
+
Embeds a list of Documents and returns a FAISS index.
|
163 |
+
Adds a dummy file_name variable to permit caching.
|
164 |
+
"""
|
165 |
# Embed the chunks
|
166 |
embeddings = OpenAIEmbeddings(openai_api_key=st.session_state.get("OPENAI_API_KEY"))
|
167 |
index = FAISS.from_documents(list(_docs), embeddings)
|
|
|
169 |
return index
|
170 |
|
171 |
|
172 |
+
# removing caching - consider to reintroduce it afterwise considering performance
|
173 |
+
# @st.cache_data
|
174 |
def search_docs(_index: VectorStore, query: str, k:int=5) -> List[Document]:
|
175 |
"""Searches a FAISS index for similar chunks to the query
|
176 |
and returns a list of Documents."""
|
|
|
180 |
return docs
|
181 |
|
182 |
|
183 |
+
# removing caching - consider to reintroduce it afterwise considering performance
|
184 |
+
# @st.cache_data
|
185 |
+
def get_answer(
|
186 |
+
_docs: List[Document],
|
187 |
+
query: str,
|
188 |
+
model: str="gpt-4",
|
189 |
+
stream_answer:bool=True) -> Dict[str, Any]:
|
190 |
"""Gets an answer to a question from a list of Documents."""
|
|
|
191 |
|
192 |
# Create the chain to be used in this specific setting
|
193 |
chain = load_qa_with_sources_chain(
|
194 |
+
ChatOpenAI(temperature=0, openai_api_key=st.session_state.get("OPENAI_API_KEY"), model=model, streaming=stream_answer),
|
195 |
chain_type="stuff",
|
196 |
+
prompt=STUFF_PROMPT
|
197 |
+
# verbose=True,
|
198 |
+
# chain_type_kwargs={
|
199 |
+
# "verbose": True,
|
200 |
+
# "prompt": query,
|
201 |
+
# "memory": ConversationBufferWindowMemory(
|
202 |
+
# k=5,
|
203 |
+
# memory_key="history",
|
204 |
+
# input_key="question"),
|
205 |
+
# }
|
206 |
)
|
207 |
# also returnig the text of the source used to form the answer
|
208 |
answer = chain(
|
|
|
210 |
)
|
211 |
return answer
|
212 |
|
213 |
+
# removing caching - consider to reintroduce it afterwise considering performance
|
214 |
+
# @st.cache_data
|
215 |
def get_sources(answer: Dict[str, Any], docs: List[Document]) -> List[Document]:
|
216 |
"""Gets the source documents for an answer."""
|
217 |
|
218 |
# Get sources for the answer
|
219 |
source_keys = [s for s in answer["output_text"].split("SOURCES: ")[-1].split(", ")]
|
220 |
+
# Retrieving the documents the actual sources refer to
|
221 |
source_docs = []
|
222 |
for doc in docs:
|
223 |
if doc.metadata["source"] in source_keys:
|
|
|
225 |
|
226 |
return source_docs
|
227 |
|
228 |
+
# this function could be removed - it is not used anymore
|
229 |
def wrap_text_in_html(text: str) -> str:
|
230 |
"""Wraps each text block separated by newlines in <p> tags"""
|
231 |
if isinstance(text, list):
|