Spaces:

fracapuano
/

AISandbox

Runtime error

App Files Files Community

fracapuano commited on Sep 5, 2023

Commit

59359cb

1 Parent(s): 4f5c619

fix: bug fixing through appropriate caching

Browse files

Files changed (1) hide show

qa/utils.py +57 -20

qa/utils.py CHANGED Viewed

@@ -15,6 +15,8 @@ import streamlit as st
 from .prompts import STUFF_PROMPT
 from pypdf import PdfReader
 from langchain.memory import ConversationBufferWindowMemory
 class PDFFile:
     """A PDF file class for typing purposes."""
@@ -48,6 +50,25 @@ class HashDocument(Document):
         return hash(content)
 @st.cache_data
 def parse_docx(file: BytesIO) -> str:
     text = docx2txt.process(file)
@@ -95,20 +116,20 @@ def get_text_splitter(
     return text_splitter
 @st.cache_data
-def text_to_docs(text: Union[Text, Tuple[Text]], **kwargs) -> List[Document]:
     """
-    Converts a string or frozenset of strings to a list of Documents
     with metadata.
     """
     # sanity check on the input provided
-    if not isinstance(text, (str, tuple)):
         raise ValueError("Text must be either a string or a list of strings. Got: {type(text)}")
-    elif isinstance(text, str):
         # Take a single string as one page - make it a tuple so that is hashable
-        text = (text, )
-    if isinstance(text, tuple):
         # map each page into a document instance
-        page_docs = [HashDocument(page_content=page) for page in text]
         # Add page numbers as metadata
         for i, doc in enumerate(page_docs):
             doc.metadata["page"] = i + 1
@@ -135,10 +156,12 @@ def text_to_docs(text: Union[Text, Tuple[Text]], **kwargs) -> List[Document]:
         return doc_chunks
 @st.cache_data
-def embed_docs(_docs: Tuple[Document]) -> VectorStore:
-    """Embeds a list of Documents and returns a FAISS index"""
     # Embed the chunks
     embeddings = OpenAIEmbeddings(openai_api_key=st.session_state.get("OPENAI_API_KEY"))
     index = FAISS.from_documents(list(_docs), embeddings)
@@ -146,7 +169,8 @@ def embed_docs(_docs: Tuple[Document]) -> VectorStore:
     return index
-@st.cache_data
 def search_docs(_index: VectorStore, query: str, k:int=5) -> List[Document]:
     """Searches a FAISS index for similar chunks to the query
     and returns a list of Documents."""
@@ -156,17 +180,29 @@ def search_docs(_index: VectorStore, query: str, k:int=5) -> List[Document]:
     return docs
-@st.cache_data
-def get_answer(_docs: List[Document], query: str) -> Dict[str, Any]:
     """Gets an answer to a question from a list of Documents."""
-    memory = ConversationBufferWindowMemory(k=5, input_key="question")  # only considering the last 5 messages
     # Create the chain to be used in this specific setting
     chain = load_qa_with_sources_chain(
-        ChatOpenAI(temperature=0, openai_api_key=st.session_state.get("OPENAI_API_KEY"), model="gpt-4", streaming=True),
         chain_type="stuff",
-        prompt=STUFF_PROMPT,
-        # memory=memory NOTE: As of Aug 2023, memory is not supported in the QA chain - uncomment this line when it is supported
         )
     # also returnig the text of the source used to form the answer
     answer = chain(
@@ -174,14 +210,14 @@ def get_answer(_docs: List[Document], query: str) -> Dict[str, Any]:
     )
     return answer
-@st.cache_data
 def get_sources(answer: Dict[str, Any], docs: List[Document]) -> List[Document]:
     """Gets the source documents for an answer."""
     # Get sources for the answer
     source_keys = [s for s in answer["output_text"].split("SOURCES: ")[-1].split(", ")]
     source_docs = []
     for doc in docs:
         if doc.metadata["source"] in source_keys:
@@ -189,6 +225,7 @@ def get_sources(answer: Dict[str, Any], docs: List[Document]) -> List[Document]:
     return source_docs
 def wrap_text_in_html(text: str) -> str:
     """Wraps each text block separated by newlines in <p> tags"""
     if isinstance(text, list):

 from .prompts import STUFF_PROMPT
 from pypdf import PdfReader
 from langchain.memory import ConversationBufferWindowMemory
+import openai
 class PDFFile:
     """A PDF file class for typing purposes."""
         return hash(content)
+@st.cache_data
+def check_openai_api_key(api_key:str)->bool:
+    """This function checks the given OpenAI API key and returns True if it is valid, False otherwise.
+    Checking is performed using"""
+    if not (api_key.startswith('sk-') and len(api_key)==51):
+        st.error("Invalid OpenAI API key! Please provide a valid key.")
+        return False
+    # setting the openai api key to the given value
+    openai.api_key = api_key
+    try:
+        _ = openai.Completion.create(
+            engine="davinci",
+            prompt="This is a call test to test out the API Key.",
+            max_tokens=5
+        )
+    except openai.error.AuthenticationError:
+        return False
+    return True
 @st.cache_data
 def parse_docx(file: BytesIO) -> str:
     text = docx2txt.process(file)
     return text_splitter
 @st.cache_data
+def text_to_docs(pages: Union[Text, Tuple[Text]], **kwargs) -> List[HashDocument]:
     """
+    Converts a string or frozenset of pages content to a list of HashDocuments (for efficient caching)
     with metadata.
     """
     # sanity check on the input provided
+    if not isinstance(pages, (str, tuple)):
         raise ValueError("Text must be either a string or a list of strings. Got: {type(text)}")
+    elif isinstance(pages, str):
         # Take a single string as one page - make it a tuple so that is hashable
+        pages = (pages, )
+    if isinstance(pages, tuple):
         # map each page into a document instance
+        page_docs = [HashDocument(page_content=page) for page in pages]
         # Add page numbers as metadata
         for i, doc in enumerate(page_docs):
             doc.metadata["page"] = i + 1
         return doc_chunks
 @st.cache_data
+def embed_docs(file_name:Text, _docs: Tuple[Document]) -> VectorStore:
+    """
+    Embeds a list of Documents and returns a FAISS index.
+    Adds a dummy file_name variable to permit caching.
+    """
     # Embed the chunks
     embeddings = OpenAIEmbeddings(openai_api_key=st.session_state.get("OPENAI_API_KEY"))
     index = FAISS.from_documents(list(_docs), embeddings)
     return index
+# removing caching - consider to reintroduce it afterwise considering performance
+# @st.cache_data
 def search_docs(_index: VectorStore, query: str, k:int=5) -> List[Document]:
     """Searches a FAISS index for similar chunks to the query
     and returns a list of Documents."""
     return docs
+# removing caching - consider to reintroduce it afterwise considering performance
+# @st.cache_data
+def get_answer(
+        _docs: List[Document],
+        query: str,
+        model: str="gpt-4",
+        stream_answer:bool=True) -> Dict[str, Any]:
     """Gets an answer to a question from a list of Documents."""
     # Create the chain to be used in this specific setting
     chain = load_qa_with_sources_chain(
+        ChatOpenAI(temperature=0, openai_api_key=st.session_state.get("OPENAI_API_KEY"), model=model, streaming=stream_answer),
         chain_type="stuff",
+        prompt=STUFF_PROMPT
+        # verbose=True,
+        # chain_type_kwargs={
+        #     "verbose": True,
+        #     "prompt": query,
+        #     "memory": ConversationBufferWindowMemory(
+        #         k=5,
+        #         memory_key="history",
+        #         input_key="question"),
+        # }
         )
     # also returnig the text of the source used to form the answer
     answer = chain(
     )
     return answer
+# removing caching - consider to reintroduce it afterwise considering performance
+# @st.cache_data
 def get_sources(answer: Dict[str, Any], docs: List[Document]) -> List[Document]:
     """Gets the source documents for an answer."""
     # Get sources for the answer
     source_keys = [s for s in answer["output_text"].split("SOURCES: ")[-1].split(", ")]
+    # Retrieving the documents the actual sources refer to
     source_docs = []
     for doc in docs:
         if doc.metadata["source"] in source_keys:
     return source_docs
+# this function could be removed - it is not used anymore
 def wrap_text_in_html(text: str) -> str:
     """Wraps each text block separated by newlines in <p> tags"""
     if isinstance(text, list):