Spaces:

fracapuano
/

AISandbox

Runtime error

App Files Files Community

fracapuano commited on Aug 29, 2023

Commit

51fe9d2

1 Parent(s): ffc32ba

Add files via upload

Browse files

Files changed (16) hide show

README.md +4 -4
app.py +45 -0
qa/__init__.py +1 -0
qa/__pycache__/__init__.cpython-310.pyc +0 -0
qa/__pycache__/embeddings.cpython-310.pyc +0 -0
qa/__pycache__/prompts.cpython-310.pyc +0 -0
qa/__pycache__/qa.cpython-310.pyc +0 -0
qa/__pycache__/utils.cpython-310.pyc +0 -0
qa/prompts.py +26 -0
qa/qa.py +119 -0
qa/utils.py +160 -0
requirements.txt +12 -0
summarization/__init__.py +1 -0
summarization/__pycache__/__init__.cpython-310.pyc +0 -0
summarization/__pycache__/summarization.cpython-310.pyc +0 -0
summarization/summarization.py +43 -0

README.md CHANGED Viewed

@@ -1,10 +1,10 @@
 ---
-title: AISandbox
-emoji: 🐠
 colorFrom: blue
-colorTo: pink
 sdk: streamlit
-sdk_version: 1.26.0
 app_file: app.py
 pinned: false
 license: mit

 ---
+title: Chat With Files
+emoji: ⚡
 colorFrom: blue
+colorTo: yellow
 sdk: streamlit
+sdk_version: 1.25.0
 app_file: app.py
 pinned: false
 license: mit

app.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import streamlit as st
+from transformers import pipeline
+st.set_page_config(page_title="2023 FS Hackathon")
+st.title("Founder's Studio AI Sandbox 🕹️")
+expander = st.expander("Click here to close this intro", expanded=True)
+expander.write(
+    """
+    This web app allows you to perform common Natural Language Processing tasks, select a task below to get started.
+    These tasks are intended to help you validate your intuition and build a proof of concept for your idea.
+    If a task you deem useful is not listed here, feel free to get in touch with Founder's Studio team at francesco.capuano@bain.com.
+    Happy hackathon!
+    """
+)
+st.subheader(":point_down: Use the following drop-down menu to select a task!")
+OPTION1="Chat wiht a file"
+OPTION2="Text summarization"
+OPTION_N="OTHER"
+option = st.selectbox("Please select a task 🤖",
+                      options=[OPTION1, OPTION2, OPTION_N],
+                      )
+if option == "OTHER":
+    user_suggestion = st.text_input("Please specify the task you would like to perform", value="")
+    if user_suggestion:
+        st.write("Thanks for your suggestion, we will get back to you soon!")
+        st.stop()
+if option == OPTION1:
+    from qa import qa_main
+    with st.container():
+        qa_main()
+elif option == OPTION2:
+    from summarization import summarization_main
+    with st.container():
+        summarization_main()
+elif option==OPTION_N:
+    raise NotImplementedError("This option is not yet implemented, please select another one")

qa/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .qa import *

qa/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (173 Bytes). View file

qa/__pycache__/embeddings.cpython-310.pyc ADDED Viewed

Binary file (4.04 kB). View file

qa/__pycache__/prompts.cpython-310.pyc ADDED Viewed

Binary file (2.18 kB). View file

qa/__pycache__/qa.cpython-310.pyc ADDED Viewed

Binary file (3.37 kB). View file

qa/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (5.46 kB). View file

qa/prompts.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from langchain.prompts import PromptTemplate
+## One might consider using a shorter template to reduce the number of tokens in the model input
+template = """Create a final answer to the given questions using the provided document (in no particular order) as references. ALWAYS include a "SOURCES" section in your answer including only the minimal set of sources needed to answer the question. If you are unable to answer the question, simply state that you do not know. Do not attempt to fabricate an answer and leave the SOURCES section empty.
+---------
+QUESTION: What  is the purpose of ARPA-H?
+=========
+Content: More support for patients and families. \n\nTo get there, I call on Congress to fund ARPA-H, the Advanced Research Projects Agency for Health. \n\nIt’s based on DARPA—the Defense Department project that led to the Internet, GPS, and so much more.  \n\nARPA-H will have a singular purpose—to drive breakthroughs in cancer, Alzheimer’s, diabetes, and more.
+Source: 1-32
+Content: While we’re at it, let’s make sure every American can get the health care they need. \n\nWe’ve already made historic investments in health care. \n\nWe’ve made it easier for Americans to get the care they need, when they need it. \n\nWe’ve made it easier for Americans to get the treatments they need, when they need them. \n\nWe’ve made it easier for Americans to get the medications they need, when they need them.
+Source: 1-33
+Content: The V.A. is pioneering new ways of linking toxic exposures to disease, already helping  veterans get the care they deserve. \n\nWe need to extend that same care to all Americans. \n\nThat’s why I’m calling on Congress to pass legislation that would establish a national registry of toxic exposures, and provide health care and financial assistance to those affected.
+Source: 1-30
+=========
+FINAL ANSWER: The purpose of ARPA-H is to drive breakthroughs in cancer, Alzheimer’s, diabetes, and more.
+SOURCES: 1-32
+---------
+QUESTION: {question}
+=========
+{summaries}
+=========
+FINAL ANSWER:"""
+STUFF_PROMPT = PromptTemplate(
+    template=template, input_variables=["summaries", "question"]
+)

qa/qa.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import streamlit as st
+from streamlit_chat import message
+from openai.error import OpenAIError
+from .utils import (
+    parse_docx,
+    parse_pdf,
+    parse_txt,
+    search_docs,
+    embed_docs,
+    text_to_docs,
+    get_answer,
+)
+from uuid import uuid4
+def clear_submit():
+    st.session_state["submit"] = False
+def set_openai_api_key(api_key: str):
+    st.session_state["OPENAI_API_KEY"] = api_key
+def qa_main():
+    st.markdown("<h1>This app allows to chat with files!</h1>", unsafe_allow_html=True)
+    st.markdown(\
+        """
+        Developed using LangChain and OpenAI Embeddings.</p>
+        Before hitting on "Submit", please make sure you have uploaded a file and entered a question.
+        You can upload files using the sidebar on the left.
+        """,
+        unsafe_allow_html=True
+        )
+    index = None
+    doc = None
+    with st.sidebar:
+        user_secret = st.text_input(
+            "OpenAI API Key",
+            type="password",
+            placeholder="Paste your OpenAI API key here (sk-...)",
+            help="You can get your API key from https://platform.openai.com/account/api-keys.",
+            value=st.session_state.get("OPENAI_API_KEY", ""),
+        )
+        if user_secret:
+            set_openai_api_key(user_secret)
+        uploaded_file = st.file_uploader(
+            "Upload a pdf, docx, or txt file",
+            type=["pdf", "docx", "txt", "py", "json", "html", "css", "md"],
+            help="Scanned documents are not supported yet!",
+            on_change=clear_submit,
+            accept_multiple_files=False,
+        )
+        # reading the files
+        if uploaded_file is not None:
+            if uploaded_file.name.endswith(".pdf"):
+                doc = parse_pdf(uploaded_file)
+            elif uploaded_file.name.endswith(".docx"):
+                doc = parse_docx(uploaded_file)
+            elif uploaded_file.name.endswith(".txt"):
+                doc = parse_txt(uploaded_file)
+            else:
+                st.error("File type not yet supported! Supported files: [.pdf, .docx, .txt]")
+                doc = None
+            text = text_to_docs(text=tuple(doc))
+            st.write(text[:1])
+            try:
+                with st.spinner("Indexing document(s)... This may take some time."):
+                    index = embed_docs(tuple(text))
+                    st.session_state["api_key_configured"] = True
+            except OpenAIError as e:
+                st.error(e._message)
+    tab1, tab2 = st.tabs(["Chat With File", "About the Application"])
+    with tab1:
+        if 'generated' not in st.session_state:
+            st.session_state['generated'] = []
+        if 'past' not in st.session_state:
+            st.session_state['past'] = []
+        def get_text():
+            if user_secret:
+                st.header("Ask me something about the document:")
+                input_text = st.text_area("You:", on_change=clear_submit)
+                return input_text
+        user_input = get_text()
+        button = st.button("Submit")
+        if button or st.session_state.get("submit"):
+            if not user_input:
+                st.error("Please enter a question!")
+            else:
+                st.session_state["submit"] = True
+                sources = search_docs(index, user_input)
+                try:
+                    answer = get_answer(sources, user_input)
+                    st.session_state.past.append(user_input)
+                    st.session_state.generated.append(answer["output_text"])
+                except OpenAIError as e:
+                    st.error(e._message)
+                if st.session_state['past']:
+                    for i in range(len(st.session_state['past'])-1, -1, -1):
+                        message(st.session_state['generated'][i], key=str(uuid4()))
+                        message(st.session_state['past'][i], is_user=True, key=str(uuid4()))
+    with tab2:
+        st.write('See sources')
+        # st.write('Chat with Files enables user to extract all the information from a file. User can obtain the transcription, the embedding of each segment and also ask questions to the file through a chat.')
+        # st.write('Features include- ')
+        # st.write('1. Reading any pdf, docx or plain txt (such as python programs) file')
+        # st.write('2. Embedding texts segments with Langchain and OpenAI')
+        # st.write('3. Chatting with the file using streamlit-chat and LangChain QA with source and the GPT4 model')

qa/utils.py ADDED Viewed

	@@ -0,0 +1,160 @@

+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.vectorstores.faiss import FAISS
+from langchain import OpenAI
+from langchain.chains.qa_with_sources import load_qa_with_sources_chain
+from langchain.embeddings.openai import OpenAIEmbeddings
+from langchain.llms import OpenAI
+from langchain.docstore.document import Document
+from langchain.vectorstores import FAISS, VectorStore
+import docx2txt
+from typing import List, Dict, Any, Union, Text, Tuple
+import re
+from io import BytesIO
+import streamlit as st
+from .prompts import STUFF_PROMPT
+from pypdf import PdfReader
+from openai.error import AuthenticationError
+class HashDocument(Document):
+    """A document that uses the page content as the hash."""
+    def __hash__(self):
+        content = self.page_content + "".join(self.metadata[k] for k in self.metadata.keys())
+        return hash(content)
+@st.cache_data
+def parse_docx(file: BytesIO) -> str:
+    text = docx2txt.process(file)
+    # Remove multiple newlines
+    text = re.sub(r"\n\s*\n", "\n\n", text)
+    return text
+@st.cache_data
+def parse_pdf(file: BytesIO) -> List[str]:
+    pdf = PdfReader(file)
+    output = []
+    for page in pdf.pages:
+        text = page.extract_text()
+        # Merge hyphenated words
+        text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)
+        # Fix newlines in the middle of sentences
+        text = re.sub(r"(?<!\n\s)\n(?!\s\n)", " ", text.strip())
+        # Remove multiple newlines
+        text = re.sub(r"\n\s*\n", "\n\n", text)
+        output.append(text)
+    return output
+@st.cache_data
+def parse_txt(file: BytesIO) -> str:
+    text = file.read().decode("utf-8")
+    # Remove multiple newlines
+    text = re.sub(r"\n\s*\n", "\n\n", text)
+    return text
+@st.cache_data
+def text_to_docs(text: Union[Text, Tuple[Text]]) -> List[Document]:
+    """
+    Converts a string or frozenset of strings to a list of Documents
+    with metadata.
+    """
+    if isinstance(text, str):
+        # Take a single string as one page
+        text = tuple([text])
+    elif isinstance(text, tuple):
+        # map each page into a document instance
+        page_docs = [HashDocument(page_content=page) for page in text]
+        # Add page numbers as metadata
+        for i, doc in enumerate(page_docs):
+            doc.metadata["page"] = i + 1
+        # Split pages into chunks
+        doc_chunks = []
+        # text splitter to split the text into chunks
+        text_splitter = RecursiveCharacterTextSplitter(
+                chunk_size=800,
+                separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
+                chunk_overlap=20,  # minimal overlap to capture sematic overlap across chunks
+            )
+        for doc in page_docs:
+            chunks = text_splitter.split_text(doc.page_content)
+            for i, chunk in enumerate(chunks):
+                # Create a new document for each individual chunk
+                doc = HashDocument(
+                    page_content=chunk, metadata={"page": doc.metadata["page"], "chunk": i}
+                )
+                # Add sources a metadata
+                doc.metadata["source"] = f"{doc.metadata['page']}-{doc.metadata['chunk']}"
+                doc_chunks.append(doc)
+        return doc_chunks
+    else:
+        raise ValueError("Text must be either a string or a list of strings. Got: {type(text)}")
+@st.cache_data
+def embed_docs(_docs: Tuple[Document]) -> VectorStore:
+    """Embeds a list of Documents and returns a FAISS index"""
+    docs = _docs
+    if not st.session_state.get("OPENAI_API_KEY"):
+        raise AuthenticationError(
+            "Enter your OpenAI API key in the sidebar. You can get a key at https://platform.openai.com/account/api-keys."
+        )
+    else:
+        # Embed the chunks
+        embeddings = OpenAIEmbeddings(openai_api_key=st.session_state.get("OPENAI_API_KEY"))
+        index = FAISS.from_documents(list(docs), embeddings)
+        return index
+@st.cache_data
+def search_docs(_index: VectorStore, query: str) -> List[Document]:
+    """Searches a FAISS index for similar chunks to the query
+    and returns a list of Documents."""
+    # Search for similar chunks
+    docs = _index.similarity_search(query, k=5)
+    return docs
+@st.cache_data
+def get_answer(_docs: List[Document], query: str) -> Dict[str, Any]:
+    """Gets an answer to a question from a list of Documents."""
+    # Get the answer
+    chain = load_qa_with_sources_chain(
+        OpenAI(temperature=0,
+               openai_api_key=st.session_state.get("OPENAI_API_KEY")),
+               chain_type="stuff",
+               prompt=STUFF_PROMPT
+            )
+    # also returnig the text of the source used to form the answer
+    answer = chain(
+        {"input_documents": _docs, "question": query}
+    )
+    return answer
+@st.cache_data
+def get_sources(answer: Dict[str, Any], docs: List[Document]) -> List[Document]:
+    """Gets the source documents for an answer."""
+    # Get sources for the answer
+    source_keys = [s for s in answer["output_text"].split("SOURCES: ")[-1].split(", ")]
+    source_docs = []
+    for doc in docs:
+        if doc.metadata["source"] in source_keys:
+            source_docs.append(doc)
+    return source_docs
+def wrap_text_in_html(text: str) -> str:
+    """Wraps each text block separated by newlines in <p> tags"""
+    if isinstance(text, list):
+        # Add horizontal rules between pages
+        text = "\n<hr/>\n".join(text)
+    return "".join([f"<p>{line}</p>" for line in text.split("\n")])

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+docx2txt==0.8
+langchain==0.0.274
+openai==0.27.9
+pypdf==3.15.4
+streamlit==1.25.0
+streamlit_chat==0.1.1
+tenacity==8.2.3
+transformers==4.32.0
+altair<5
+torch==2.0.1
+tiktoken==0.4.0
+faiss-cpu==1.7.4

summarization/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .summarization import *

summarization/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (195 Bytes). View file

summarization/__pycache__/summarization.cpython-310.pyc ADDED Viewed

Binary file (2.01 kB). View file

summarization/summarization.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import streamlit as st
+from transformers import pipeline
+@st.cache_resource
+def summarization_model():
+    model_name = "google/pegasus-xsum"
+    summarizer = pipeline(
+        model=model_name,
+        tokenizer=model_name,
+        task="summarization"
+    )
+    return summarizer
+def summarization_main():
+    st.markdown("<h2 style='text-align: center; color:grey;'>Text Summarization</h2>", unsafe_allow_html=True)
+    st.markdown("<h3 style='text-align: left; color:#F63366; font-size:18px;'><b>What is text summarization about?<b></h3>", unsafe_allow_html=True)
+    st.write("Text summarization is producing a shorter version of a given text while preserving its important information.")
+    st.markdown('___')
+    source = st.radio("How would you like to start? Choose an option below", ["I want to input some text", "I want to upload a file"])
+    if source == "I want to input some text":
+        sample_text = ""
+        text = st.text_area("Input a text in English (10,000 characters max) or use the example below", value=sample_text, max_chars=10000, height=330)
+        button = st.button("Get summary")
+        if button:
+            with st.spinner(text="Loading summarization model..."):
+                summarizer = summarization_model()
+            with st.spinner(text="Summarizing text..."):
+                summary = summarizer(text, max_length=130, min_length=30)
+                st.text(summary[0]["summary_text"])
+    elif source == "I want to upload a file":
+        uploaded_file = st.file_uploader("Choose a .txt file to upload", type=["txt"])
+        if uploaded_file is not None:
+            raw_text = str(uploaded_file.read(),"utf-8")
+            text = st.text_area("", value=raw_text, height=330)
+            button = st.button("Get summary")
+            if button:
+                with st.spinner(text="Loading summarization model..."):
+                    summarizer = summarization_model()
+                with st.spinner(text="Summarizing text..."):
+                    summary = summarizer(text, max_length=130, min_length=30)
+                    st.text(summary[0]["summary_text"])