Spaces:

amrithhun
/

streamlit_pdf_gpt

Runtime error

App Files Files Community

amrithhun commited on Apr 26, 2023

Commit

b0dbd35

•

1 Parent(s): cb0b4d5

Upload app.py

Browse files

Files changed (1) hide show

app.py +289 -0

app.py ADDED Viewed

	@@ -0,0 +1,289 @@

+import os
+import PyPDF2
+import random
+import itertools
+import streamlit as st
+from io import StringIO
+from langchain.vectorstores import FAISS
+from langchain.chains import RetrievalQA
+from langchain.chat_models import ChatOpenAI
+from langchain.retrievers import SVMRetriever
+from langchain.chains import QAGenerationChain
+from langchain.embeddings.openai import OpenAIEmbeddings
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
+from langchain.callbacks.base import CallbackManager
+from langchain.embeddings import HuggingFaceEmbeddings
+st.set_page_config(page_title="PDF Analyzer",page_icon=':shark:')
+@st.cache_data
+def load_docs(files):
+    st.info("`Reading doc ...`")
+    all_text = ""
+    for file_path in files:
+        file_extension = os.path.splitext(file_path.name)[1]
+        if file_extension == ".pdf":
+            pdf_reader = PyPDF2.PdfReader(file_path)
+            text = ""
+            for page in pdf_reader.pages:
+                text += page.extract_text()
+            all_text += text
+        elif file_extension == ".txt":
+            stringio = StringIO(file_path.getvalue().decode("utf-8"))
+            text = stringio.read()
+            all_text += text
+        else:
+            st.warning('Please provide txt or pdf.', icon="⚠️")
+    return all_text
+@st.cache_resource
+def create_retriever(_embeddings, splits, retriever_type):
+    if retriever_type == "SIMILARITY SEARCH":
+        try:
+            vectorstore = FAISS.from_texts(splits, _embeddings)
+        except (IndexError, ValueError) as e:
+            st.error(f"Error creating vectorstore: {e}")
+            return
+        retriever = vectorstore.as_retriever(k=5)
+    elif retriever_type == "SUPPORT VECTOR MACHINES":
+        retriever = SVMRetriever.from_texts(splits, _embeddings)
+    return retriever
+@st.cache_resource
+def split_texts(text, chunk_size, overlap, split_method):
+    # Split texts
+    # IN: text, chunk size, overlap, split_method
+    # OUT: list of str splits
+    st.info("`Splitting doc ...`")
+    split_method = "RecursiveTextSplitter"
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=chunk_size, chunk_overlap=overlap)
+    splits = text_splitter.split_text(text)
+    if not splits:
+        st.error("Failed to split document")
+        st.stop()
+    return splits
+@st.cache_data
+def generate_eval(text, N, chunk):
+    # Generate N questions from context of chunk chars
+    # IN: text, N questions, chunk size to draw question from in the doc
+    # OUT: eval set as JSON list
+    st.info("`Generating sample questions ...`")
+    n = len(text)
+    starting_indices = [random.randint(0, n-chunk) for _ in range(N)]
+    sub_sequences = [text[i:i+chunk] for i in starting_indices]
+    chain = QAGenerationChain.from_llm(ChatOpenAI(temperature=0))
+    eval_set = []
+    for i, b in enumerate(sub_sequences):
+        try:
+            qa = chain.run(b)
+            eval_set.append(qa)
+            st.write("Creating Question:",i+1)
+        except:
+            st.warning('Error generating question %s.' % str(i+1), icon="⚠️")
+    eval_set_full = list(itertools.chain.from_iterable(eval_set))
+    return eval_set_full
+# ...
+def main():
+    foot = f"""
+    <div style="
+        position: fixed;
+        bottom: 0;
+        left: 30%;
+        right: 0;
+        width: 50%;
+        padding: 0px 0px;
+        text-align: center;
+    ">
+        <p>Made by <a href='https://twitter.com/mehmet_ba7'>Mehmet Balioglu</a></p>
+    </div>
+    """
+    st.markdown(foot, unsafe_allow_html=True)
+    # Add custom CSS
+    st.markdown(
+        """
+        <style>
+        #MainMenu {visibility: hidden;
+        # }
+            footer {visibility: hidden;
+            }
+            .css-card {
+                border-radius: 0px;
+                padding: 30px 10px 10px 10px;
+                background-color: #f8f9fa;
+                box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
+                margin-bottom: 10px;
+                font-family: "IBM Plex Sans", sans-serif;
+            }
+            .card-tag {
+                border-radius: 0px;
+                padding: 1px 5px 1px 5px;
+                margin-bottom: 10px;
+                position: absolute;
+                left: 0px;
+                top: 0px;
+                font-size: 0.6rem;
+                font-family: "IBM Plex Sans", sans-serif;
+                color: white;
+                background-color: green;
+                }
+            .css-zt5igj {left:0;
+            }
+            span.css-10trblm {margin-left:0;
+            }
+            div.css-1kyxreq {margin-top: -40px;
+            }
+        </style>
+        """,
+        unsafe_allow_html=True,
+    )
+    st.sidebar.image("img/logo1.png")
+    st.write(
+    f"""
+    <div style="display: flex; align-items: center; margin-left: 0;">
+        <h1 style="display: inline-block;">PDF Analyzer</h1>
+        <sup style="margin-left:5px;font-size:small; color: green;">beta</sup>
+    </div>
+    """,
+    unsafe_allow_html=True,
+        )
+    st.sidebar.title("Menu")
+    embedding_option = st.sidebar.radio(
+        "Choose Embeddings", ["OpenAI Embeddings", "HuggingFace Embeddings(slower)"])
+    retriever_type = st.sidebar.selectbox(
+        "Choose Retriever", ["SIMILARITY SEARCH", "SUPPORT VECTOR MACHINES"])
+    # Use RecursiveCharacterTextSplitter as the default and only text splitter
+    splitter_type = "RecursiveCharacterTextSplitter"
+    if 'openai_api_key' not in st.session_state:
+        openai_api_key = st.text_input(
+            'Please enter your OpenAI API key or [get one here](https://platform.openai.com/account/api-keys)', value="", placeholder="Enter the OpenAI API key which begins with sk-")
+        if openai_api_key:
+            st.session_state.openai_api_key = openai_api_key
+            os.environ["OPENAI_API_KEY"] = openai_api_key
+        else:
+            #warning_text = 'Please enter your OpenAI API key. Get yours from here: [link](https://platform.openai.com/account/api-keys)'
+            #warning_html = f'<span>{warning_text}</span>'
+            #st.markdown(warning_html, unsafe_allow_html=True)
+            return
+    else:
+        os.environ["OPENAI_API_KEY"] = st.session_state.openai_api_key
+    uploaded_files = st.file_uploader("Upload a PDF or TXT Document", type=[
+                                      "pdf", "txt"], accept_multiple_files=True)
+    if uploaded_files:
+        # Check if last_uploaded_files is not in session_state or if uploaded_files are different from last_uploaded_files
+        if 'last_uploaded_files' not in st.session_state or st.session_state.last_uploaded_files != uploaded_files:
+            st.session_state.last_uploaded_files = uploaded_files
+            if 'eval_set' in st.session_state:
+                del st.session_state['eval_set']
+        # Load and process the uploaded PDF or TXT files.
+        loaded_text = load_docs(uploaded_files)
+        st.write("Documents uploaded and processed.")
+        # Split the document into chunks
+        splits = split_texts(loaded_text, chunk_size=1000,
+                             overlap=0, split_method=splitter_type)
+        # Display the number of text chunks
+        num_chunks = len(splits)
+        st.write(f"Number of text chunks: {num_chunks}")
+        # Embed using OpenAI embeddings
+            # Embed using OpenAI embeddings or HuggingFace embeddings
+        if embedding_option == "OpenAI Embeddings":
+            embeddings = OpenAIEmbeddings()
+        elif embedding_option == "HuggingFace Embeddings(slower)":
+            # Replace "bert-base-uncased" with the desired HuggingFace model
+            embeddings = HuggingFaceEmbeddings()
+        retriever = create_retriever(embeddings, splits, retriever_type)
+        # Initialize the RetrievalQA chain with streaming output
+        callback_handler = StreamingStdOutCallbackHandler()
+        callback_manager = CallbackManager([callback_handler])
+        chat_openai = ChatOpenAI(
+            streaming=True, callback_manager=callback_manager, verbose=True, temperature=0)
+        qa = RetrievalQA.from_chain_type(llm=chat_openai, retriever=retriever, chain_type="stuff", verbose=True)
+        # Check if there are no generated question-answer pairs in the session state
+        if 'eval_set' not in st.session_state:
+            # Use the generate_eval function to generate question-answer pairs
+            num_eval_questions = 10  # Number of question-answer pairs to generate
+            st.session_state.eval_set = generate_eval(
+                loaded_text, num_eval_questions, 3000)
+       # Display the question-answer pairs in the sidebar with smaller text
+        for i, qa_pair in enumerate(st.session_state.eval_set):
+            st.sidebar.markdown(
+                f"""
+                <div class="css-card">
+                <span class="card-tag">Question {i + 1}</span>
+                    <p style="font-size: 12px;">{qa_pair['question']}</p>
+                    <p style="font-size: 12px;">{qa_pair['answer']}</p>
+                </div>
+                """,
+                unsafe_allow_html=True,
+            )
+            # <h4 style="font-size: 14px;">Question {i + 1}:</h4>
+            # <h4 style="font-size: 14px;">Answer {i + 1}:</h4>
+        st.write("Ready to answer questions.")
+        # Question and answering
+        user_question = st.text_input("Enter your question:")
+        if user_question:
+            answer = qa.run(user_question)
+            st.write("Answer:", answer)
+if __name__ == "__main__":
+    main()