Spaces:

BirdseyeSharing
/

Chat

Running

App Files Files Community

jeonghin commited on May 6

Commit

f880c97

•

1 Parent(s): 429e86f

Initial commit

Browse files

Files changed (3) hide show

.gitignore +171 -0
app.py +202 -0
requirements.txt +81 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,171 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+birdseye_venv/
+birdseye/migrations/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# MacOS
+.DS_Store
+# Certificate
+Birdseye.pem
+Birdseye2.pem
+RECOVERY-CODES-Jeong Hin  Chin.txt

app.py ADDED Viewed

	@@ -0,0 +1,202 @@

+import streamlit as st
+# from htmlTemplates import css, bot_template, user_template
+from dotenv import load_dotenv
+# from PyPDF2 import PdfReader
+import os
+import mysql.connector
+from langchain.text_splitter import CharacterTextSplitter
+from langchain_community.embeddings import HuggingFaceInstructEmbeddings
+from langchain_community.vectorstores import FAISS
+from langchain_community.llms import HuggingFaceHub
+from langchain_openai import ChatOpenAI
+from langchain_openai import OpenAIEmbeddings
+from langchain.memory import ConversationBufferMemory
+from langchain.chains import ConversationalRetrievalChain
+def get_pdf_text(slug):
+    load_dotenv()
+    text = ""
+    try:
+        conn = mysql.connector.connect(
+            user=os.getenv("SQL_USER"),
+            password=os.getenv("SQL_PWD"),
+            host=os.getenv("SQL_HOST"),
+            database="Birdseye_DB",
+        )
+        cursor = conn.cursor()
+        # Execute a query
+        cursor.execute("SELECT ocr_text FROM birdseye_temp WHERE slug = %s", (slug,))
+        # Fetch the results
+        rows = cursor.fetchall()
+        for row in rows:
+            if row[0]:
+                text += row[0]
+    except mysql.connector.Error as err:
+        st.error(f"Error: {err}")
+    finally:
+        if conn.is_connected():
+            cursor.close()
+            conn.close()
+    return text
+def get_text_chunks(text):
+    """
+    Splits the given text into chunks based on specified character settings.
+    Parameters:
+    - text (str): The text to be split into chunks.
+    Returns:
+    - list: A list of text chunks.
+    """
+    text_splitter = CharacterTextSplitter(
+        separator="\n", chunk_size=1000, chunk_overlap=200, length_function=len
+    )
+    chunks = text_splitter.split_text(text)
+    return chunks
+def get_vectorstore(text_chunks):
+    """
+    Generates a vector store from a list of text chunks using specified embeddings.
+    Parameters:
+    - text_chunks (list of str): Text segments to convert into vector embeddings.
+    Returns:
+    - FAISS: A FAISS vector store containing the embeddings of the text chunks.
+    """
+    embeddings = OpenAIEmbeddings()
+    vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
+    return vectorstore
+def get_conversation_chain(vectorstore):
+    """
+    Initializes a conversational retrieval chain that uses a large language model
+    for generating responses based on the provided vector store.
+    Parameters:
+    - vectorstore (FAISS): A vector store to be used for retrieving relevant content.
+    Returns:
+    - ConversationalRetrievalChain: An initialized conversational chain object.
+    """
+    try:
+        llm = ChatOpenAI(model_name="gpt-4-1106-preview")
+        memory = ConversationBufferMemory(
+            memory_key="chat_history", return_messages=True
+        )
+        conversation_chain = ConversationalRetrievalChain.from_llm(
+            llm=llm, retriever=vectorstore.as_retriever(), memory=memory
+        )
+        return conversation_chain
+    except Exception as e:
+        raise  # Re-raise exception to handle it or log it properly elsewhere
+def handle_userinput(user_question):
+    response = st.session_state.conversation(
+        {
+            "question": f"Based on the memory and the provided document, answer the following user question: {user_question}. If the question is unrelated to memory or the document, just mention that you cannot provide an answer."
+        }
+    )
+    st.session_state.chat_history = response["chat_history"]
+    for i, message in reversed(list(enumerate(st.session_state.chat_history))):
+        if i % 2 == 0:
+            st.write(
+                user_template.replace("{{MSG}}", message.content),
+                unsafe_allow_html=True,
+            )
+        else:
+            st.write(
+                bot_template.replace("{{MSG}}", message.content), unsafe_allow_html=True
+            )
+def chat(slug):
+    """
+    Manages the chat interface in the Streamlit application, handling the conversation
+    flow and displaying the chat history.
+    """
+    text_chunks = get_text_chunks(get_pdf_text(slug))
+    vectorstore = get_vectorstore(text_chunks)
+    st.session_state.conversation = get_conversation_chain(vectorstore)
+    if len(st.session_state.messages) == 1:
+        message = st.session_state.messages[0]
+        with st.chat_message(message["role"]):
+            st.write(message["content"])
+    else:
+        for message in st.session_state.messages:
+            with st.chat_message(message["role"]):
+                st.write(message["content"])
+    # User-provided prompt
+    if prompt := st.chat_input():
+        st.session_state.messages.append({"role": "user", "content": prompt})
+        st.session_state.prompts = prompt
+        with st.chat_message("user"):
+            st.write(prompt)
+    if st.session_state.messages[-1]["role"] != "system":
+        with st.spinner("Generating response..."):
+            response = st.session_state.conversation.invoke(
+                {"question": st.session_state.prompts}
+            )
+        with st.chat_message("system"):
+            message_content = response["chat_history"][-1].content
+            st.session_state.messages.append(
+                {"role": "system", "content": message_content}
+            )
+            st.write(message_content)
+def init():
+    """
+    Initializes the session state variables used in the Streamlit application and
+    loads environment variables.
+    """
+    if "pdf" not in st.session_state:
+        st.session_state["pdf"] = False
+    if "conversation" not in st.session_state:
+        st.session_state.conversation = None
+    if "chat_history" not in st.session_state:
+        st.session_state.chat_history = None
+    if "messages" not in st.session_state.keys():
+        st.session_state.messages = [
+            {
+                "role": "system",
+                "content": "What do you want to learn about the document? Ask me a question!",
+            }
+        ]
+def main():
+    init()
+    query_params = st.query_params
+    slug = query_params.get("slug")
+    load_dotenv()
+    st.title("Chat with GPT :books:")
+    if slug:
+        chat(slug)
+    else:
+        st.error("Please return to Birdseye and select a document.")
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,81 @@

+aiohttp==3.9.5
+aiosignal==1.3.1
+altair==4.0.0
+annotated-types==0.6.0
+anyio==4.3.0
+attrs==23.2.0
+blinker==1.8.1
+cachetools==5.3.3
+certifi==2024.2.2
+charset-normalizer==3.3.2
+click==8.1.7
+dataclasses-json==0.6.5
+distro==1.9.0
+entrypoints==0.4
+faiss-cpu==1.7.4
+frozenlist==1.4.1
+gitdb==4.0.11
+GitPython==3.1.43
+h11==0.14.0
+httpcore==1.0.5
+httpx==0.27.0
+idna==3.7
+Jinja2==3.1.4
+jsonpatch==1.33
+jsonpointer==2.4
+jsonschema==4.22.0
+jsonschema-specifications==2023.12.1
+langchain==0.1.16
+langchain-community==0.0.32
+langchain-core==0.1.42
+langchain-openai==0.1.3
+langchain-text-splitters==0.0.1
+langsmith==0.1.54
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+marshmallow==3.21.2
+mdurl==0.1.2
+multidict==6.0.5
+mypy-extensions==1.0.0
+mysql==0.0.3
+mysql-connector-python==8.4.0
+mysql-connector-python-rf==2.2.2
+mysqlclient==2.2.0
+numpy==1.26.4
+openai==1.25.2
+orjson==3.10.3
+packaging==23.2
+pandas==2.2.2
+pillow==10.3.0
+protobuf==4.25.3
+pyarrow==16.0.0
+pydantic==2.7.1
+pydantic_core==2.18.2
+pydeck==0.9.0
+Pygments==2.18.0
+PyPDF2==3.0.1
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.0
+pytz==2024.1
+PyYAML==6.0.1
+referencing==0.35.1
+regex==2024.4.28
+requests==2.31.0
+rich==13.7.1
+rpds-py==0.18.0
+six==1.16.0
+smmap==5.0.1
+sniffio==1.3.1
+SQLAlchemy==2.0.30
+streamlit==1.33.0
+tenacity==8.2.3
+tiktoken==0.6.0
+toml==0.10.2
+toolz==0.12.1
+tornado==6.4
+tqdm==4.66.4
+typing-inspect==0.9.0
+typing_extensions==4.11.0
+tzdata==2024.1
+urllib3==2.2.1
+yarl==1.9.4