jeonghin commited on
Commit
f880c97
1 Parent(s): 429e86f

Initial commit

Browse files
Files changed (3) hide show
  1. .gitignore +171 -0
  2. app.py +202 -0
  3. requirements.txt +81 -0
.gitignore ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/#use-with-ide
110
+ .pdm.toml
111
+
112
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113
+ __pypackages__/
114
+
115
+ # Celery stuff
116
+ celerybeat-schedule
117
+ celerybeat.pid
118
+
119
+ # SageMath parsed files
120
+ *.sage.py
121
+
122
+ # Environments
123
+ .env
124
+ .venv
125
+ env/
126
+ venv/
127
+ ENV/
128
+ env.bak/
129
+ venv.bak/
130
+ birdseye_venv/
131
+ birdseye/migrations/
132
+
133
+ # Spyder project settings
134
+ .spyderproject
135
+ .spyproject
136
+
137
+ # Rope project settings
138
+ .ropeproject
139
+
140
+ # mkdocs documentation
141
+ /site
142
+
143
+ # mypy
144
+ .mypy_cache/
145
+ .dmypy.json
146
+ dmypy.json
147
+
148
+ # Pyre type checker
149
+ .pyre/
150
+
151
+ # pytype static type analyzer
152
+ .pytype/
153
+
154
+ # Cython debug symbols
155
+ cython_debug/
156
+
157
+ # PyCharm
158
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
161
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
162
+ #.idea/
163
+
164
+ # MacOS
165
+ .DS_Store
166
+
167
+ # Certificate
168
+ Birdseye.pem
169
+ Birdseye2.pem
170
+
171
+ RECOVERY-CODES-Jeong Hin Chin.txt
app.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ # from htmlTemplates import css, bot_template, user_template
4
+
5
+ from dotenv import load_dotenv
6
+
7
+ # from PyPDF2 import PdfReader
8
+
9
+ import os
10
+ import mysql.connector
11
+
12
+ from langchain.text_splitter import CharacterTextSplitter
13
+ from langchain_community.embeddings import HuggingFaceInstructEmbeddings
14
+ from langchain_community.vectorstores import FAISS
15
+ from langchain_community.llms import HuggingFaceHub
16
+ from langchain_openai import ChatOpenAI
17
+ from langchain_openai import OpenAIEmbeddings
18
+ from langchain.memory import ConversationBufferMemory
19
+ from langchain.chains import ConversationalRetrievalChain
20
+
21
+
22
+ def get_pdf_text(slug):
23
+ load_dotenv()
24
+
25
+ text = ""
26
+ try:
27
+ conn = mysql.connector.connect(
28
+ user=os.getenv("SQL_USER"),
29
+ password=os.getenv("SQL_PWD"),
30
+ host=os.getenv("SQL_HOST"),
31
+ database="Birdseye_DB",
32
+ )
33
+ cursor = conn.cursor()
34
+
35
+ # Execute a query
36
+ cursor.execute("SELECT ocr_text FROM birdseye_temp WHERE slug = %s", (slug,))
37
+
38
+ # Fetch the results
39
+ rows = cursor.fetchall()
40
+ for row in rows:
41
+ if row[0]:
42
+ text += row[0]
43
+
44
+ except mysql.connector.Error as err:
45
+ st.error(f"Error: {err}")
46
+ finally:
47
+ if conn.is_connected():
48
+ cursor.close()
49
+ conn.close()
50
+ return text
51
+
52
+
53
+ def get_text_chunks(text):
54
+ """
55
+ Splits the given text into chunks based on specified character settings.
56
+ Parameters:
57
+ - text (str): The text to be split into chunks.
58
+ Returns:
59
+ - list: A list of text chunks.
60
+ """
61
+ text_splitter = CharacterTextSplitter(
62
+ separator="\n", chunk_size=1000, chunk_overlap=200, length_function=len
63
+ )
64
+ chunks = text_splitter.split_text(text)
65
+ return chunks
66
+
67
+
68
+ def get_vectorstore(text_chunks):
69
+ """
70
+ Generates a vector store from a list of text chunks using specified embeddings.
71
+ Parameters:
72
+ - text_chunks (list of str): Text segments to convert into vector embeddings.
73
+ Returns:
74
+ - FAISS: A FAISS vector store containing the embeddings of the text chunks.
75
+ """
76
+ embeddings = OpenAIEmbeddings()
77
+ vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
78
+ return vectorstore
79
+
80
+
81
+ def get_conversation_chain(vectorstore):
82
+ """
83
+ Initializes a conversational retrieval chain that uses a large language model
84
+ for generating responses based on the provided vector store.
85
+ Parameters:
86
+ - vectorstore (FAISS): A vector store to be used for retrieving relevant content.
87
+ Returns:
88
+ - ConversationalRetrievalChain: An initialized conversational chain object.
89
+ """
90
+ try:
91
+ llm = ChatOpenAI(model_name="gpt-4-1106-preview")
92
+ memory = ConversationBufferMemory(
93
+ memory_key="chat_history", return_messages=True
94
+ )
95
+ conversation_chain = ConversationalRetrievalChain.from_llm(
96
+ llm=llm, retriever=vectorstore.as_retriever(), memory=memory
97
+ )
98
+ return conversation_chain
99
+ except Exception as e:
100
+ raise # Re-raise exception to handle it or log it properly elsewhere
101
+
102
+
103
+ def handle_userinput(user_question):
104
+ response = st.session_state.conversation(
105
+ {
106
+ "question": f"Based on the memory and the provided document, answer the following user question: {user_question}. If the question is unrelated to memory or the document, just mention that you cannot provide an answer."
107
+ }
108
+ )
109
+ st.session_state.chat_history = response["chat_history"]
110
+
111
+ for i, message in reversed(list(enumerate(st.session_state.chat_history))):
112
+ if i % 2 == 0:
113
+ st.write(
114
+ user_template.replace("{{MSG}}", message.content),
115
+ unsafe_allow_html=True,
116
+ )
117
+ else:
118
+ st.write(
119
+ bot_template.replace("{{MSG}}", message.content), unsafe_allow_html=True
120
+ )
121
+
122
+
123
+ def chat(slug):
124
+ """
125
+ Manages the chat interface in the Streamlit application, handling the conversation
126
+ flow and displaying the chat history.
127
+ """
128
+
129
+ text_chunks = get_text_chunks(get_pdf_text(slug))
130
+ vectorstore = get_vectorstore(text_chunks)
131
+ st.session_state.conversation = get_conversation_chain(vectorstore)
132
+
133
+ if len(st.session_state.messages) == 1:
134
+ message = st.session_state.messages[0]
135
+ with st.chat_message(message["role"]):
136
+ st.write(message["content"])
137
+
138
+ else:
139
+ for message in st.session_state.messages:
140
+ with st.chat_message(message["role"]):
141
+ st.write(message["content"])
142
+
143
+ # User-provided prompt
144
+ if prompt := st.chat_input():
145
+ st.session_state.messages.append({"role": "user", "content": prompt})
146
+ st.session_state.prompts = prompt
147
+ with st.chat_message("user"):
148
+ st.write(prompt)
149
+
150
+ if st.session_state.messages[-1]["role"] != "system":
151
+
152
+ with st.spinner("Generating response..."):
153
+ response = st.session_state.conversation.invoke(
154
+ {"question": st.session_state.prompts}
155
+ )
156
+
157
+ with st.chat_message("system"):
158
+ message_content = response["chat_history"][-1].content
159
+ st.session_state.messages.append(
160
+ {"role": "system", "content": message_content}
161
+ )
162
+ st.write(message_content)
163
+
164
+
165
+ def init():
166
+ """
167
+ Initializes the session state variables used in the Streamlit application and
168
+ loads environment variables.
169
+ """
170
+
171
+ if "pdf" not in st.session_state:
172
+ st.session_state["pdf"] = False
173
+ if "conversation" not in st.session_state:
174
+ st.session_state.conversation = None
175
+ if "chat_history" not in st.session_state:
176
+ st.session_state.chat_history = None
177
+ if "messages" not in st.session_state.keys():
178
+ st.session_state.messages = [
179
+ {
180
+ "role": "system",
181
+ "content": "What do you want to learn about the document? Ask me a question!",
182
+ }
183
+ ]
184
+
185
+
186
+ def main():
187
+ init()
188
+ query_params = st.query_params
189
+ slug = query_params.get("slug")
190
+
191
+ load_dotenv()
192
+ st.title("Chat with GPT :books:")
193
+
194
+ if slug:
195
+ chat(slug)
196
+
197
+ else:
198
+ st.error("Please return to Birdseye and select a document.")
199
+
200
+
201
+ if __name__ == "__main__":
202
+ main()
requirements.txt ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiohttp==3.9.5
2
+ aiosignal==1.3.1
3
+ altair==4.0.0
4
+ annotated-types==0.6.0
5
+ anyio==4.3.0
6
+ attrs==23.2.0
7
+ blinker==1.8.1
8
+ cachetools==5.3.3
9
+ certifi==2024.2.2
10
+ charset-normalizer==3.3.2
11
+ click==8.1.7
12
+ dataclasses-json==0.6.5
13
+ distro==1.9.0
14
+ entrypoints==0.4
15
+ faiss-cpu==1.7.4
16
+ frozenlist==1.4.1
17
+ gitdb==4.0.11
18
+ GitPython==3.1.43
19
+ h11==0.14.0
20
+ httpcore==1.0.5
21
+ httpx==0.27.0
22
+ idna==3.7
23
+ Jinja2==3.1.4
24
+ jsonpatch==1.33
25
+ jsonpointer==2.4
26
+ jsonschema==4.22.0
27
+ jsonschema-specifications==2023.12.1
28
+ langchain==0.1.16
29
+ langchain-community==0.0.32
30
+ langchain-core==0.1.42
31
+ langchain-openai==0.1.3
32
+ langchain-text-splitters==0.0.1
33
+ langsmith==0.1.54
34
+ markdown-it-py==3.0.0
35
+ MarkupSafe==2.1.5
36
+ marshmallow==3.21.2
37
+ mdurl==0.1.2
38
+ multidict==6.0.5
39
+ mypy-extensions==1.0.0
40
+ mysql==0.0.3
41
+ mysql-connector-python==8.4.0
42
+ mysql-connector-python-rf==2.2.2
43
+ mysqlclient==2.2.0
44
+ numpy==1.26.4
45
+ openai==1.25.2
46
+ orjson==3.10.3
47
+ packaging==23.2
48
+ pandas==2.2.2
49
+ pillow==10.3.0
50
+ protobuf==4.25.3
51
+ pyarrow==16.0.0
52
+ pydantic==2.7.1
53
+ pydantic_core==2.18.2
54
+ pydeck==0.9.0
55
+ Pygments==2.18.0
56
+ PyPDF2==3.0.1
57
+ python-dateutil==2.9.0.post0
58
+ python-dotenv==1.0.0
59
+ pytz==2024.1
60
+ PyYAML==6.0.1
61
+ referencing==0.35.1
62
+ regex==2024.4.28
63
+ requests==2.31.0
64
+ rich==13.7.1
65
+ rpds-py==0.18.0
66
+ six==1.16.0
67
+ smmap==5.0.1
68
+ sniffio==1.3.1
69
+ SQLAlchemy==2.0.30
70
+ streamlit==1.33.0
71
+ tenacity==8.2.3
72
+ tiktoken==0.6.0
73
+ toml==0.10.2
74
+ toolz==0.12.1
75
+ tornado==6.4
76
+ tqdm==4.66.4
77
+ typing-inspect==0.9.0
78
+ typing_extensions==4.11.0
79
+ tzdata==2024.1
80
+ urllib3==2.2.1
81
+ yarl==1.9.4