Athsara commited on
Commit
5e416e3
·
verified ·
1 Parent(s): e7afd7f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +173 -173
app.py CHANGED
@@ -1,173 +1,173 @@
1
- import os
2
- import streamlit as st
3
- from dotenv import load_dotenv
4
- from PyPDF2 import PdfReader
5
- from langchain.text_splitter import CharacterTextSplitter
6
- from langchain.embeddings import HuggingFaceEmbeddings # Changed to HuggingFace
7
- from langchain.vectorstores import FAISS
8
- from langchain.chat_models import ChatOpenAI # For LLM
9
- from langchain.memory import ConversationBufferMemory
10
- from langchain.chains import ConversationalRetrievalChain
11
- from htmlTemplates import css, bot_template, user_template
12
-
13
- # Function to extract text from PDF documents
14
- def get_pdf_text(pdf_docs):
15
- text = ""
16
- for pdf in pdf_docs:
17
- pdf_reader = PdfReader(pdf)
18
- for page in pdf_reader.pages:
19
- extracted_text = page.extract_text()
20
- if extracted_text:
21
- text += extracted_text
22
- return text
23
-
24
- # Function to split text into manageable chunks
25
- def get_text_chunks(text):
26
- text_splitter = CharacterTextSplitter(
27
- separator="\n",
28
- chunk_size=1000,
29
- chunk_overlap=200,
30
- length_function=len
31
- )
32
- chunks = text_splitter.split_text(text)
33
- return chunks
34
-
35
- # Function to create a vector store using HuggingFace embeddings
36
- def get_vectorstore(text_chunks, huggingface_api_key):
37
- embeddings = HuggingFaceEmbeddings(
38
- model_name="sentence-transformers/all-MiniLM-L6-v2", # Choose an appropriate model
39
- model_kwargs={"use_auth_token": huggingface_api_key}
40
- )
41
-
42
- vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
43
- return vectorstore
44
-
45
- # Function to initialize the conversational retrieval chain with GrokAI
46
- def get_conversation_chain(vectorstore, grok_api_key, grok_api_base):
47
- llm = ChatOpenAI(
48
- openai_api_key=grok_api_key,
49
- openai_api_base=grok_api_base,
50
- model_name="grok-beta", # Specify GrokAI's model
51
- temperature=0.5
52
- )
53
-
54
- memory = ConversationBufferMemory(
55
- memory_key='chat_history', return_messages=True
56
- )
57
- conversation_chain = ConversationalRetrievalChain.from_llm(
58
- llm=llm, # Use the configured GrokAI LLM
59
- retriever=vectorstore.as_retriever(),
60
- memory=memory
61
- )
62
- return conversation_chain
63
-
64
- # Function to handle user input and generate responses
65
- def handle_userinput(user_question):
66
- if st.session_state.conversation is None:
67
- st.warning("Documents are still being processed. Please wait.")
68
- return
69
-
70
- response = st.session_state.conversation({'question': user_question})
71
- st.session_state.chat_history = response['chat_history']
72
-
73
- # Function triggered when the user presses Enter in the input box
74
- def on_enter():
75
- user_question = st.session_state.user_question
76
- if user_question:
77
- handle_userinput(user_question)
78
- st.session_state.user_question = "" # Clear the input box
79
-
80
- # Function to load and process PDF documents
81
- def load_and_process_pdfs(folder_path, huggingface_api_key, grok_api_key, grok_api_base):
82
- pdf_files = [file for file in os.listdir(folder_path) if file.lower().endswith('.pdf')]
83
- if not pdf_files:
84
- st.error(f"No PDF files found in the directory: {folder_path}")
85
- return
86
-
87
- pdf_docs = []
88
- for file in pdf_files:
89
- file_path = os.path.join(folder_path, file)
90
- pdf_docs.append(file_path)
91
-
92
- with st.spinner("Processing documents..."):
93
- # Extract text from PDFs
94
- with st.spinner("Extracting text from PDFs..."):
95
- pdf_file_objects = [open(file, 'rb') for file in pdf_docs]
96
- raw_text = get_pdf_text(pdf_file_objects)
97
- # Close the files after reading
98
- for f in pdf_file_objects:
99
- f.close()
100
-
101
- # Split text into chunks
102
- with st.spinner("Splitting text into chunks..."):
103
- text_chunks = get_text_chunks(raw_text)
104
-
105
- # Create vector store using HuggingFace embeddings
106
- with st.spinner("Creating vector store..."):
107
- vectorstore = get_vectorstore(text_chunks, huggingface_api_key)
108
-
109
- # Initialize conversation chain with GrokAI LLM
110
- with st.spinner("Initializing conversation chain..."):
111
- st.session_state.conversation = get_conversation_chain(vectorstore, grok_api_key, grok_api_base)
112
-
113
- st.success("Documents processed successfully!")
114
-
115
- # Function to display chat history with auto-scrolling
116
- def display_chat_history():
117
- if st.session_state.chat_history:
118
- for i, message in enumerate(st.session_state.chat_history):
119
- if i % 2 == 0:
120
- st.markdown(user_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
121
- else:
122
- st.markdown(bot_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
123
-
124
- # Inject JavaScript to scroll the entire page to the bottom
125
- scroll_script = """
126
- <script>
127
- // Function to scroll to the bottom of the page
128
- function scrollToBottom() {
129
- window.scrollTo({ top: document.body.scrollHeight, behavior: 'smooth' });
130
- }
131
- // Delay to ensure the DOM is fully rendered
132
- setTimeout(scrollToBottom, 100);
133
- </script>
134
- """
135
- st.markdown(scroll_script, unsafe_allow_html=True)
136
-
137
- # Main function to run the Streamlit app
138
- def main():
139
- load_dotenv()
140
-
141
- # Retrieve credentials from .env
142
- grok_api_key = os.getenv("GROK_API_KEY")
143
- grok_api_base = "https://api.x.ai/v1" # GrokAI's API base URL
144
- huggingface_api_key = os.getenv("HUGGINGFACEHUB_API_TOKEN")
145
-
146
- st.set_page_config(page_title="Chat with AI Tax Agent", page_icon=":books:")
147
- st.write(css, unsafe_allow_html=True)
148
-
149
- if "conversation" not in st.session_state:
150
- st.session_state.conversation = None
151
- if "chat_history" not in st.session_state:
152
- st.session_state.chat_history = []
153
-
154
- # Title Section
155
- st.header("Chat with AI Tax Agent :books:")
156
-
157
- # Automatically load and process PDFs on startup
158
- if st.session_state.conversation is None:
159
- documents_folder = "./documents/" # Specify your documents folder path here
160
- load_and_process_pdfs(documents_folder, huggingface_api_key, grok_api_key, grok_api_base)
161
-
162
- # Chat History Section
163
- display_chat_history()
164
-
165
- # Input Box Section
166
- st.text_input(
167
- "Ask a question about your documents:",
168
- key='user_question',
169
- on_change=on_enter
170
- )
171
-
172
- if __name__ == '__main__':
173
- main()
 
1
+ import os
2
+ import streamlit as st
3
+ from dotenv import load_dotenv
4
+ from PyPDF2 import PdfReader
5
+ from langchain.text_splitter import CharacterTextSplitter
6
+ from langchain.embeddings import HuggingFaceEmbeddings # Changed to HuggingFace
7
+ from langchain.vectorstores import FAISS
8
+ from langchain.chat_models import ChatOpenAI # For LLM
9
+ from langchain.memory import ConversationBufferMemory
10
+ from langchain.chains import ConversationalRetrievalChain
11
+ from htmlTemplates import css, bot_template, user_template
12
+
13
+ # Function to extract text from PDF documents
14
+ def get_pdf_text(pdf_docs):
15
+ text = ""
16
+ for pdf in pdf_docs:
17
+ pdf_reader = PdfReader(pdf)
18
+ for page in pdf_reader.pages:
19
+ extracted_text = page.extract_text()
20
+ if extracted_text:
21
+ text += extracted_text
22
+ return text
23
+
24
+ # Function to split text into manageable chunks
25
+ def get_text_chunks(text):
26
+ text_splitter = CharacterTextSplitter(
27
+ separator="\n",
28
+ chunk_size=1000,
29
+ chunk_overlap=200,
30
+ length_function=len
31
+ )
32
+ chunks = text_splitter.split_text(text)
33
+ return chunks
34
+
35
+ # Function to create a vector store using HuggingFace embeddings
36
+ def get_vectorstore(text_chunks, huggingface_api_key):
37
+ embeddings = HuggingFaceEmbeddings(
38
+ model_name="sentence-transformers/all-MiniLM-L6-v2", # Choose an appropriate model
39
+ model_kwargs={"use_auth_token": huggingface_api_key}
40
+ )
41
+
42
+ vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
43
+ return vectorstore
44
+
45
+ # Function to initialize the conversational retrieval chain with GrokAI
46
+ def get_conversation_chain(vectorstore, grok_api_key, grok_api_base):
47
+ llm = ChatOpenAI(
48
+ openai_api_key=grok_api_key,
49
+ openai_api_base=grok_api_base,
50
+ model_name="grok-beta", # Specify GrokAI's model
51
+ temperature=0.5
52
+ )
53
+
54
+ memory = ConversationBufferMemory(
55
+ memory_key='chat_history', return_messages=True
56
+ )
57
+ conversation_chain = ConversationalRetrievalChain.from_llm(
58
+ llm=llm, # Use the configured GrokAI LLM
59
+ retriever=vectorstore.as_retriever(),
60
+ memory=memory
61
+ )
62
+ return conversation_chain
63
+
64
+ # Function to handle user input and generate responses
65
+ def handle_userinput(user_question):
66
+ if st.session_state.conversation is None:
67
+ st.warning("Documents are still being processed. Please wait.")
68
+ return
69
+
70
+ response = st.session_state.conversation({'question': user_question})
71
+ st.session_state.chat_history = response['chat_history']
72
+
73
+ # Function triggered when the user presses Enter in the input box
74
+ def on_enter():
75
+ user_question = st.session_state.user_question
76
+ if user_question:
77
+ handle_userinput(user_question)
78
+ st.session_state.user_question = "" # Clear the input box
79
+
80
+ # Function to load and process PDF documents
81
+ def load_and_process_pdfs(folder_path, huggingface_api_key, grok_api_key, grok_api_base):
82
+ pdf_files = [file for file in os.listdir(folder_path) if file.lower().endswith('.pdf')]
83
+ if not pdf_files:
84
+ st.error(f"No PDF files found in the directory: {folder_path}")
85
+ return
86
+
87
+ pdf_docs = []
88
+ for file in pdf_files:
89
+ file_path = os.path.join(folder_path, file)
90
+ pdf_docs.append(file_path)
91
+
92
+ with st.spinner("Processing documents..."):
93
+ # Extract text from PDFs
94
+ with st.spinner("Extracting text from PDFs..."):
95
+ pdf_file_objects = [open(file, 'rb') for file in pdf_docs]
96
+ raw_text = get_pdf_text(pdf_file_objects)
97
+ # Close the files after reading
98
+ for f in pdf_file_objects:
99
+ f.close()
100
+
101
+ # Split text into chunks
102
+ with st.spinner("Splitting text into chunks..."):
103
+ text_chunks = get_text_chunks(raw_text)
104
+
105
+ # Create vector store using HuggingFace embeddings
106
+ with st.spinner("Creating vector store..."):
107
+ vectorstore = get_vectorstore(text_chunks, huggingface_api_key)
108
+
109
+ # Initialize conversation chain with GrokAI LLM
110
+ with st.spinner("Initializing conversation chain..."):
111
+ st.session_state.conversation = get_conversation_chain(vectorstore, grok_api_key, grok_api_base)
112
+
113
+ st.success("Documents processed successfully!")
114
+
115
+ # Function to display chat history with auto-scrolling
116
+ def display_chat_history():
117
+ if st.session_state.chat_history:
118
+ for i, message in enumerate(st.session_state.chat_history):
119
+ if i % 2 == 0:
120
+ st.markdown(user_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
121
+ else:
122
+ st.markdown(bot_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
123
+
124
+ # Inject JavaScript to scroll the entire page to the bottom
125
+ scroll_script = """
126
+ <script>
127
+ // Function to scroll to the bottom of the page
128
+ function scrollToBottom() {
129
+ window.scrollTo({ top: document.body.scrollHeight, behavior: 'smooth' });
130
+ }
131
+ // Delay to ensure the DOM is fully rendered
132
+ setTimeout(scrollToBottom, 100);
133
+ </script>
134
+ """
135
+ st.markdown(scroll_script, unsafe_allow_html=True)
136
+
137
+ # Main function to run the Streamlit app
138
+ def main():
139
+ load_dotenv()
140
+
141
+ # Retrieve credentials from .env
142
+ grok_api_key = os.getenv("GROK_API_KEY")
143
+ grok_api_base = "https://api.x.ai/v1" # GrokAI's API base URL
144
+ huggingface_api_key = os.getenv("HUGGINGFACEHUB_API_TOKEN")
145
+
146
+ st.set_page_config(page_title="Chat with AI Tax Agent", page_icon=":books:")
147
+ st.write(css, unsafe_allow_html=True)
148
+
149
+ if "conversation" not in st.session_state:
150
+ st.session_state.conversation = None
151
+ if "chat_history" not in st.session_state:
152
+ st.session_state.chat_history = []
153
+
154
+ # Title Section
155
+ st.header("Chat with AI Tax Agent :books:")
156
+
157
+ # Automatically load and process PDFs on startup
158
+ if st.session_state.conversation is None:
159
+ documents_folder = "./documents/" # Specify your documents folder path here
160
+ load_and_process_pdfs(documents_folder, huggingface_api_key, grok_api_key, grok_api_base)
161
+
162
+ # Chat History Section
163
+ display_chat_history()
164
+
165
+ # Input Box Section
166
+ st.text_input(
167
+ "Ask a question about your documents:",
168
+ key='user_question',
169
+ on_change=on_enter
170
+ )
171
+
172
+ if __name__ == '__main__':
173
+ main()