DebabrataHalder commited on
Commit
c50e492
·
verified ·
1 Parent(s): e64c9f9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +96 -219
app.py CHANGED
@@ -1,246 +1,123 @@
1
- # import os
2
- # import logging
3
- # from dotenv import load_dotenv
4
- # import streamlit as st
5
- # from PyPDF2 import PdfReader
6
- # from langchain.text_splitter import CharacterTextSplitter
7
- # # from langchain.embeddings import HuggingFaceInstructEmbeddings
8
- # from langchain_cohere import CohereEmbeddings
9
- # from langchain.vectorstores import FAISS
10
- # from langchain.memory import ConversationBufferMemory
11
- # from langchain.chains import ConversationalRetrievalChain
12
- # # from langchain.llms import Ollama
13
- # from langchain_groq import ChatGroq
14
-
15
- # # Load environment variables
16
- # load_dotenv()
17
-
18
- # # Set up logging
19
- # logging.basicConfig(
20
- # level=logging.INFO,
21
- # format='%(asctime)s - %(levelname)s - %(message)s'
22
- # )
23
-
24
- # # Function to extract text from PDF files
25
- # def get_pdf_text(pdf_docs):
26
- # text = ""
27
- # for pdf in pdf_docs:
28
- # pdf_reader = PdfReader(pdf)
29
- # for page in pdf_reader.pages:
30
- # text += page.extract_text()
31
- # return text
32
-
33
- # # Function to split the extracted text into chunks
34
- # def get_text_chunks(text):
35
- # text_splitter = CharacterTextSplitter(
36
- # separator="\n",
37
- # chunk_size=1000,
38
- # chunk_overlap=200,
39
- # length_function=len
40
- # )
41
- # chunks = text_splitter.split_text(text)
42
- # return chunks
43
-
44
- # # Function to create a FAISS vectorstore
45
- # # def get_vectorstore(text_chunks):
46
- # # embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
47
- # # vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
48
- # # return vectorstore
49
-
50
- # def get_vectorstore(text_chunks):
51
- # cohere_api_key = os.getenv("COHERE_API_KEY")
52
- # embeddings = CohereEmbeddings(model="embed-english-v3.0", cohere_api_key=cohere_api_key)
53
- # vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
54
- # return vectorstore
55
-
56
- # # Function to set up the conversational retrieval chain
57
- # def get_conversation_chain(vectorstore):
58
- # try:
59
- # # llm = Ollama(model="llama3.2:1b")
60
- # llm = ChatGroq(model="llama-3.3-70b-versatile", temperature=0.5)
61
- # memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
62
-
63
- # conversation_chain = ConversationalRetrievalChain.from_llm(
64
- # llm=llm,
65
- # retriever=vectorstore.as_retriever(),
66
- # memory=memory
67
- # )
68
-
69
- # logging.info("Conversation chain created successfully.")
70
- # return conversation_chain
71
- # except Exception as e:
72
- # logging.error(f"Error creating conversation chain: {e}")
73
- # st.error("An error occurred while setting up the conversation chain.")
74
-
75
- # # Handle user input
76
- # def handle_userinput(user_question):
77
- # if st.session_state.conversation is not None:
78
- # response = st.session_state.conversation({'question': user_question})
79
- # st.session_state.chat_history = response['chat_history']
80
-
81
- # for i, message in enumerate(st.session_state.chat_history):
82
- # if i % 2 == 0:
83
- # st.write(f"*User:* {message.content}")
84
- # else:
85
- # st.write(f"*Bot:* {message.content}")
86
- # else:
87
- # st.warning("Please process the documents first.")
88
-
89
- # # Main function to run the Streamlit app
90
- # def main():
91
- # load_dotenv()
92
- # st.set_page_config(page_title="Chat with multiple PDFs", page_icon=":books:")
93
-
94
- # if "conversation" not in st.session_state:
95
- # st.session_state.conversation = None
96
- # if "chat_history" not in st.session_state:
97
- # st.session_state.chat_history = None
98
-
99
- # st.header("Chat with multiple PDFs :books:")
100
- # user_question = st.text_input("Ask a question about your documents:")
101
- # if user_question:
102
- # handle_userinput(user_question)
103
-
104
- # with st.sidebar:
105
- # st.subheader("Your documents")
106
- # pdf_docs = st.file_uploader(
107
- # "Upload your PDFs here and click on 'Process'", accept_multiple_files=True
108
- # )
109
- # if st.button("Process"):
110
- # with st.spinner("Processing..."):
111
- # raw_text = get_pdf_text(pdf_docs)
112
- # text_chunks = get_text_chunks(raw_text)
113
- # vectorstore = get_vectorstore(text_chunks)
114
- # st.session_state.conversation = get_conversation_chain(vectorstore)
115
-
116
- # if __name__ == '__main__':
117
- # main()
118
-
119
-
120
-
121
-
122
-
123
-
124
-
125
- import streamlit as st
126
  import os
 
127
  from dotenv import load_dotenv
128
- import PyPDF2
129
- import requests
130
- import cohere
131
- from langchain_text_splitters import RecursiveCharacterTextSplitter
132
- from langchain_community.vectorstores import FAISS
133
  from langchain_cohere import CohereEmbeddings
 
 
 
 
 
134
 
135
  # Load environment variables
136
  load_dotenv()
137
- GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
138
- COHERE_API_KEY = os.getenv("COHERE_API_KEY")
139
 
140
- # Initialize Cohere client
141
- co = cohere.Client(COHERE_API_KEY)
 
 
 
142
 
143
- # Configure Streamlit
144
- st.set_page_config(page_title="RAG Chatbot with Gemini & Cohere")
145
- st.title("🤖 Multi-Model RAG Chatbot")
146
-
147
- # Initialize session state
148
- if "messages" not in st.session_state:
149
- st.session_state.messages = []
150
- if "vector_store" not in st.session_state:
151
- st.session_state.vector_store = None
152
-
153
- # File upload and processing
154
- uploaded_file = st.file_uploader("Upload a PDF document", type="pdf")
155
-
156
- if uploaded_file and not st.session_state.vector_store:
157
- # Process PDF
158
- pdf_reader = PyPDF2.PdfReader(uploaded_file)
159
  text = ""
160
- for page in pdf_reader.pages:
161
- text += page.extract_text()
162
-
163
- # Split text
164
- text_splitter = RecursiveCharacterTextSplitter(
 
 
 
 
 
165
  chunk_size=1000,
166
- chunk_overlap=200
 
167
  )
168
  chunks = text_splitter.split_text(text)
 
169
 
170
- # Create embeddings and vector store
171
- embeddings = CohereEmbeddings(
172
- cohere_api_key=COHERE_API_KEY,
173
- model="embed-english-v3.0",
174
- user_agent="rag-chatbot-v1"
175
- )
176
- st.session_state.vector_store = FAISS.from_texts(
177
- texts=chunks,
178
- embedding=embeddings
179
- )
180
-
181
- # Display chat messages
182
- for message in st.session_state.messages:
183
- with st.chat_message(message["role"]):
184
- st.markdown(message["content"])
185
 
186
- # Query expansion function
187
- def expand_query(query):
188
- prompt = f"""Generate 3 query variations that help answer: {query}
189
- Format as numbered bullet points:"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
 
191
- response = co.generate(
192
- prompt=prompt,
193
- max_tokens=100,
194
- temperature=0.7
195
- )
196
- expanded_queries = [query] + [q.split(". ")[1] for q in response.generations[0].text.split("\n") if q]
197
- return expanded_queries
198
 
199
- # Gemini API call
200
- def generate_with_gemini(context, query):
201
- url = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key={GEMINI_API_KEY}"
 
 
 
 
202
 
203
- system_prompt = f"""You're an expert assistant. Use this context to answer:
204
- {context}
 
 
205
 
206
- Apply Chain of Abstraction and Grounding (CAG):
207
- 1. Identify key concepts
208
- 2. Create abstract relationships
209
- 3. Ground in specific examples
210
- 4. Synthesize final answer"""
211
 
212
- headers = {"Content-Type": "application/json"}
213
- data = {
214
- "contents": [{
215
- "parts": [{
216
- "text": f"{system_prompt}\n\nQuestion: {query}"
217
- }]
218
- }]
219
- }
220
 
221
- response = requests.post(url, json=data, headers=headers)
222
- return response.json()["candidates"][0]["content"]["parts"][0]["text"]
 
 
 
 
 
 
 
 
 
223
 
224
- # Chat input
225
- if prompt := st.chat_input("Ask about the document"):
226
- st.session_state.messages.append({"role": "user", "content": prompt})
227
 
228
- with st.chat_message("user"):
229
- st.markdown(prompt)
230
 
231
- # Query expansion
232
- expanded_queries = expand_query(prompt)
233
 
234
- # Retrieve documents
235
- docs = []
236
- for query in expanded_queries:
237
- docs.extend(st.session_state.vector_store.similarity_search(query, k=2))
238
 
239
- # Generate response
240
- context = "\n\n".join([doc.page_content for doc in docs])
241
- response = generate_with_gemini(context, prompt)
242
 
243
- with st.chat_message("assistant"):
244
- st.markdown(response)
245
 
246
- st.session_state.messages.append({"role": "assistant", "content": response})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
+ import logging
3
  from dotenv import load_dotenv
4
+ import streamlit as st
5
+ from PyPDF2 import PdfReader
6
+ from langchain.text_splitter import CharacterTextSplitter
7
+ # from langchain.embeddings import HuggingFaceInstructEmbeddings
 
8
  from langchain_cohere import CohereEmbeddings
9
+ from langchain.vectorstores import FAISS
10
+ from langchain.memory import ConversationBufferMemory
11
+ from langchain.chains import ConversationalRetrievalChain
12
+ # from langchain.llms import Ollama
13
+ from langchain_groq import ChatGroq
14
 
15
  # Load environment variables
16
  load_dotenv()
 
 
17
 
18
+ # Set up logging
19
+ logging.basicConfig(
20
+ level=logging.INFO,
21
+ format='%(asctime)s - %(levelname)s - %(message)s'
22
+ )
23
 
24
+ # Function to extract text from PDF files
25
+ def get_pdf_text(pdf_docs):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  text = ""
27
+ for pdf in pdf_docs:
28
+ pdf_reader = PdfReader(pdf)
29
+ for page in pdf_reader.pages:
30
+ text += page.extract_text()
31
+ return text
32
+
33
+ # Function to split the extracted text into chunks
34
+ def get_text_chunks(text):
35
+ text_splitter = CharacterTextSplitter(
36
+ separator="\n",
37
  chunk_size=1000,
38
+ chunk_overlap=200,
39
+ length_function=len
40
  )
41
  chunks = text_splitter.split_text(text)
42
+ return chunks
43
 
44
+ # Function to create a FAISS vectorstore
45
+ # def get_vectorstore(text_chunks):
46
+ # embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
47
+ # vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
48
+ # return vectorstore
 
 
 
 
 
 
 
 
 
 
49
 
50
+ def get_vectorstore(text_chunks):
51
+ cohere_api_key = os.getenv("COHERE_API_KEY")
52
+ embeddings = CohereEmbeddings(model="embed-english-v3.0", cohere_api_key=cohere_api_key)
53
+ vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
54
+ return vectorstore
55
+
56
+ # Function to set up the conversational retrieval chain
57
+ def get_conversation_chain(vectorstore):
58
+ try:
59
+ # llm = Ollama(model="llama3.2:1b")
60
+ llm = ChatGroq(model="llama-3.3-70b-versatile", temperature=0.5)
61
+ memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
62
+
63
+ conversation_chain = ConversationalRetrievalChain.from_llm(
64
+ llm=llm,
65
+ retriever=vectorstore.as_retriever(),
66
+ memory=memory
67
+ )
68
+
69
+ logging.info("Conversation chain created successfully.")
70
+ return conversation_chain
71
+ except Exception as e:
72
+ logging.error(f"Error creating conversation chain: {e}")
73
+ st.error("An error occurred while setting up the conversation chain.")
74
 
75
+ # Handle user input
76
+ def handle_userinput(user_question):
77
+ if st.session_state.conversation is not None:
78
+ response = st.session_state.conversation({'question': user_question})
79
+ st.session_state.chat_history = response['chat_history']
 
 
80
 
81
+ for i, message in enumerate(st.session_state.chat_history):
82
+ if i % 2 == 0:
83
+ st.write(f"*User:* {message.content}")
84
+ else:
85
+ st.write(f"*Bot:* {message.content}")
86
+ else:
87
+ st.warning("Please process the documents first.")
88
 
89
+ # Main function to run the Streamlit app
90
+ def main():
91
+ load_dotenv()
92
+ st.set_page_config(page_title="Chat with multiple PDFs", page_icon=":books:")
93
 
94
+ if "conversation" not in st.session_state:
95
+ st.session_state.conversation = None
96
+ if "chat_history" not in st.session_state:
97
+ st.session_state.chat_history = None
 
98
 
99
+ st.header("Chat with multiple PDFs :books:")
100
+ user_question = st.text_input("Ask a question about your documents:")
101
+ if user_question:
102
+ handle_userinput(user_question)
 
 
 
 
103
 
104
+ with st.sidebar:
105
+ st.subheader("Your documents")
106
+ pdf_docs = st.file_uploader(
107
+ "Upload your PDFs here and click on 'Process'", accept_multiple_files=True
108
+ )
109
+ if st.button("Process"):
110
+ with st.spinner("Processing..."):
111
+ raw_text = get_pdf_text(pdf_docs)
112
+ text_chunks = get_text_chunks(raw_text)
113
+ vectorstore = get_vectorstore(text_chunks)
114
+ st.session_state.conversation = get_conversation_chain(vectorstore)
115
 
116
+ if __name__ == '__main__':
117
+ main()
 
118
 
 
 
119
 
 
 
120
 
 
 
 
 
121
 
 
 
 
122
 
 
 
123