ramhemanth580 commited on
Commit
65358b5
1 Parent(s): 79dfccb

Upload app.py.py

Browse files
Files changed (1) hide show
  1. app.py.py +263 -0
app.py.py ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """RAG Conversational Chat Application using lanchain, Mistral 7B , Pinecone vector DB
3
+
4
+ ### Step-1: Upload Documents and Load with Langchain Document Loader
5
+ - Upload the documents to Google Colab.
6
+ - Use Langchain document loader to load the documents.
7
+
8
+ ### Step-2: Perform Chunking
9
+ - Perform chunking on the loaded documents.
10
+
11
+ ### Step-3: Initialize LLM and Use Huggingface Embedding Model
12
+ - Initialize a Large Language Model (LLM).
13
+ - Use the Huggingface Embedding Model to convert the chunks into embeddings.
14
+
15
+ ### Step-4: Initialize Vector Database
16
+ - Initialize a Vector Database to store the resulting embeddings.
17
+
18
+ ### Step-5: Upload Embeddings to Vector Database
19
+ - Upload the embeddings to the Vector Database.
20
+
21
+ ### Step-6: Create Langchain Conversational Buffer Memory
22
+ - Create a Langchain conversational buffer memory.
23
+
24
+ ### Step-7: Create Prompt Template
25
+ - Create a prompt template for generating responses.
26
+
27
+ ### Step-8: Use Langchain RetreivalQA
28
+ - Use Langchain RetreivalQA for creating the conversational chat.
29
+
30
+ ### Step-9: Create Front End with Streamlit
31
+ - Create a front end for the application using Gradio.
32
+
33
+ ### Step-10: Upload Code to GitHub
34
+ - Upload the code to a GitHub repository.
35
+
36
+ ### Step-11: Deploy App in Huggingface Spaces
37
+ - Deploy the application in Huggingface Spaces.
38
+
39
+ ### Step-12: Create Documentation
40
+ - Create documentation for the entire process followed.
41
+ """
42
+
43
+ # Installing the required libraries
44
+ # !pip install langchain
45
+ # !pip install pypdf
46
+ # !pip install sentence-transformers==2.2.2
47
+ # !pip install pinecone-client==2.2.4
48
+ # !pip install unstructured
49
+ # !pip install "unstructured[pdf]"
50
+
51
+ # initializing the Huggingface API to access Embeddig models
52
+ # from google.colab import userdata
53
+ # HUGGINGFACE_API_KEY = userdata.get('Hugging_Face_API_Key')
54
+ # HUGGINGFACE_API_KEY=HUGGINGFACE_API_KEY
55
+
56
+ # Creating a directory to store the data
57
+
58
+ # from langchain.document_loaders import PyPDFDirectoryLoader
59
+
60
+ # loader = PyPDFDirectoryLoader("data")
61
+
62
+ # importing all the required Libraries
63
+ from PyPDF2 import PdfReader
64
+ from langchain.chains.question_answering import load_qa_chain
65
+ from langchain.prompts import PromptTemplate
66
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
67
+ from langchain.memory import ConversationBufferMemory
68
+ from langchain.chains import ConversationalRetrievalChain
69
+
70
+ # from langchain.document_loaders import PyPDFDirectoryLoader
71
+ # loader = PyPDFDirectoryLoader("data")
72
+ # data = loader.load()
73
+
74
+ # len(data)
75
+
76
+ import os
77
+
78
+ huggingfacehub_api_token = os.getenv("HF_API_TOKEN")
79
+
80
+ def get_pdf_text(pdf_docs):
81
+ text=""
82
+ for pdf in pdf_docs:
83
+ pdf_reader= PdfReader(pdf)
84
+ for page in pdf_reader.pages:
85
+ text+= page.extract_text()
86
+ return text
87
+
88
+ # creating chunking for the above data
89
+ def get_text_chunks(text):
90
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
91
+ chunks = text_splitter.split_text(text)
92
+ return chunks
93
+
94
+ # # creating chunking for the above data
95
+ # from langchain.text_splitter import RecursiveCharacterTextSplitter
96
+ # text_splitter=RecursiveCharacterTextSplitter(chunk_size=200,chunk_overlap=20)
97
+ # chunked_data=text_splitter.split_text(data)
98
+
99
+ # Create Embeddings using Huggingface Embeddings
100
+ import sentence_transformers
101
+ from langchain.embeddings import HuggingFaceEmbeddings
102
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
103
+
104
+
105
+ # Initializing Pinecone
106
+ PINECONE_API_KEY=os.environ.get('PINECONE_API_KEY', 'f7384d73-ea97-45ca-abaa-9b14327fd50f')
107
+ PINECONE_API_ENV=os.environ.get('PINECONE_API_ENV', 'gcp-starter')
108
+
109
+ import pinecone
110
+ # initialize pinecone
111
+ pinecone.init(
112
+ api_key=PINECONE_API_KEY, # find at app.pinecone.io
113
+ environment=PINECONE_API_ENV # next to api key in console
114
+ )
115
+ index_name = "pinecone-demo" # put in the name of your pinecone index here
116
+
117
+ from langchain.vectorstores import Pinecone
118
+
119
+ # Load the data into pinecone database
120
+ def get_vector_store(text_chunks):
121
+ #docsearch = Pinecone.from_texts(chunked_data, embeddings, index_name=index_name)
122
+ docsearch = Pinecone.from_texts([t for t in text_chunks], embeddings, index_name=index_name)
123
+ return docsearch
124
+
125
+
126
+ # query = "How many topics are covered?"
127
+ # docs = docsearch.similarity_search(query, k=1)
128
+ # docs
129
+
130
+ from langchain import HuggingFaceHub
131
+
132
+ llm=HuggingFaceHub(huggingfacehub_api_token= huggingfacehub_api_token ,repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1")
133
+
134
+ # from langchain.chains import RetrievalQA
135
+ # # retriever = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=docsearch.as_retriever())
136
+ # retriever = docsearch.as_retriever(search_kwargs={"k": 2})
137
+
138
+ # qa_chain = RetrievalQA.from_chain_type(llm=llm,
139
+ # chain_type="stuff",
140
+ # retriever=retriever,
141
+ # return_source_documents=True)
142
+
143
+ #question = "What are the Technical Skills to learn for a Promising AI Career?"
144
+
145
+ #print(qa_chain(question))
146
+
147
+ ## Adding Memory component
148
+ memory = ConversationBufferMemory(
149
+ memory_key="chat_history",
150
+ return_messages=True, max_history_length=5
151
+ )
152
+
153
+
154
+ import streamlit as st
155
+
156
+
157
+ # Chat History
158
+ #chat = llm.start_chat(history=[])
159
+ # intialize session state for chat history if it doesn't exist
160
+ if 'chat_history' not in st.session_state:
161
+ st.session_state['chat_history'] = []
162
+
163
+ def user_input(user_question):
164
+ # Load embeddings only once (assuming same model for both)
165
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
166
+
167
+ # Pinecone search using the loaded embeddings
168
+ docsearch = Pinecone.from_existing_index(index_name, embeddings)
169
+ docs = docsearch.similarity_search(user_question)
170
+
171
+ # Define prompt template
172
+ template = """Answer the question as detailed as possible from the provided context, make sure to provide all the details,
173
+ if the answer is not available in the provided context just say, "answer is not available in the context", don't provide the wrong answer\n\n
174
+ {context}
175
+ Donot provide the Context , Provide the Answer only , to the question in the following format
176
+ Question: {question}
177
+ Helpful Answer:"""
178
+ prompt = PromptTemplate(input_variables=["context", "question"], template=template)
179
+ #prompt = PromptTemplate(input_variables=["question"], template=template)
180
+
181
+ # Create retriever and chain using the loaded embeddings
182
+ retriever = docsearch.as_retriever()
183
+ qa_chain = ConversationalRetrievalChain.from_llm(
184
+ llm,
185
+ retriever=retriever,
186
+ memory=memory
187
+ )
188
+
189
+ # Extract context from retrieved documents (replace with your logic)
190
+ # Consider filtering or summarizing retrieved documents
191
+ #context = " ".join([doc.get("text", "") for doc in docs[:3]])
192
+ context = docs
193
+
194
+ # Inject prompt into query (alternative approach)
195
+ query = f"{template.format(context=context, question=user_question)}\nQuestion: {user_question}"
196
+ #query = f"{template.format( question=user_question)}\nQuestion: {user_question}"
197
+
198
+ response = qa_chain(
199
+ {"question": query},
200
+ return_only_outputs=True
201
+ )
202
+
203
+ # Display response
204
+ #st.write("Reply: ", response["answer"])
205
+ Ans = extract_helpful_answer(response)
206
+ st.write(Ans)
207
+
208
+ # Feature to load Chat history
209
+ if st.button("Load Chat History"):
210
+ # add user query and response to session chat history
211
+ st.session_state['chat_history'].append(("you",user_question))
212
+ # for chunk in Ans:
213
+ # #st.write(chunk.text)
214
+ # st.session_state['chat_history'].append(("AI Assistant",chunk))
215
+ st.session_state['chat_history'].append(("AI Assistant",Ans))
216
+ st.subheader("The chat history is ")
217
+ for role,text in st.session_state['chat_history']:
218
+ st.write(f"{role}: {text}")
219
+
220
+ # Feature to load Related Context from the uploaded Documents
221
+ if st.button("Load Related Context from Your Document"):
222
+ related_context = docs
223
+ st.subheader("Related Context from Your Document:")
224
+ for doc in related_context:
225
+ st.write(f"Document: {doc}")
226
+ st.write("\n")
227
+ else:
228
+ st.warning("Please enter a question before loading related context.")
229
+
230
+ def extract_helpful_answer(response):
231
+ # Split the response by the delimiter "Helpful Answer:"
232
+ parts = response["answer"].split("Helpful Answer:")
233
+
234
+ # If there are two parts (before and after "Helpful Answer:"), return the second part
235
+ return parts[2].strip()
236
+
237
+
238
+
239
+
240
+ def main():
241
+ #st.set_page_config("Chat PDF")
242
+ st.header("Chat with PDF using Mistral")
243
+
244
+ user_question = st.text_input("Ask a Question from the PDF Files")
245
+
246
+ if user_question:
247
+ user_input(user_question)
248
+
249
+ with st.sidebar:
250
+ st.title("Menu:")
251
+ pdf_docs = st.file_uploader("Upload your PDF Files and Click on the Submit & Process Button", accept_multiple_files=True)
252
+ if st.button("Submit & Process"):
253
+ with st.spinner("Processing..."):
254
+ raw_text = get_pdf_text(pdf_docs)
255
+ text_chunks = get_text_chunks(raw_text)
256
+ get_vector_store(text_chunks)
257
+ st.success("Done")
258
+
259
+
260
+
261
+ if __name__ == "__main__":
262
+ main()
263
+