ramhemanth580 commited on
Commit
62bc827
1 Parent(s): 65358b5

Delete app.py.py

Browse files
Files changed (1) hide show
  1. app.py.py +0 -263
app.py.py DELETED
@@ -1,263 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- """RAG Conversational Chat Application using lanchain, Mistral 7B , Pinecone vector DB
3
-
4
- ### Step-1: Upload Documents and Load with Langchain Document Loader
5
- - Upload the documents to Google Colab.
6
- - Use Langchain document loader to load the documents.
7
-
8
- ### Step-2: Perform Chunking
9
- - Perform chunking on the loaded documents.
10
-
11
- ### Step-3: Initialize LLM and Use Huggingface Embedding Model
12
- - Initialize a Large Language Model (LLM).
13
- - Use the Huggingface Embedding Model to convert the chunks into embeddings.
14
-
15
- ### Step-4: Initialize Vector Database
16
- - Initialize a Vector Database to store the resulting embeddings.
17
-
18
- ### Step-5: Upload Embeddings to Vector Database
19
- - Upload the embeddings to the Vector Database.
20
-
21
- ### Step-6: Create Langchain Conversational Buffer Memory
22
- - Create a Langchain conversational buffer memory.
23
-
24
- ### Step-7: Create Prompt Template
25
- - Create a prompt template for generating responses.
26
-
27
- ### Step-8: Use Langchain RetreivalQA
28
- - Use Langchain RetreivalQA for creating the conversational chat.
29
-
30
- ### Step-9: Create Front End with Streamlit
31
- - Create a front end for the application using Gradio.
32
-
33
- ### Step-10: Upload Code to GitHub
34
- - Upload the code to a GitHub repository.
35
-
36
- ### Step-11: Deploy App in Huggingface Spaces
37
- - Deploy the application in Huggingface Spaces.
38
-
39
- ### Step-12: Create Documentation
40
- - Create documentation for the entire process followed.
41
- """
42
-
43
- # Installing the required libraries
44
- # !pip install langchain
45
- # !pip install pypdf
46
- # !pip install sentence-transformers==2.2.2
47
- # !pip install pinecone-client==2.2.4
48
- # !pip install unstructured
49
- # !pip install "unstructured[pdf]"
50
-
51
- # initializing the Huggingface API to access Embeddig models
52
- # from google.colab import userdata
53
- # HUGGINGFACE_API_KEY = userdata.get('Hugging_Face_API_Key')
54
- # HUGGINGFACE_API_KEY=HUGGINGFACE_API_KEY
55
-
56
- # Creating a directory to store the data
57
-
58
- # from langchain.document_loaders import PyPDFDirectoryLoader
59
-
60
- # loader = PyPDFDirectoryLoader("data")
61
-
62
- # importing all the required Libraries
63
- from PyPDF2 import PdfReader
64
- from langchain.chains.question_answering import load_qa_chain
65
- from langchain.prompts import PromptTemplate
66
- from langchain.text_splitter import RecursiveCharacterTextSplitter
67
- from langchain.memory import ConversationBufferMemory
68
- from langchain.chains import ConversationalRetrievalChain
69
-
70
- # from langchain.document_loaders import PyPDFDirectoryLoader
71
- # loader = PyPDFDirectoryLoader("data")
72
- # data = loader.load()
73
-
74
- # len(data)
75
-
76
- import os
77
-
78
- huggingfacehub_api_token = os.getenv("HF_API_TOKEN")
79
-
80
- def get_pdf_text(pdf_docs):
81
- text=""
82
- for pdf in pdf_docs:
83
- pdf_reader= PdfReader(pdf)
84
- for page in pdf_reader.pages:
85
- text+= page.extract_text()
86
- return text
87
-
88
- # creating chunking for the above data
89
- def get_text_chunks(text):
90
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
91
- chunks = text_splitter.split_text(text)
92
- return chunks
93
-
94
- # # creating chunking for the above data
95
- # from langchain.text_splitter import RecursiveCharacterTextSplitter
96
- # text_splitter=RecursiveCharacterTextSplitter(chunk_size=200,chunk_overlap=20)
97
- # chunked_data=text_splitter.split_text(data)
98
-
99
- # Create Embeddings using Huggingface Embeddings
100
- import sentence_transformers
101
- from langchain.embeddings import HuggingFaceEmbeddings
102
- embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
103
-
104
-
105
- # Initializing Pinecone
106
- PINECONE_API_KEY=os.environ.get('PINECONE_API_KEY', 'f7384d73-ea97-45ca-abaa-9b14327fd50f')
107
- PINECONE_API_ENV=os.environ.get('PINECONE_API_ENV', 'gcp-starter')
108
-
109
- import pinecone
110
- # initialize pinecone
111
- pinecone.init(
112
- api_key=PINECONE_API_KEY, # find at app.pinecone.io
113
- environment=PINECONE_API_ENV # next to api key in console
114
- )
115
- index_name = "pinecone-demo" # put in the name of your pinecone index here
116
-
117
- from langchain.vectorstores import Pinecone
118
-
119
- # Load the data into pinecone database
120
- def get_vector_store(text_chunks):
121
- #docsearch = Pinecone.from_texts(chunked_data, embeddings, index_name=index_name)
122
- docsearch = Pinecone.from_texts([t for t in text_chunks], embeddings, index_name=index_name)
123
- return docsearch
124
-
125
-
126
- # query = "How many topics are covered?"
127
- # docs = docsearch.similarity_search(query, k=1)
128
- # docs
129
-
130
- from langchain import HuggingFaceHub
131
-
132
- llm=HuggingFaceHub(huggingfacehub_api_token= huggingfacehub_api_token ,repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1")
133
-
134
- # from langchain.chains import RetrievalQA
135
- # # retriever = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=docsearch.as_retriever())
136
- # retriever = docsearch.as_retriever(search_kwargs={"k": 2})
137
-
138
- # qa_chain = RetrievalQA.from_chain_type(llm=llm,
139
- # chain_type="stuff",
140
- # retriever=retriever,
141
- # return_source_documents=True)
142
-
143
- #question = "What are the Technical Skills to learn for a Promising AI Career?"
144
-
145
- #print(qa_chain(question))
146
-
147
- ## Adding Memory component
148
- memory = ConversationBufferMemory(
149
- memory_key="chat_history",
150
- return_messages=True, max_history_length=5
151
- )
152
-
153
-
154
- import streamlit as st
155
-
156
-
157
- # Chat History
158
- #chat = llm.start_chat(history=[])
159
- # intialize session state for chat history if it doesn't exist
160
- if 'chat_history' not in st.session_state:
161
- st.session_state['chat_history'] = []
162
-
163
- def user_input(user_question):
164
- # Load embeddings only once (assuming same model for both)
165
- embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
166
-
167
- # Pinecone search using the loaded embeddings
168
- docsearch = Pinecone.from_existing_index(index_name, embeddings)
169
- docs = docsearch.similarity_search(user_question)
170
-
171
- # Define prompt template
172
- template = """Answer the question as detailed as possible from the provided context, make sure to provide all the details,
173
- if the answer is not available in the provided context just say, "answer is not available in the context", don't provide the wrong answer\n\n
174
- {context}
175
- Donot provide the Context , Provide the Answer only , to the question in the following format
176
- Question: {question}
177
- Helpful Answer:"""
178
- prompt = PromptTemplate(input_variables=["context", "question"], template=template)
179
- #prompt = PromptTemplate(input_variables=["question"], template=template)
180
-
181
- # Create retriever and chain using the loaded embeddings
182
- retriever = docsearch.as_retriever()
183
- qa_chain = ConversationalRetrievalChain.from_llm(
184
- llm,
185
- retriever=retriever,
186
- memory=memory
187
- )
188
-
189
- # Extract context from retrieved documents (replace with your logic)
190
- # Consider filtering or summarizing retrieved documents
191
- #context = " ".join([doc.get("text", "") for doc in docs[:3]])
192
- context = docs
193
-
194
- # Inject prompt into query (alternative approach)
195
- query = f"{template.format(context=context, question=user_question)}\nQuestion: {user_question}"
196
- #query = f"{template.format( question=user_question)}\nQuestion: {user_question}"
197
-
198
- response = qa_chain(
199
- {"question": query},
200
- return_only_outputs=True
201
- )
202
-
203
- # Display response
204
- #st.write("Reply: ", response["answer"])
205
- Ans = extract_helpful_answer(response)
206
- st.write(Ans)
207
-
208
- # Feature to load Chat history
209
- if st.button("Load Chat History"):
210
- # add user query and response to session chat history
211
- st.session_state['chat_history'].append(("you",user_question))
212
- # for chunk in Ans:
213
- # #st.write(chunk.text)
214
- # st.session_state['chat_history'].append(("AI Assistant",chunk))
215
- st.session_state['chat_history'].append(("AI Assistant",Ans))
216
- st.subheader("The chat history is ")
217
- for role,text in st.session_state['chat_history']:
218
- st.write(f"{role}: {text}")
219
-
220
- # Feature to load Related Context from the uploaded Documents
221
- if st.button("Load Related Context from Your Document"):
222
- related_context = docs
223
- st.subheader("Related Context from Your Document:")
224
- for doc in related_context:
225
- st.write(f"Document: {doc}")
226
- st.write("\n")
227
- else:
228
- st.warning("Please enter a question before loading related context.")
229
-
230
- def extract_helpful_answer(response):
231
- # Split the response by the delimiter "Helpful Answer:"
232
- parts = response["answer"].split("Helpful Answer:")
233
-
234
- # If there are two parts (before and after "Helpful Answer:"), return the second part
235
- return parts[2].strip()
236
-
237
-
238
-
239
-
240
- def main():
241
- #st.set_page_config("Chat PDF")
242
- st.header("Chat with PDF using Mistral")
243
-
244
- user_question = st.text_input("Ask a Question from the PDF Files")
245
-
246
- if user_question:
247
- user_input(user_question)
248
-
249
- with st.sidebar:
250
- st.title("Menu:")
251
- pdf_docs = st.file_uploader("Upload your PDF Files and Click on the Submit & Process Button", accept_multiple_files=True)
252
- if st.button("Submit & Process"):
253
- with st.spinner("Processing..."):
254
- raw_text = get_pdf_text(pdf_docs)
255
- text_chunks = get_text_chunks(raw_text)
256
- get_vector_store(text_chunks)
257
- st.success("Done")
258
-
259
-
260
-
261
- if __name__ == "__main__":
262
- main()
263
-