maitykritadhi
commited on
Commit
•
0f39449
1
Parent(s):
63da8c0
Upload app.py
Browse files
app.py
ADDED
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import shutil
|
3 |
+
import streamlit as st
|
4 |
+
import chromadb
|
5 |
+
import config as cf
|
6 |
+
|
7 |
+
from langchain.chains.question_answering import load_qa_chain
|
8 |
+
from langchain.prompts import PromptTemplate
|
9 |
+
# from langchain_community.embeddings import SentenceTransformerEmbeddings
|
10 |
+
from sentence_transformers import SentenceTransformer
|
11 |
+
from langchain_groq import ChatGroq
|
12 |
+
from langchain.schema import Document
|
13 |
+
from source.utils.data_processing import ProcessDocs
|
14 |
+
|
15 |
+
from source.utils.store_data import get_vector_store, check_pdfs_chromadb, save_uploaded_files
|
16 |
+
from source.utils.process_data import get_pdf_text, get_text_chunks
|
17 |
+
|
18 |
+
|
19 |
+
llm = None
|
20 |
+
|
21 |
+
|
22 |
+
def get_conversational_chain(model):
|
23 |
+
global llm
|
24 |
+
|
25 |
+
# prompt_template = """
|
26 |
+
# Answer the question as detailed as possible from the provided context, make sure to provide all the details, if the answer is not in
|
27 |
+
# provided context just say, "answer is not available in the context", don't provide the wrong answer\n\n
|
28 |
+
# Context:\n {context}?\n
|
29 |
+
# Question: \n{question}\n
|
30 |
+
|
31 |
+
# Answer:
|
32 |
+
# """
|
33 |
+
|
34 |
+
# model = ChatGoogleGenerativeAI(model="gemini-pro",temperature=0.3)
|
35 |
+
if model == 'gemma-7b-it':
|
36 |
+
llm = ChatGroq(temperature=0, model_name="gemma-7b-it")
|
37 |
+
if model == 'mixtral-8x7b-32768':
|
38 |
+
llm = ChatGroq(temperature=0, model_name="mixtral-8x7b-32768")
|
39 |
+
if model == 'llama3-70b-8192':
|
40 |
+
llm = ChatGroq(temperature=0, model_name="llama3-70b-8192")
|
41 |
+
if model == 'llama3-8b-8192':
|
42 |
+
llm = ChatGroq(temperature=0, model_name="llama3-8b-8192")
|
43 |
+
|
44 |
+
|
45 |
+
|
46 |
+
# prompt = PromptTemplate(template = prompt_template, input_variables = ["context", "question"])
|
47 |
+
chain = load_qa_chain(llm, chain_type="stuff",
|
48 |
+
# prompt=prompt
|
49 |
+
)
|
50 |
+
return chain
|
51 |
+
|
52 |
+
|
53 |
+
|
54 |
+
def user_input(user_question,model):
|
55 |
+
embedding_model = SentenceTransformer("all-mpnet-base-v2")
|
56 |
+
|
57 |
+
chain = get_conversational_chain(model)
|
58 |
+
docs = []
|
59 |
+
|
60 |
+
input_embeddings = embedding_model.encode(user_question).tolist()
|
61 |
+
client = chromadb.PersistentClient("chromadb")
|
62 |
+
collection = client.get_collection("Chromadb_pdf")
|
63 |
+
|
64 |
+
results = collection.query(
|
65 |
+
query_embeddings = [input_embeddings],
|
66 |
+
n_results = 5,
|
67 |
+
include=['distances', 'metadatas', 'documents']
|
68 |
+
)
|
69 |
+
|
70 |
+
|
71 |
+
if results['documents']:
|
72 |
+
pg_num = []
|
73 |
+
for i in range(len(results['documents'][0])):
|
74 |
+
document = results['documents'][0][i]
|
75 |
+
metadata = results['metadatas'][0][i]
|
76 |
+
pdf_name = metadata['pdf_name']
|
77 |
+
page_number = metadata['page_number']
|
78 |
+
|
79 |
+
|
80 |
+
docs.append(Document(
|
81 |
+
page_content=document,
|
82 |
+
metadata={
|
83 |
+
'source': pdf_name,
|
84 |
+
'page': page_number
|
85 |
+
}
|
86 |
+
))
|
87 |
+
|
88 |
+
pg_num.append(str(page_number))
|
89 |
+
|
90 |
+
|
91 |
+
response = chain(
|
92 |
+
{"input_documents": docs,
|
93 |
+
"question": user_question},
|
94 |
+
# return_only_outputs=True
|
95 |
+
return_only_outputs= False
|
96 |
+
)
|
97 |
+
|
98 |
+
# st.write("Reply: ", document)
|
99 |
+
# st.write("Reply:", response)
|
100 |
+
st.write("Reply:", response["output_text"])
|
101 |
+
st.write("Metadata: ", f"PDF Name: {pdf_name}, Page Numbers: {','.join(pg_num)}")
|
102 |
+
else:
|
103 |
+
st.write("No results found.")
|
104 |
+
|
105 |
+
|
106 |
+
|
107 |
+
def main():
|
108 |
+
st.set_page_config("Chat PDF")
|
109 |
+
model = st.selectbox("Select Model", ["llama3-8b-8192", "llama3-70b-8192","mixtral-8x7b-32768","gemma-7b-it"])
|
110 |
+
st.header("Chat with PDF after Uploading")
|
111 |
+
|
112 |
+
user_question = st.text_input("Ask a Question from the PDF Files")
|
113 |
+
|
114 |
+
if user_question:
|
115 |
+
db_obj = ProcessDocs(cf.db_collection_name)
|
116 |
+
response = db_obj.retrieval_qa(user_question, model)
|
117 |
+
st.write("Response:", response)
|
118 |
+
# st.write("Metadata: ", f"PDF Name: {pdf_name}, Page Numbers: {','.join(pg_num)}")
|
119 |
+
# user_input(user_question, model)
|
120 |
+
|
121 |
+
with st.sidebar:
|
122 |
+
st.title("Menu:")
|
123 |
+
pdf_docs = st.file_uploader("Upload your PDF Files and Click on the Submit & Process Button", accept_multiple_files=True)
|
124 |
+
db_obj = ProcessDocs(cf.db_collection_name)
|
125 |
+
# print(pdf_docs)
|
126 |
+
if st.button("Submit & Process"):
|
127 |
+
# global list_of_pdfs
|
128 |
+
# list_of_pdfs = check_pdfs_chromadb()
|
129 |
+
# check_pdfs_chromadb(list_of_pdfs)
|
130 |
+
|
131 |
+
new_files = [doc.name for doc in pdf_docs]
|
132 |
+
|
133 |
+
# new_files = [pdf_name for pdf_name in uploaded_docs_list]
|
134 |
+
# docs_directory = 'docs'
|
135 |
+
|
136 |
+
print(new_files)
|
137 |
+
if new_files:
|
138 |
+
if os.path.exists(cf.pdf_download_path):
|
139 |
+
shutil.rmtree(cf.pdf_download_path)
|
140 |
+
os.makedirs(cf.pdf_download_path)
|
141 |
+
|
142 |
+
pdf_docs = [pdf for pdf in pdf_docs if pdf.name in new_files]
|
143 |
+
print(pdf_docs)
|
144 |
+
|
145 |
+
save_uploaded_files(pdf_docs, cf.pdf_download_path)
|
146 |
+
|
147 |
+
with st.spinner("Processing..."):
|
148 |
+
new_unique_files = db_obj.identify_new_uploaded_files()
|
149 |
+
pdf_docs = db_obj.create_pdf_docx_loader(new_unique_files, model)
|
150 |
+
splits = db_obj.split_documents(pdf_docs)
|
151 |
+
db_obj.vector_store(splits)
|
152 |
+
# raw_text = get_pdf_text(cf.pdf_download_path)
|
153 |
+
# text_chunks = get_text_chunks(raw_text)
|
154 |
+
# get_vector_store(text_chunks)
|
155 |
+
|
156 |
+
st.success("Done")
|
157 |
+
# st.success("Done")
|
158 |
+
else:
|
159 |
+
st.success("No new files to process")
|
160 |
+
|
161 |
+
|
162 |
+
if __name__ == "__main__":
|
163 |
+
main()
|
164 |
+
|