Spaces:
Sleeping
Sleeping
File size: 3,333 Bytes
9ed0ab0 df697c8 9ed0ab0 df697c8 9ed0ab0 df697c8 9ed0ab0 df697c8 9ed0ab0 df697c8 9ed0ab0 df697c8 9ed0ab0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 |
from dotenv import load_dotenv
load_dotenv()
import os
import pickle
import streamlit as st
from scanned_pdf_parser import get_text_from_scanned_pdf
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.llms import GooglePalm
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import FAISS
from langchain.docstore.document import Document
llm = GooglePalm(temperature=0.9)
st.title("PDF Query Tool")
st.write("Upload your PDF and ask question from it")
uploaded_file = st.file_uploader("Choose a PDF file")
main_placeholder = st.empty()
second_placeholder = st.empty()
if uploaded_file:
filename = uploaded_file.name
if not filename.endswith(('.pdf', '.PDF')):
main_placeholder.warning("Choose PDF Document !!!")
exit()
elif not os.path.exists(uploaded_file.name):
main_placeholder.text("Data Loading Started...βββ")
with open(f'{uploaded_file.name}', 'wb') as f:
f.write(uploaded_file.getbuffer())
pdf_loader = PyPDFLoader(uploaded_file.name)
documents = pdf_loader.load()
raw_text = ''
for doc in documents:
raw_text += doc.page_content
if len(raw_text) < 10:
main_placeholder.text("It looks like Scanned PDF, No worries converting it...βββ")
raw_text = get_text_from_scanned_pdf(uploaded_file.name)
main_placeholder.text("Splitting text into smaller chunks...βββ")
text_splitter = RecursiveCharacterTextSplitter(
separators=['\n\n', '\n', '.', ','],
chunk_size=2000
)
texts = text_splitter.split_text(raw_text)
docs = [Document(page_content=t) for t in texts]
embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-base")
main_placeholder.text("Storing data into Vector Database...βββ")
vectorstore = FAISS.from_documents(docs, embeddings)
# Save the FAISS index to a pickle file
with open(f'vector_store_{uploaded_file.name}.pkl', "wb") as f:
pickle.dump(vectorstore, f)
main_placeholder.text("Data Loading Completed...β
β
β
")
query = second_placeholder.text_input("Question:")
if query:
if os.path.exists(f'vector_store_{uploaded_file.name}.pkl'):
with open(f'vector_store_{uploaded_file.name}.pkl', "rb") as f:
vector_store = pickle.load(f)
prompt_template = """
<context>
{context}
</context>
Question: {question}
Assistant:"""
prompt = PromptTemplate(
template=prompt_template, input_variables=["context", "question"]
)
chain = RetrievalQA.from_chain_type(
llm=llm,
chain_type="stuff",
retriever=vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 1}),
return_source_documents=True,
chain_type_kwargs={"prompt": prompt}
)
with st.spinner("Searching for the answer..."):
result = chain({"query": query})
st.header("Answer")
st.write(result["result"])
|