File size: 4,377 Bytes
ad57ebd c520917 47892e4 c520917 47892e4 bd9359c e5a4890 e39f02b 45f6902 ad57ebd d95bc3e ad57ebd 47892e4 ad57ebd d95bc3e ad57ebd 0106e5c d95bc3e bd9359c ad57ebd d95bc3e 47892e4 d95bc3e 2637c8c 47892e4 ad57ebd 0106e5c d95bc3e 2626c12 d95bc3e 2626c12 ad57ebd 47892e4 2626c12 47892e4 ad57ebd d95bc3e 47892e4 d95bc3e 47892e4 2626c12 24caaa3 ad57ebd 47892e4 d95bc3e 47892e4 d95bc3e 0106e5c d95bc3e 0106e5c ad57ebd b3f61e5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
import streamlit as st
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
from langchain_community.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate
from dotenv import load_dotenv
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain import HuggingFaceHub
from dotenv import load_dotenv
import os
load_dotenv()
def get_pdf_text(pdf_docs):
"""Extracts text from all pages of provided PDF documents"""
text = ""
for pdf in pdf_docs:
pdf_reader = PdfReader(pdf)
for page in pdf_reader.pages:
text += page.extract_text()
return text
def get_text_chunks(text):
"""Splits text into chunks of 10,000 characters with 1,000 character overlap"""
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
chunks = text_splitter.split_text(text)
return chunks
def get_vector_store(text_chunks, hf):
"""Creates and saves a FAISS vector store from text chunks"""
vector_store = FAISS.from_texts(text_chunks, embedding=hf)
vector_store.save_local("faiss_index")
def get_conversational_chain():
"""Creates and returns a conversational chain for question answering"""
prompt_template = """Answer the question concisely, focusing on the most relevant and important details from the PDF context. Refrain from mentioning any mathematical equations, even if they are present in provided context. Focus on the textual information available. Please provide direct quotations or references from PDF to back up your response. If the answer is not found within the PDF, please state "answer is not available in the context."\n\nContext:\n {context}?\nQuestion: \n{question}\nExample response format:Overview: (brief summary or introduction)Key points: (point 1: paragraph for key details)(point 2: paragraph for key details)...Use a mix of paragraphs and points to effectively convey the information."""
# Adjust temperature parameter to lower value to reduce model creativity & focus on factual accuracy
model = HuggingFaceHub(repo_id="google/flan-t5-xl", model_kwargs={"temperature": 0.2, "max_length": 100}, token=os.environ['HUGGINGFACEHUB_API_TOKEN'])
prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
chain = load_qa_chain(model, chain_type="stuff", prompt=prompt)
return chain
def user_input(user_question, hf):
"""Processes user question and provides a response"""
try:
new_db = FAISS.load_local("faiss_index", hf, allow_dangerous_deserialization=True)
docs = new_db.similarity_search(user_question)
except FileNotFoundError:
st.error("No index found. Please upload PDFs and click 'Submit & Process' first.")
return
chain = get_conversational_chain()
response = chain.invoke(
{"input_documents": docs, "question": user_question},
return_only_outputs=True
)
st.write("Reply: ", response["output_text"], "")
def main():
"""Streamlit UI"""
st.set_page_config(page_title="Chat with PDFs", page_icon="")
st.header("Enterprise Brain Sub-Component")
st.header("RAG based Chatbot ")
user_question = st.text_input("Ask a Question from PDF file(s)")
with st.sidebar:
st.title("Menu ✨")
pdf_docs = st.file_uploader("Upload your PDF Files and Click on the Submit & Process Button ",
accept_multiple_files=True)
if pdf_docs:
with st.spinner("Processing..."):
try:
raw_text = get_pdf_text(pdf_docs)
text_chunks = get_text_chunks(raw_text)
model_name = "BAAI/bge-large-en"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True}
hf = HuggingFaceBgeEmbeddings(model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs)
get_vector_store(text_chunks, hf)
st.success("Done ✨")
except Exception as e:
st.error(f"An error occurred: {e}")
if user_question:
user_input(user_question, hf)
if __name__ == "__main__":
main() |