File size: 4,503 Bytes
1be99a2
ad57ebd
c520917
 
 
47892e4
 
 
bd9359c
1be99a2
e39f02b
 
45f6902
ad57ebd
d95bc3e
ad57ebd
 
 
 
47892e4
ad57ebd
 
 
d95bc3e
ad57ebd
 
 
 
0106e5c
d95bc3e
bd9359c
ad57ebd
 
 
d95bc3e
 
47892e4
d95bc3e
4706238
47892e4
 
ad57ebd
 
0106e5c
d95bc3e
2626c12
ad55563
 
d95bc3e
 
 
2626c12
fb9d2d1
 
 
2626c12
ad57ebd
47892e4
 
 
 
2626c12
47892e4
ad57ebd
d95bc3e
47892e4
d95bc3e
47892e4
2626c12
24caaa3
fb9d2d1
ad57ebd
 
47892e4
 
d95bc3e
47892e4
d95bc3e
 
 
0106e5c
 
 
 
 
d95bc3e
 
0106e5c
 
fb9d2d1
0106e5c
ad57ebd
 
ad55563
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import os
import streamlit as st
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate
from dotenv import load_dotenv
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.llms import HuggingFaceHub

load_dotenv()

def get_pdf_text(pdf_docs):
    """Extracts text from all pages of provided PDF documents"""
    text = ""
    for pdf in pdf_docs:
        pdf_reader = PdfReader(pdf)
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text

def get_text_chunks(text):
    """Splits text into chunks of 10,000 characters with 1,000 character overlap"""
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
    chunks = text_splitter.split_text(text)
    return chunks

def get_vector_store(text_chunks, hf):
    """Creates and saves a FAISS vector store from text chunks"""
    vector_store = FAISS.from_texts(text_chunks, embedding=hf)
    vector_store.save_local("faiss_index")

def get_conversational_chain():
    """Creates and returns a conversational chain for question answering"""
    prompt_template = """Answer the question concisely, focusing on the most relevant and important details from the PDF context. Refrain from mentioning any mathematical equations, even if they are present in provided context. Focus on the textual information available. Please provide direct quotations or references from PDF to back up your response. If the answer is not found within the PDF, please state "answer is not available in the context."\n\nContext:\n {context}?\nQuestion: \n{question}\nExample response format:Overview: (brief summary or introduction)Key points: (point 1: paragraph for key details)(point 2: paragraph for key details)...Use a mix of paragraphs and points to effectively convey the information."""

    # Adjust temperature parameter to lower value to reduce model creativity & focus on factual accuracy
    model = HuggingFaceHub(repo_id="meta-llama/Meta-Llama-3.1-8B-Instruct", model_kwargs={"temperature": 0.2, "max_length": 100}, token=os.environ['HUGGINGFACEHUB_API_TOKEN'])
    prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
    chain = load_qa_chain(model, chain_type="stuff", prompt=prompt)
    return chain

def user_input(user_question, hf):
    """Processes user question and provides a response"""
    try:
        # Load the FAISS index without the 'allow_dangerous_deserialization' argument
        new_db = FAISS.load_local("faiss_index", hf)
        docs = new_db.similarity_search(user_question)
    except FileNotFoundError:
        st.error("No index found. Please upload PDFs and click 'Submit & Process' first.")
        return
    except Exception as e:
        st.error(f"An error occurred while loading the index: {e}")
        return

    chain = get_conversational_chain()
    response = chain.invoke(
        {"input_documents": docs, "question": user_question},
        return_only_outputs=True
    )
    st.write("Reply: ", response["output_text"], "")

def main():
    """Streamlit UI"""
    st.set_page_config(page_title="Chat with PDFs", page_icon="")
    st.header("RAG based Chatbot ")

    user_question = st.text_input("Ask a Question from PDF file(s)")

    hf = None
    with st.sidebar:
        st.title("Menu ✨")
        pdf_docs = st.file_uploader("Upload your PDF Files and Click on the Submit & Process Button ",
                                     accept_multiple_files=True)
        if pdf_docs:
            with st.spinner("Processing..."):
                try:
                    raw_text = get_pdf_text(pdf_docs)
                    text_chunks = get_text_chunks(raw_text)
                    model_name = "BAAI/bge-large-en"
                    model_kwargs = {'device': 'cpu'}
                    encode_kwargs = {'normalize_embeddings': True}
                    hf = HuggingFaceBgeEmbeddings(model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs)
                    get_vector_store(text_chunks, hf)
                    st.success("Done ✨")
                except Exception as e:
                    st.error(f"An error occurred: {e}")

    if user_question and hf:
        user_input(user_question, hf)

if __name__ == "__main__":
    main()