File size: 5,903 Bytes
790731d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import base64
import os

import streamlit as st
from langchain.chains import RetrievalQA
from langchain.document_loaders import PDFMinerLoader
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.llms import HuggingFacePipeline
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from streamlit_chat import message
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
import torch

st.set_page_config(layout="wide")

def process_answer(instruction, qa_chain):
    response = ''
    generated_text = qa_chain.run(instruction)
    return generated_text


def get_file_size(file):
    file.seek(0, os.SEEK_END)
    file_size = file.tell()
    file.seek(0)
    return file_size


@st.cache_resource
def data_ingestion():
    for root, dirs, files in os.walk("docs"):
        for file in files:
            if file.endswith(".pdf"):
                print(file)
                loader = PDFMinerLoader(os.path.join(root, file))

    documents = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=500)
    splits = text_splitter.split_documents(documents)

    # create embeddings here
    embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
    vectordb = FAISS.from_documents(splits, embeddings)
    vectordb.save_local("faiss_index")


@st.cache_resource
def initialize_qa_chain(selected_model):
    # Constants
    CHECKPOINT = selected_model
    TOKENIZER = AutoTokenizer.from_pretrained(CHECKPOINT)
    BASE_MODEL = AutoModelForSeq2SeqLM.from_pretrained(CHECKPOINT, device_map=torch.device('cpu'), torch_dtype=torch.float32)
    pipe = pipeline(
        'text2text-generation',
        model=BASE_MODEL,
        tokenizer=TOKENIZER,
        max_length=256,
        do_sample=True,
        temperature=0.3,
        top_p=0.95,
        # device=torch.device('cpu')
    )

    llm = HuggingFacePipeline(pipeline=pipe)
    embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

    vectordb = FAISS.load_local("faiss_index", embeddings)

    # Build a QA chain
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vectordb.as_retriever(),
    )
    return qa_chain


@st.cache_data
# function to display the PDF of a given file
def display_pdf(file):
    try:
        # Opening file from file path
        with open(file, "rb") as f:
            base64_pdf = base64.b64encode(f.read()).decode('utf-8')

        # Embedding PDF in HTML
        pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="100%" height="600" type="application/pdf"></iframe>'

        # Displaying File
        st.markdown(pdf_display, unsafe_allow_html=True)
    except Exception as e:
        st.error(f"An error occurred while displaying the PDF: {e}")


# Display conversation history using Streamlit messages
def display_conversation(history):
    for i in range(len(history["generated"])):
        message(history["past"][i], is_user=True, key=f"{i}_user")
        message(history["generated"][i], key=str(i))


def main():
     # Add a sidebar for model selection
    model_options = ["MBZUAI/LaMini-T5-738M", "google/flan-t5-base", "google/flan-t5-small"]
    selected_model = st.sidebar.selectbox("Select Model", model_options)
    
    st.markdown("<h1 style='text-align: center; color: blue;'>Custom PDF Chatbot πŸ¦œπŸ“„ </h1>", unsafe_allow_html=True)
    st.markdown("<h2 style='text-align: center; color:red;'>Upload your PDF, and Ask Questions πŸ‘‡</h2>", unsafe_allow_html=True)

    uploaded_file = st.file_uploader("", type=["pdf"])

    if uploaded_file is not None:
        file_details = {
            "Filename": uploaded_file.name,
            "File size": get_file_size(uploaded_file)
        }
        os.makedirs("docs", exist_ok=True)
        filepath = os.path.join("docs", uploaded_file.name)
        try:
            with open(filepath, "wb") as temp_file:
                temp_file.write(uploaded_file.read())

            col1, col2 = st.columns([1, 2])
            with col1:
                st.markdown("<h4 style color:black;'>File details</h4>", unsafe_allow_html=True)
                st.json(file_details)
                st.markdown("<h4 style color:black;'>File preview</h4>", unsafe_allow_html=True)
                pdf_view = display_pdf(filepath)

            with col2:
                st.success(f'model selected successfully: {selected_model}')
                with st.spinner('Embeddings are in process...'):
                    ingested_data = data_ingestion()
                st.success('Embeddings are created successfully!')
                st.markdown("<h4 style color:black;'>Chat Here</h4>", unsafe_allow_html=True)

                user_input = st.text_input("", key="input")

                # Initialize session state for generated responses and past messages
                if "generated" not in st.session_state:
                    st.session_state["generated"] = ["I am ready to help you"]
                if "past" not in st.session_state:
                    st.session_state["past"] = ["Hey there!"]

                # Search the database for a response based on user input and update session state
                if user_input:
                    answer = process_answer({'query': user_input}, initialize_qa_chain(selected_model))
                    st.session_state["past"].append(user_input)
                    response = answer
                    st.session_state["generated"].append(response)

                # Display conversation history using Streamlit messages
                if st.session_state["generated"]:
                    display_conversation(st.session_state)

        except Exception as e:
            st.error(f"An error occurred: {e}")


if __name__ == "__main__":
    main()