File size: 6,043 Bytes
3aaeae4
4095cfa
1bf2670
 
3aaeae4
 
 
 
 
 
 
5915d7a
 
bef48d1
 
 
999a1fd
 
 
ea0b1ef
 
071e28b
3176d98
 
c15629e
 
 
1bf2670
d3bca06
 
b4ffaef
 
 
 
 
 
 
 
 
 
 
86943bc
 
 
 
 
 
 
 
 
bc2edb8
 
b31777c
bc2edb8
b31777c
 
bc2edb8
b31777c
 
 
bc2edb8
b31777c
 
 
 
 
 
bc2edb8
b31777c
 
bc2edb8
b31777c
 
 
f72b341
 
b31777c
 
bc2edb8
c15629e
 
 
 
 
 
 
 
 
 
 
 
 
 
bc2edb8
34ff935
bc2edb8
34ff935
 
 
bc2edb8
 
34ff935
bc2edb8
34ff935
 
 
 
bc2edb8
34ff935
 
 
b4ffaef
3aaeae4
 
 
 
 
b27a1d5
3ff5e5b
78337de
 
 
3aaeae4
 
 
 
 
 
 
 
3ff5e5b
78337de
3aaeae4
4095cfa
34ff935
 
3aaeae4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
##############################################################
#   app.py   -  Pennwick PDF Chat
#
#   HuggingFace Spaces application 
#
#   Mike Pastor  February 2024


import streamlit as st
from dotenv import load_dotenv

from PyPDF2 import PdfReader

from htmlTemplates import css, bot_template, user_template


#  from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain_community.embeddings import HuggingFaceInstructEmbeddings

# from langchain.vectorstores import FAISS
from langchain_community.vectorstores import FAISS

from langchain.text_splitter import CharacterTextSplitter

from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain


#  from langchain.llms import HuggingFaceHub
from langchain_community.llms import HuggingFaceHub

def get_pdf_text(pdf_docs):
    text = ""
    for pdf in pdf_docs:
        pdf_reader = PdfReader(pdf)
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text

#  Chunk size and overlap must not exceed the models capacity!
#
def get_text_chunks(text):
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=800,    #  1000
        chunk_overlap=200,
        length_function=len
    )
    chunks = text_splitter.split_text(text)
    return chunks


def get_vectorstore(text_chunks):

    st.write('Here in vector store....', unsafe_allow_html=True)
    # embeddings = OpenAIEmbeddings()

    #  pip install InstructorEmbedding
    #  pip install sentence-transformers==2.2.2
    embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")

    st.write('Here in vector store - got embeddings ', unsafe_allow_html=True)
    #  from InstructorEmbedding import INSTRUCTOR
    # model = INSTRUCTOR('hkunlp/instructor-xl')
    # sentence = "3D ActionSLAM: wearable person tracking in multi-floor environments"
    # instruction = "Represent the Science title:"
    # embeddings = model.encode([[instruction, sentence]])

    # embeddings = model.encode(text_chunks)
    print('have Embeddings:   ')

    # text_chunks="this is a test"
    #   FAISS,  Chroma and other vector databases
    #
    vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
    st.write('FAISS succeeds:   ')

    return vectorstore

def get_conversation_chain(vectorstore):
    # llm = ChatOpenAI()
    #  llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512})
    #  google/bigbird-roberta-base     facebook/bart-large
    llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature": 0.5, "max_length": 512})

    memory = ConversationBufferMemory(
        memory_key='chat_history', return_messages=True)
    conversation_chain = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=vectorstore.as_retriever(),
        memory=memory,
    )
    return conversation_chain

def handle_userinput(user_question):

    response = st.session_state.conversation({'question': user_question})
    # response = st.session_state.conversation({'summarization': user_question})
    st.session_state.chat_history = response['chat_history']


    # st.empty()

    for i, message in enumerate(st.session_state.chat_history):
        if i % 2 == 0:
            st.write(user_template.replace(
                "{{MSG}}", message.content), unsafe_allow_html=True)

        else:
            st.write(bot_template.replace(
                "{{MSG}}", message.content), unsafe_allow_html=True)




def main():

    # load_dotenv()
    #  st.set_page_config(page_title="Pennwick PDF Analyzer", page_icon=":books:")
    im = Image.open("robot_icon.png")
    # st.set_page_config(page_title="My App", page_icon=im)
    st.set_page_config(page_title="Pennwick PDF Analyzer", page_icon=im )

    st.write(css, unsafe_allow_html=True)

    if "conversation" not in st.session_state:
        st.session_state.conversation = None
    if "chat_history" not in st.session_state:
        st.session_state.chat_history = None

    # st.header("Pennwick File Analyzer :books:")
    st.header("Pennwick File Analyzer ")

    user_question = st.text_input("Ask the Model a question about your uploaded documents:")
    if user_question:
        handle_userinput(user_question)

    # st.write( user_template, unsafe_allow_html=True)
    # st.write(user_template.replace( "{{MSG}}", "Hello robot!"), unsafe_allow_html=True)
    # st.write(bot_template.replace( "{{MSG}}", "Hello human!"), unsafe_allow_html=True)


    with st.sidebar:

        st.subheader("Your documents")
        pdf_docs = st.file_uploader(
            "Upload your PDFs here and click on 'Process'", accept_multiple_files=True)

        # Upon button press
        if st.button("Process these files"):
            with st.spinner("Processing..."):

                #################################################################
                #  Track the overall time for file processing into Vectors
                # #
                from datetime import datetime
                global_now = datetime.now()
                global_current_time = global_now.strftime("%H:%M:%S")
                st.write("Vectorizing Files - Current Time =", global_current_time)

                # get pdf text
                raw_text = get_pdf_text(pdf_docs)
                #  st.write(raw_text)

                # # get the text chunks
                text_chunks = get_text_chunks(raw_text)
                # st.write(text_chunks)

                # # create vector store
                vectorstore = get_vectorstore(text_chunks)

                # # create conversation chain
                st.session_state.conversation = get_conversation_chain(vectorstore)

                # Mission Complete!
                global_later = datetime.now()
                st.write("Files Vectorized - Total EXECUTION Time =",
                         (global_later - global_now), global_later)


if __name__ == '__main__':
    main()