File size: 4,419 Bytes
d6626e6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
337ce72
d6626e6
 
 
 
 
 
 
 
 
 
 
 
 
bf65290
d6626e6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
337ce72
 
d6626e6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264f598
d6626e6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
337ce72
d6626e6
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import streamlit as st
import openai
import os
from dotenv import load_dotenv
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
from langchain.vectorstores import FAISS
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.chat_models import ChatOpenAI
from htmlTemplates import css, bot_template, user_template
from PIL import Image

def get_pdf_text(pdf_docs):
    text = ""
    for pdf in pdf_docs:
        pdf_reader = PdfReader(pdf)
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text

# documentation for CharacterTextSplitter:
# https://python.langchain.com/en/latest/modules/indexes/text_splitters/examples/character_text_splitter.html
def get_text_chunk(text):
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size = 1000,
        chunk_overlap = 200,
        length_function = len
    )
    chunks = text_splitter.split_text(text)
    return chunks

#embedding using openAI embedding. Warn: This will cost you money

def get_vectorstore_openAI(text_chunks):
    embeddings = OpenAIEmbeddings()
    vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
    return vectorstore

#embedding using instructor-xl with your local machine for free
#you can find more details at: https://huggingface.co/hkunlp/instructor-xl
def get_vectorstore(text_chunks):
    embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
    vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
    return vectorstore

def get_conversation_chain(vectorstore):
    llm = ChatOpenAI()
    memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
    conversation_chain = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=vectorstore.as_retriever(),
        memory = memory
    )
    return conversation_chain

def handle_userinput(user_question):
    response = st.session_state.conversation({'question': user_question})
    st.session_state.chat_history = response['chat_history']

    for i, message in enumerate(st.session_state.chat_history):
        if i%2 == 0:
            st.write(user_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
        else:
            st.write(bot_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)

def main():
    ##############################################################################
    #load openai api_key from .evn
    # load_dotenv()
    openai.api_key = os.getenv("OPENAI_API_KEY")
    
    ##############################################################################
    #set up basic page
    st.set_page_config(page_title="Chat With multiple PDFs", page_icon=":books:")
    st.write(css, unsafe_allow_html=True)

    #initial session_state in order to avoid refresh
    if "conversation" not in st.session_state:
        st.session_state.conversation = None
    if "chat_history" not in st.session_state:
        st.session_state.chat_history = None

    st.header("Chat based on PDF you provided :books:")
    user_question = st.text_input("Ask a question about your documents:")

    if user_question:
        handle_userinput(user_question)

    # Define the templates

    with st.sidebar:
        st.subheader("Your PDF documents")
        pdf_docs = st.file_uploader("Upload your pdfs here and click on 'Proces'", accept_multiple_files= True)
        #if the button is pressed
        if st.button("Process"):
            with st.spinner("Processing"):
                #get pdf text
                raw_text = get_pdf_text(pdf_docs)
                print('raw_text is created')

                #get the text chunks
                text_chunks = get_text_chunk(raw_text)
                print('text_chunks are generated')

                #create vector store
                vectorstore = get_vectorstore_openAI(text_chunks)
                print('vectorstore is created')

                #create converstion chain
                st.session_state.conversation = get_conversation_chain(vectorstore)
                print('conversation chain created')
    



# to run this application, you need to run "streamlit run app.py"
if __name__ == '__main__':
    main()