File size: 6,129 Bytes
51fe9d2
 
0489db2
 
 
0e17089
51fe9d2
 
0489db2
 
 
 
 
7a7c4d5
0489db2
51fe9d2
0489db2
 
 
7a7c4d5
 
 
 
51fe9d2
0489db2
7a7c4d5
51fe9d2
0489db2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51fe9d2
0489db2
 
 
 
51fe9d2
 
 
d5bd88b
 
0489db2
0e17089
 
 
 
 
 
 
 
 
 
 
 
0489db2
7a7c4d5
0e17089
d5bd88b
 
 
 
 
 
 
 
 
 
0e17089
d5bd88b
 
0e17089
 
 
d5bd88b
 
0e17089
 
 
 
 
 
 
 
d5bd88b
 
 
 
 
 
0489db2
d5bd88b
0489db2
d5bd88b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51fe9d2
d5bd88b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import streamlit as st
from openai.error import OpenAIError
from .utils import *
from typing import Text, Union

multiple_files = True

def clear_submit():
    """
    Toggles the file_submitted internal session state variable to False.
    """
    st.session_state["file_submitted"] = False

def set_openai_api_key(api_key:Text)->bool:
    """Sets the internal OpenAI API key to the given value.

    Args:
        api_key (Text): OpenAI API key
    """
    if not (api_key.startswith('sk-') and len(api_key)==51):
        st.error("Invalid OpenAI API key! Please provide a valid key.")
        return False
    
    st.session_state["OPENAI_API_KEY"] = api_key
    st.session_state["api_key_configured"] = True
    return True

def file_to_doc(file:Union[PDFFile, DocxFile, TxtFile, CodeFile]) -> None:
    """Converts a file to a document using specialized parsers."""
    if file.name.endswith(".pdf"):
        doc = parse_pdf(file)
    elif file.name.endswith(".docx"):
        doc = parse_docx(file)
    elif file.name.split["."][1] in [".txt", ".py", ".json", ".html", ".css", ".md" ]:
        doc = parse_txt(file)
    else:
        st.error("File type not yet supported! Supported files: [.pdf, .docx, .txt, .py, .json, .html, .css, .md]")
        doc = None
    
    return doc

# this function can be used to define a single doc processing pipeline
# def document_embedding_pipeline(file:Union[PDFFile, DocxFile, TxtFile, CodeFile]) -> None:  

def qa_main():
    st.markdown("<h2>This app allows to chat with files!</h2>", unsafe_allow_html=True)
    st.write("Just upload something using and start chatting with a version of GPT4 that has read the file!")
    
    index = None
    doc = None

    upload_document_greenlight = False
    uploaded_processed_document_greenlight = False
    # OpenAI API Key - TODO: consider adding a key valid for everyone
    # st.header("Configure OpenAI API Key")
    # st.warning('Please enter your OpenAI API Key!', icon='⚠️')

    # uncomment the following lines to add a user-specific key
    # user_secret = st.text_input(
    #     "Insert your OpenAI API key here ([get your API key](https://platform.openai.com/account/api-keys)).",
    #     type="password",
    #     placeholder="Paste your OpenAI API key here (sk-...)",
    #     help="You can get your API key from https://platform.openai.com/account/api-keys.",
    #     value=st.session_state.get("OPENAI_API_KEY", ""),
    # )
    user_secret = st.secrets["OPENAI_API_KEY"]
    if user_secret:
        if set_openai_api_key(user_secret):
            st.success('OpenAI API key successfully accessed!', icon='✅')
            upload_document_greenlight = True
    
    if upload_document_greenlight:
        # File that needs to be queried
        st.header("Upload a file")
        uploaded_file = st.file_uploader(
            "Upload a pdf, docx, or txt file (scanned documents not supported)",
            type=["pdf", "docx", "txt", "py", "json", "html", "css", "md"],
            help="Scanned documents are not supported yet 🥲",
            on_change=clear_submit, 
            accept_multiple_files=multiple_files
        )
            
        # reading the uploaded files
        text = []
        if len(uploaded_file) != 0:
            # toggle internal file submission state to True
            st.session_state["file_submitted"] = True
            for file in uploaded_file:
                # parse the file using custom parsers
                file_doc = file_to_doc(file)
                # converts the files into a list of documents
                file_text = text_to_docs(text=tuple(file_doc), file_name=file.name)
                text.extend(file_text)
        
            # embeds the documents using OpenAI API
            try:
                with st.spinner("Indexing the document... This might take a while!"):
                    index = embed_docs(tuple(text))
                    st.session_state["api_key_configured"] = True
            except OpenAIError as e:
                st.error("OpenAI error encountered: ", e._message)
        
            uploaded_processed_document_greenlight = True
        
    if uploaded_processed_document_greenlight: 
        if "messages" not in st.session_state:
            st.session_state["messages"] = []

        for message in st.session_state.messages:
            with st.chat_message(message["role"]):
                st.markdown(message["content"])

        if prompt := st.chat_input("Ask the document something..."):
            st.session_state.messages.append({"role": "user", "content": prompt})
            with st.chat_message("user"):
                st.markdown(prompt)

            with st.chat_message("assistant"):
                message_placeholder = st.empty()
                # retrieving the most relevant sources
                sources = search_docs(index, prompt)
                # producing the answer, live
                full_response = ""
                for answer_bit in get_answer(sources, prompt)["output_text"]:
                    full_response += answer_bit
                    message_placeholder.markdown(full_response + "▌")
                
                message_placeholder.markdown(full_response)

                # answer = get_answer(sources, prompt)
                # message_placeholder.markdown(answer["output_text"])
            
            # st.session_state.messages.append({"role": "assistant", "content": answer["output_text"]})
            st.session_state.messages.append({"role": "assistant", "content": full_response})

# This might be useful to add memory to the chatbot harnessing a more low-level approach
# llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo")

# memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True, output_key='answer')
# retriever = your_vector_store.as_retriever()

# # Create the multipurpose chain
# qachat = ConversationalRetrievalChain.from_llm(
#     llm=ChatOpenAI(temperature=0),
#     memory=memory,
#     retriever=retriever, 
#     return_source_documents=True
# )

# qachat("Ask your question here...")