File size: 9,460 Bytes
8999dd1
 
17161a6
8999dd1
df07373
17161a6
9ee3f2a
0c3c33c
 
 
d75ee7e
0c3c33c
17161a6
ace378a
8999dd1
 
 
 
 
17161a6
 
 
 
 
 
 
 
 
 
df07373
17161a6
8999dd1
df07373
 
c8ffb29
04f5a93
 
df07373
8999dd1
df07373
 
8999dd1
df07373
 
 
 
 
 
17161a6
8999dd1
 
17161a6
8999dd1
 
 
 
 
 
 
17161a6
df07373
 
 
 
8999dd1
df07373
 
 
 
17161a6
df07373
17161a6
 
df07373
 
 
ace378a
 
 
 
df07373
17161a6
df07373
c8ffb29
507bba8
17161a6
 
27e2e36
 
17161a6
 
ace378a
17161a6
 
 
507bba8
df07373
 
27e2e36
 
df07373
 
17161a6
 
 
 
 
 
 
df07373
04f5a93
 
 
507bba8
04f5a93
 
27e2e36
 
04f5a93
 
 
17161a6
 
8999dd1
df07373
8999dd1
17161a6
df07373
 
 
 
17161a6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
df07373
 
 
8999dd1
 
 
 
 
17161a6
8999dd1
df07373
c8ffb29
df07373
 
 
8999dd1
17161a6
 
 
 
 
 
 
 
04f5a93
 
 
 
 
 
 
 
17161a6
 
 
 
 
 
 
 
8999dd1
04f5a93
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
import os
import tempfile
import uuid
import streamlit as st
from dotenv import load_dotenv
from qdrant_client import models

from utils import setup_openai_embeddings, setup_qdrant_client, delete_collection, is_document_embedded
from embed import embed_documents_into_qdrant
from preprocess import split_documents, update_metadata, load_documents_OCR
from retrieve import retrieve_documents_from_collection
from summarize import summarize_documents

# Load environment variables
load_dotenv()

def main():
    st.sidebar.title("PDF Management")
    uploaded_files = st.sidebar.file_uploader("Upload PDF files", type=["pdf"], accept_multiple_files=True)

    if 'uploaded_collection_name' not in st.session_state:
        st.session_state['uploaded_collection_name'] = None

    if uploaded_files:
        if st.sidebar.button("Add Docs to Data Bank"):
            files_info = save_uploaded_files(uploaded_files)
            embed_documents_to_data_bank(files_info)
        
        if st.sidebar.button("Add Docs to Current Chat"):
            files_info = save_uploaded_files(uploaded_files)
            add_docs_to_current_chat(files_info)

    pages = {
        "Lex Document Summarization": page_summarization,
        "Chat with RSCA": page_qna,
        "Chat with Uploaded Docs": page_chat_with_uploaded_docs,
        "Chat with VOO": page_chat_with_voo
    }

    st.sidebar.title("Page Navigation")
    page = st.sidebar.radio("Select a page", tuple(pages.keys()))

    # Initialize session state for summarization results if not already set
    if 'summaries' not in st.session_state:
        st.session_state['summaries'] = {}

    # Call the page function based on the user selection
    if page:
        pages[page](uploaded_files)

def save_uploaded_files(uploaded_files):
    """Save uploaded files to a temporary directory and return their file paths along with original filenames."""
    files_info = []
    for uploaded_file in uploaded_files:
        with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmpfile:
            tmpfile.write(uploaded_file.getvalue())
            files_info.append((tmpfile.name, uploaded_file.name))
    return files_info

def page_summarization(uploaded_files):
    """Page for document summarization."""
    st.title("Lex Document Summarization")
    if uploaded_files:
        files_info = save_uploaded_files(uploaded_files)
        for temp_path, original_name in files_info:
            summary_button = st.button(f"Summarize {original_name}", key=original_name)
            if summary_button or (original_name in st.session_state['summaries']):
                with st.container():
                    st.write(f"Summary for {original_name}:")
                    if summary_button:  # Only summarize if the button is pressed
                        try:
                            documents = load_documents_OCR(temp_path, os.getenv('UNSTRUCTURED_API'))
                            summary = summarize_documents(documents, os.getenv('OPENAI_API_KEY'))
                            st.session_state['summaries'][original_name] = summary  # Store summary in session state
                        except Exception as e:
                            st.error(f"Failed to summarize {original_name}: {str(e)}")
                    if original_name in st.session_state['summaries']:
                        st.text_area("", value=st.session_state['summaries'][original_name], height=1000, key=f"summary_{original_name}")
                    else:
                        st.error(f"No summary found for {original_name}. Please click the summarize button.")

def page_qna(uploaded_files):
    """Page for Q&A functionality."""
    st.title("Chat with RSCA")
    user_query = st.text_area("Enter your question here:", height=300)
    if st.button('Get Answer'):
        if user_query:
            answer = handle_query(user_query)
            st.write(answer)
        else:
            st.error("Please enter a question to get an answer.")

def page_chat_with_uploaded_docs(uploaded_files):
    """Page for chatting with uploaded documents."""
    st.title("Chat with Uploaded Documents")
    user_query = st.text_area("Enter your question here:", height=300)
    if st.button('Get Answer'):
        if user_query:
            answer = handle_uploaded_docs_query(user_query, st.session_state['uploaded_collection_name'])
            st.write(answer)
        else:
            st.error("Please enter a question to get an answer.")
    
    if st.session_state['uploaded_collection_name']:
        if st.button('Delete Embedded Collection'):
            collection_name = st.session_state['uploaded_collection_name']
            delete_collection(collection_name, os.getenv('QDRANT_URL'), os.getenv('QDRANT_API_KEY'))
            st.session_state['uploaded_collection_name'] = None
            st.success(f"Deleted collection {collection_name}")

def page_chat_with_voo(uploaded_files):
    """Page for chatting with VOO documents."""
    st.title("Chat with VOO")
    user_query = st.text_area("Enter your question here:", height=300)
    if st.button('Get Answer'):
        if user_query:
            answer = handle_voo_query(user_query)
            st.write(answer)
        else:
            st.error("Please enter a question to get an answer.")

def embed_documents_to_data_bank(files_info):
    """Function to embed documents into the data bank."""
    for temp_path, original_name in files_info:
        if not is_document_embedded(original_name):
            try:
                documents = load_documents_OCR(temp_path, os.getenv('UNSTRUCTURED_API'))
                documents = update_metadata(documents, original_name)
                documents = split_documents(documents)
                if documents:
                    embed_documents_into_qdrant(documents, os.getenv('OPENAI_API_KEY'), os.getenv('QDRANT_URL'), os.getenv('QDRANT_API_KEY'), 'Lex-v1')
                    st.success(f"Embedded {original_name} into Data Bank")
                else:
                    st.error(f"No documents found or extracted from {original_name}")
            except Exception as e:
                st.error(f"Failed to embed {original_name}: {str(e)}")
        else:
            st.info(f"{original_name} is already embedded.")

def add_docs_to_current_chat(files_info):
    """Function to add documents to the current chat session."""
    if not st.session_state['uploaded_collection_name']:
        st.session_state['uploaded_collection_name'] = f"session-{uuid.uuid4()}"
        client = setup_qdrant_client(os.getenv('QDRANT_URL'), os.getenv('QDRANT_API_KEY'))
        client.create_collection(
            collection_name=st.session_state['uploaded_collection_name'],
            vectors_config=models.VectorParams(size=1536, distance=models.Distance.COSINE)
        )
    else:
        client = setup_qdrant_client(os.getenv('QDRANT_URL'), os.getenv('QDRANT_API_KEY'))

    embeddings_model = setup_openai_embeddings(os.getenv('OPENAI_API_KEY'))

    for temp_path, original_name in files_info:
        if not is_document_embedded(original_name):
            try:
                documents = load_documents_OCR(temp_path, os.getenv('UNSTRUCTURED_API'))
                documents = update_metadata(documents, original_name)
                documents = split_documents(documents)
                if documents:
                    embed_documents_into_qdrant(documents, os.getenv('OPENAI_API_KEY'), os.getenv('QDRANT_URL'), os.getenv('QDRANT_API_KEY'), collection_name=st.session_state['uploaded_collection_name'])
                    st.success(f"Embedded {original_name}")
                else:
                    st.error(f"No documents found or extracted from {original_name}")
            except Exception as e:
                st.error(f"Failed to embed {original_name}: {str(e)}")
        else:
            st.info(f"{original_name} is already embedded.")

def handle_query(query):
    """Retrieve answers based on the query."""
    try:
        answer = retrieve_documents_from_collection(query, os.getenv('OPENAI_API_KEY'), os.getenv('QDRANT_URL'), os.getenv('QDRANT_API_KEY'),'Lex-v1')
        return answer or "No relevant answer found."
    except Exception as e:
        return f"Error processing the query: {str(e)}"

def handle_uploaded_docs_query(query, collection_name):
    """Retrieve answers from the uploaded documents collection."""
    try:
        answer = retrieve_documents_from_collection(query, os.getenv('OPENAI_API_KEY'), os.getenv('QDRANT_URL'), os.getenv('QDRANT_API_KEY'), collection_name)
        return answer or "No relevant answer found."
    except Exception as e:
        return f"Error processing the query: {str(e)}"

def handle_voo_query(query):
    """Retrieve answers from the VOO collection."""
    try:
        answer = retrieve_documents_from_collection(query, os.getenv('OPENAI_API_KEY'), os.getenv('QDRANT_URL'), os.getenv('QDRANT_API_KEY'), 'Lex-v2')
        return answer or "No relevant answer found."
    except Exception as e:
        return f"Error processing the query: {str(e)}"

def delete_collection(collection_name, qdrant_url, qdrant_api_key):
    """Delete a Qdrant collection."""
    client = setup_qdrant_client(qdrant_url, qdrant_api_key)
    try:
        client.delete_collection(collection_name=collection_name)
    except Exception as e:
        print("Failed to delete collection:", e)

if __name__ == "__main__":
    main()