taaha3244 commited on
Commit
df07373
1 Parent(s): c90e83e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +79 -70
app.py CHANGED
@@ -1,103 +1,112 @@
1
  import os
2
- from dotenv import load_dotenv
3
  import tempfile
4
  import streamlit as st
5
- from langchain_community.document_loaders import PyPDFLoader
6
-
7
- from main import summarize_pdf_document
8
- from main import retrieve_documents
9
- from main import embed_document_data
10
- from main import is_document_embedded
11
-
12
-
13
-
14
 
15
  load_dotenv()
16
 
17
  def main():
18
  st.sidebar.title("PDF Management")
19
  uploaded_files = st.sidebar.file_uploader("Upload PDF files", type=["pdf"], accept_multiple_files=True)
20
- files_info = [] # Initialize files_info to an empty list before checking for uploaded files
 
 
 
 
 
 
21
 
22
- if uploaded_files:
23
- files_info = save_uploaded_files(uploaded_files)
24
- process_documents(files_info)
25
- if st.button('Add Uploaded Documents in Q nd A'):
26
- embed_documents(files_info)
27
 
28
- # Call to display the Q&A section unconditionally
29
- display_qna_section(files_info)
30
 
 
 
 
 
 
 
 
31
 
32
  def save_uploaded_files(uploaded_files):
33
  """Save uploaded files to temporary directory and return their file paths along with original filenames."""
34
  files_info = []
35
  for uploaded_file in uploaded_files:
36
- # Create a temporary file
37
  with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmpfile:
38
- # Write contents of the uploaded file to the temporary file
39
  tmpfile.write(uploaded_file.getvalue())
40
- # Append both the temp file path and the original name
41
  files_info.append((tmpfile.name, uploaded_file.name))
42
  return files_info
43
 
44
-
45
- def process_documents(files_info):
46
- """Handle document processing for summarization."""
47
- st.header("Document Summaries")
48
- summarize_button = st.button('Summarize Uploaded Documents')
49
-
50
- if summarize_button:
51
  for temp_path, original_name in files_info:
52
- with st.container(): # Using container to better organize the display
53
- st.write(f"Summary for {original_name}:") # Display the original file name
54
- try:
55
- summary = summarize_pdf_document(temp_path, os.getenv('OPENAI_API_KEY'))
56
- st.text_area("", value=summary, height=200, key=f"summary_{original_name}")
57
- except Exception as e:
58
- st.error(f"Failed to summarize {original_name}: {str(e)}")
59
-
60
-
61
- def embed_documents(files_info):
62
- """Embed each document with correct metadata, replacing temp path with original filename."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  for temp_path, original_name in files_info:
64
- if not is_document_embedded(original_name): # Check if already embedded
65
  try:
66
- # Load the document
67
- loader = PyPDFLoader(temp_path)
68
- documents = loader.load()
69
-
70
- # Update the metadata for each document
71
- updated_documents = []
72
- for doc in documents:
73
- doc.metadata['source'] = original_name # Use original filename
74
- updated_documents.append(doc)
75
-
76
- embed_document_data(updated_documents) # Embed the documents
77
- st.success(f"Embedded {original_name}")
78
  except Exception as e:
79
  st.error(f"Failed to embed {original_name}: {str(e)}")
80
  else:
81
  st.info(f"{original_name} is already embedded.")
82
 
83
-
84
- def display_qna_section(files_info):
85
- """Display Q&A section."""
86
- st.header("Question and Answer")
87
- with st.form("qa_form"):
88
- user_query = st.text_input("Enter your question here:")
89
- submit_button = st.form_submit_button('Get Answer')
90
-
91
- if submit_button and user_query:
92
- answer = handle_query(user_query)
93
- st.write(answer)
94
- elif submit_button and not user_query:
95
- st.error("Please enter a question to get an answer.")
96
-
97
- def handle_query(query):
98
  """Retrieve answers based on the query."""
99
- answer = retrieve_documents(query)
100
- return answer
 
 
 
101
 
102
  if __name__ == "__main__":
103
- main()
 
1
  import os
 
2
  import tempfile
3
  import streamlit as st
4
+ from dotenv import load_dotenv
5
+ from main import (
6
+ load_and_split_documents, summarize_documents, embed_documents_into_qdrant,
7
+ retrieve_documents, is_document_embedded, load_documents, split_documents,
8
+ update_metadata, load_documents_OCR
9
+ )
 
 
 
10
 
11
  load_dotenv()
12
 
13
  def main():
14
  st.sidebar.title("PDF Management")
15
  uploaded_files = st.sidebar.file_uploader("Upload PDF files", type=["pdf"], accept_multiple_files=True)
16
+ model_name = st.sidebar.selectbox("Choose your model:", ["gpt-3.5-turbo", "gpt-4-turbo"]) # Model selection
17
+ use_ocr = st.sidebar.checkbox("Use OCR for document processing")
18
+
19
+ if st.sidebar.button('Add Uploaded Documents in Q&A'):
20
+ if uploaded_files:
21
+ files_info = save_uploaded_files(uploaded_files)
22
+ embed_documents(files_info, model_name, use_ocr)
23
 
24
+ pages = {
25
+ "Lex Document Summarization": page_summarization,
26
+ "Lex Q&A": page_qna
27
+ }
 
28
 
29
+ st.sidebar.title("Page Navigation")
30
+ page = st.sidebar.radio("Select a page", tuple(pages.keys()))
31
 
32
+ # Initialize session state for summarization results if not already set
33
+ if 'summaries' not in st.session_state:
34
+ st.session_state['summaries'] = {}
35
+
36
+ # Call the page function based on the user selection
37
+ if page:
38
+ pages[page](uploaded_files, model_name, use_ocr)
39
 
40
  def save_uploaded_files(uploaded_files):
41
  """Save uploaded files to temporary directory and return their file paths along with original filenames."""
42
  files_info = []
43
  for uploaded_file in uploaded_files:
 
44
  with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmpfile:
 
45
  tmpfile.write(uploaded_file.getvalue())
 
46
  files_info.append((tmpfile.name, uploaded_file.name))
47
  return files_info
48
 
49
+ def page_summarization(uploaded_files, model_name, use_ocr):
50
+ """Page for document summarization."""
51
+ st.title("Lex Document Summarization")
52
+ if uploaded_files:
53
+ files_info = save_uploaded_files(uploaded_files)
 
 
54
  for temp_path, original_name in files_info:
55
+ summary_button = st.button(f"Summarize {original_name}", key=original_name)
56
+ if summary_button or (original_name in st.session_state['summaries']):
57
+ with st.container():
58
+ st.write(f"Summary for {original_name}:")
59
+ if summary_button: # Only summarize if button is pressed
60
+ try:
61
+ if use_ocr:
62
+ documents = load_documents_OCR(temp_path, os.getenv('UNSTRUCTURED_API'))
63
+ else:
64
+ documents = load_and_split_documents(temp_path)
65
+ summary = summarize_documents(model_name, documents, os.getenv('OPENAI_API_KEY'))
66
+ st.session_state['summaries'][original_name] = summary # Store summary in session state
67
+ except Exception as e:
68
+ st.error(f"Failed to summarize {original_name}: {str(e)}")
69
+ st.text_area("", value=st.session_state['summaries'][original_name], height=200, key=f"summary_{original_name}")
70
+
71
+ def page_qna(uploaded_files, model_name, use_ocr):
72
+ """Page for Q&A functionality."""
73
+ st.title("Lex Question and Answer")
74
+ user_query = st.text_area("Enter your question here:",height=300)
75
+ if st.button('Get Answer'):
76
+ if user_query:
77
+ answer = handle_query(user_query, model_name)
78
+ st.write(answer)
79
+ else:
80
+ st.error("Please enter a question to get an answer.")
81
+
82
+ def embed_documents(files_info, model_name, use_ocr):
83
+ """Function to embed documents."""
84
  for temp_path, original_name in files_info:
85
+ if not is_document_embedded(original_name):
86
  try:
87
+ if use_ocr:
88
+ documents = load_documents_OCR(temp_path, os.getenv('UNSTRUCTURED_API'))
89
+ else:
90
+ documents = load_documents(temp_path)
91
+ documents = update_metadata(documents, original_name)
92
+ documents = split_documents(documents)
93
+ if documents:
94
+ embed_documents_into_qdrant(documents, os.getenv('OPENAI_API_KEY'), os.getenv('QDRANT_URL'), os.getenv('QDRANT_API_KEY'), 'Lex-v1')
95
+ st.success(f"Embedded {original_name}")
96
+ else:
97
+ st.error(f"No documents found or extracted from {original_name}")
 
98
  except Exception as e:
99
  st.error(f"Failed to embed {original_name}: {str(e)}")
100
  else:
101
  st.info(f"{original_name} is already embedded.")
102
 
103
+ def handle_query(query, model_name):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  """Retrieve answers based on the query."""
105
+ try:
106
+ answer = retrieve_documents(query, os.getenv('OPENAI_API_KEY'), os.getenv('QDRANT_URL'), os.getenv('QDRANT_API_KEY'), model_name)
107
+ return answer or "No relevant answer found."
108
+ except Exception as e:
109
+ return f"Error processing the query: {str(e)}"
110
 
111
  if __name__ == "__main__":
112
+ main()