msr2903 commited on
Commit
3eed450
·
1 Parent(s): e9c51d1

Update app.py, pages.py, and section_extract.py

Browse files

Adding new features: Session state of the pages, so you don't have to wait the process if the pages has processed beforehand.

__pycache__/pages.cpython-312.pyc CHANGED
Binary files a/__pycache__/pages.cpython-312.pyc and b/__pycache__/pages.cpython-312.pyc differ
 
__pycache__/section_extract.cpython-312.pyc CHANGED
Binary files a/__pycache__/section_extract.cpython-312.pyc and b/__pycache__/section_extract.cpython-312.pyc differ
 
app.py CHANGED
@@ -1,5 +1,7 @@
1
  import streamlit as st
2
- from pages import home, cover, underwriter, income_statement, balance_sheet, cash_flow
 
 
3
 
4
  # Define pages
5
  pages = {
 
1
  import streamlit as st
2
+ from pages import home, cover, underwriter, income_statement, balance_sheet, cash_flow, uploader_sidebar
3
+
4
+ uploader_sidebar()
5
 
6
  # Define pages
7
  pages = {
pages.py CHANGED
@@ -2,44 +2,99 @@ import streamlit as st
2
  from section_extract import find_cover, find_underwriter, find_financial
3
  from streamlit_pdf_viewer import pdf_viewer
4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  def home():
6
  st.title("Prospectus Lens")
7
- st.write("Welcome to the Prospectus Lens! Upload the PDF of the prospectus below!")
8
- uploaded_file = st.file_uploader("Upload your Prospectus File", accept_multiple_files=False, type=["pdf"])
9
- st.session_state["uploaded_file"] = uploaded_file
10
- st.caption("Made with ❤️ by @michael_sr24")
11
 
12
  def cover():
13
- temp_cover_page_path = find_cover(uploaded_file=st.session_state.get("uploaded_file"))
14
- if temp_cover_page_path:
15
- pdf_viewer(temp_cover_page_path)
16
  else:
17
- st.warning("Could not process the PDF file.")
18
 
19
  def underwriter():
20
- temp_page_path = find_underwriter(uploaded_file=st.session_state.get("uploaded_file"))
21
- if temp_page_path:
22
- pdf_viewer(temp_page_path)
23
  else:
24
- st.warning("Could not extract the underwriter section.")
25
 
26
  def income_statement():
27
- temp_section_path = find_financial(uploaded_file=st.session_state.get("uploaded_file"), section_name="income_statement")
28
- if temp_section_path:
29
- pdf_viewer(temp_section_path)
30
  else:
31
- st.warning("Could not extract the income statement section.")
32
 
33
  def balance_sheet():
34
- temp_section_path = find_financial(uploaded_file=st.session_state.get("uploaded_file"), section_name="balance_sheet")
35
- if temp_section_path:
36
- pdf_viewer(temp_section_path)
37
  else:
38
- st.warning("Could not extract the balance sheet section.")
39
 
40
  def cash_flow():
41
- temp_section_path = find_financial(uploaded_file=st.session_state.get("uploaded_file"), section_name="cash_flow")
42
- if temp_section_path:
43
- pdf_viewer(temp_section_path)
44
  else:
45
- st.warning("Could not extract the cash flow section.")
 
2
  from section_extract import find_cover, find_underwriter, find_financial
3
  from streamlit_pdf_viewer import pdf_viewer
4
 
5
+ def uploader_sidebar():
6
+ uploaded_file = st.sidebar.file_uploader("Upload your Prospectus File", accept_multiple_files=False, type=["pdf"])
7
+ st.sidebar.caption("Made with ❤️ by @michael_sr24")
8
+
9
+ if uploaded_file:
10
+ # Initialize session state for processing flags and paths
11
+ if "uploaded_file" not in st.session_state:
12
+ st.session_state["uploaded_file"] = uploaded_file
13
+ st.session_state["cover_path"] = None
14
+ st.session_state["underwriter_path"] = None
15
+ st.session_state["income_statement_path"] = None
16
+ st.session_state["balance_sheet_path"] = None
17
+ st.session_state["cash_flow_path"] = None
18
+ st.session_state["processing"] = {
19
+ "cover_path": False,
20
+ "underwriter_path": False,
21
+ "income_statement_path": False,
22
+ "balance_sheet_path": False,
23
+ "cash_flow_path": False,
24
+ }
25
+ st.session_state["all_processed"] = False
26
+ else:
27
+ st.session_state["uploaded_file"] = uploaded_file
28
+ process_sections()
29
+
30
+ def process_sections():
31
+ """Continuously process all sections in the background."""
32
+ if "processing" in st.session_state and not st.session_state.get("all_processed", False):
33
+ for key, processed in st.session_state["processing"].items():
34
+ if not processed:
35
+ if key == "cover_path":
36
+ st.session_state[key] = find_cover(st.session_state["uploaded_file"])
37
+ elif key == "underwriter_path":
38
+ st.session_state[key] = find_underwriter(st.session_state["uploaded_file"])
39
+ elif key == "income_statement_path":
40
+ st.session_state[key] = find_financial(st.session_state["uploaded_file"], "income_statement")
41
+ elif key == "balance_sheet_path":
42
+ st.session_state[key] = find_financial(st.session_state["uploaded_file"], "balance_sheet")
43
+ elif key == "cash_flow_path":
44
+ st.session_state[key] = find_financial(st.session_state["uploaded_file"], "cash_flow")
45
+
46
+ st.session_state["processing"][key] = True # Mark as processed
47
+ break
48
+
49
+ # Check if all sections are processed
50
+ st.session_state["all_processed"] = all(st.session_state["processing"].values())
51
+
52
+ def show_section(section_key):
53
+ """Display the section if available, otherwise inform the user."""
54
+ temp_path = st.session_state.get(section_key)
55
+ if temp_path:
56
+ pdf_viewer(temp_path)
57
+ else:
58
+ if not st.session_state["processing"].get(section_key, False):
59
+ st.info(f"{section_key.replace('_', ' ').capitalize()} is still being processed.")
60
+ else:
61
+ st.warning(f"Could not process {section_key.replace('_', ' ')}.")
62
+
63
  def home():
64
  st.title("Prospectus Lens")
65
+ st.write("Welcome to the Prospectus Lens! Upload the PDF of the prospectus on the left sidebar!")
 
 
 
66
 
67
  def cover():
68
+ st.title("Cover")
69
+ if "uploaded_file" in st.session_state:
70
+ show_section("cover_path")
71
  else:
72
+ st.warning("Please upload a file first!")
73
 
74
  def underwriter():
75
+ st.title("Underwriter")
76
+ if "uploaded_file" in st.session_state:
77
+ show_section("underwriter_path")
78
  else:
79
+ st.warning("Please upload a file first!")
80
 
81
  def income_statement():
82
+ st.title("Income Statement")
83
+ if "uploaded_file" in st.session_state:
84
+ show_section("income_statement_path")
85
  else:
86
+ st.warning("Please upload a file first!")
87
 
88
  def balance_sheet():
89
+ st.title("Balance Sheet")
90
+ if "uploaded_file" in st.session_state:
91
+ show_section("balance_sheet_path")
92
  else:
93
+ st.warning("Please upload a file first!")
94
 
95
  def cash_flow():
96
+ st.title("Cash Flow")
97
+ if "uploaded_file" in st.session_state:
98
+ show_section("cash_flow_path")
99
  else:
100
+ st.warning("Please upload a file first!")
section_extract.py CHANGED
@@ -1,206 +1,211 @@
1
- import os
2
- import re
3
- from PyPDF2 import PdfReader, PdfWriter
4
- import streamlit as st
5
- from config import keywords_dict, stop_keywords, anti_keywords
6
-
7
- def find_cover(uploaded_file):
8
- """
9
- Extracts and saves the first page of a PDF to a temporary file.
10
-
11
- Parameters:
12
- uploaded_file: The uploaded PDF file.
13
-
14
- Returns:
15
- str: Path to the temporary file containing the first page of the PDF.
16
- """
17
- section_title = "cover"
18
- st.title(section_title.title())
19
-
20
- if uploaded_file:
21
- try:
22
- # Read the PDF and extract the first page
23
- pdf_reader = PdfReader(uploaded_file)
24
- first_page = pdf_reader.pages[0]
25
-
26
- pdf_writer = PdfWriter()
27
- temp_cover_page_path = os.path.join(f"temp_{section_title}.pdf")
28
- with open(temp_cover_page_path, "wb") as f:
29
- pdf_writer.add_page(first_page)
30
- pdf_writer.write(f)
31
-
32
- # Return the path to the temporary file
33
- return temp_cover_page_path
34
- except Exception as e:
35
- st.error(f"An error occurred while processing the PDF: {e}")
36
- return None
37
- else:
38
- st.warning("Please upload a PDF on the Home page first.")
39
- return None
40
-
41
-
42
- def find_underwriter(uploaded_file):
43
- """
44
- Searches for pages in a PDF containing specific keywords for the 'underwriter' section and returns the extracted file path.
45
-
46
- Parameters:
47
- uploaded_file: The uploaded PDF file.
48
-
49
- Returns:
50
- str: Path to the temporary file containing the extracted 'underwriter' page(s).
51
- """
52
- section_name = "underwriter"
53
- st.title(section_name.title())
54
-
55
- keyword_sets = keywords_dict.get(section_name, [])
56
- if not keyword_sets:
57
- st.error(f"No keywords defined for section: {section_name}")
58
- return None
59
-
60
- if uploaded_file:
61
- try:
62
- pdf_reader = PdfReader(uploaded_file)
63
- total_pages = len(pdf_reader.pages)
64
- start_page = total_pages // 3 # Skip the first 1/3 of the PDF
65
- pages = pdf_reader.pages[start_page:]
66
-
67
- # Loop through the keyword sets
68
- for keyword_set in keyword_sets:
69
- for page_num, page in enumerate(pages, start=start_page + 1):
70
- text = page.extract_text()
71
-
72
- # Check if any keyword in the set is found on the page
73
- if any(re.search(keyword, text, re.IGNORECASE) for keyword in keyword_set):
74
- # Save the matched page to a temporary file
75
- pdf_writer = PdfWriter()
76
- pdf_writer.add_page(page)
77
-
78
- temp_page_path = os.path.join(f"temp_{section_name.lower()}_page_{page_num + 1}.pdf")
79
- with open(temp_page_path, "wb") as f:
80
- pdf_writer.write(f)
81
-
82
- # Return the path of the extracted page
83
- return temp_page_path
84
-
85
- st.warning(f"No pages contain the specified keywords for {section_name}.")
86
- return None
87
- except Exception as e:
88
- st.error(f"An error occurred while processing the PDF: {e}")
89
- return None
90
- else:
91
- st.warning("Please upload a PDF on the Home page first.")
92
- return None
93
-
94
- def find_financial(uploaded_file, section_name):
95
- """
96
- Extracts and displays sections of a PDF based on keyword matches.
97
-
98
- Parameters:
99
- uploaded_file: The uploaded PDF file (Streamlit file uploader object).
100
- section_name: The name of the section to search for (e.g., "income_statement").
101
-
102
- Returns:
103
- bool: True if processing completed without interruptions; False if stopped or an error occurred.
104
- """
105
-
106
- st.title(section_name.replace("_", " ").title())
107
-
108
- if uploaded_file:
109
- try:
110
- pdf_reader = PdfReader(uploaded_file)
111
- total_pages = len(pdf_reader.pages)
112
-
113
- # Step 1: Start from the second half of the PDF
114
- start_page = total_pages // 2
115
- pages = pdf_reader.pages[start_page:]
116
-
117
- section_keywords = keywords_dict.get(section_name, [])
118
- section_stop_keywords = stop_keywords.get(section_name, [])
119
- section_anti_keywords = anti_keywords.get(section_name, [])
120
-
121
- pdf_writer = PdfWriter() # Writer for the extracted pages
122
- extraction_started = False # Flag to check if extraction has started
123
-
124
- for page_num, page in enumerate(pages, start=start_page + 1):
125
- text = page.extract_text()
126
-
127
- # Step 2: Find the keywords within the keywords_dict
128
- if not extraction_started:
129
- for keyword_set in section_keywords:
130
- if all(re.search(keyword, text, re.IGNORECASE) for keyword in keyword_set):
131
- st.write(f"Keywords matched on page {page_num}. Starting extraction.")
132
- pdf_writer.add_page(page)
133
-
134
- # Check for stop keywords on the same page
135
- if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in stop_set)
136
- for stop_set in section_stop_keywords):
137
- st.warning(f"Stop keywords matched on starting page {page_num}. Stopping extraction.")
138
-
139
- # Check for anti-keywords before stopping
140
- if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in anti_set)
141
- for anti_set in section_anti_keywords):
142
- st.write(f"Page {page_num} contains anti-keywords. Excluding from results.")
143
- pdf_writer.pages.pop() # Remove the last added page
144
-
145
- # Save and display the extracted pages (if any)
146
- if len(pdf_writer.pages) > 0:
147
- temp_section_path = os.path.join(f"temp_{section_name}_section.pdf")
148
- with open(temp_section_path, "wb") as f:
149
- pdf_writer.write(f)
150
- return temp_section_path
151
- else:
152
- st.warning(f"No pages matched the criteria for {section_name}.")
153
-
154
- # Stop extraction immediately and signal to stop all processing
155
- return False
156
- else:
157
- # Continue extraction
158
- extraction_started = True
159
- break
160
- elif extraction_started:
161
- # Step 3: Add the page to the output
162
- pdf_writer.add_page(page)
163
-
164
- # Step 4: Check for stop keywords
165
- if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in stop_set)
166
- for stop_set in section_stop_keywords):
167
- st.warning(f"Stopping extraction at page {page_num}. Stop keywords matched.")
168
-
169
- # Step 5: After stopping, check for anti-keywords
170
- if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in anti_set)
171
- for anti_set in section_anti_keywords):
172
- st.write(f"Page {page_num} contains anti-keywords. Excluding from results.")
173
- pdf_writer.pages.pop() # Remove the last added page
174
-
175
- # Save and display the extracted pages (if any)
176
- if len(pdf_writer.pages) > 0:
177
- temp_section_path = os.path.join(f"temp_{section_name}_section.pdf")
178
- with open(temp_section_path, "wb") as f:
179
- pdf_writer.write(f)
180
- return temp_section_path
181
- else:
182
- st.warning(f"No pages matched the criteria for {section_name}.")
183
-
184
- # Stop extraction and signal to stop all processing
185
- return False
186
-
187
- # If extraction finished without hitting stop keywords, save and display the pages
188
- if len(pdf_writer.pages) > 0:
189
- temp_section_path = os.path.join(f"temp_{section_name}_section.pdf")
190
- with open(temp_section_path, "wb") as f:
191
- pdf_writer.write(f)
192
- return temp_section_path
193
- else:
194
- st.warning(f"No pages matched the criteria for {section_name}.")
195
-
196
- # Indicate that processing can continue
197
- return True
198
-
199
- except Exception as e:
200
- st.error(f"An error occurred while processing the PDF: {e}")
201
- # Stop processing due to an error
202
- return False
203
- else:
204
- st.warning("Please upload a PDF on the Home page first.")
205
- # Stop processing since no file is uploaded
 
 
 
 
 
206
  return False
 
1
+ import os
2
+ import re
3
+ from PyPDF2 import PdfReader, PdfWriter
4
+ import streamlit as st
5
+ from config import keywords_dict, stop_keywords, anti_keywords
6
+
7
+ def find_cover(uploaded_file):
8
+ """
9
+ Extracts and saves the first page of a PDF to a temporary file.
10
+
11
+ Parameters:
12
+ uploaded_file: The uploaded PDF file.
13
+
14
+ Returns:
15
+ str: Path to the temporary file containing the first page of the PDF.
16
+ """
17
+ section_title = "cover"
18
+ if uploaded_file:
19
+ try:
20
+ # Read the PDF and extract the first page
21
+ pdf_reader = PdfReader(uploaded_file)
22
+ first_page = pdf_reader.pages[0]
23
+
24
+ pdf_writer = PdfWriter()
25
+ temp_cover_page_path = os.path.join(f"temp_{section_title}_1.pdf")
26
+ with open(temp_cover_page_path, "wb") as f:
27
+ pdf_writer.add_page(first_page)
28
+ pdf_writer.write(f)
29
+
30
+ # Return the path to the temporary file
31
+ return temp_cover_page_path
32
+ except Exception as e:
33
+ st.error(f"An error occurred while processing the PDF: {e}")
34
+ return None
35
+ else:
36
+ st.warning("Please upload a PDF on the Home page first.")
37
+ return None
38
+
39
+
40
+ def find_underwriter(uploaded_file):
41
+ """
42
+ Searches for pages in a PDF containing specific keywords for the 'underwriter' section and returns the extracted file path.
43
+
44
+ Parameters:
45
+ uploaded_file: The uploaded PDF file.
46
+
47
+ Returns:
48
+ str: Path to the temporary file containing the extracted 'underwriter' page(s).
49
+ """
50
+ section_name = "underwriter"
51
+
52
+ keyword_sets = keywords_dict.get(section_name, [])
53
+ if not keyword_sets:
54
+ st.error(f"No keywords defined for section: {section_name}")
55
+ return None
56
+
57
+ if uploaded_file:
58
+ try:
59
+ pdf_reader = PdfReader(uploaded_file)
60
+ total_pages = len(pdf_reader.pages)
61
+ start_page = total_pages // 3 # Skip the first 1/3 of the PDF
62
+ pages = pdf_reader.pages[start_page:]
63
+
64
+ # Loop through the keyword sets
65
+ for keyword_set in keyword_sets:
66
+ for page_num, page in enumerate(pages, start=start_page + 1):
67
+ text = page.extract_text()
68
+
69
+ # Check if any keyword in the set is found on the page
70
+ if any(re.search(keyword, text, re.IGNORECASE) for keyword in keyword_set):
71
+ # Save the matched page to a temporary file
72
+ pdf_writer = PdfWriter()
73
+ pdf_writer.add_page(page)
74
+
75
+ temp_page_path = os.path.join(f"temp_{section_name}_{page_num}.pdf")
76
+ with open(temp_page_path, "wb") as f:
77
+ pdf_writer.write(f)
78
+
79
+ # Return the path of the extracted page
80
+ return temp_page_path
81
+
82
+ st.warning(f"No pages contain the specified keywords for {section_name}.")
83
+ return None
84
+ except Exception as e:
85
+ st.error(f"An error occurred while processing the PDF: {e}")
86
+ return None
87
+ else:
88
+ st.warning("Please upload a PDF on the Home page first.")
89
+ return None
90
+
91
+ def find_financial(uploaded_file, section_name):
92
+ """
93
+ Extracts and displays sections of a PDF based on keyword matches.
94
+
95
+ Parameters:
96
+ uploaded_file: The uploaded PDF file (Streamlit file uploader object).
97
+ section_name: The name of the section to search for (e.g., "income_statement").
98
+
99
+ Returns:
100
+ bool: True if processing completed without interruptions; False if stopped or an error occurred.
101
+ """
102
+ if uploaded_file:
103
+ try:
104
+ pdf_reader = PdfReader(uploaded_file)
105
+ total_pages = len(pdf_reader.pages)
106
+
107
+ # Step 1: Start from the second half of the PDF
108
+ start_page = total_pages // 2
109
+ pages = pdf_reader.pages[start_page:]
110
+
111
+ section_keywords = keywords_dict.get(section_name, [])
112
+ section_stop_keywords = stop_keywords.get(section_name, [])
113
+ section_anti_keywords = anti_keywords.get(section_name, [])
114
+
115
+ pdf_writer = PdfWriter() # Writer for the extracted pages
116
+ extraction_started = False # Flag to check if extraction has started
117
+ extraction_start_page = None # Track the starting page number
118
+ pages_extracted = 0 # Counter for extracted pages
119
+
120
+ for page_num, page in enumerate(pages, start=start_page + 1):
121
+ text = page.extract_text()
122
+
123
+ # Step 2: Find the keywords within the keywords_dict
124
+ if not extraction_started:
125
+ for keyword_set in section_keywords:
126
+ if all(re.search(keyword, text, re.IGNORECASE) for keyword in keyword_set):
127
+ pdf_writer.add_page(page)
128
+ pages_extracted += 1
129
+ extraction_start_page = page_num # Set the starting page number
130
+
131
+ # Check for stop keywords on the same page
132
+ if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in stop_set)
133
+ for stop_set in section_stop_keywords):
134
+
135
+ # Check for anti-keywords before stopping
136
+ if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in anti_set)
137
+ for anti_set in section_anti_keywords):
138
+ pdf_writer.pages.pop() # Remove the last added page
139
+ pages_extracted -= 1
140
+
141
+ # Save and display the extracted pages (if any)
142
+ if len(pdf_writer.pages) > 0:
143
+ temp_section_path = os.path.join(f"temp_{section_name}_{extraction_start_page}-{page_num}.pdf")
144
+ with open(temp_section_path, "wb") as f:
145
+ pdf_writer.write(f)
146
+ return temp_section_path
147
+ else:
148
+ st.warning(f"No pages matched the criteria for {section_name}.")
149
+
150
+ # Stop extraction immediately and signal to stop all processing
151
+ return False
152
+ else:
153
+ # Continue extraction
154
+ extraction_started = True
155
+ break
156
+ elif extraction_started:
157
+ # Check if we've reached the 3-page limit
158
+ if pages_extracted >= 3:
159
+ if len(pdf_writer.pages) > 0:
160
+ temp_section_path = os.path.join(f"temp_{section_name}_{extraction_start_page}-{page_num-1}.pdf")
161
+ with open(temp_section_path, "wb") as f:
162
+ pdf_writer.write(f)
163
+ return temp_section_path
164
+ return False
165
+
166
+ # Step 3: Add the page to the output
167
+ pdf_writer.add_page(page)
168
+ pages_extracted += 1
169
+
170
+ # Step 4: Check for stop keywords
171
+ if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in stop_set)
172
+ for stop_set in section_stop_keywords):
173
+
174
+ # Step 5: After stopping, check for anti-keywords
175
+ if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in anti_set)
176
+ for anti_set in section_anti_keywords):
177
+ pdf_writer.pages.pop() # Remove the last added page
178
+ pages_extracted -= 1
179
+
180
+ # Save and display the extracted pages (if any)
181
+ if len(pdf_writer.pages) > 0:
182
+ temp_section_path = os.path.join(f"temp_{section_name}_{extraction_start_page}-{page_num}.pdf")
183
+ with open(temp_section_path, "wb") as f:
184
+ pdf_writer.write(f)
185
+ return temp_section_path
186
+ else:
187
+ st.warning(f"No pages matched the criteria for {section_name}.")
188
+
189
+ # Stop extraction and signal to stop all processing
190
+ return False
191
+
192
+ # If extraction finished without hitting stop keywords, save and display the pages
193
+ if len(pdf_writer.pages) > 0:
194
+ temp_section_path = os.path.join(f"temp_{section_name}_{extraction_start_page}-{page_num}.pdf")
195
+ with open(temp_section_path, "wb") as f:
196
+ pdf_writer.write(f)
197
+ return temp_section_path
198
+ else:
199
+ st.warning(f"No pages matched the criteria for {section_name}.")
200
+
201
+ # Indicate that processing can continue
202
+ return True
203
+
204
+ except Exception as e:
205
+ st.error(f"An error occurred while processing the PDF: {e}")
206
+ # Stop processing due to an error
207
+ return False
208
+ else:
209
+ st.warning("Please upload a PDF on the Home page first.")
210
+ # Stop processing since no file is uploaded
211
  return False