DrishtiSharma commited on
Commit
fba6e19
·
verified ·
1 Parent(s): e9ee2aa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +53 -22
app.py CHANGED
@@ -57,17 +57,25 @@ check_poppler_installed()
57
 
58
  def load_docs(document_path):
59
  try:
60
- # Load the entire PDF content
61
- loader = PyMuPDFLoader(document_path)
62
- documents = loader.load()
63
 
64
- # Combine all pages into a single string
65
- full_text = "\n".join([doc.page_content for doc in documents])
 
66
 
67
- # Debug: Verify total text size
68
- st.write(f"📄 Total Text Length: {len(full_text)} characters")
 
 
 
69
 
70
- # Split the text into meaningful chunks
 
 
 
 
 
 
71
  text_splitter = RecursiveCharacterTextSplitter(
72
  chunk_size=1000,
73
  chunk_overlap=100,
@@ -78,7 +86,7 @@ def load_docs(document_path):
78
  # Debug: Show filtered chunks
79
  st.write(f"🔍 Total Chunks After Splitting: {len(split_docs)}")
80
  for i, doc in enumerate(split_docs[:5]): # Show first 5 chunks
81
- st.write(f"Chunk {i + 1}: {doc.page_content[:200]}...")
82
 
83
  return split_docs
84
  except Exception as e:
@@ -86,6 +94,31 @@ def load_docs(document_path):
86
  st.stop()
87
 
88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  def already_indexed(vectordb, file_name):
90
  indexed_sources = set(
91
  x["source"] for x in vectordb.get(include=["metadatas"])["metadatas"]
@@ -236,15 +269,17 @@ if __name__ == "__main__":
236
  else:
237
  st.write("✅ File already downloaded.")
238
 
239
- # Generate PDF preview
240
- st.write("🖼️ Generating PDF preview...")
241
- preview_image_path = preview_pdf(pdf_path)
242
- if preview_image_path:
243
- st.session_state.pdf_preview = preview_image_path
244
- st.image(preview_image_path, caption="First Page Preview", use_container_width=True)
245
- else:
246
- st.warning("Failed to generate PDF preview.")
247
- st.session_state.pdf_preview = None
 
 
248
 
249
  # Load the document into the system
250
  st.write("🔄 Loading document into the system...")
@@ -258,10 +293,6 @@ if __name__ == "__main__":
258
  st.error(f"Failed to load the document: {e}")
259
  st.stop()
260
 
261
- # Display the PDF preview if available
262
- if st.session_state.pdf_preview:
263
- st.image(st.session_state.pdf_preview, caption="First Page Preview", use_container_width=True)
264
-
265
  # Display previous chat messages
266
  if st.session_state.messages:
267
  for message in st.session_state.messages:
 
57
 
58
  def load_docs(document_path):
59
  try:
60
+ import fitz # PyMuPDF for text extraction
 
 
61
 
62
+ # Step 1: Extract plain text from PDF
63
+ doc = fitz.open(document_path)
64
+ extracted_text = []
65
 
66
+ for page_num, page in enumerate(doc):
67
+ page_text = page.get_text("text") # Extract text
68
+ clean_page_text = clean_extracted_text(page_text)
69
+ if clean_page_text: # Keep only non-empty cleaned text
70
+ extracted_text.append(clean_page_text)
71
 
72
+ doc.close()
73
+
74
+ # Step 2: Combine cleaned text
75
+ full_text = "\n".join(extracted_text)
76
+ st.write(f"📄 Total Cleaned Text Length: {len(full_text)} characters")
77
+
78
+ # Step 3: Chunk the cleaned text
79
  text_splitter = RecursiveCharacterTextSplitter(
80
  chunk_size=1000,
81
  chunk_overlap=100,
 
86
  # Debug: Show filtered chunks
87
  st.write(f"🔍 Total Chunks After Splitting: {len(split_docs)}")
88
  for i, doc in enumerate(split_docs[:5]): # Show first 5 chunks
89
+ st.write(f"Chunk {i + 1}: {doc.page_content[:300]}...")
90
 
91
  return split_docs
92
  except Exception as e:
 
94
  st.stop()
95
 
96
 
97
+ def clean_extracted_text(text):
98
+ """
99
+ Cleans extracted text to remove metadata, headers, and irrelevant content.
100
+ """
101
+ lines = text.split("\n")
102
+ cleaned_lines = []
103
+
104
+ for line in lines:
105
+ line = line.strip()
106
+
107
+ # Filter out lines with metadata patterns
108
+ if (
109
+ re.match(r"^(U\.S\.|United States|Sheet|Figure|References|Patent No|Date of Patent)", line)
110
+ or re.match(r"^\(?\d+\)?$", line) # Matches single numbers (page numbers)
111
+ or "Examiner" in line
112
+ or "Attorney" in line
113
+ or len(line) < 30 # Skip very short lines
114
+ ):
115
+ continue
116
+
117
+ cleaned_lines.append(line)
118
+
119
+ return "\n".join(cleaned_lines)
120
+
121
+
122
  def already_indexed(vectordb, file_name):
123
  indexed_sources = set(
124
  x["source"] for x in vectordb.get(include=["metadatas"])["metadatas"]
 
269
  else:
270
  st.write("✅ File already downloaded.")
271
 
272
+ # Generate PDF preview only if not already displayed
273
+ if not st.session_state.get("pdf_preview_displayed", False):
274
+ st.write("🖼️ Generating PDF preview...")
275
+ preview_image_path = preview_pdf(pdf_path)
276
+ if preview_image_path:
277
+ st.session_state.pdf_preview = preview_image_path
278
+ st.image(preview_image_path, caption="First Page Preview", use_container_width=True)
279
+ st.session_state["pdf_preview_displayed"] = True
280
+ else:
281
+ st.warning("Failed to generate PDF preview.")
282
+ st.session_state.pdf_preview = None
283
 
284
  # Load the document into the system
285
  st.write("🔄 Loading document into the system...")
 
293
  st.error(f"Failed to load the document: {e}")
294
  st.stop()
295
 
 
 
 
 
296
  # Display previous chat messages
297
  if st.session_state.messages:
298
  for message in st.session_state.messages: