Spaces:
Sleeping
Sleeping
DrishtiSharma
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -57,17 +57,25 @@ check_poppler_installed()
|
|
57 |
|
58 |
def load_docs(document_path):
|
59 |
try:
|
60 |
-
#
|
61 |
-
loader = PyMuPDFLoader(document_path)
|
62 |
-
documents = loader.load()
|
63 |
|
64 |
-
#
|
65 |
-
|
|
|
66 |
|
67 |
-
|
68 |
-
|
|
|
|
|
|
|
69 |
|
70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
text_splitter = RecursiveCharacterTextSplitter(
|
72 |
chunk_size=1000,
|
73 |
chunk_overlap=100,
|
@@ -78,7 +86,7 @@ def load_docs(document_path):
|
|
78 |
# Debug: Show filtered chunks
|
79 |
st.write(f"🔍 Total Chunks After Splitting: {len(split_docs)}")
|
80 |
for i, doc in enumerate(split_docs[:5]): # Show first 5 chunks
|
81 |
-
st.write(f"Chunk {i + 1}: {doc.page_content[:
|
82 |
|
83 |
return split_docs
|
84 |
except Exception as e:
|
@@ -86,6 +94,31 @@ def load_docs(document_path):
|
|
86 |
st.stop()
|
87 |
|
88 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
def already_indexed(vectordb, file_name):
|
90 |
indexed_sources = set(
|
91 |
x["source"] for x in vectordb.get(include=["metadatas"])["metadatas"]
|
@@ -236,15 +269,17 @@ if __name__ == "__main__":
|
|
236 |
else:
|
237 |
st.write("✅ File already downloaded.")
|
238 |
|
239 |
-
# Generate PDF preview
|
240 |
-
st.
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
|
|
|
|
248 |
|
249 |
# Load the document into the system
|
250 |
st.write("🔄 Loading document into the system...")
|
@@ -258,10 +293,6 @@ if __name__ == "__main__":
|
|
258 |
st.error(f"Failed to load the document: {e}")
|
259 |
st.stop()
|
260 |
|
261 |
-
# Display the PDF preview if available
|
262 |
-
if st.session_state.pdf_preview:
|
263 |
-
st.image(st.session_state.pdf_preview, caption="First Page Preview", use_container_width=True)
|
264 |
-
|
265 |
# Display previous chat messages
|
266 |
if st.session_state.messages:
|
267 |
for message in st.session_state.messages:
|
|
|
57 |
|
58 |
def load_docs(document_path):
|
59 |
try:
|
60 |
+
import fitz # PyMuPDF for text extraction
|
|
|
|
|
61 |
|
62 |
+
# Step 1: Extract plain text from PDF
|
63 |
+
doc = fitz.open(document_path)
|
64 |
+
extracted_text = []
|
65 |
|
66 |
+
for page_num, page in enumerate(doc):
|
67 |
+
page_text = page.get_text("text") # Extract text
|
68 |
+
clean_page_text = clean_extracted_text(page_text)
|
69 |
+
if clean_page_text: # Keep only non-empty cleaned text
|
70 |
+
extracted_text.append(clean_page_text)
|
71 |
|
72 |
+
doc.close()
|
73 |
+
|
74 |
+
# Step 2: Combine cleaned text
|
75 |
+
full_text = "\n".join(extracted_text)
|
76 |
+
st.write(f"📄 Total Cleaned Text Length: {len(full_text)} characters")
|
77 |
+
|
78 |
+
# Step 3: Chunk the cleaned text
|
79 |
text_splitter = RecursiveCharacterTextSplitter(
|
80 |
chunk_size=1000,
|
81 |
chunk_overlap=100,
|
|
|
86 |
# Debug: Show filtered chunks
|
87 |
st.write(f"🔍 Total Chunks After Splitting: {len(split_docs)}")
|
88 |
for i, doc in enumerate(split_docs[:5]): # Show first 5 chunks
|
89 |
+
st.write(f"Chunk {i + 1}: {doc.page_content[:300]}...")
|
90 |
|
91 |
return split_docs
|
92 |
except Exception as e:
|
|
|
94 |
st.stop()
|
95 |
|
96 |
|
97 |
+
def clean_extracted_text(text):
|
98 |
+
"""
|
99 |
+
Cleans extracted text to remove metadata, headers, and irrelevant content.
|
100 |
+
"""
|
101 |
+
lines = text.split("\n")
|
102 |
+
cleaned_lines = []
|
103 |
+
|
104 |
+
for line in lines:
|
105 |
+
line = line.strip()
|
106 |
+
|
107 |
+
# Filter out lines with metadata patterns
|
108 |
+
if (
|
109 |
+
re.match(r"^(U\.S\.|United States|Sheet|Figure|References|Patent No|Date of Patent)", line)
|
110 |
+
or re.match(r"^\(?\d+\)?$", line) # Matches single numbers (page numbers)
|
111 |
+
or "Examiner" in line
|
112 |
+
or "Attorney" in line
|
113 |
+
or len(line) < 30 # Skip very short lines
|
114 |
+
):
|
115 |
+
continue
|
116 |
+
|
117 |
+
cleaned_lines.append(line)
|
118 |
+
|
119 |
+
return "\n".join(cleaned_lines)
|
120 |
+
|
121 |
+
|
122 |
def already_indexed(vectordb, file_name):
|
123 |
indexed_sources = set(
|
124 |
x["source"] for x in vectordb.get(include=["metadatas"])["metadatas"]
|
|
|
269 |
else:
|
270 |
st.write("✅ File already downloaded.")
|
271 |
|
272 |
+
# Generate PDF preview only if not already displayed
|
273 |
+
if not st.session_state.get("pdf_preview_displayed", False):
|
274 |
+
st.write("🖼️ Generating PDF preview...")
|
275 |
+
preview_image_path = preview_pdf(pdf_path)
|
276 |
+
if preview_image_path:
|
277 |
+
st.session_state.pdf_preview = preview_image_path
|
278 |
+
st.image(preview_image_path, caption="First Page Preview", use_container_width=True)
|
279 |
+
st.session_state["pdf_preview_displayed"] = True
|
280 |
+
else:
|
281 |
+
st.warning("Failed to generate PDF preview.")
|
282 |
+
st.session_state.pdf_preview = None
|
283 |
|
284 |
# Load the document into the system
|
285 |
st.write("🔄 Loading document into the system...")
|
|
|
293 |
st.error(f"Failed to load the document: {e}")
|
294 |
st.stop()
|
295 |
|
|
|
|
|
|
|
|
|
296 |
# Display previous chat messages
|
297 |
if st.session_state.messages:
|
298 |
for message in st.session_state.messages:
|