Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -435,11 +435,11 @@ class EnhancedGazaKnowledgeBase:
|
|
| 435 |
logger.info(f"Cached {len(documents)} documents")
|
| 436 |
|
| 437 |
return documents
|
| 438 |
-
|
| 439 |
def _extract_pdf_text(self, pdf_path: Path) -> str:
|
| 440 |
-
"""Use unstructured to extract and chunk PDF text by title"""
|
| 441 |
try:
|
| 442 |
-
elements = partition_pdf(filename=str(pdf_path), strategy="
|
| 443 |
if not elements:
|
| 444 |
logger.warning(f"No elements extracted from {pdf_path}")
|
| 445 |
return ""
|
|
@@ -464,11 +464,18 @@ class EnhancedGazaKnowledgeBase:
|
|
| 464 |
logger.warning(f"Extracted text too short from {pdf_path}")
|
| 465 |
return ""
|
| 466 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 467 |
return full_text
|
| 468 |
except Exception as e:
|
| 469 |
logger.error(f"Unstructured PDF parse failed for {pdf_path}: {e}")
|
| 470 |
return ""
|
| 471 |
|
|
|
|
| 472 |
|
| 473 |
def search(self, query: str, k: int = 5) -> List[Dict[str, Any]]:
|
| 474 |
"""Enhanced search with better error handling and result processing"""
|
|
|
|
| 435 |
logger.info(f"Cached {len(documents)} documents")
|
| 436 |
|
| 437 |
return documents
|
| 438 |
+
|
| 439 |
def _extract_pdf_text(self, pdf_path: Path) -> str:
|
| 440 |
+
"""Use unstructured to extract and chunk PDF text by title, and save as .txt"""
|
| 441 |
try:
|
| 442 |
+
elements = partition_pdf(filename=str(pdf_path), strategy="auto")
|
| 443 |
if not elements:
|
| 444 |
logger.warning(f"No elements extracted from {pdf_path}")
|
| 445 |
return ""
|
|
|
|
| 464 |
logger.warning(f"Extracted text too short from {pdf_path}")
|
| 465 |
return ""
|
| 466 |
|
| 467 |
+
# Save extracted output to .txt next to original PDF
|
| 468 |
+
txt_output = pdf_path.with_suffix(".extracted.txt")
|
| 469 |
+
with open(txt_output, "w", encoding="utf-8") as f:
|
| 470 |
+
f.write(full_text)
|
| 471 |
+
logger.info(f"Saved extracted text to {txt_output.name}")
|
| 472 |
+
|
| 473 |
return full_text
|
| 474 |
except Exception as e:
|
| 475 |
logger.error(f"Unstructured PDF parse failed for {pdf_path}: {e}")
|
| 476 |
return ""
|
| 477 |
|
| 478 |
+
|
| 479 |
|
| 480 |
def search(self, query: str, k: int = 5) -> List[Dict[str, Any]]:
|
| 481 |
"""Enhanced search with better error handling and result processing"""
|