Spaces:

rivapereira123
/

firstaid

Sleeping

rivapereira123 commited on Jul 14

Commit

84c93e1

verified ·

1 Parent(s): 370caed

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -435,11 +435,11 @@ class EnhancedGazaKnowledgeBase:
             logger.info(f"Cached {len(documents)} documents")
         return documents
     def _extract_pdf_text(self, pdf_path: Path) -> str:
-        """Use unstructured to extract and chunk PDF text by title"""
         try:
-            elements = partition_pdf(filename=str(pdf_path), strategy="hi_res")
             if not elements:
                 logger.warning(f"No elements extracted from {pdf_path}")
                 return ""
@@ -464,11 +464,18 @@ class EnhancedGazaKnowledgeBase:
                 logger.warning(f"Extracted text too short from {pdf_path}")
                 return ""
             return full_text
         except Exception as e:
             logger.error(f"Unstructured PDF parse failed for {pdf_path}: {e}")
             return ""
     def search(self, query: str, k: int = 5) -> List[Dict[str, Any]]:
         """Enhanced search with better error handling and result processing"""

             logger.info(f"Cached {len(documents)} documents")
         return documents
     def _extract_pdf_text(self, pdf_path: Path) -> str:
+        """Use unstructured to extract and chunk PDF text by title, and save as .txt"""
         try:
+            elements = partition_pdf(filename=str(pdf_path), strategy="auto")
             if not elements:
                 logger.warning(f"No elements extracted from {pdf_path}")
                 return ""
                 logger.warning(f"Extracted text too short from {pdf_path}")
                 return ""
+            # Save extracted output to .txt next to original PDF
+            txt_output = pdf_path.with_suffix(".extracted.txt")
+            with open(txt_output, "w", encoding="utf-8") as f:
+                f.write(full_text)
+            logger.info(f"Saved extracted text to {txt_output.name}")
             return full_text
         except Exception as e:
             logger.error(f"Unstructured PDF parse failed for {pdf_path}: {e}")
             return ""
     def search(self, query: str, k: int = 5) -> List[Dict[str, Any]]:
         """Enhanced search with better error handling and result processing"""