Spaces:
Sleeping
Sleeping
fix
Browse files
content_analyzer/document_parser.py
CHANGED
|
@@ -140,11 +140,13 @@ class DocumentProcessor:
|
|
| 140 |
)
|
| 141 |
self.gemini_client = None
|
| 142 |
self.genai_module = None # Store the module reference
|
| 143 |
-
|
|
|
|
|
|
|
| 144 |
self._init_gemini_vision()
|
| 145 |
logger.debug(f"DocumentProcessor initialized with cache dir: {self.cache_dir}")
|
| 146 |
logger.debug(f"Chunk size: {parameters.CHUNK_SIZE}, Chunk overlap: {parameters.CHUNK_OVERLAP}")
|
| 147 |
-
logger.debug(f"Chart extraction: {'enabled' if
|
| 148 |
|
| 149 |
def _init_gemini_vision(self):
|
| 150 |
"""Initialize Gemini Vision client for chart analysis."""
|
|
@@ -156,7 +158,7 @@ class DocumentProcessor:
|
|
| 156 |
except ImportError as e:
|
| 157 |
logger.warning(f"google-genai not installed: {e}")
|
| 158 |
logger.info("Install with: pip install google-genai")
|
| 159 |
-
|
| 160 |
return
|
| 161 |
self.genai_module = genai
|
| 162 |
try:
|
|
@@ -165,7 +167,7 @@ class DocumentProcessor:
|
|
| 165 |
logger.info(f"✅ Gemini Vision client initialized")
|
| 166 |
except Exception as e:
|
| 167 |
logger.error(f"❌ Failed to initialize Gemini Vision client: {e}")
|
| 168 |
-
|
| 169 |
|
| 170 |
def validate_files(self, files: List) -> bool:
|
| 171 |
"""
|
|
@@ -288,8 +290,8 @@ class DocumentProcessor:
|
|
| 288 |
def run_pdfplumber():
|
| 289 |
return self._load_pdf_with_pdfplumber(file.name)
|
| 290 |
def run_charts():
|
| 291 |
-
logger.info(f"
|
| 292 |
-
if
|
| 293 |
return self._extract_charts_from_pdf(file.name)
|
| 294 |
return []
|
| 295 |
try:
|
|
@@ -313,7 +315,7 @@ class DocumentProcessor:
|
|
| 313 |
except MemoryError as e:
|
| 314 |
logger.error(f"Out of memory in parallel PDF processing: {e}. Falling back to sequential.")
|
| 315 |
documents = self._load_pdf_with_pdfplumber(file.name)
|
| 316 |
-
if
|
| 317 |
chart_docs = self._extract_charts_from_pdf(file.name)
|
| 318 |
if chart_docs:
|
| 319 |
documents.extend(chart_docs)
|
|
|
|
| 140 |
)
|
| 141 |
self.gemini_client = None
|
| 142 |
self.genai_module = None # Store the module reference
|
| 143 |
+
# Instance-level flag instead of modifying global parameters
|
| 144 |
+
self.chart_extraction_enabled = parameters.ENABLE_CHART_EXTRACTION
|
| 145 |
+
if self.chart_extraction_enabled:
|
| 146 |
self._init_gemini_vision()
|
| 147 |
logger.debug(f"DocumentProcessor initialized with cache dir: {self.cache_dir}")
|
| 148 |
logger.debug(f"Chunk size: {parameters.CHUNK_SIZE}, Chunk overlap: {parameters.CHUNK_OVERLAP}")
|
| 149 |
+
logger.debug(f"Chart extraction: {'enabled' if self.chart_extraction_enabled else 'disabled'}")
|
| 150 |
|
| 151 |
def _init_gemini_vision(self):
|
| 152 |
"""Initialize Gemini Vision client for chart analysis."""
|
|
|
|
| 158 |
except ImportError as e:
|
| 159 |
logger.warning(f"google-genai not installed: {e}")
|
| 160 |
logger.info("Install with: pip install google-genai")
|
| 161 |
+
self.chart_extraction_enabled = False # Instance-level, not global
|
| 162 |
return
|
| 163 |
self.genai_module = genai
|
| 164 |
try:
|
|
|
|
| 167 |
logger.info(f"✅ Gemini Vision client initialized")
|
| 168 |
except Exception as e:
|
| 169 |
logger.error(f"❌ Failed to initialize Gemini Vision client: {e}")
|
| 170 |
+
self.chart_extraction_enabled = False # Instance-level, not global
|
| 171 |
|
| 172 |
def validate_files(self, files: List) -> bool:
|
| 173 |
"""
|
|
|
|
| 290 |
def run_pdfplumber():
|
| 291 |
return self._load_pdf_with_pdfplumber(file.name)
|
| 292 |
def run_charts():
|
| 293 |
+
logger.info(f"chart_extraction_enabled={self.chart_extraction_enabled}, gemini_client={self.gemini_client is not None}")
|
| 294 |
+
if self.chart_extraction_enabled and self.gemini_client:
|
| 295 |
return self._extract_charts_from_pdf(file.name)
|
| 296 |
return []
|
| 297 |
try:
|
|
|
|
| 315 |
except MemoryError as e:
|
| 316 |
logger.error(f"Out of memory in parallel PDF processing: {e}. Falling back to sequential.")
|
| 317 |
documents = self._load_pdf_with_pdfplumber(file.name)
|
| 318 |
+
if self.chart_extraction_enabled and self.gemini_client:
|
| 319 |
chart_docs = self._extract_charts_from_pdf(file.name)
|
| 320 |
if chart_docs:
|
| 321 |
documents.extend(chart_docs)
|