Update main_analyzer.py (#1)
Browse files- Update main_analyzer.py (21e59ca78f93aa1af8acd53cd3e78c131cc7bb50)
- main_analyzer.py +10 -16
main_analyzer.py
CHANGED
|
@@ -90,7 +90,7 @@ def analyze_pdf(filepath_or_stream: Any) -> Tuple[Dict[str, Any], None]:
|
|
| 90 |
print(f"Analyzer: Mapping {len(detailed_issues_for_mapping)} issues to PDF coordinates...")
|
| 91 |
# ... (rest of mapping logic as before) ...
|
| 92 |
for page_idx in range(doc_for_mapping.page_count):
|
| 93 |
-
page = doc_for_mapping[page_idx]
|
| 94 |
current_page_num_1_based = page_idx + 1
|
| 95 |
unmapped_issues_on_this_page_by_context = defaultdict(list)
|
| 96 |
for issue_dict in detailed_issues_for_mapping:
|
|
@@ -104,9 +104,14 @@ def analyze_pdf(filepath_or_stream: Any) -> Tuple[Dict[str, Any], None]:
|
|
| 104 |
for ctx_str, issues_for_ctx in unmapped_issues_on_this_page_by_context.items():
|
| 105 |
if not ctx_str or not ctx_str.strip(): continue
|
| 106 |
try:
|
| 107 |
-
|
| 108 |
-
if
|
| 109 |
-
try_map_issues_to_page_rects(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
except Exception as search_exc:
|
| 111 |
print(f"Analyzer: Warning: Error searching for context '{ctx_str[:30].replace(chr(10),' ')}' on page {current_page_num_1_based}: {search_exc}")
|
| 112 |
total_mapped = sum(1 for iss in detailed_issues_for_mapping if iss['is_mapped_to_pdf'])
|
|
@@ -153,15 +158,4 @@ def analyze_pdf(filepath_or_stream: Any) -> Tuple[Dict[str, Any], None]:
|
|
| 153 |
# itself if it received a stream; this isn't happening in the Gradio flow.
|
| 154 |
if doc_for_mapping: # Ensure the fitz document for mapping is closed
|
| 155 |
doc_for_mapping.close()
|
| 156 |
-
print(f"Analyzer: Closed fitz document used for mapping.")
|
| 157 |
-
|
| 158 |
-
# The original finally block for temp_file_for_stream_path:
|
| 159 |
-
# if temp_file_for_stream_path and os.path.exists(temp_file_for_stream_path):
|
| 160 |
-
# try:
|
| 161 |
-
# os.remove(temp_file_for_stream_path)
|
| 162 |
-
# print(f"Analyzer: Cleaned up main temporary PDF file: {temp_file_for_stream_path}")
|
| 163 |
-
# except Exception as e_clean:
|
| 164 |
-
# print(f"Analyzer: Error cleaning up main temporary PDF file {temp_file_for_stream_path}: {e_clean}")
|
| 165 |
-
# This part is removed because temp_file_for_stream_path is never assigned a value
|
| 166 |
-
# in the current structure of analyze_pdf. If analyze_pdf were to handle streams
|
| 167 |
-
# by creating its own temp file, then this cleanup would be relevant for that temp file.
|
|
|
|
| 90 |
print(f"Analyzer: Mapping {len(detailed_issues_for_mapping)} issues to PDF coordinates...")
|
| 91 |
# ... (rest of mapping logic as before) ...
|
| 92 |
for page_idx in range(doc_for_mapping.page_count):
|
| 93 |
+
page = doc_for_mapping[page_idx] # Current PyMuPDF page object
|
| 94 |
current_page_num_1_based = page_idx + 1
|
| 95 |
unmapped_issues_on_this_page_by_context = defaultdict(list)
|
| 96 |
for issue_dict in detailed_issues_for_mapping:
|
|
|
|
| 104 |
for ctx_str, issues_for_ctx in unmapped_issues_on_this_page_by_context.items():
|
| 105 |
if not ctx_str or not ctx_str.strip(): continue
|
| 106 |
try:
|
| 107 |
+
pdf_rects_for_context_occurrences = page.search_for(ctx_str, flags=fitz.TEXT_PRESERVE_LIGATURES | fitz.TEXT_PRESERVE_WHITESPACE)
|
| 108 |
+
if pdf_rects_for_context_occurrences:
|
| 109 |
+
try_map_issues_to_page_rects(
|
| 110 |
+
issues_for_ctx,
|
| 111 |
+
pdf_rects_for_context_occurrences,
|
| 112 |
+
current_page_num_1_based,
|
| 113 |
+
page # Pass the current page object
|
| 114 |
+
)
|
| 115 |
except Exception as search_exc:
|
| 116 |
print(f"Analyzer: Warning: Error searching for context '{ctx_str[:30].replace(chr(10),' ')}' on page {current_page_num_1_based}: {search_exc}")
|
| 117 |
total_mapped = sum(1 for iss in detailed_issues_for_mapping if iss['is_mapped_to_pdf'])
|
|
|
|
| 158 |
# itself if it received a stream; this isn't happening in the Gradio flow.
|
| 159 |
if doc_for_mapping: # Ensure the fitz document for mapping is closed
|
| 160 |
doc_for_mapping.close()
|
| 161 |
+
print(f"Analyzer: Closed fitz document used for mapping.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|