File size: 12,914 Bytes
961b876
 
 
 
 
fee8cba
eb20090
961b876
0918bea
961b876
 
0918bea
961b876
 
0918bea
 
961b876
 
fee8cba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
961b876
 
fee8cba
 
 
961b876
 
fee8cba
 
 
 
 
961b876
 
fee8cba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
961b876
0918bea
961b876
 
 
 
 
2c6cadb
fee8cba
2c6cadb
 
eb20090
2c6cadb
 
 
 
eb20090
 
961b876
2c6cadb
0918bea
eb20090
2c6cadb
eb20090
 
 
0918bea
2c6cadb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eb20090
2c6cadb
eb20090
2c6cadb
eb20090
 
 
 
2c6cadb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0918bea
2c6cadb
0918bea
 
 
2c6cadb
eb20090
0918bea
 
 
 
 
 
 
 
 
fee8cba
0918bea
961b876
0918bea
961b876
 
0918bea
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
# pdf_processing.py
import fitz  # PyMuPDF
import pymupdf4llm
import os
import traceback
from typing import Any, Dict, List, Optional # Use standard List, Dict, Optional
from collections import Counter

def convert_rect_to_dict(rect: fitz.Rect) -> Dict[str, float] | None:
    """Converts a fitz.Rect object to a dictionary."""
    if not rect or not isinstance(rect, fitz.Rect):
        # print(f"Warning: Invalid rect object received: {rect}") # Can be verbose
        return None
    return {
        "x0": rect.x0, "y0": rect.y0, "x1": rect.x1, "y1": rect.y1,
        "width": rect.width, "height": rect.height
    }

def _get_specific_error_rect_in_context(
    page: fitz.Page,
    context_rect: fitz.Rect,
    error_text_verbatim: str
) -> Optional[fitz.Rect]:
    """
    Tries to find the precise bounding box of error_text_verbatim within
    the larger context_rect on the given page.
    """
    if not error_text_verbatim or error_text_verbatim.isspace():
        print(f"Debug: _get_specific_error_rect_in_context: error_text_verbatim is empty or whitespace.")
        return None

    # Extract words sorted by position within the given context_rect
    # Each word_data is (x0, y0, x1, y1, "text", block_no, line_no, word_no)
    words_on_page_in_clip = page.get_text("words", clip=context_rect, sort=True)
    
    # print(f"Debug: _get_specific_error_rect_in_context: Searching for '{error_text_verbatim}' in {len(words_on_page_in_clip)} words within clip {context_rect}")

    error_tokens = error_text_verbatim.strip().split()
    if not error_tokens:
        print(f"Debug: _get_specific_error_rect_in_context: No tokens from error_text_verbatim '{error_text_verbatim}'.")
        return None

    found_rects_for_error_sequence = []

    for i in range(len(words_on_page_in_clip) - len(error_tokens) + 1):
        match = True
        current_sequence_rects = []
        # print(f"Debug: _get_specific_error_rect_in_context: Trying match starting at PDF word '{words_on_page_in_clip[i][4]}'")
        for j in range(len(error_tokens)):
            pdf_word_text = words_on_page_in_clip[i+j][4]
            error_token_to_match = error_tokens[j]
            
            # Basic normalization for comparison
            pdf_word_normalized = pdf_word_text.strip().lower()
            error_token_normalized = error_token_to_match.strip().lower()

            # A more robust comparison might involve removing common punctuation
            # or handling hyphenation if LanguageTool splits differently than PyMuPDF.
            if error_token_normalized != pdf_word_normalized:
                # print(f"Debug: _get_specific_error_rect_in_context: Mismatch: '{error_token_normalized}' (expected) vs '{pdf_word_normalized}' (pdf word)")
                match = False
                break
            current_sequence_rects.append(fitz.Rect(words_on_page_in_clip[i+j][:4]))
        
        if match:
            # print(f"Debug: _get_specific_error_rect_in_context: Found match for '{error_text_verbatim}'")
            found_rects_for_error_sequence = current_sequence_rects
            break # Found the first full match of the error_text_verbatim

    if found_rects_for_error_sequence:
        final_error_bbox = fitz.Rect() # Start with an empty rect
        for r_part in found_rects_for_error_sequence:
            final_error_bbox.include_rect(r_part) # Expand to include this part
        
        if not final_error_bbox.is_empty:
            # print(f"Debug: _get_specific_error_rect_in_context: Combined bbox: {final_error_bbox}")
            return final_error_bbox
        else:
            # print(f"Debug: _get_specific_error_rect_in_context: Combined bbox was empty.")
            pass
    else:
        # print(f"Debug: _get_specific_error_rect_in_context: No match found for '{error_text_verbatim}'.")
        pass
    return None


def try_map_issues_to_page_rects(
    issues_to_map_for_context: List[Dict[str, Any]],
    pdf_rects_from_search: List[fitz.Rect], # Rects for occurrences of the wider context string
    page_number_for_mapping: int,
    page: fitz.Page # The current PyMuPDF page object
) -> int:
    mapped_count = 0
    # We assume that the number of issues for a given context string on a page
    # should not exceed the number of times that context string appears.
    # If it does, we only map up to the number of found context occurrences.
    limit = min(len(issues_to_map_for_context), len(pdf_rects_from_search))

    for i in range(limit):
        issue_to_update = issues_to_map_for_context[i]
        if issue_to_update['is_mapped_to_pdf']: 
            continue

        # This is the rectangle for the i-th occurrence of the wider context string
        context_occurrence_rect = pdf_rects_from_search[i]
        
        final_rect_for_issue = context_occurrence_rect # Default to the whole context rect

        # For LanguageTool issues, try to refine the rect to the specific error text
        if issue_to_update.get('source_check_type') == 'LanguageTool':
            error_text_verbatim = issue_to_update.get('error_text_verbatim')
            if error_text_verbatim:
                # print(f"Debug: Refining LT issue: '{error_text_verbatim}' within context rect {context_occurrence_rect}")
                specific_error_rect = _get_specific_error_rect_in_context(
                    page, context_occurrence_rect, error_text_verbatim
                )
                if specific_error_rect:
                    final_rect_for_issue = specific_error_rect
                    # print(f"Debug: Refined rect to: {final_rect_for_issue}")
                else:
                    # print(f"Debug: Could not refine rect, using context rect: {context_occurrence_rect}")
                    pass # Stick with the wider context_occurrence_rect if specific not found
        
        coord_dict = convert_rect_to_dict(final_rect_for_issue)
        if coord_dict:
            issue_to_update['pdf_coordinates_list'] = [coord_dict]
            issue_to_update['is_mapped_to_pdf'] = True
            issue_to_update['mapped_page_number'] = page_number_for_mapping
            mapped_count += 1
    return mapped_count


# ... (rest of pdf_processing.py, including extract_majority_font_text_directly and extract_plain_text_from_original_pdf) ...

def extract_majority_font_text_directly(pdf_path: str) -> str:
    """
    Extracts text from PDF, identifies the majority font and size,
    and then directly assembles a plain text string containing only the text
    that matches this majority font, attempting to preserve basic structure.
    This method does NOT create an intermediate PDF document.
    """
    original_doc = None
    try:
        # 1. Open PDF and Perform Font Analysis (similar to before)
        original_doc = fitz.open(pdf_path)
        if not original_doc.page_count:
            print("FontFilter (Direct): PDF has no pages.")
            return ""

        font_char_counts: Counter = Counter()
        pdf_basename = os.path.basename(pdf_path)
        print(f"FontFilter (Direct): Analyzing fonts in '{pdf_basename}' ({original_doc.page_count} pages)...")

        # First pass: Analyze fonts to find the majority
        for page_num_analysis in range(original_doc.page_count):
            page_analysis = original_doc[page_num_analysis]
            # Using TEXTFLAGS_TEXT for potentially cleaner text from spans
            text_dict_analysis = page_analysis.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)
            for block_analysis in text_dict_analysis.get("blocks", []):
                if block_analysis.get("type") == 0:  # type 0 is a text block
                    for line_analysis in block_analysis.get("lines", []):
                        for span_analysis in line_analysis.get("spans", []):
                            font_name = span_analysis["font"]
                            font_size = span_analysis.get("size")
                            if font_size is None: continue  # Skip if size is not available

                            font_size_rounded = int(round(font_size))
                            text = span_analysis["text"]
                            if not text.strip(): continue  # Skip purely whitespace spans

                            font_char_counts[(font_name, font_size_rounded)] += len(text)

        if not font_char_counts:
            print("FontFilter (Direct): No text with font information found in PDF.")
            return ""

        majority_font_tuple_info = font_char_counts.most_common(1)[0]
        (majority_font_name, majority_font_size_rounded) = majority_font_tuple_info[0]
        char_count_for_majority = majority_font_tuple_info[1]
        print(
            f"FontFilter (Direct): Majority font identified: Name='{majority_font_name}', RoundedSize={majority_font_size_rounded}pt ({char_count_for_majority} chars).")

        # 2. Second Pass: Extract and Assemble Text Based on Majority Font
        print(
            f"FontFilter (Direct): Extracting text matching majority font (Name='{majority_font_name}', RoundedSize={majority_font_size_rounded}pt)...")
        all_pages_collected_text = []  # List to hold text from each page (as a list of block texts)

        for page_num_extraction in range(original_doc.page_count):
            page = original_doc[page_num_extraction]
            # Using flags for potentially better whitespace and ligature handling in extracted text
            text_page_dict = page.get_text("dict",
                                           flags=fitz.TEXTFLAGS_TEXT | fitz.TEXT_PRESERVE_LIGATURES | fitz.TEXT_PRESERVE_WHITESPACE)

            page_blocks_text_parts = []  # Collect text from blocks on this page

            for block in text_page_dict.get("blocks", []):
                if block.get("type") == 0:  # Text block
                    current_block_lines_text_parts = []
                    for line in block.get("lines", []):
                        current_line_spans_text_parts = []
                        for span in line.get("spans", []):
                            # Check if this span matches the majority font
                            current_span_font_name = span["font"]
                            current_span_font_size = span.get("size")

                            if current_span_font_size is not None and \
                                    current_span_font_name == majority_font_name and \
                                    int(round(current_span_font_size)) == majority_font_size_rounded:
                                current_line_spans_text_parts.append(span["text"])

                        if current_line_spans_text_parts:
                            # Join text from selected spans within a line with a single space
                            line_text = " ".join(current_line_spans_text_parts)
                            current_block_lines_text_parts.append(line_text)

                    if current_block_lines_text_parts:
                        # Join lines within a block with a single newline
                        block_text = "\n".join(current_block_lines_text_parts)
                        page_blocks_text_parts.append(block_text)

            if page_blocks_text_parts:
                # Join blocks on a page with a double newline (simulating paragraph breaks)
                all_pages_collected_text.append("\n\n".join(page_blocks_text_parts))

        if not all_pages_collected_text:
            print("FontFilter (Direct): No text matching the majority font was found to extract.")
            return ""

        # Join text from all pages.
        # A page break is already handled by the \n\n between blocks of different pages.
        # If more distinct page separation is needed, a custom separator could be added here.
        final_text = "\n\n".join(all_pages_collected_text)
        print(f"FontFilter (Direct): Successfully extracted text. Total length: {len(final_text)} characters.")
        return final_text

    except Exception as e:
        print(f"Error in extract_majority_font_text_directly for '{pdf_path}': {e}\n{traceback.format_exc()}")
        return ""
    finally:
        if original_doc: original_doc.close()


def extract_plain_text_from_original_pdf(pdf_path: str) -> str:
    """
    Extracts raw plain text from the PDF at pdf_path without any filtering.
    Expects pdf_path to be a valid path to a PDF file.
    """
    doc_orig_text = None
    try:
        doc_orig_text = fitz.open(pdf_path)
        full_text_parts = [page.get_text("text") for page in doc_orig_text]
        # print(full_text_parts) # This was the user's debug print, can be noisy
        return "".join(full_text_parts)
    except Exception as e:
        print(f"Error extracting plain text from original PDF '{pdf_path}': {e}\n{traceback.format_exc()}")
        return ""
    finally:
        if doc_orig_text: doc_orig_text.close()