File size: 27,886 Bytes
364e0ba
 
 
 
 
 
46d2946
364e0ba
 
 
 
 
 
46d2946
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
364e0ba
 
46d2946
364e0ba
 
46d2946
 
 
 
364e0ba
46d2946
 
 
 
 
 
 
 
 
 
 
 
 
d3509b9
46d2946
 
 
 
 
 
 
 
 
364e0ba
f652e83
364e0ba
46d2946
364e0ba
46d2946
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
364e0ba
184c6f9
364e0ba
 
 
 
 
46d2946
 
 
 
364e0ba
 
 
 
46d2946
 
 
 
364e0ba
46d2946
 
 
 
 
 
2ad4c58
46d2946
 
 
2ad4c58
364e0ba
46d2946
364e0ba
46d2946
 
364e0ba
 
46d2946
 
 
 
 
 
 
 
 
 
 
364e0ba
46d2946
 
 
 
364e0ba
 
46d2946
 
 
 
 
 
 
 
 
 
364e0ba
46d2946
 
 
364e0ba
 
46d2946
364e0ba
46d2946
 
 
 
 
 
 
 
 
 
364e0ba
a0e200f
364e0ba
46d2946
 
364e0ba
 
 
 
46d2946
364e0ba
 
 
46d2946
364e0ba
 
a0e200f
46d2946
 
a0e200f
46d2946
 
 
364e0ba
46d2946
 
 
 
364e0ba
46d2946
 
 
364e0ba
a0e200f
46d2946
 
 
 
 
364e0ba
46d2946
 
 
 
364e0ba
 
46d2946
 
 
0e6dbe2
46d2946
 
364e0ba
46d2946
 
 
 
364e0ba
46d2946
 
dde32e5
46d2946
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e444d56
46d2946
 
18c6797
46d2946
4bb46a1
 
46d2946
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
364e0ba
46d2946
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c85e0b2
46d2946
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
364e0ba
46d2946
 
 
 
 
 
 
4bb46a1
364e0ba
 
 
46d2946
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0e6dbe2
46d2946
 
 
91e3e31
364e0ba
46d2946
 
 
 
810882a
46d2946
 
 
 
 
 
 
 
364e0ba
46d2946
 
364e0ba
46d2946
 
 
 
 
 
 
364e0ba
46d2946
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
364e0ba
 
46d2946
 
 
 
 
 
 
 
364e0ba
 
 
46d2946
 
 
4dd18db
46d2946
 
 
 
 
 
 
4dd18db
46d2946
4dd18db
46d2946
 
 
 
 
 
 
 
 
 
 
 
4dd18db
a0e200f
364e0ba
46d2946
 
 
 
 
 
 
0c80b43
364e0ba
 
46d2946
364e0ba
46d2946
364e0ba
0c80b43
46d2946
0c80b43
46d2946
 
 
 
 
 
 
 
364e0ba
 
 
46d2946
364e0ba
46d2946
 
364e0ba
12a89b7
364e0ba
46d2946
 
 
 
 
364e0ba
 
f51a49c
46d2946
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
import re
import fitz  # PyMuPDF
import language_tool_python
from typing import List, Dict, Any, Tuple
from collections import Counter
import json
import traceback # Keep for debugging, but try to minimize in final user-facing JSON
import io
import tempfile
import os
import gradio as gr

# Set JAVA_HOME environment variable
if 'JAVA_HOME' not in os.environ:
    potential_java_homes = [
        '/usr/lib/jvm/java-11-openjdk-amd64',
        '/usr/lib/jvm/java-17-openjdk-amd64',
        # For macOS users with Homebrew OpenJDK (common paths):
        # '/opt/homebrew/opt/openjdk@11/libexec/openjdk.jdk/Contents/Home', # M1/M2 Macs
        # '/usr/local/opt/openjdk@11/libexec/openjdk.jdk/Contents/Home',    # Intel Macs
        # '/opt/homebrew/opt/openjdk/libexec/openjdk.jdk/Contents/Home',    # Default OpenJDK Homebrew
    ]
    # User-specific path from environment if available
    user_java_home = os.environ.get('USER_JAVA_HOME_CONFIG') # Example custom env var
    if user_java_home and os.path.exists(user_java_home):
        potential_java_homes.insert(0, user_java_home)

    for jh in potential_java_homes:
        if os.path.exists(jh):
            os.environ['JAVA_HOME'] = jh
            print(f"Set JAVA_HOME to: {jh}")
            break
    if 'JAVA_HOME' not in os.environ:
        print("Warning: JAVA_HOME not found or set. LanguageTool might fail.")
        print("Please set JAVA_HOME environment variable to your JDK (version 11+) installation path,")
        print("or ensure your LanguageTool setup (e.g., remote server) does not require it locally.")

# ------------------------------
# Text Extraction & Analysis Functions
# ------------------------------

def extract_pdf_text_for_general_checks(file_path_or_stream) -> str:
    """Extracts full text from a PDF file using PyMuPDF4LLM for general regex checks."""
    temp_file_path_holder = [] 
    pdf_path_for_pymupdf4llm = None
    try:
        if isinstance(file_path_or_stream, str) and os.path.exists(file_path_or_stream):
            pdf_path_for_pymupdf4llm = file_path_or_stream
        elif hasattr(file_path_or_stream, 'read'): # Gradio File(type="binary") gives bytes, wrapped in BytesIO
            with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_file:
                file_path_or_stream.seek(0) 
                temp_file.write(file_path_or_stream.read())
                pdf_path_for_pymupdf4llm = temp_file.name
                temp_file_path_holder.append(pdf_path_for_pymupdf4llm)
        elif isinstance(file_path_or_stream, bytes):
             with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_file:
                temp_file.write(file_path_or_stream)
                pdf_path_for_pymupdf4llm = temp_file.name
                temp_file_path_holder.append(pdf_path_for_pymupdf4llm)
        else:
            print(f"Unsupported input type for PyMuPDF4LLM: {type(file_path_or_stream)}")
            return ""

        if not pdf_path_for_pymupdf4llm:
            print("PDF path could not be determined for PyMuPDF4LLM.")
            return ""
            
        import pymupdf4llm 
        full_text = pymupdf4llm.to_markdown(pdf_path_for_pymupdf4llm)
        return full_text
        
    except Exception as e:
        print(f"Error extracting text with PyMuPDF4LLM: {str(e)}")
        return ""
    finally:
        if temp_file_path_holder:
            try:
                os.remove(temp_file_path_holder[0])
            except OSError as e_os:
                print(f"Warning: Could not remove temp file {temp_file_path_holder[0]}: {e_os}")


def extract_word_data_and_text_for_lt(file_path_or_stream) -> Tuple[str, List[Dict[str, Any]]]:
    doc = None
    try:
        if isinstance(file_path_or_stream, str) and os.path.exists(file_path_or_stream):
            doc = fitz.open(file_path_or_stream)
        elif hasattr(file_path_or_stream, 'read'): # BytesIO or tempfile
            file_path_or_stream.seek(0)
            doc = fitz.open(stream=file_path_or_stream.read(), filetype="pdf")
        elif isinstance(file_path_or_stream, bytes):
            doc = fitz.open(stream=file_path_or_stream, filetype="pdf")
        else:
            print(f"Unsupported input type for extract_word_data_and_text_for_lt: {type(file_path_or_stream)}")
            return "", []
    except Exception as e:
        print(f"Error opening PDF in extract_word_data_and_text_for_lt: {e}")
        return "", []

    word_coords_data_intermediate = [] 
    for page_idx, page in enumerate(doc):
        # Using sort=True attempts to get words in reading order.
        words_on_page = page.get_text("words", sort=True) 
        for w_info in words_on_page: # (x0, y0, x1, y1, "word", block_no, line_no, word_no)
            word_text = w_info[4]
            word_rect = fitz.Rect(w_info[0:4])
            if word_text.strip(): # Ensure word is not just whitespace
                word_coords_data_intermediate.append({
                    'text': word_text,
                    'page_num': page_idx,
                    'rect': word_rect,
                })
    doc.close()

    text_for_lt = " ".join([item['text'] for item in word_coords_data_intermediate])
    
    word_coords_data_final = []
    current_char_pos_recalc = 0
    for i, item_data in enumerate(word_coords_data_intermediate):
        final_item = item_data.copy()
        final_item['start_offset'] = current_char_pos_recalc
        word_coords_data_final.append(final_item)
        
        current_char_pos_recalc += len(final_item['text'])
        if i < len(word_coords_data_intermediate) - 1: # Add 1 for the space
            current_char_pos_recalc += 1 

    return text_for_lt, word_coords_data_final


def check_text_presence(full_text: str, search_terms: List[str]) -> Dict[str, bool]:
    return {term: term.lower() in full_text.lower() for term in search_terms}

def check_metadata(full_text: str) -> Dict[str, Any]:
    return {
        "author_email_present": bool(re.search(r'\b[\w.-]+?@\w+?\.\w+?\b', full_text)),
        "authors_list_heading_present": bool(re.search(r'(?:Authors?|AUTHORS?):\s*', full_text)),
        "keywords_list_heading_present": bool(re.search(r'(?:Keywords?|KEYWORDS?):\s*', full_text, re.IGNORECASE)),
        "word_count_estimate": len(full_text.split()) if full_text else "Missing"
    }

def check_disclosures(full_text: str) -> Dict[str, bool]:
    search_terms = [
        "conflict of interest statement", "COI statement",
        "ethics statement", "ethical approval",
        "funding statement", "acknowledgment of funding",
        "data availability statement", "data access statement"
    ]
    # Use a more robust check by looking for variations and combining results
    results = {}
    results["conflict_of_interest_statement"] = any(term.lower() in full_text.lower() for term in search_terms[0:2])
    results["ethics_statement"] = any(term.lower() in full_text.lower() for term in search_terms[2:4])
    results["funding_statement"] = any(term.lower() in full_text.lower() for term in search_terms[4:6])
    results["data_access_statement"] = any(term.lower() in full_text.lower() for term in search_terms[6:8])
    
    has_author_contribution = ("author contribution" in full_text.lower() or # Catches singular and plural
                               "authors contribution" in full_text.lower()) # Common variation
    results["author_contribution_statement"] = has_author_contribution
    return results

def check_figures_and_tables_overview(full_text: str) -> Dict[str, bool]:
    return {
        "figures_mentioned": bool(re.search(r'Fig(?:ure)?\s*\d+', full_text, re.IGNORECASE)),
        "tables_mentioned": bool(re.search(r'Table\s*\d+', full_text, re.IGNORECASE)),
    }

def check_references_overview(full_text: str) -> Dict[str, Any]:
    has_references_section = bool(re.search(r"^\s*(?:References|Bibliography)\s*$", full_text, re.IGNORECASE | re.MULTILINE))
    citations_in_text = re.findall(r'\[\d+(?:,\s*\d+)*(?:–\d+)?\]', full_text) # Matches [1], [1,2], [1-3], [1, 2-5]
    
    reference_list_items = []
    if has_references_section:
        match_ref_sec = re.search(r"^\s*(?:References|Bibliography)\s*$(.*)", full_text, re.IGNORECASE | re.MULTILINE | re.DOTALL)
        if match_ref_sec:
            references_text_block = match_ref_sec.group(1)
            reference_list_items = re.findall(r"^\s*(?:\[\d+\]|\d+\.\s)", references_text_block, re.MULTILINE) # [1] or 1.

    return {
        "references_section_heading_present": has_references_section,
        "citations_in_text_count": len(citations_in_text),
        "reference_list_items_count_heuristic": len(reference_list_items),
        "old_references_present_pre_1995": bool(re.search(r'\b(?:19[0-8]\d|199[0-4])\b', full_text)), 
    }

def check_structure_overview(full_text: str) -> Dict[str, bool]:
    imrad_sections = ["Introduction", "Methods", "Materials and Methods", "Results", "Discussion"]
    imrad_found_count = 0
    if any(re.search(rf"^\s*Introduction\b", full_text, re.IGNORECASE | re.MULTILINE) for s in [imrad_sections[0]]): imrad_found_count+=1
    if any(re.search(rf"^\s*(?:Methods|Materials\s+and\s+Methods)\b", full_text, re.IGNORECASE | re.MULTILINE) for s in imrad_sections[1:3]): imrad_found_count+=1
    if any(re.search(rf"^\s*Results\b", full_text, re.IGNORECASE | re.MULTILINE) for s in [imrad_sections[3]]): imrad_found_count+=1
    if any(re.search(rf"^\s*Discussion\b", full_text, re.IGNORECASE | re.MULTILINE) for s in [imrad_sections[4]]): imrad_found_count+=1
    
    has_abstract_section = bool(re.search(r"^\s*Abstract\b", full_text, re.IGNORECASE | re.MULTILINE))

    return {
        "abstract_section_heading_present": has_abstract_section,
        "imrad_structure_partially_present": imrad_found_count >=3, # e.g. at least 3 of 4 main sections
        "imrad_sections_detected_count": imrad_found_count
    }

def check_language_issues(text_for_lt: str) -> Dict[str, Any]:
    try:
        tool_path = os.environ.get('LT_PATH')
        # If LT_PATH is set, use it; otherwise, try remote server or allow LT to manage its server.
        # Default for language_tool_python if no server/path is given is to often start its own managed server.
        # Forcing remote_server=None if LT_PATH is given.
        language_tool = language_tool_python.LanguageTool(
            'en-US', 
            remote_server='http://localhost:8081' if not tool_path else None, 
            language_tool_path=tool_path if tool_path else None
        )
        matches = language_tool.check(text_for_lt)
        issues = []
        
        for match in matches:
            # Example: ignore a common false positive or stylistic choice
            if match.ruleId in ["EN_SPLIT_WORDS_HYPHEN", "UPPERCASE_SENTENCE_START", "MORFOLOGIK_RULE_EN_US"]: 
                continue
            issues.append({
                "message": match.message,
                "context": match.context.strip(),
                "error_text_segment": match.context[match.contextOffset : match.contextOffset + match.errorLength],
                "suggestions": match.replacements[:3] if match.replacements else [],
                "category": match.category,
                "rule_id": match.ruleId,
                "offset": match.offset, 
                "length": match.errorLength,
            })
        
        regex_pattern = r'\b(\w+)\[(\d+)\]' 
        regex_matches = list(re.finditer(regex_pattern, text_for_lt))
        
        for match_re in regex_matches:
            word = match_re.group(1)
            number = match_re.group(2)
            issues.append({
                "message": f"Missing space before '[' in '{word}[{number}]'. Suggestion: '{word} [{number}]'.",
                "context": text_for_lt[max(match_re.start() - 40, 0):min(match_re.end() + 40, len(text_for_lt))].strip(),
                "error_text_segment": match_re.group(0), 
                "suggestions": [f"{word} [{number}]"],
                "category": "Formatting",
                "rule_id": "MISSING_SPACE_BEFORE_BRACKET_CITATION",
                "offset": match_re.start(),
                "length": match_re.end() - match_re.start(),
            })
        
        return {"total_issues": len(issues), "issues": issues}
    except ConnectionRefusedError:
        error_msg = "LanguageTool Error: Connection to LT server (e.g., http://localhost:8081) refused. Ensure it's running, or configure LT_PATH for local JAR usage."
        print(error_msg)
        return {"error": error_msg, "issues": []}
    except Exception as e:
        error_msg = f"Error checking language issues: {type(e).__name__} - {e}"
        print(error_msg)
        # print(traceback.format_exc()) # For server-side debugging
        return {"error": error_msg, "issues": []}


def check_figure_table_order(full_text: str) -> Dict[str, Any]:
    fig_pattern = r'(?:Fig(?:ure)?\.?|Figure)\s*(\d+)'
    fig_refs_in_order = [int(num) for num in re.findall(fig_pattern, full_text, re.IGNORECASE)]
    
    tbl_pattern = r'Table\s*(\d+)'
    tbl_refs_in_order = [int(num) for num in re.findall(tbl_pattern, full_text, re.IGNORECASE)]

    def analyze_numbering(refs_list, item_type="Item"):
        issues = []
        if not refs_list:
            return {"references_in_order_of_appearance": [], f"{item_type}_numbering_issues": ["Not mentioned."]}

        unique_sorted_refs = sorted(list(set(refs_list)))
        max_ref_num = unique_sorted_refs[-1] if unique_sorted_refs else 0
        
        # Check for missing numbers in the sequence up to max_ref_num
        expected_sequence = list(range(1, max_ref_num + 1))
        missing_numbers = [num for num in expected_sequence if num not in unique_sorted_refs]
        if missing_numbers:
            issues.append(f"Missing {item_type.lower()}(s) in sequence up to {max_ref_num}: {missing_numbers}")

        # Check if first mention is sequential (e.g. Fig 1 then Fig 2, not Fig 2 then Fig 1)
        # This is a simplified check on the raw list of appearances.
        # A more robust check would track first appearances of unique numbers.
        # For now, if the list of unique items in appearance order is not sorted.
        first_occurrence_map = {}
        unique_refs_in_appearance_order = []
        for ref_num in refs_list:
            if ref_num not in first_occurrence_map:
                first_occurrence_map[ref_num] = True
                unique_refs_in_appearance_order.append(ref_num)
        
        if unique_refs_in_appearance_order != sorted(unique_refs_in_appearance_order):
            issues.append(f"{item_type}s may not be first mentioned in strict numerical order. Sequence of first mentions: {unique_refs_in_appearance_order}")
        
        return {
            "references_in_order_of_appearance": refs_list,
            f"{item_type.lower()}_numbering_issues": issues if issues else ["Appears OK based on simple checks."]
        }

    fig_analysis = analyze_numbering(fig_refs_in_order, "Figure")
    tbl_analysis = analyze_numbering(tbl_refs_in_order, "Table")

    return {**fig_analysis, **tbl_analysis}


# ------------------------------
# Highlighting Function
# ------------------------------
def highlight_issues_in_pdf(
    pdf_file_or_stream, 
    word_coords_data: List[Dict[str, Any]], 
    language_issues_list: List[Dict[str, Any]]
    # text_for_lt is implicitly used via offsets stored in language_issues_list
) -> Tuple[List[Dict[str, Any]], bytes]:
    
    doc = None
    try:
        if isinstance(pdf_file_or_stream, str) and os.path.exists(pdf_file_or_stream):
            doc = fitz.open(pdf_file_or_stream)
        elif hasattr(pdf_file_or_stream, 'read'):
            pdf_file_or_stream.seek(0)
            doc = fitz.open(stream=pdf_file_or_stream.read(), filetype="pdf")
        elif isinstance(pdf_file_or_stream, bytes):
            doc = fitz.open(stream=pdf_file_or_stream, filetype="pdf")
        else:
            print(f"Unsupported PDF input type in highlight_issues_in_pdf: {type(pdf_file_or_stream)}")
            return language_issues_list, b"" # Return original issues, no PDF bytes
    except Exception as e:
        print(f"Error opening PDF in highlight_issues_in_pdf: {e}")
        return language_issues_list, b""

    issues_with_coords_and_page = []

    for issue_details in language_issues_list:
        issue_offset = issue_details["offset"]       
        issue_length = issue_details["length"]       
        error_text_to_search = issue_details["error_text_segment"] 

        current_issue_output = issue_details.copy()
        current_issue_output["page"] = 0 
        current_issue_output["coordinates"] = [] # [x0, y0, x1, y1]

        candidate_pdf_words_info = []
        for word_info in word_coords_data:
            word_start_offset = word_info['start_offset']
            word_end_offset = word_start_offset + len(word_info['text'])
            if word_start_offset < (issue_offset + issue_length) and issue_offset < word_end_offset:
                candidate_pdf_words_info.append(word_info)
        
        if not candidate_pdf_words_info:
            issues_with_coords_and_page.append(current_issue_output)
            continue

        page_num_for_issue = candidate_pdf_words_info[0]["page_num"]
        page_to_search_on = doc[page_num_for_issue]
        
        clip_search_rect = fitz.Rect(candidate_pdf_words_info[0]['rect'])
        for i in range(1, len(candidate_pdf_words_info)):
            clip_search_rect.include_rect(candidate_pdf_words_info[i]['rect'])
        
        clip_search_rect.x0 -= 3 # Small padding for search_for
        clip_search_rect.y0 -= 3
        clip_search_rect.x1 += 3
        clip_search_rect.y1 += 3
        clip_search_rect.normalize()

        found_rects_on_page = []
        if error_text_to_search.strip(): 
             try:
                # search_for is case-sensitive by default if query has mixed case.
                # LT error_text_segment usually preserves case.
                found_rects_on_page = page_to_search_on.search_for(error_text_to_search, clip=clip_search_rect, quads=False)
             except Exception as search_e: 
                print(f"PyMuPDF search_for error: '{search_e}' for text '{error_text_to_search}' on page {page_num_for_issue+1}. Skipping this highlight.")

        if found_rects_on_page:
            current_issue_output["page"] = page_num_for_issue + 1
            
            overall_bounds = fitz.Rect(found_rects_on_page[0])
            for r_idx in range(1, len(found_rects_on_page)):
                overall_bounds.include_rect(found_rects_on_page[r_idx])
            current_issue_output["coordinates"] = [
                round(overall_bounds.x0, 2), round(overall_bounds.y0, 2),
                round(overall_bounds.x1, 2), round(overall_bounds.y1, 2)
            ]

            for rect_to_highlight in found_rects_on_page:
                if not rect_to_highlight.is_empty and rect_to_highlight.width > 0.1 and rect_to_highlight.height > 0.1: # Min width/height
                    highlight_annot = page_to_search_on.add_highlight_annot(rect_to_highlight)
                    if highlight_annot:
                        highlight_annot.set_colors(stroke=(1, 1, 0)) # Yellow
                        highlight_annot.update(opacity=0.4) # Make highlight slightly transparent
        issues_with_coords_and_page.append(current_issue_output)

    output_pdf_bytes = io.BytesIO()
    try:
        doc.save(output_pdf_bytes, garbage=3, deflate=True) # Options for smaller size
        annotated_pdf_bytes_content = output_pdf_bytes.getvalue()
    except Exception as e:
        print(f"Error saving annotated PDF: {e}")
        annotated_pdf_bytes_content = b""
    finally:
        doc.close()
        output_pdf_bytes.close()
        
    return issues_with_coords_and_page, annotated_pdf_bytes_content

# ------------------------------
# Main Analysis Function
# ------------------------------
def analyze_pdf(pdf_input_data) -> Tuple[Dict[str, Any], bytes]:
    results = {"language_issues": [], "general_document_checks": {}, "analysis_errors": []}
    annotated_pdf_bytes = None

    # Ensure pdf_input_data can be read multiple times if it's a stream
    input_bytes_content = None
    if hasattr(pdf_input_data, 'read'):
        pdf_input_data.seek(0)
        input_bytes_content = pdf_input_data.read()
        # For functions below, create new BytesIO if they expect a stream
    elif isinstance(pdf_input_data, bytes):
        input_bytes_content = pdf_input_data
    elif isinstance(pdf_input_data, str) and os.path.exists(pdf_input_data): # Path
        with open(pdf_input_data, "rb") as f_path:
            input_bytes_content = f_path.read()
    else:
        results["analysis_errors"].append(f"Invalid PDF input data type: {type(pdf_input_data)}")
        return results, None

    if not input_bytes_content:
        results["analysis_errors"].append("PDF input data is empty or unreadable.")
        return results, None

    try:
        # General checks use PyMuPDF4LLM text
        pdf_stream_for_general = io.BytesIO(input_bytes_content)
        full_text_for_general_checks = extract_pdf_text_for_general_checks(pdf_stream_for_general)
        pdf_stream_for_general.close()
        
        if full_text_for_general_checks:
             results["general_document_checks"] = {
                "metadata": check_metadata(full_text_for_general_checks),
                "disclosures": check_disclosures(full_text_for_general_checks),
                "figures_tables_overview": check_figures_and_tables_overview(full_text_for_general_checks),
                "references_overview": check_references_overview(full_text_for_general_checks),
                "structure_overview": check_structure_overview(full_text_for_general_checks),
                "figure_table_order": check_figure_table_order(full_text_for_general_checks),
            }
        else:
            results["analysis_errors"].append("Failed to extract text using PyMuPDF4LLM for general checks.")

        # Language checks and highlighting use word-based extraction
        pdf_stream_for_lt = io.BytesIO(input_bytes_content)
        text_for_lt, word_coords_data = extract_word_data_and_text_for_lt(pdf_stream_for_lt)
        pdf_stream_for_lt.close()
        
        if not text_for_lt and not word_coords_data:
            results["analysis_errors"].append("Could not extract word data for language analysis and highlighting.")
        else:
            language_issues_result = check_language_issues(text_for_lt) # text_for_lt is passed here
            if "error" in language_issues_result:
                results["analysis_errors"].append(f"Language check error: {language_issues_result['error']}")
            
            lt_issues_list = language_issues_result.get("issues", [])
            
            if lt_issues_list:
                pdf_stream_for_highlighting = io.BytesIO(input_bytes_content)
                updated_lt_issues_list, annotated_pdf_bytes = highlight_issues_in_pdf(
                    pdf_stream_for_highlighting, 
                    word_coords_data, 
                    lt_issues_list
                )
                pdf_stream_for_highlighting.close()
                results["language_issues"] = updated_lt_issues_list
            else: # No issues, or error in check_language_issues
                results["language_issues"] = lt_issues_list # Will be empty if no issues, or contain error if LT failed

        if not results["analysis_errors"]: 
            del results["analysis_errors"]
        # Rename "issues" to "language_issues" in the top-level results for clarity
        if "issues" in results and "language_issues" not in results: # Should be handled by now
             results["language_issues"] = results.pop("issues")


        return results, annotated_pdf_bytes

    except Exception as e:
        error_msg = f"Critical error in analyze_pdf: {type(e).__name__} - {e}"
        print(error_msg)
        # print(traceback.format_exc()) # Server-side debug
        current_errors = results.get("analysis_errors", [])
        current_errors.append(error_msg)
        results["analysis_errors"] = current_errors
        return results, None

# ------------------------------
# Gradio Interface
# ------------------------------
def process_upload(file_bytes_from_gradio): 
    if file_bytes_from_gradio is None:
        return json.dumps({"error_message": "No file uploaded"}, indent=2), None
    
    try:
        # analyze_pdf now robustly handles bytes or streams
        results, annotated_pdf_output_bytes = analyze_pdf(file_bytes_from_gradio) 
        
        # Sanitize results for JSON (e.g., convert fitz.Rect if any slipped through)
        # This should ideally be handled within each check function if it returns complex objects not meant for JSON.
        # For now, assume results are JSON-serializable.

        results_json = json.dumps(results, indent=2, ensure_ascii=False)

        if annotated_pdf_output_bytes:
            with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_annotated_pdf_file:
                tmp_annotated_pdf_file.write(annotated_pdf_output_bytes)
                return results_json, tmp_annotated_pdf_file.name # Gradio needs a path for gr.File output
        else:
            return results_json, None
            
    except Exception as e:
        error_msg = f"Error processing file in Gradio interface: {type(e).__name__} - {e}"
        print(error_msg)
        # print(traceback.format_exc()) # Server-side debug
        return json.dumps({"error_message": error_msg}, indent=2), None


def create_interface():
    with gr.Blocks(title="PDF Analyzer", theme=gr.themes.Glass()) as interface: # Changed theme
        gr.Markdown("# PDF Document Analyzer")
        gr.Markdown(
            "Upload a PDF to check for common manuscript issues. "
            "Language checks use LanguageTool (EN-US). Ensure your LanguageTool setup is correct "
            "(e.g., local server on port 8081, or LT_PATH environment variable for local JAR)."
        )
        
        with gr.Row():
            file_input = gr.File(
                label="Upload PDF Document",
                file_types=[".pdf"],
                type="binary" # Receives bytes
            )
        
        analyze_btn = gr.Button("Analyze PDF", variant="primary", scale=0) # scale=0 for smaller button
        
        gr.Markdown("## Analysis Results")
        with gr.Tabs():
            with gr.TabItem("Detailed Report"):
                results_output = gr.JSON(label="JSON Report", scale=2) # Increased scale for more space
            with gr.TabItem("Annotated PDF"):
                # Changed to gr.File for download, as direct PDF viewer is not standard in Gradio
                pdf_output_display = gr.File(label="Download Annotated PDF (if issues were highlighted)", interactive=False)

        analyze_btn.click(
            fn=process_upload,
            inputs=[file_input],
            outputs=[results_output, pdf_output_display]
        )
        gr.Markdown("---")
        gr.Markdown("Developed with PyMuPDF, LanguageTool, and Gradio. Alpha version.")
    return interface

if __name__ == "__main__":
    print("PDF Analyzer launching...")
    print("Ensure LanguageTool is accessible (e.g., server at http://localhost:8081 or LT_PATH set).")
    # Example: To run LT server: java -cp languagetool-server.jar org.languagetool.server.HTTPServer --port 8081 --allow-origin "*"
    # Example: os.environ['LT_PATH'] = '/path/to/languagetool-6.X/' (if you have the full distribution)
    
    interface = create_interface()
    interface.launch(
        share=True, # For ngrok public link
        # server_name="0.0.0.0", # To allow access from network
        # server_port=7860 
    )