texmetrics-regex-checks-gradio-1-testing

Sleeping

texmetrics-regex-checks-gradio-1-testing

File size: 27,886 Bytes

import re
import fitz  # PyMuPDF
import language_tool_python
from typing import List, Dict, Any, Tuple
from collections import Counter
import json
import traceback # Keep for debugging, but try to minimize in final user-facing JSON
import io
import tempfile
import os
import gradio as gr

# Set JAVA_HOME environment variable
if 'JAVA_HOME' not in os.environ:
    potential_java_homes = [
        '/usr/lib/jvm/java-11-openjdk-amd64',
        '/usr/lib/jvm/java-17-openjdk-amd64',
        # For macOS users with Homebrew OpenJDK (common paths):
        # '/opt/homebrew/opt/openjdk@11/libexec/openjdk.jdk/Contents/Home', # M1/M2 Macs
        # '/usr/local/opt/openjdk@11/libexec/openjdk.jdk/Contents/Home',    # Intel Macs
        # '/opt/homebrew/opt/openjdk/libexec/openjdk.jdk/Contents/Home',    # Default OpenJDK Homebrew
    ]
    # User-specific path from environment if available
    user_java_home = os.environ.get('USER_JAVA_HOME_CONFIG') # Example custom env var
    if user_java_home and os.path.exists(user_java_home):
        potential_java_homes.insert(0, user_java_home)

    for jh in potential_java_homes:
        if os.path.exists(jh):
            os.environ['JAVA_HOME'] = jh
            print(f"Set JAVA_HOME to: {jh}")
            break
    if 'JAVA_HOME' not in os.environ:
        print("Warning: JAVA_HOME not found or set. LanguageTool might fail.")
        print("Please set JAVA_HOME environment variable to your JDK (version 11+) installation path,")
        print("or ensure your LanguageTool setup (e.g., remote server) does not require it locally.")

# ------------------------------
# Text Extraction & Analysis Functions
# ------------------------------

def extract_pdf_text_for_general_checks(file_path_or_stream) -> str:
    """Extracts full text from a PDF file using PyMuPDF4LLM for general regex checks."""
    temp_file_path_holder = [] 
    pdf_path_for_pymupdf4llm = None
    try:
        if isinstance(file_path_or_stream, str) and os.path.exists(file_path_or_stream):
            pdf_path_for_pymupdf4llm = file_path_or_stream
        elif hasattr(file_path_or_stream, 'read'): # Gradio File(type="binary") gives bytes, wrapped in BytesIO
            with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_file:
                file_path_or_stream.seek(0) 
                temp_file.write(file_path_or_stream.read())
                pdf_path_for_pymupdf4llm = temp_file.name
                temp_file_path_holder.append(pdf_path_for_pymupdf4llm)
        elif isinstance(file_path_or_stream, bytes):
             with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_file:
                temp_file.write(file_path_or_stream)
                pdf_path_for_pymupdf4llm = temp_file.name
                temp_file_path_holder.append(pdf_path_for_pymupdf4llm)
        else:
            print(f"Unsupported input type for PyMuPDF4LLM: {type(file_path_or_stream)}")
            return ""

        if not pdf_path_for_pymupdf4llm:
            print("PDF path could not be determined for PyMuPDF4LLM.")
            return ""
            
        import pymupdf4llm 
        full_text = pymupdf4llm.to_markdown(pdf_path_for_pymupdf4llm)
        return full_text
        
    except Exception as e:
        print(f"Error extracting text with PyMuPDF4LLM: {str(e)}")
        return ""
    finally:
        if temp_file_path_holder:
            try:
                os.remove(temp_file_path_holder[0])
            except OSError as e_os:
                print(f"Warning: Could not remove temp file {temp_file_path_holder[0]}: {e_os}")


def extract_word_data_and_text_for_lt(file_path_or_stream) -> Tuple[str, List[Dict[str, Any]]]:
    doc = None
    try:
        if isinstance(file_path_or_stream, str) and os.path.exists(file_path_or_stream):
            doc = fitz.open(file_path_or_stream)
        elif hasattr(file_path_or_stream, 'read'): # BytesIO or tempfile
            file_path_or_stream.seek(0)
            doc = fitz.open(stream=file_path_or_stream.read(), filetype="pdf")
        elif isinstance(file_path_or_stream, bytes):
            doc = fitz.open(stream=file_path_or_stream, filetype="pdf")
        else:
            print(f"Unsupported input type for extract_word_data_and_text_for_lt: {type(file_path_or_stream)}")
            return "", []
    except Exception as e:
        print(f"Error opening PDF in extract_word_data_and_text_for_lt: {e}")
        return "", []

    word_coords_data_intermediate = [] 
    for page_idx, page in enumerate(doc):
        # Using sort=True attempts to get words in reading order.
        words_on_page = page.get_text("words", sort=True) 
        for w_info in words_on_page: # (x0, y0, x1, y1, "word", block_no, line_no, word_no)
            word_text = w_info[4]
            word_rect = fitz.Rect(w_info[0:4])
            if word_text.strip(): # Ensure word is not just whitespace
                word_coords_data_intermediate.append({
                    'text': word_text,
                    'page_num': page_idx,
                    'rect': word_rect,
                })
    doc.close()

    text_for_lt = " ".join([item['text'] for item in word_coords_data_intermediate])
    
    word_coords_data_final = []
    current_char_pos_recalc = 0
    for i, item_data in enumerate(word_coords_data_intermediate):
        final_item = item_data.copy()
        final_item['start_offset'] = current_char_pos_recalc
        word_coords_data_final.append(final_item)
        
        current_char_pos_recalc += len(final_item['text'])
        if i < len(word_coords_data_intermediate) - 1: # Add 1 for the space
            current_char_pos_recalc += 1 

    return text_for_lt, word_coords_data_final


def check_text_presence(full_text: str, search_terms: List[str]) -> Dict[str, bool]:
    return {term: term.lower() in full_text.lower() for term in search_terms}

def check_metadata(full_text: str) -> Dict[str, Any]:
    return {
        "author_email_present": bool(re.search(r'\b[\w.-]+?@\w+?\.\w+?\b', full_text)),
        "authors_list_heading_present": bool(re.search(r'(?:Authors?|AUTHORS?):\s*', full_text)),
        "keywords_list_heading_present": bool(re.search(r'(?:Keywords?|KEYWORDS?):\s*', full_text, re.IGNORECASE)),
        "word_count_estimate": len(full_text.split()) if full_text else "Missing"
    }

def check_disclosures(full_text: str) -> Dict[str, bool]:
    search_terms = [
        "conflict of interest statement", "COI statement",
        "ethics statement", "ethical approval",
        "funding statement", "acknowledgment of funding",
        "data availability statement", "data access statement"
    ]
    # Use a more robust check by looking for variations and combining results
    results = {}
    results["conflict_of_interest_statement"] = any(term.lower() in full_text.lower() for term in search_terms[0:2])
    results["ethics_statement"] = any(term.lower() in full_text.lower() for term in search_terms[2:4])
    results["funding_statement"] = any(term.lower() in full_text.lower() for term in search_terms[4:6])
    results["data_access_statement"] = any(term.lower() in full_text.lower() for term in search_terms[6:8])
    
    has_author_contribution = ("author contribution" in full_text.lower() or # Catches singular and plural
                               "authors contribution" in full_text.lower()) # Common variation
    results["author_contribution_statement"] = has_author_contribution
    return results

def check_figures_and_tables_overview(full_text: str) -> Dict[str, bool]:
    return {
        "figures_mentioned": bool(re.search(r'Fig(?:ure)?\s*\d+', full_text, re.IGNORECASE)),
        "tables_mentioned": bool(re.search(r'Table\s*\d+', full_text, re.IGNORECASE)),
    }

def check_references_overview(full_text: str) -> Dict[str, Any]:
    has_references_section = bool(re.search(r"^\s*(?:References|Bibliography)\s*$", full_text, re.IGNORECASE | re.MULTILINE))
    citations_in_text = re.findall(r'\[\d+(?:,\s*\d+)*(?:–\d+)?\]', full_text) # Matches [1], [1,2], [1-3], [1, 2-5]
    
    reference_list_items = []
    if has_references_section:
        match_ref_sec = re.search(r"^\s*(?:References|Bibliography)\s*$(.*)", full_text, re.IGNORECASE | re.MULTILINE | re.DOTALL)
        if match_ref_sec:
            references_text_block = match_ref_sec.group(1)
            reference_list_items = re.findall(r"^\s*(?:\[\d+\]|\d+\.\s)", references_text_block, re.MULTILINE) # [1] or 1.

    return {
        "references_section_heading_present": has_references_section,
        "citations_in_text_count": len(citations_in_text),
        "reference_list_items_count_heuristic": len(reference_list_items),
        "old_references_present_pre_1995": bool(re.search(r'\b(?:19[0-8]\d|199[0-4])\b', full_text)), 
    }

def check_structure_overview(full_text: str) -> Dict[str, bool]:
    imrad_sections = ["Introduction", "Methods", "Materials and Methods", "Results", "Discussion"]
    imrad_found_count = 0
    if any(re.search(rf"^\s*Introduction\b", full_text, re.IGNORECASE | re.MULTILINE) for s in [imrad_sections[0]]): imrad_found_count+=1
    if any(re.search(rf"^\s*(?:Methods|Materials\s+and\s+Methods)\b", full_text, re.IGNORECASE | re.MULTILINE) for s in imrad_sections[1:3]): imrad_found_count+=1
    if any(re.search(rf"^\s*Results\b", full_text, re.IGNORECASE | re.MULTILINE) for s in [imrad_sections[3]]): imrad_found_count+=1
    if any(re.search(rf"^\s*Discussion\b", full_text, re.IGNORECASE | re.MULTILINE) for s in [imrad_sections[4]]): imrad_found_count+=1
    
    has_abstract_section = bool(re.search(r"^\s*Abstract\b", full_text, re.IGNORECASE | re.MULTILINE))

    return {
        "abstract_section_heading_present": has_abstract_section,
        "imrad_structure_partially_present": imrad_found_count >=3, # e.g. at least 3 of 4 main sections
        "imrad_sections_detected_count": imrad_found_count
    }

def check_language_issues(text_for_lt: str) -> Dict[str, Any]:
    try:
        tool_path = os.environ.get('LT_PATH')
        # If LT_PATH is set, use it; otherwise, try remote server or allow LT to manage its server.
        # Default for language_tool_python if no server/path is given is to often start its own managed server.
        # Forcing remote_server=None if LT_PATH is given.
        language_tool = language_tool_python.LanguageTool(
            'en-US', 
            remote_server='http://localhost:8081' if not tool_path else None, 
            language_tool_path=tool_path if tool_path else None
        )
        matches = language_tool.check(text_for_lt)
        issues = []
        
        for match in matches:
            # Example: ignore a common false positive or stylistic choice
            if match.ruleId in ["EN_SPLIT_WORDS_HYPHEN", "UPPERCASE_SENTENCE_START", "MORFOLOGIK_RULE_EN_US"]: 
                continue
            issues.append({
                "message": match.message,
                "context": match.context.strip(),
                "error_text_segment": match.context[match.contextOffset : match.contextOffset + match.errorLength],
                "suggestions": match.replacements[:3] if match.replacements else [],
                "category": match.category,
                "rule_id": match.ruleId,
                "offset": match.offset, 
                "length": match.errorLength,
            })
        
        regex_pattern = r'\b(\w+)\[(\d+)\]' 
        regex_matches = list(re.finditer(regex_pattern, text_for_lt))
        
        for match_re in regex_matches:
            word = match_re.group(1)
            number = match_re.group(2)
            issues.append({
                "message": f"Missing space before '[' in '{word}[{number}]'. Suggestion: '{word} [{number}]'.",
                "context": text_for_lt[max(match_re.start() - 40, 0):min(match_re.end() + 40, len(text_for_lt))].strip(),
                "error_text_segment": match_re.group(0), 
                "suggestions": [f"{word} [{number}]"],
                "category": "Formatting",
                "rule_id": "MISSING_SPACE_BEFORE_BRACKET_CITATION",
                "offset": match_re.start(),
                "length": match_re.end() - match_re.start(),
            })
        
        return {"total_issues": len(issues), "issues": issues}
    except ConnectionRefusedError:
        error_msg = "LanguageTool Error: Connection to LT server (e.g., http://localhost:8081) refused. Ensure it's running, or configure LT_PATH for local JAR usage."
        print(error_msg)
        return {"error": error_msg, "issues": []}
    except Exception as e:
        error_msg = f"Error checking language issues: {type(e).__name__} - {e}"
        print(error_msg)
        # print(traceback.format_exc()) # For server-side debugging
        return {"error": error_msg, "issues": []}


def check_figure_table_order(full_text: str) -> Dict[str, Any]:
    fig_pattern = r'(?:Fig(?:ure)?\.?|Figure)\s*(\d+)'
    fig_refs_in_order = [int(num) for num in re.findall(fig_pattern, full_text, re.IGNORECASE)]
    
    tbl_pattern = r'Table\s*(\d+)'
    tbl_refs_in_order = [int(num) for num in re.findall(tbl_pattern, full_text, re.IGNORECASE)]

    def analyze_numbering(refs_list, item_type="Item"):
        issues = []
        if not refs_list:
            return {"references_in_order_of_appearance": [], f"{item_type}_numbering_issues": ["Not mentioned."]}

        unique_sorted_refs = sorted(list(set(refs_list)))
        max_ref_num = unique_sorted_refs[-1] if unique_sorted_refs else 0
        
        # Check for missing numbers in the sequence up to max_ref_num
        expected_sequence = list(range(1, max_ref_num + 1))
        missing_numbers = [num for num in expected_sequence if num not in unique_sorted_refs]
        if missing_numbers:
            issues.append(f"Missing {item_type.lower()}(s) in sequence up to {max_ref_num}: {missing_numbers}")

        # Check if first mention is sequential (e.g. Fig 1 then Fig 2, not Fig 2 then Fig 1)
        # This is a simplified check on the raw list of appearances.
        # A more robust check would track first appearances of unique numbers.
        # For now, if the list of unique items in appearance order is not sorted.
        first_occurrence_map = {}
        unique_refs_in_appearance_order = []
        for ref_num in refs_list:
            if ref_num not in first_occurrence_map:
                first_occurrence_map[ref_num] = True
                unique_refs_in_appearance_order.append(ref_num)
        
        if unique_refs_in_appearance_order != sorted(unique_refs_in_appearance_order):
            issues.append(f"{item_type}s may not be first mentioned in strict numerical order. Sequence of first mentions: {unique_refs_in_appearance_order}")
        
        return {
            "references_in_order_of_appearance": refs_list,
            f"{item_type.lower()}_numbering_issues": issues if issues else ["Appears OK based on simple checks."]
        }

    fig_analysis = analyze_numbering(fig_refs_in_order, "Figure")
    tbl_analysis = analyze_numbering(tbl_refs_in_order, "Table")

    return {**fig_analysis, **tbl_analysis}


# ------------------------------
# Highlighting Function
# ------------------------------
def highlight_issues_in_pdf(
    pdf_file_or_stream, 
    word_coords_data: List[Dict[str, Any]], 
    language_issues_list: List[Dict[str, Any]]
    # text_for_lt is implicitly used via offsets stored in language_issues_list
) -> Tuple[List[Dict[str, Any]], bytes]:
    
    doc = None
    try:
        if isinstance(pdf_file_or_stream, str) and os.path.exists(pdf_file_or_stream):
            doc = fitz.open(pdf_file_or_stream)
        elif hasattr(pdf_file_or_stream, 'read'):
            pdf_file_or_stream.seek(0)
            doc = fitz.open(stream=pdf_file_or_stream.read(), filetype="pdf")
        elif isinstance(pdf_file_or_stream, bytes):
            doc = fitz.open(stream=pdf_file_or_stream, filetype="pdf")
        else:
            print(f"Unsupported PDF input type in highlight_issues_in_pdf: {type(pdf_file_or_stream)}")
            return language_issues_list, b"" # Return original issues, no PDF bytes
    except Exception as e:
        print(f"Error opening PDF in highlight_issues_in_pdf: {e}")
        return language_issues_list, b""

    issues_with_coords_and_page = []

    for issue_details in language_issues_list:
        issue_offset = issue_details["offset"]       
        issue_length = issue_details["length"]       
        error_text_to_search = issue_details["error_text_segment"] 

        current_issue_output = issue_details.copy()
        current_issue_output["page"] = 0 
        current_issue_output["coordinates"] = [] # [x0, y0, x1, y1]

        candidate_pdf_words_info = []
        for word_info in word_coords_data:
            word_start_offset = word_info['start_offset']
            word_end_offset = word_start_offset + len(word_info['text'])
            if word_start_offset < (issue_offset + issue_length) and issue_offset < word_end_offset:
                candidate_pdf_words_info.append(word_info)
        
        if not candidate_pdf_words_info:
            issues_with_coords_and_page.append(current_issue_output)
            continue

        page_num_for_issue = candidate_pdf_words_info[0]["page_num"]
        page_to_search_on = doc[page_num_for_issue]
        
        clip_search_rect = fitz.Rect(candidate_pdf_words_info[0]['rect'])
        for i in range(1, len(candidate_pdf_words_info)):
            clip_search_rect.include_rect(candidate_pdf_words_info[i]['rect'])
        
        clip_search_rect.x0 -= 3 # Small padding for search_for
        clip_search_rect.y0 -= 3
        clip_search_rect.x1 += 3
        clip_search_rect.y1 += 3
        clip_search_rect.normalize()

        found_rects_on_page = []
        if error_text_to_search.strip(): 
             try:
                # search_for is case-sensitive by default if query has mixed case.
                # LT error_text_segment usually preserves case.
                found_rects_on_page = page_to_search_on.search_for(error_text_to_search, clip=clip_search_rect, quads=False)
             except Exception as search_e: 
                print(f"PyMuPDF search_for error: '{search_e}' for text '{error_text_to_search}' on page {page_num_for_issue+1}. Skipping this highlight.")

        if found_rects_on_page:
            current_issue_output["page"] = page_num_for_issue + 1
            
            overall_bounds = fitz.Rect(found_rects_on_page[0])
            for r_idx in range(1, len(found_rects_on_page)):
                overall_bounds.include_rect(found_rects_on_page[r_idx])
            current_issue_output["coordinates"] = [
                round(overall_bounds.x0, 2), round(overall_bounds.y0, 2),
                round(overall_bounds.x1, 2), round(overall_bounds.y1, 2)
            ]

            for rect_to_highlight in found_rects_on_page:
                if not rect_to_highlight.is_empty and rect_to_highlight.width > 0.1 and rect_to_highlight.height > 0.1: # Min width/height
                    highlight_annot = page_to_search_on.add_highlight_annot(rect_to_highlight)
                    if highlight_annot:
                        highlight_annot.set_colors(stroke=(1, 1, 0)) # Yellow
                        highlight_annot.update(opacity=0.4) # Make highlight slightly transparent
        issues_with_coords_and_page.append(current_issue_output)

    output_pdf_bytes = io.BytesIO()
    try:
        doc.save(output_pdf_bytes, garbage=3, deflate=True) # Options for smaller size
        annotated_pdf_bytes_content = output_pdf_bytes.getvalue()
    except Exception as e:
        print(f"Error saving annotated PDF: {e}")
        annotated_pdf_bytes_content = b""
    finally:
        doc.close()
        output_pdf_bytes.close()
        
    return issues_with_coords_and_page, annotated_pdf_bytes_content

# ------------------------------
# Main Analysis Function
# ------------------------------
def analyze_pdf(pdf_input_data) -> Tuple[Dict[str, Any], bytes]:
    results = {"language_issues": [], "general_document_checks": {}, "analysis_errors": []}
    annotated_pdf_bytes = None

    # Ensure pdf_input_data can be read multiple times if it's a stream
    input_bytes_content = None
    if hasattr(pdf_input_data, 'read'):
        pdf_input_data.seek(0)
        input_bytes_content = pdf_input_data.read()
        # For functions below, create new BytesIO if they expect a stream
    elif isinstance(pdf_input_data, bytes):
        input_bytes_content = pdf_input_data
    elif isinstance(pdf_input_data, str) and os.path.exists(pdf_input_data): # Path
        with open(pdf_input_data, "rb") as f_path:
            input_bytes_content = f_path.read()
    else:
        results["analysis_errors"].append(f"Invalid PDF input data type: {type(pdf_input_data)}")
        return results, None

    if not input_bytes_content:
        results["analysis_errors"].append("PDF input data is empty or unreadable.")
        return results, None

    try:
        # General checks use PyMuPDF4LLM text
        pdf_stream_for_general = io.BytesIO(input_bytes_content)
        full_text_for_general_checks = extract_pdf_text_for_general_checks(pdf_stream_for_general)
        pdf_stream_for_general.close()
        
        if full_text_for_general_checks:
             results["general_document_checks"] = {
                "metadata": check_metadata(full_text_for_general_checks),
                "disclosures": check_disclosures(full_text_for_general_checks),
                "figures_tables_overview": check_figures_and_tables_overview(full_text_for_general_checks),
                "references_overview": check_references_overview(full_text_for_general_checks),
                "structure_overview": check_structure_overview(full_text_for_general_checks),
                "figure_table_order": check_figure_table_order(full_text_for_general_checks),
            }
        else:
            results["analysis_errors"].append("Failed to extract text using PyMuPDF4LLM for general checks.")

        # Language checks and highlighting use word-based extraction
        pdf_stream_for_lt = io.BytesIO(input_bytes_content)
        text_for_lt, word_coords_data = extract_word_data_and_text_for_lt(pdf_stream_for_lt)
        pdf_stream_for_lt.close()
        
        if not text_for_lt and not word_coords_data:
            results["analysis_errors"].append("Could not extract word data for language analysis and highlighting.")
        else:
            language_issues_result = check_language_issues(text_for_lt) # text_for_lt is passed here
            if "error" in language_issues_result:
                results["analysis_errors"].append(f"Language check error: {language_issues_result['error']}")
            
            lt_issues_list = language_issues_result.get("issues", [])
            
            if lt_issues_list:
                pdf_stream_for_highlighting = io.BytesIO(input_bytes_content)
                updated_lt_issues_list, annotated_pdf_bytes = highlight_issues_in_pdf(
                    pdf_stream_for_highlighting, 
                    word_coords_data, 
                    lt_issues_list
                )
                pdf_stream_for_highlighting.close()
                results["language_issues"] = updated_lt_issues_list
            else: # No issues, or error in check_language_issues
                results["language_issues"] = lt_issues_list # Will be empty if no issues, or contain error if LT failed

        if not results["analysis_errors"]: 
            del results["analysis_errors"]
        # Rename "issues" to "language_issues" in the top-level results for clarity
        if "issues" in results and "language_issues" not in results: # Should be handled by now
             results["language_issues"] = results.pop("issues")


        return results, annotated_pdf_bytes

    except Exception as e:
        error_msg = f"Critical error in analyze_pdf: {type(e).__name__} - {e}"
        print(error_msg)
        # print(traceback.format_exc()) # Server-side debug
        current_errors = results.get("analysis_errors", [])
        current_errors.append(error_msg)
        results["analysis_errors"] = current_errors
        return results, None

# ------------------------------
# Gradio Interface
# ------------------------------
def process_upload(file_bytes_from_gradio): 
    if file_bytes_from_gradio is None:
        return json.dumps({"error_message": "No file uploaded"}, indent=2), None
    
    try:
        # analyze_pdf now robustly handles bytes or streams
        results, annotated_pdf_output_bytes = analyze_pdf(file_bytes_from_gradio) 
        
        # Sanitize results for JSON (e.g., convert fitz.Rect if any slipped through)
        # This should ideally be handled within each check function if it returns complex objects not meant for JSON.
        # For now, assume results are JSON-serializable.

        results_json = json.dumps(results, indent=2, ensure_ascii=False)

        if annotated_pdf_output_bytes:
            with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_annotated_pdf_file:
                tmp_annotated_pdf_file.write(annotated_pdf_output_bytes)
                return results_json, tmp_annotated_pdf_file.name # Gradio needs a path for gr.File output
        else:
            return results_json, None
            
    except Exception as e:
        error_msg = f"Error processing file in Gradio interface: {type(e).__name__} - {e}"
        print(error_msg)
        # print(traceback.format_exc()) # Server-side debug
        return json.dumps({"error_message": error_msg}, indent=2), None


def create_interface():
    with gr.Blocks(title="PDF Analyzer", theme=gr.themes.Glass()) as interface: # Changed theme
        gr.Markdown("# PDF Document Analyzer")
        gr.Markdown(
            "Upload a PDF to check for common manuscript issues. "
            "Language checks use LanguageTool (EN-US). Ensure your LanguageTool setup is correct "
            "(e.g., local server on port 8081, or LT_PATH environment variable for local JAR)."
        )
        
        with gr.Row():
            file_input = gr.File(
                label="Upload PDF Document",
                file_types=[".pdf"],
                type="binary" # Receives bytes
            )
        
        analyze_btn = gr.Button("Analyze PDF", variant="primary", scale=0) # scale=0 for smaller button
        
        gr.Markdown("## Analysis Results")
        with gr.Tabs():
            with gr.TabItem("Detailed Report"):
                results_output = gr.JSON(label="JSON Report", scale=2) # Increased scale for more space
            with gr.TabItem("Annotated PDF"):
                # Changed to gr.File for download, as direct PDF viewer is not standard in Gradio
                pdf_output_display = gr.File(label="Download Annotated PDF (if issues were highlighted)", interactive=False)

        analyze_btn.click(
            fn=process_upload,
            inputs=[file_input],
            outputs=[results_output, pdf_output_display]
        )
        gr.Markdown("---")
        gr.Markdown("Developed with PyMuPDF, LanguageTool, and Gradio. Alpha version.")
    return interface

if __name__ == "__main__":
    print("PDF Analyzer launching...")
    print("Ensure LanguageTool is accessible (e.g., server at http://localhost:8081 or LT_PATH set).")
    # Example: To run LT server: java -cp languagetool-server.jar org.languagetool.server.HTTPServer --port 8081 --allow-origin "*"
    # Example: os.environ['LT_PATH'] = '/path/to/languagetool-6.X/' (if you have the full distribution)
    
    interface = create_interface()
    interface.launch(
        share=True, # For ngrok public link
        # server_name="0.0.0.0", # To allow access from network
        # server_port=7860 
    )