Update main_analyzer.py
Browse files- main_analyzer.py +17 -10
    	
        main_analyzer.py
    CHANGED
    
    | @@ -19,8 +19,6 @@ from content_analysis import ( | |
| 19 | 
             
            )
         | 
| 20 | 
             
            from language_checker import perform_language_checks
         | 
| 21 | 
             
            from regex_checker import perform_regex_checks
         | 
| 22 | 
            -
            # text_utils.convert_markdown_to_plain_text is used by language_checker
         | 
| 23 | 
            -
            # config.py is imported in app.py
         | 
| 24 |  | 
| 25 | 
             
            def analyze_pdf(filepath_or_stream: Any) -> Tuple[Dict[str, Any], None]:
         | 
| 26 | 
             
                original_pdf_access_path = None
         | 
| @@ -28,21 +26,32 @@ def analyze_pdf(filepath_or_stream: Any) -> Tuple[Dict[str, Any], None]: | |
| 28 | 
             
                doc_for_mapping = None
         | 
| 29 |  | 
| 30 | 
             
                try:
         | 
| 31 | 
            -
                    if isinstance(filepath_or_stream, str):
         | 
| 32 | 
             
                        original_pdf_access_path = filepath_or_stream
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 33 | 
             
                    elif hasattr(filepath_or_stream, 'read') and callable(filepath_or_stream.read):
         | 
| 34 | 
             
                        with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_file_obj:
         | 
| 35 | 
             
                            temp_file_for_stream_path = temp_file_obj.name
         | 
| 36 | 
            -
                            filepath_or_stream.seek | 
|  | |
| 37 | 
             
                            temp_file_obj.write(filepath_or_stream.read())
         | 
| 38 | 
             
                        original_pdf_access_path = temp_file_for_stream_path
         | 
| 39 | 
            -
                        print(f"Analyzer:  | 
| 40 | 
             
                    else:
         | 
| 41 | 
            -
                        return {"error": "Invalid PDF input type. Must be path or file-like object."}, None
         | 
| 42 |  | 
| 43 | 
             
                    if not original_pdf_access_path or not os.path.exists(original_pdf_access_path):
         | 
| 44 | 
            -
                         return {"error": f"PDF path '{original_pdf_access_path}' does not exist or is invalid."}, None
         | 
| 45 |  | 
|  | |
| 46 | 
             
                    # 1. Unfiltered Plain Text (for general and regex checks)
         | 
| 47 | 
             
                    print(f"Analyzer: Extracting plain text from original PDF: {original_pdf_access_path}")
         | 
| 48 | 
             
                    raw_unfiltered_plain_text = extract_plain_text_from_original_pdf(original_pdf_access_path)
         | 
| @@ -124,10 +133,8 @@ def analyze_pdf(filepath_or_stream: Any) -> Tuple[Dict[str, Any], None]: | |
| 124 | 
             
                    for issue_data in detailed_issues_for_mapping:
         | 
| 125 | 
             
                        coords = issue_data.get('pdf_coordinates_list', [{}])[0] if issue_data.get('is_mapped_to_pdf') else {}
         | 
| 126 | 
             
                        coords_for_json = [coords.get("x0"), coords.get("y0"), coords.get("x1"), coords.get("y1")] if coords else []
         | 
| 127 | 
            -
                        # Filter out None coordinates that might arise from empty coords dict
         | 
| 128 | 
             
                        coords_for_json = [c for c in coords_for_json if c is not None]
         | 
| 129 |  | 
| 130 | 
            -
             | 
| 131 | 
             
                        final_formatted_issues_list.append({
         | 
| 132 | 
             
                            "message": issue_data.get('message', 'N/A'),
         | 
| 133 | 
             
                            "context": issue_data.get('context_text', 'N/A'), 
         | 
| @@ -136,7 +143,7 @@ def analyze_pdf(filepath_or_stream: Any) -> Tuple[Dict[str, Any], None]: | |
| 136 | 
             
                            "rule_id": issue_data.get('ruleId', 'N/A'),
         | 
| 137 | 
             
                            "offset": issue_data.get('offset_in_text', -1), 
         | 
| 138 | 
             
                            "length": issue_data.get('error_length', 0),   
         | 
| 139 | 
            -
                            "coordinates": coords_for_json if len(coords_for_json) == 4 else [], | 
| 140 | 
             
                            "page": issue_data.get('mapped_page_number', 0) if issue_data.get('is_mapped_to_pdf') else 0,
         | 
| 141 | 
             
                            "source_check_type": issue_data.get('source_check_type', 'N/A')
         | 
| 142 | 
             
                        })
         | 
|  | |
| 19 | 
             
            )
         | 
| 20 | 
             
            from language_checker import perform_language_checks
         | 
| 21 | 
             
            from regex_checker import perform_regex_checks
         | 
|  | |
|  | |
| 22 |  | 
| 23 | 
             
            def analyze_pdf(filepath_or_stream: Any) -> Tuple[Dict[str, Any], None]:
         | 
| 24 | 
             
                original_pdf_access_path = None
         | 
|  | |
| 26 | 
             
                doc_for_mapping = None
         | 
| 27 |  | 
| 28 | 
             
                try:
         | 
| 29 | 
            +
                    if isinstance(filepath_or_stream, str): 
         | 
| 30 | 
             
                        original_pdf_access_path = filepath_or_stream
         | 
| 31 | 
            +
                        print(f"Analyzer: Input is a string path: {original_pdf_access_path}")
         | 
| 32 | 
            +
                    # Check for objects like Gradio's NamedString or TemporaryFileWrapper's .name attribute
         | 
| 33 | 
            +
                    elif hasattr(filepath_or_stream, 'name') and isinstance(getattr(filepath_or_stream, 'name'), str) and \
         | 
| 34 | 
            +
                         os.path.exists(getattr(filepath_or_stream, 'name')): # Ensure the .name path is valid
         | 
| 35 | 
            +
                        original_pdf_access_path = filepath_or_stream.name
         | 
| 36 | 
            +
                        print(f"Analyzer: Input is an object with .name attribute, using path: {original_pdf_access_path}")
         | 
| 37 | 
            +
                         # If this object also has a .read method, it might be a TemporaryFileWrapper.
         | 
| 38 | 
            +
                         # The next elif would handle it if we prefer processing it as a stream,
         | 
| 39 | 
            +
                         # but using its .name path is usually fine and simpler.
         | 
| 40 | 
             
                    elif hasattr(filepath_or_stream, 'read') and callable(filepath_or_stream.read):
         | 
| 41 | 
             
                        with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_file_obj:
         | 
| 42 | 
             
                            temp_file_for_stream_path = temp_file_obj.name
         | 
| 43 | 
            +
                            if hasattr(filepath_or_stream, 'seek') and callable(filepath_or_stream.seek):
         | 
| 44 | 
            +
                                filepath_or_stream.seek(0)
         | 
| 45 | 
             
                            temp_file_obj.write(filepath_or_stream.read())
         | 
| 46 | 
             
                        original_pdf_access_path = temp_file_for_stream_path
         | 
| 47 | 
            +
                        print(f"Analyzer: Input stream saved to temp file: {original_pdf_access_path}")
         | 
| 48 | 
             
                    else:
         | 
| 49 | 
            +
                        return {"error": f"Invalid PDF input type: {type(filepath_or_stream)}. Must be path string, an object with a .name attribute as path, or file-like stream object."}, None
         | 
| 50 |  | 
| 51 | 
             
                    if not original_pdf_access_path or not os.path.exists(original_pdf_access_path):
         | 
| 52 | 
            +
                         return {"error": f"PDF path '{original_pdf_access_path}' (derived from input) does not exist or is invalid."}, None
         | 
| 53 |  | 
| 54 | 
            +
                    # --- The rest of the function remains the same as the previous complete listing ---
         | 
| 55 | 
             
                    # 1. Unfiltered Plain Text (for general and regex checks)
         | 
| 56 | 
             
                    print(f"Analyzer: Extracting plain text from original PDF: {original_pdf_access_path}")
         | 
| 57 | 
             
                    raw_unfiltered_plain_text = extract_plain_text_from_original_pdf(original_pdf_access_path)
         | 
|  | |
| 133 | 
             
                    for issue_data in detailed_issues_for_mapping:
         | 
| 134 | 
             
                        coords = issue_data.get('pdf_coordinates_list', [{}])[0] if issue_data.get('is_mapped_to_pdf') else {}
         | 
| 135 | 
             
                        coords_for_json = [coords.get("x0"), coords.get("y0"), coords.get("x1"), coords.get("y1")] if coords else []
         | 
|  | |
| 136 | 
             
                        coords_for_json = [c for c in coords_for_json if c is not None]
         | 
| 137 |  | 
|  | |
| 138 | 
             
                        final_formatted_issues_list.append({
         | 
| 139 | 
             
                            "message": issue_data.get('message', 'N/A'),
         | 
| 140 | 
             
                            "context": issue_data.get('context_text', 'N/A'), 
         | 
|  | |
| 143 | 
             
                            "rule_id": issue_data.get('ruleId', 'N/A'),
         | 
| 144 | 
             
                            "offset": issue_data.get('offset_in_text', -1), 
         | 
| 145 | 
             
                            "length": issue_data.get('error_length', 0),   
         | 
| 146 | 
            +
                            "coordinates": coords_for_json if len(coords_for_json) == 4 else [],
         | 
| 147 | 
             
                            "page": issue_data.get('mapped_page_number', 0) if issue_data.get('is_mapped_to_pdf') else 0,
         | 
| 148 | 
             
                            "source_check_type": issue_data.get('source_check_type', 'N/A')
         | 
| 149 | 
             
                        })
         |