Update app.py
Browse files
    	
        app.py
    CHANGED
    
    | 
         @@ -226,27 +226,38 @@ def check_reference_order(full_text: str) -> Dict[str, Any]: 
     | 
|
| 226 | 
         
             
                    "missing_references": missing_refs,
         
     | 
| 227 | 
         
             
                    "is_ordered": len(out_of_order) == 0 and len(missing_refs) == 0
         
     | 
| 228 | 
         
             
                }
         
     | 
| 
         | 
|
| 229 | 
         
             
            def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> bytes:
         
     | 
| 230 | 
         
             
                """
         
     | 
| 231 | 
         
            -
                Highlights language issues in the PDF 
     | 
| 232 | 
         
            -
                 
     | 
| 233 | 
         
            -
                 
     | 
| 234 | 
         
             
                """
         
     | 
| 235 | 
         
             
                try:
         
     | 
| 236 | 
         
             
                    # Open the PDF
         
     | 
| 237 | 
         
             
                    doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file)
         
     | 
| 238 | 
         
            -
             
     | 
| 
         | 
|
| 239 | 
         
             
                    # Extract words with positions from each page
         
     | 
| 240 | 
         
             
                    word_list = []  # List of tuples: (page_number, word, x0, y0, x1, y1)
         
     | 
| 241 | 
         
             
                    for page_number in range(len(doc)):
         
     | 
| 242 | 
         
             
                        page = doc[page_number]
         
     | 
| 
         | 
|
| 243 | 
         
             
                        words = page.get_text("words")  # List of tuples: (x0, y0, x1, y1, "word", block_no, line_no, word_no)
         
     | 
| 244 | 
         
             
                        for w in words:
         
     | 
| 
         | 
|
| 245 | 
         
             
                            word_text = w[4]
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 246 | 
         
             
                            word_list.append((page_number, word_text, w[0], w[1], w[2], w[3]))
         
     | 
| 
         | 
|
| 247 | 
         | 
| 248 | 
         
             
                    # Concatenate all words to form the full text
         
     | 
| 
         | 
|
| 249 | 
         
             
                    concatenated_text = " ".join([w[1] for w in word_list])
         
     | 
| 
         | 
|
| 
         | 
|
| 250 | 
         | 
| 251 | 
         
             
                    # Find "Abstract" section and set the processing start point
         
     | 
| 252 | 
         
             
                    abstract_start = concatenated_text.lower().find("abstract")
         
     | 
| 
         @@ -258,14 +269,16 @@ def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> byt 
     | 
|
| 258 | 
         | 
| 259 | 
         
             
                    # Iterate over each language issue
         
     | 
| 260 | 
         
             
                    for idx, issue in enumerate(language_matches, start=1):
         
     | 
| 261 | 
         
            -
                        offset = issue["offset"]
         
     | 
| 262 | 
         
             
                        length = issue["length"]
         
     | 
| 263 | 
         | 
| 264 | 
         
             
                        # Skip issues in the references section
         
     | 
| 265 | 
         
             
                        if offset < abstract_offset or offset >= references_offset:
         
     | 
| 266 | 
         
             
                            continue
         
     | 
| 267 | 
         
            -
             
     | 
| 268 | 
         
            -
                         
     | 
| 
         | 
|
| 
         | 
|
| 269 | 
         | 
| 270 | 
         
             
                        # Find the words that fall within the error span
         
     | 
| 271 | 
         
             
                        current_pos = 0
         
     | 
| 
         @@ -279,48 +292,32 @@ def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> byt 
     | 
|
| 279 | 
         
             
                            current_pos += word_length
         
     | 
| 280 | 
         | 
| 281 | 
         
             
                        if not target_words:
         
     | 
| 
         | 
|
| 282 | 
         
             
                            continue
         
     | 
| 283 | 
         | 
| 284 | 
         
             
                        initial_x = target_words[0][2]
         
     | 
| 285 | 
         
             
                        initial_y = target_words[0][3]
         
     | 
| 286 | 
         
            -
                        final_x = target_words[len(target_words) 
     | 
| 287 | 
         
            -
                        final_y = target_words[len(target_words) 
     | 
| 288 | 
         
             
                        issue["coordinates"] = [initial_x, initial_y, final_x, final_y]
         
     | 
| 289 | 
         
             
                        issue["page"] = target_words[0][0] + 1
         
     | 
| 290 | 
         
            -
             
     | 
| 291 | 
         
             
                        # Add highlight annotations to the target words
         
     | 
| 292 | 
         
            -
                         
     | 
| 293 | 
         
            -
                         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 294 | 
         | 
| 295 | 
         
            -
                        # Create a rectangle around the highlighted text
         
     | 
| 296 | 
         
            -
                        rect = fitz.Rect(initial_x - 1, initial_y - 1, final_x + 1, final_y + 1)
         
     | 
| 297 | 
         
            -
                        highlight = page.add_highlight_annot(rect)
         
     | 
| 298 | 
         
            -
                        highlight.set_colors(stroke=(1, 1, 0))  # Yellow color
         
     | 
| 299 | 
         
            -
                        highlight.update()
         
     | 
| 300 | 
         
            -
             
     | 
| 301 | 
         
            -
                        # Dynamically calculate the position of the comment box
         
     | 
| 302 | 
         
            -
                        page_width, page_height = page.rect.width, page.rect.height
         
     | 
| 303 | 
         
            -
                        comment_box_width = min(140, page_width / 3)  # Ensure the comment box width is a reasonable fraction of the page width
         
     | 
| 304 | 
         
            -
                        comment_box_height = 100  # Set a reasonable height for the comment box
         
     | 
| 305 | 
         
            -
             
     | 
| 306 | 
         
            -
                        # Position the comment box dynamically
         
     | 
| 307 | 
         
            -
                        if initial_x < page_width / 2:  # If the highlighted text is on the left half of the page
         
     | 
| 308 | 
         
            -
                            comment_x = page_width - comment_box_width - 10  # Position it on the right side
         
     | 
| 309 | 
         
            -
                        else:  # If the highlighted text is on the right half of the page
         
     | 
| 310 | 
         
            -
                            comment_x = 10  # Position it on the left side
         
     | 
| 311 | 
         
            -
             
     | 
| 312 | 
         
            -
                        comment_y = initial_y  # Position the comment box near the highlighted text
         
     | 
| 313 | 
         
            -
                        comment_rect = fitz.Rect(comment_x, comment_y, comment_x + comment_box_width, comment_y + comment_box_height)
         
     | 
| 314 | 
         
            -
                        page.add_freetext_annot(comment_rect, error_text)
         
     | 
| 315 | 
         
            -
             
     | 
| 316 | 
         
            -
                        # Draw an arrow from the highlighted word to the comment box
         
     | 
| 317 | 
         
            -
                        arrow_start_x = (initial_x + final_x) / 2  # Center X of the highlighted word
         
     | 
| 318 | 
         
            -
                        arrow_start_y = (initial_y + final_y) / 2  # Center Y of the highlighted word
         
     | 
| 319 | 
         
            -
                        arrow_end_x = (comment_rect.x0 + comment_rect.x1) / 2  # Center X of the comment box
         
     | 
| 320 | 
         
            -
                        arrow_end_y = (comment_rect.y0 + comment_rect.y1) / 2  # Center Y of the comment box
         
     | 
| 321 | 
         
            -
             
     | 
| 322 | 
         
            -
                        # Draw the arrow
         
     | 
| 323 | 
         
            -
                        page.add_arrow((arrow_start_x, arrow_start_y), (arrow_end_x, arrow_end_y), color=(0, 0, 0), width=2)
         
     | 
| 324 | 
         | 
| 325 | 
         
             
                    # Save annotated PDF to bytes
         
     | 
| 326 | 
         
             
                    byte_stream = io.BytesIO()
         
     | 
| 
         @@ -328,9 +325,10 @@ def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> byt 
     | 
|
| 328 | 
         
             
                    annotated_pdf_bytes = byte_stream.getvalue()
         
     | 
| 329 | 
         
             
                    doc.close()
         
     | 
| 330 | 
         | 
| 331 | 
         
            -
                    # Save annotated PDF locally for verification 
     | 
| 332 | 
         
             
                    with open("annotated_temp.pdf", "wb") as f:
         
     | 
| 333 | 
         
             
                        f.write(annotated_pdf_bytes)
         
     | 
| 
         | 
|
| 334 | 
         | 
| 335 | 
         
             
                    return language_matches, annotated_pdf_bytes
         
     | 
| 336 | 
         
             
                except Exception as e:
         
     | 
| 
         @@ -340,8 +338,6 @@ def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> byt 
     | 
|
| 340 | 
         | 
| 341 | 
         | 
| 342 | 
         | 
| 343 | 
         
            -
             
     | 
| 344 | 
         
            -
             
     | 
| 345 | 
         
             
            # ------------------------------
         
     | 
| 346 | 
         
             
            # Main Analysis Function
         
     | 
| 347 | 
         
             
            # ------------------------------
         
     | 
| 
         | 
|
| 226 | 
         
             
                    "missing_references": missing_refs,
         
     | 
| 227 | 
         
             
                    "is_ordered": len(out_of_order) == 0 and len(missing_refs) == 0
         
     | 
| 228 | 
         
             
                }
         
     | 
| 229 | 
         
            +
             
     | 
| 230 | 
         
             
            def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> bytes:
         
     | 
| 231 | 
         
             
                """
         
     | 
| 232 | 
         
            +
                Highlights language issues in the PDF and returns the annotated PDF as bytes.
         
     | 
| 233 | 
         
            +
                This function maps LanguageTool matches to specific words in the PDF
         
     | 
| 234 | 
         
            +
                and highlights those words.
         
     | 
| 235 | 
         
             
                """
         
     | 
| 236 | 
         
             
                try:
         
     | 
| 237 | 
         
             
                    # Open the PDF
         
     | 
| 238 | 
         
             
                    doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file)
         
     | 
| 239 | 
         
            +
                    # print(f"Opened PDF with {len(doc)} pages.")
         
     | 
| 240 | 
         
            +
                    # print(language_matches)
         
     | 
| 241 | 
         
             
                    # Extract words with positions from each page
         
     | 
| 242 | 
         
             
                    word_list = []  # List of tuples: (page_number, word, x0, y0, x1, y1)
         
     | 
| 243 | 
         
             
                    for page_number in range(len(doc)):
         
     | 
| 244 | 
         
             
                        page = doc[page_number]
         
     | 
| 245 | 
         
            +
                        print(page.get_text("words"))
         
     | 
| 246 | 
         
             
                        words = page.get_text("words")  # List of tuples: (x0, y0, x1, y1, "word", block_no, line_no, word_no)
         
     | 
| 247 | 
         
             
                        for w in words:
         
     | 
| 248 | 
         
            +
            #                 print(w)
         
     | 
| 249 | 
         
             
                            word_text = w[4]
         
     | 
| 250 | 
         
            +
                            # **Fix:** Insert a space before '[' to ensure "globally [2]" instead of "globally[2]"
         
     | 
| 251 | 
         
            +
                            # if '[' in word_text:
         
     | 
| 252 | 
         
            +
                            #     word_text = word_text.replace('[', ' [')
         
     | 
| 253 | 
         
             
                            word_list.append((page_number, word_text, w[0], w[1], w[2], w[3]))
         
     | 
| 254 | 
         
            +
                    # print(f"Total words extracted: {len(word_list)}")
         
     | 
| 255 | 
         | 
| 256 | 
         
             
                    # Concatenate all words to form the full text
         
     | 
| 257 | 
         
            +
                    concatenated_text=""
         
     | 
| 258 | 
         
             
                    concatenated_text = " ".join([w[1] for w in word_list])
         
     | 
| 259 | 
         
            +
                    
         
     | 
| 260 | 
         
            +
                    # print(f"Concatenated text length: {concatenated_text} characters.")
         
     | 
| 261 | 
         | 
| 262 | 
         
             
                    # Find "Abstract" section and set the processing start point
         
     | 
| 263 | 
         
             
                    abstract_start = concatenated_text.lower().find("abstract")
         
     | 
| 
         | 
|
| 269 | 
         | 
| 270 | 
         
             
                    # Iterate over each language issue
         
     | 
| 271 | 
         
             
                    for idx, issue in enumerate(language_matches, start=1):
         
     | 
| 272 | 
         
            +
                        offset = issue["offset"]  # offset+line_no-1
         
     | 
| 273 | 
         
             
                        length = issue["length"]
         
     | 
| 274 | 
         | 
| 275 | 
         
             
                        # Skip issues in the references section
         
     | 
| 276 | 
         
             
                        if offset < abstract_offset or offset >= references_offset:
         
     | 
| 277 | 
         
             
                            continue
         
     | 
| 278 | 
         
            +
                        
         
     | 
| 279 | 
         
            +
                        
         
     | 
| 280 | 
         
            +
                        error_text = concatenated_text[offset:offset+length]
         
     | 
| 281 | 
         
            +
                        print(f"\nIssue {idx}: '{error_text}' at offset {offset} with length {length}")
         
     | 
| 282 | 
         | 
| 283 | 
         
             
                        # Find the words that fall within the error span
         
     | 
| 284 | 
         
             
                        current_pos = 0
         
     | 
| 
         | 
|
| 292 | 
         
             
                            current_pos += word_length
         
     | 
| 293 | 
         | 
| 294 | 
         
             
                        if not target_words:
         
     | 
| 295 | 
         
            +
                            # print("No matching words found for this issue.")
         
     | 
| 296 | 
         
             
                            continue
         
     | 
| 297 | 
         | 
| 298 | 
         
             
                        initial_x = target_words[0][2]
         
     | 
| 299 | 
         
             
                        initial_y = target_words[0][3]
         
     | 
| 300 | 
         
            +
                        final_x = target_words[len(target_words)-1][4]
         
     | 
| 301 | 
         
            +
                        final_y = target_words[len(target_words)-1][5]
         
     | 
| 302 | 
         
             
                        issue["coordinates"] = [initial_x, initial_y, final_x, final_y]
         
     | 
| 303 | 
         
             
                        issue["page"] = target_words[0][0] + 1
         
     | 
| 
         | 
|
| 304 | 
         
             
                        # Add highlight annotations to the target words
         
     | 
| 305 | 
         
            +
                        print()
         
     | 
| 306 | 
         
            +
                        print("issue", issue)
         
     | 
| 307 | 
         
            +
                        print("error text", error_text)
         
     | 
| 308 | 
         
            +
                        print(target_words)
         
     | 
| 309 | 
         
            +
                        print()
         
     | 
| 310 | 
         
            +
                        for target in target_words:
         
     | 
| 311 | 
         
            +
                            page_num, word_text, x0, y0, x1, y1 = target
         
     | 
| 312 | 
         
            +
                            page = doc[page_num]
         
     | 
| 313 | 
         
            +
                            # Define a rectangle around the word with some padding
         
     | 
| 314 | 
         
            +
                            rect = fitz.Rect(x0 - 1, y0 - 1, x1 + 1, y1 + 1)
         
     | 
| 315 | 
         
            +
                            # Add a highlight annotation
         
     | 
| 316 | 
         
            +
                            highlight = page.add_highlight_annot(rect)
         
     | 
| 317 | 
         
            +
                            highlight.set_colors(stroke=(1, 1, 0))  # Yellow color
         
     | 
| 318 | 
         
            +
                            highlight.update()
         
     | 
| 319 | 
         
            +
                            # print(f"Highlighted '{word_text}' on page {page_num + 1} at position ({x0}, {y0}, {x1}, {y1})")
         
     | 
| 320 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 321 | 
         | 
| 322 | 
         
             
                    # Save annotated PDF to bytes
         
     | 
| 323 | 
         
             
                    byte_stream = io.BytesIO()
         
     | 
| 
         | 
|
| 325 | 
         
             
                    annotated_pdf_bytes = byte_stream.getvalue()
         
     | 
| 326 | 
         
             
                    doc.close()
         
     | 
| 327 | 
         | 
| 328 | 
         
            +
                    # Save annotated PDF locally for verification
         
     | 
| 329 | 
         
             
                    with open("annotated_temp.pdf", "wb") as f:
         
     | 
| 330 | 
         
             
                        f.write(annotated_pdf_bytes)
         
     | 
| 331 | 
         
            +
                    # print("Annotated PDF saved as 'annotated_temp.pdf' for manual verification.")
         
     | 
| 332 | 
         | 
| 333 | 
         
             
                    return language_matches, annotated_pdf_bytes
         
     | 
| 334 | 
         
             
                except Exception as e:
         
     | 
| 
         | 
|
| 338 | 
         | 
| 339 | 
         | 
| 340 | 
         | 
| 
         | 
|
| 
         | 
|
| 341 | 
         
             
            # ------------------------------
         
     | 
| 342 | 
         
             
            # Main Analysis Function
         
     | 
| 343 | 
         
             
            # ------------------------------
         
     |