Spaces:

samyak152002
/

texmetrics-regex-checks-gradio-1

Sleeping

App Files Files Community

samyak152002 commited on Dec 26, 2024

Commit

dde32e5

verified ·

1 Parent(s): 28757b4

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -44

app.py CHANGED Viewed

@@ -226,27 +226,38 @@ def check_reference_order(full_text: str) -> Dict[str, Any]:
         "missing_references": missing_refs,
         "is_ordered": len(out_of_order) == 0 and len(missing_refs) == 0
     }
 def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> bytes:
     """
-    Highlights language issues in the PDF, adds a dynamic comment box with text on the side of the page,
-    and draws arrows pointing from the highlighted text to the comment box.
-    Returns the annotated PDF as bytes.
     """
     try:
         # Open the PDF
         doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file)
         # Extract words with positions from each page
         word_list = []  # List of tuples: (page_number, word, x0, y0, x1, y1)
         for page_number in range(len(doc)):
             page = doc[page_number]
             words = page.get_text("words")  # List of tuples: (x0, y0, x1, y1, "word", block_no, line_no, word_no)
             for w in words:
                 word_text = w[4]
                 word_list.append((page_number, word_text, w[0], w[1], w[2], w[3]))
         # Concatenate all words to form the full text
         concatenated_text = " ".join([w[1] for w in word_list])
         # Find "Abstract" section and set the processing start point
         abstract_start = concatenated_text.lower().find("abstract")
@@ -258,14 +269,16 @@ def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> byt
         # Iterate over each language issue
         for idx, issue in enumerate(language_matches, start=1):
-            offset = issue["offset"]
             length = issue["length"]
             # Skip issues in the references section
             if offset < abstract_offset or offset >= references_offset:
                 continue
-            error_text = concatenated_text[offset:offset + length]
             # Find the words that fall within the error span
             current_pos = 0
@@ -279,48 +292,32 @@ def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> byt
                 current_pos += word_length
             if not target_words:
                 continue
             initial_x = target_words[0][2]
             initial_y = target_words[0][3]
-            final_x = target_words[len(target_words) - 1][4]
-            final_y = target_words[len(target_words) - 1][5]
             issue["coordinates"] = [initial_x, initial_y, final_x, final_y]
             issue["page"] = target_words[0][0] + 1
             # Add highlight annotations to the target words
-            page_num = target_words[0][0]
-            page = doc[page_num]
-            # Create a rectangle around the highlighted text
-            rect = fitz.Rect(initial_x - 1, initial_y - 1, final_x + 1, final_y + 1)
-            highlight = page.add_highlight_annot(rect)
-            highlight.set_colors(stroke=(1, 1, 0))  # Yellow color
-            highlight.update()
-            # Dynamically calculate the position of the comment box
-            page_width, page_height = page.rect.width, page.rect.height
-            comment_box_width = min(140, page_width / 3)  # Ensure the comment box width is a reasonable fraction of the page width
-            comment_box_height = 100  # Set a reasonable height for the comment box
-            # Position the comment box dynamically
-            if initial_x < page_width / 2:  # If the highlighted text is on the left half of the page
-                comment_x = page_width - comment_box_width - 10  # Position it on the right side
-            else:  # If the highlighted text is on the right half of the page
-                comment_x = 10  # Position it on the left side
-            comment_y = initial_y  # Position the comment box near the highlighted text
-            comment_rect = fitz.Rect(comment_x, comment_y, comment_x + comment_box_width, comment_y + comment_box_height)
-            page.add_freetext_annot(comment_rect, error_text)
-            # Draw an arrow from the highlighted word to the comment box
-            arrow_start_x = (initial_x + final_x) / 2  # Center X of the highlighted word
-            arrow_start_y = (initial_y + final_y) / 2  # Center Y of the highlighted word
-            arrow_end_x = (comment_rect.x0 + comment_rect.x1) / 2  # Center X of the comment box
-            arrow_end_y = (comment_rect.y0 + comment_rect.y1) / 2  # Center Y of the comment box
-            # Draw the arrow
-            page.add_arrow((arrow_start_x, arrow_start_y), (arrow_end_x, arrow_end_y), color=(0, 0, 0), width=2)
         # Save annotated PDF to bytes
         byte_stream = io.BytesIO()
@@ -328,9 +325,10 @@ def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> byt
         annotated_pdf_bytes = byte_stream.getvalue()
         doc.close()
-        # Save annotated PDF locally for verification (optional)
         with open("annotated_temp.pdf", "wb") as f:
             f.write(annotated_pdf_bytes)
         return language_matches, annotated_pdf_bytes
     except Exception as e:
@@ -340,8 +338,6 @@ def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> byt
 # ------------------------------
 # Main Analysis Function
 # ------------------------------

         "missing_references": missing_refs,
         "is_ordered": len(out_of_order) == 0 and len(missing_refs) == 0
     }
 def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> bytes:
     """
+    Highlights language issues in the PDF and returns the annotated PDF as bytes.
+    This function maps LanguageTool matches to specific words in the PDF
+    and highlights those words.
     """
     try:
         # Open the PDF
         doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file)
+        # print(f"Opened PDF with {len(doc)} pages.")
+        # print(language_matches)
         # Extract words with positions from each page
         word_list = []  # List of tuples: (page_number, word, x0, y0, x1, y1)
         for page_number in range(len(doc)):
             page = doc[page_number]
+            print(page.get_text("words"))
             words = page.get_text("words")  # List of tuples: (x0, y0, x1, y1, "word", block_no, line_no, word_no)
             for w in words:
+#                 print(w)
                 word_text = w[4]
+                # **Fix:** Insert a space before '[' to ensure "globally [2]" instead of "globally[2]"
+                # if '[' in word_text:
+                #     word_text = word_text.replace('[', ' [')
                 word_list.append((page_number, word_text, w[0], w[1], w[2], w[3]))
+        # print(f"Total words extracted: {len(word_list)}")
         # Concatenate all words to form the full text
+        concatenated_text=""
         concatenated_text = " ".join([w[1] for w in word_list])
+        # print(f"Concatenated text length: {concatenated_text} characters.")
         # Find "Abstract" section and set the processing start point
         abstract_start = concatenated_text.lower().find("abstract")
         # Iterate over each language issue
         for idx, issue in enumerate(language_matches, start=1):
+            offset = issue["offset"]  # offset+line_no-1
             length = issue["length"]
             # Skip issues in the references section
             if offset < abstract_offset or offset >= references_offset:
                 continue
+            error_text = concatenated_text[offset:offset+length]
+            print(f"\nIssue {idx}: '{error_text}' at offset {offset} with length {length}")
             # Find the words that fall within the error span
             current_pos = 0
                 current_pos += word_length
             if not target_words:
+                # print("No matching words found for this issue.")
                 continue
             initial_x = target_words[0][2]
             initial_y = target_words[0][3]
+            final_x = target_words[len(target_words)-1][4]
+            final_y = target_words[len(target_words)-1][5]
             issue["coordinates"] = [initial_x, initial_y, final_x, final_y]
             issue["page"] = target_words[0][0] + 1
             # Add highlight annotations to the target words
+            print()
+            print("issue", issue)
+            print("error text", error_text)
+            print(target_words)
+            print()
+            for target in target_words:
+                page_num, word_text, x0, y0, x1, y1 = target
+                page = doc[page_num]
+                # Define a rectangle around the word with some padding
+                rect = fitz.Rect(x0 - 1, y0 - 1, x1 + 1, y1 + 1)
+                # Add a highlight annotation
+                highlight = page.add_highlight_annot(rect)
+                highlight.set_colors(stroke=(1, 1, 0))  # Yellow color
+                highlight.update()
+                # print(f"Highlighted '{word_text}' on page {page_num + 1} at position ({x0}, {y0}, {x1}, {y1})")
         # Save annotated PDF to bytes
         byte_stream = io.BytesIO()
         annotated_pdf_bytes = byte_stream.getvalue()
         doc.close()
+        # Save annotated PDF locally for verification
         with open("annotated_temp.pdf", "wb") as f:
             f.write(annotated_pdf_bytes)
+        # print("Annotated PDF saved as 'annotated_temp.pdf' for manual verification.")
         return language_matches, annotated_pdf_bytes
     except Exception as e:
 # ------------------------------
 # Main Analysis Function
 # ------------------------------