Spaces:

APEXlogistics
/

ErrorMsgIdentifier

Sleeping

App Files Files Community

joycecast commited on May 2, 2025

Commit

6d303d7

verified ·

1 Parent(s): 00b95cb

Update app.py

Browse files

Files changed (1) hide show

app.py +14 -13

app.py CHANGED Viewed

@@ -6,8 +6,8 @@ from io import BytesIO
 import pandas as pd
 def check_latest_section(pdf_url, identifiers_input, split_marker):
-    # Step 1: Prepare identifiers
-    identifiers = [id.strip() for id in identifiers_input.split(',') if id.strip().isdigit()]
     if not identifiers:
         return "❌ No valid Message Identifiers entered.", None
@@ -29,25 +29,26 @@ def check_latest_section(pdf_url, identifiers_input, split_marker):
     # Step 4: Split by user-defined marker (optional)
     if split_marker.strip() and split_marker in full_text:
         parts = full_text.split(split_marker)
-        latest_block = parts[1]
         note = f"✅ Found marker '{split_marker}', using the latest block."
     else:
         latest_block = full_text
         note = f"⚠️ Marker '{split_marker}' not found. Using entire PDF content."
-    # Step 5: Match Line# and Message Identifier
     id_pattern = "|".join(re.escape(i) for i in identifiers)
-    regex = re.compile(rf"Line#\s+(\d+)\s+({id_pattern})")
-    matches = []
     for match in regex.finditer(latest_block):
         line_num, msg_id = match.groups()
-        matches.append({"Line#": int(line_num), "Message Identifier": msg_id})
-    if not matches:
         return note + " No matching Message Identifiers found.", None
-    df = pd.DataFrame(matches).sort_values("Line#").reset_index(drop=True)
     return note + " Matches found:", df
 # Gradio Interface
@@ -55,15 +56,15 @@ demo = gr.Interface(
     fn=check_latest_section,
     inputs=[
         gr.Textbox(label="PDF URL", placeholder="https://.../yourfile.pdf"),
-        gr.Textbox(label="Message Identifier List", value="523"),
         gr.Textbox(label="Split Marker (optional)", value="Record #"),
     ],
     outputs=[
         gr.Textbox(label="Status"),
         gr.Dataframe(label="Matching Lines", type="pandas"),
     ],
-    title="PDF Line# Identifier Checker (Latest Only)",
-    description="Checks Line# entries with specified Message Identifiers. If a split marker is provided, only the latest section is used; otherwise, the full document is scanned."
 )
-demo.launch()

 import pandas as pd
 def check_latest_section(pdf_url, identifiers_input, split_marker):
+    # Step 1: Prepare identifiers (alphanumeric-safe)
+    identifiers = [id.strip() for id in identifiers_input.split(',') if id.strip()]
     if not identifiers:
         return "❌ No valid Message Identifiers entered.", None
     # Step 4: Split by user-defined marker (optional)
     if split_marker.strip() and split_marker in full_text:
         parts = full_text.split(split_marker)
+        latest_block = parts[1]  # First block *after* the split
         note = f"✅ Found marker '{split_marker}', using the latest block."
     else:
         latest_block = full_text
         note = f"⚠️ Marker '{split_marker}' not found. Using entire PDF content."
+    # Step 5: Match Line# and Message Identifier (alphanumeric-safe, unique results)
     id_pattern = "|".join(re.escape(i) for i in identifiers)
+    regex = re.compile(rf"Line#\s+(\d+)[\s\S]*?\b({id_pattern})\b")
+    matches_set = set()
     for match in regex.finditer(latest_block):
         line_num, msg_id = match.groups()
+        matches_set.add((int(line_num), msg_id))  # Add to set to keep unique
+    if not matches_set:
         return note + " No matching Message Identifiers found.", None
+    # Convert to sorted DataFrame
+    df = pd.DataFrame(sorted(matches_set), columns=["Line#","Message Identifier"])
     return note + " Matches found:", df
 # Gradio Interface
     fn=check_latest_section,
     inputs=[
         gr.Textbox(label="PDF URL", placeholder="https://.../yourfile.pdf"),
+        gr.Textbox(label="Message Identifier List", value="523, P00"),
         gr.Textbox(label="Split Marker (optional)", value="Record #"),
     ],
     outputs=[
         gr.Textbox(label="Status"),
         gr.Dataframe(label="Matching Lines", type="pandas"),
     ],
+    title="PDF Line# Identifier Checker (Unique Matches Only)",
+    description="Checks Line# entries with specified Message Identifiers. Removes duplicates. Uses latest section if marker is provided."
 )
+demo.launch()