Spaces:

APEXlogistics
/

ErrorMsgIdentifier

Sleeping

App Files Files Community

joycecast commited on May 7, 2025

Commit

042f009

verified ·

1 Parent(s): 6d303d7

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -13

app.py CHANGED Viewed

@@ -22,7 +22,7 @@ def check_latest_section(pdf_url, identifiers_input, split_marker):
     # Step 3: Extract full text from PDF
     try:
         doc = fitz.open(stream=pdf_bytes, filetype="pdf")
-        full_text = "\n".join([page.get_text() for page in doc])
     except Exception as e:
         return f"❌ Failed to extract text: {str(e)}", None
@@ -35,20 +35,31 @@ def check_latest_section(pdf_url, identifiers_input, split_marker):
         latest_block = full_text
         note = f"⚠️ Marker '{split_marker}' not found. Using entire PDF content."
-    # Step 5: Match Line# and Message Identifier (alphanumeric-safe, unique results)
-    id_pattern = "|".join(re.escape(i) for i in identifiers)
-    regex = re.compile(rf"Line#\s+(\d+)[\s\S]*?\b({id_pattern})\b")
     matches_set = set()
-    for match in regex.finditer(latest_block):
-        line_num, msg_id = match.groups()
-        matches_set.add((int(line_num), msg_id))  # Add to set to keep unique
     if not matches_set:
         return note + " No matching Message Identifiers found.", None
-    # Convert to sorted DataFrame
-    df = pd.DataFrame(sorted(matches_set), columns=["Line#","Message Identifier"])
     return note + " Matches found:", df
 # Gradio Interface
@@ -56,15 +67,15 @@ demo = gr.Interface(
     fn=check_latest_section,
     inputs=[
         gr.Textbox(label="PDF URL", placeholder="https://.../yourfile.pdf"),
-        gr.Textbox(label="Message Identifier List", value="523, P00"),
         gr.Textbox(label="Split Marker (optional)", value="Record #"),
     ],
     outputs=[
         gr.Textbox(label="Status"),
         gr.Dataframe(label="Matching Lines", type="pandas"),
     ],
-    title="PDF Line# Identifier Checker (Unique Matches Only)",
-    description="Checks Line# entries with specified Message Identifiers. Removes duplicates. Uses latest section if marker is provided."
 )
 demo.launch()

     # Step 3: Extract full text from PDF
     try:
         doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+        full_text = "\n".join([page.get_text("text") for page in doc])
     except Exception as e:
         return f"❌ Failed to extract text: {str(e)}", None
         latest_block = full_text
         note = f"⚠️ Marker '{split_marker}' not found. Using entire PDF content."
+    # Step 5: Track Line# context and find message identifiers below it
+    id_pattern = set(identifiers)
     matches_set = set()
+    current_line = None
+    for line in latest_block.splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        line_match = re.match(r"Line#\s+(\d+)", line)
+        if line_match:
+            current_line = int(line_match.group(1))
+            continue
+        if current_line is not None:
+            for ident in id_pattern:
+                if re.search(rf"\b{re.escape(ident)}\b", line):
+                    matches_set.add((current_line, ident))
     if not matches_set:
         return note + " No matching Message Identifiers found.", None
+    df = pd.DataFrame(sorted(matches_set), columns=["Line#", "Message Identifier"])
     return note + " Matches found:", df
 # Gradio Interface
     fn=check_latest_section,
     inputs=[
         gr.Textbox(label="PDF URL", placeholder="https://.../yourfile.pdf"),
+        gr.Textbox(label="Message Identifier List", value="523, P00, P02, 831"),
         gr.Textbox(label="Split Marker (optional)", value="Record #"),
     ],
     outputs=[
         gr.Textbox(label="Status"),
         gr.Dataframe(label="Matching Lines", type="pandas"),
     ],
+    title="PDF Line# Identifier Checker (Reliable Contextual Matching)",
+    description="Scans a PDF from URL, tracks Line# blocks and matches identifiers in the lines that follow."
 )
 demo.launch()