Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -22,7 +22,7 @@ def check_latest_section(pdf_url, identifiers_input, split_marker):
|
|
| 22 |
# Step 3: Extract full text from PDF
|
| 23 |
try:
|
| 24 |
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
| 25 |
-
full_text = "\n".join([page.get_text() for page in doc])
|
| 26 |
except Exception as e:
|
| 27 |
return f"❌ Failed to extract text: {str(e)}", None
|
| 28 |
|
|
@@ -35,20 +35,31 @@ def check_latest_section(pdf_url, identifiers_input, split_marker):
|
|
| 35 |
latest_block = full_text
|
| 36 |
note = f"⚠️ Marker '{split_marker}' not found. Using entire PDF content."
|
| 37 |
|
| 38 |
-
# Step 5:
|
| 39 |
-
id_pattern =
|
| 40 |
-
regex = re.compile(rf"Line#\s+(\d+)[\s\S]*?\b({id_pattern})\b")
|
| 41 |
-
|
| 42 |
matches_set = set()
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
if not matches_set:
|
| 48 |
return note + " No matching Message Identifiers found.", None
|
| 49 |
|
| 50 |
-
|
| 51 |
-
df = pd.DataFrame(sorted(matches_set), columns=["Line#","Message Identifier"])
|
| 52 |
return note + " Matches found:", df
|
| 53 |
|
| 54 |
# Gradio Interface
|
|
@@ -56,15 +67,15 @@ demo = gr.Interface(
|
|
| 56 |
fn=check_latest_section,
|
| 57 |
inputs=[
|
| 58 |
gr.Textbox(label="PDF URL", placeholder="https://.../yourfile.pdf"),
|
| 59 |
-
gr.Textbox(label="Message Identifier List", value="523, P00"),
|
| 60 |
gr.Textbox(label="Split Marker (optional)", value="Record #"),
|
| 61 |
],
|
| 62 |
outputs=[
|
| 63 |
gr.Textbox(label="Status"),
|
| 64 |
gr.Dataframe(label="Matching Lines", type="pandas"),
|
| 65 |
],
|
| 66 |
-
title="PDF Line# Identifier Checker (
|
| 67 |
-
description="
|
| 68 |
)
|
| 69 |
|
| 70 |
demo.launch()
|
|
|
|
| 22 |
# Step 3: Extract full text from PDF
|
| 23 |
try:
|
| 24 |
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
| 25 |
+
full_text = "\n".join([page.get_text("text") for page in doc])
|
| 26 |
except Exception as e:
|
| 27 |
return f"❌ Failed to extract text: {str(e)}", None
|
| 28 |
|
|
|
|
| 35 |
latest_block = full_text
|
| 36 |
note = f"⚠️ Marker '{split_marker}' not found. Using entire PDF content."
|
| 37 |
|
| 38 |
+
# Step 5: Track Line# context and find message identifiers below it
|
| 39 |
+
id_pattern = set(identifiers)
|
|
|
|
|
|
|
| 40 |
matches_set = set()
|
| 41 |
+
|
| 42 |
+
current_line = None
|
| 43 |
+
for line in latest_block.splitlines():
|
| 44 |
+
line = line.strip()
|
| 45 |
+
|
| 46 |
+
if not line:
|
| 47 |
+
continue
|
| 48 |
+
|
| 49 |
+
line_match = re.match(r"Line#\s+(\d+)", line)
|
| 50 |
+
if line_match:
|
| 51 |
+
current_line = int(line_match.group(1))
|
| 52 |
+
continue
|
| 53 |
+
|
| 54 |
+
if current_line is not None:
|
| 55 |
+
for ident in id_pattern:
|
| 56 |
+
if re.search(rf"\b{re.escape(ident)}\b", line):
|
| 57 |
+
matches_set.add((current_line, ident))
|
| 58 |
|
| 59 |
if not matches_set:
|
| 60 |
return note + " No matching Message Identifiers found.", None
|
| 61 |
|
| 62 |
+
df = pd.DataFrame(sorted(matches_set), columns=["Line#", "Message Identifier"])
|
|
|
|
| 63 |
return note + " Matches found:", df
|
| 64 |
|
| 65 |
# Gradio Interface
|
|
|
|
| 67 |
fn=check_latest_section,
|
| 68 |
inputs=[
|
| 69 |
gr.Textbox(label="PDF URL", placeholder="https://.../yourfile.pdf"),
|
| 70 |
+
gr.Textbox(label="Message Identifier List", value="523, P00, P02, 831"),
|
| 71 |
gr.Textbox(label="Split Marker (optional)", value="Record #"),
|
| 72 |
],
|
| 73 |
outputs=[
|
| 74 |
gr.Textbox(label="Status"),
|
| 75 |
gr.Dataframe(label="Matching Lines", type="pandas"),
|
| 76 |
],
|
| 77 |
+
title="PDF Line# Identifier Checker (Reliable Contextual Matching)",
|
| 78 |
+
description="Scans a PDF from URL, tracks Line# blocks and matches identifiers in the lines that follow."
|
| 79 |
)
|
| 80 |
|
| 81 |
demo.launch()
|