joycecast commited on
Commit
042f009
·
verified ·
1 Parent(s): 6d303d7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -13
app.py CHANGED
@@ -22,7 +22,7 @@ def check_latest_section(pdf_url, identifiers_input, split_marker):
22
  # Step 3: Extract full text from PDF
23
  try:
24
  doc = fitz.open(stream=pdf_bytes, filetype="pdf")
25
- full_text = "\n".join([page.get_text() for page in doc])
26
  except Exception as e:
27
  return f"❌ Failed to extract text: {str(e)}", None
28
 
@@ -35,20 +35,31 @@ def check_latest_section(pdf_url, identifiers_input, split_marker):
35
  latest_block = full_text
36
  note = f"⚠️ Marker '{split_marker}' not found. Using entire PDF content."
37
 
38
- # Step 5: Match Line# and Message Identifier (alphanumeric-safe, unique results)
39
- id_pattern = "|".join(re.escape(i) for i in identifiers)
40
- regex = re.compile(rf"Line#\s+(\d+)[\s\S]*?\b({id_pattern})\b")
41
-
42
  matches_set = set()
43
- for match in regex.finditer(latest_block):
44
- line_num, msg_id = match.groups()
45
- matches_set.add((int(line_num), msg_id)) # Add to set to keep unique
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
  if not matches_set:
48
  return note + " No matching Message Identifiers found.", None
49
 
50
- # Convert to sorted DataFrame
51
- df = pd.DataFrame(sorted(matches_set), columns=["Line#","Message Identifier"])
52
  return note + " Matches found:", df
53
 
54
  # Gradio Interface
@@ -56,15 +67,15 @@ demo = gr.Interface(
56
  fn=check_latest_section,
57
  inputs=[
58
  gr.Textbox(label="PDF URL", placeholder="https://.../yourfile.pdf"),
59
- gr.Textbox(label="Message Identifier List", value="523, P00"),
60
  gr.Textbox(label="Split Marker (optional)", value="Record #"),
61
  ],
62
  outputs=[
63
  gr.Textbox(label="Status"),
64
  gr.Dataframe(label="Matching Lines", type="pandas"),
65
  ],
66
- title="PDF Line# Identifier Checker (Unique Matches Only)",
67
- description="Checks Line# entries with specified Message Identifiers. Removes duplicates. Uses latest section if marker is provided."
68
  )
69
 
70
  demo.launch()
 
22
  # Step 3: Extract full text from PDF
23
  try:
24
  doc = fitz.open(stream=pdf_bytes, filetype="pdf")
25
+ full_text = "\n".join([page.get_text("text") for page in doc])
26
  except Exception as e:
27
  return f"❌ Failed to extract text: {str(e)}", None
28
 
 
35
  latest_block = full_text
36
  note = f"⚠️ Marker '{split_marker}' not found. Using entire PDF content."
37
 
38
+ # Step 5: Track Line# context and find message identifiers below it
39
+ id_pattern = set(identifiers)
 
 
40
  matches_set = set()
41
+
42
+ current_line = None
43
+ for line in latest_block.splitlines():
44
+ line = line.strip()
45
+
46
+ if not line:
47
+ continue
48
+
49
+ line_match = re.match(r"Line#\s+(\d+)", line)
50
+ if line_match:
51
+ current_line = int(line_match.group(1))
52
+ continue
53
+
54
+ if current_line is not None:
55
+ for ident in id_pattern:
56
+ if re.search(rf"\b{re.escape(ident)}\b", line):
57
+ matches_set.add((current_line, ident))
58
 
59
  if not matches_set:
60
  return note + " No matching Message Identifiers found.", None
61
 
62
+ df = pd.DataFrame(sorted(matches_set), columns=["Line#", "Message Identifier"])
 
63
  return note + " Matches found:", df
64
 
65
  # Gradio Interface
 
67
  fn=check_latest_section,
68
  inputs=[
69
  gr.Textbox(label="PDF URL", placeholder="https://.../yourfile.pdf"),
70
+ gr.Textbox(label="Message Identifier List", value="523, P00, P02, 831"),
71
  gr.Textbox(label="Split Marker (optional)", value="Record #"),
72
  ],
73
  outputs=[
74
  gr.Textbox(label="Status"),
75
  gr.Dataframe(label="Matching Lines", type="pandas"),
76
  ],
77
+ title="PDF Line# Identifier Checker (Reliable Contextual Matching)",
78
+ description="Scans a PDF from URL, tracks Line# blocks and matches identifiers in the lines that follow."
79
  )
80
 
81
  demo.launch()