joycecast commited on
Commit
6d303d7
·
verified ·
1 Parent(s): 00b95cb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -13
app.py CHANGED
@@ -6,8 +6,8 @@ from io import BytesIO
6
  import pandas as pd
7
 
8
  def check_latest_section(pdf_url, identifiers_input, split_marker):
9
- # Step 1: Prepare identifiers
10
- identifiers = [id.strip() for id in identifiers_input.split(',') if id.strip().isdigit()]
11
  if not identifiers:
12
  return "❌ No valid Message Identifiers entered.", None
13
 
@@ -29,25 +29,26 @@ def check_latest_section(pdf_url, identifiers_input, split_marker):
29
  # Step 4: Split by user-defined marker (optional)
30
  if split_marker.strip() and split_marker in full_text:
31
  parts = full_text.split(split_marker)
32
- latest_block = parts[1]
33
  note = f"✅ Found marker '{split_marker}', using the latest block."
34
  else:
35
  latest_block = full_text
36
  note = f"⚠️ Marker '{split_marker}' not found. Using entire PDF content."
37
 
38
- # Step 5: Match Line# and Message Identifier
39
  id_pattern = "|".join(re.escape(i) for i in identifiers)
40
- regex = re.compile(rf"Line#\s+(\d+)\s+({id_pattern})")
41
 
42
- matches = []
43
  for match in regex.finditer(latest_block):
44
  line_num, msg_id = match.groups()
45
- matches.append({"Line#": int(line_num), "Message Identifier": msg_id})
46
 
47
- if not matches:
48
  return note + " No matching Message Identifiers found.", None
49
 
50
- df = pd.DataFrame(matches).sort_values("Line#").reset_index(drop=True)
 
51
  return note + " Matches found:", df
52
 
53
  # Gradio Interface
@@ -55,15 +56,15 @@ demo = gr.Interface(
55
  fn=check_latest_section,
56
  inputs=[
57
  gr.Textbox(label="PDF URL", placeholder="https://.../yourfile.pdf"),
58
- gr.Textbox(label="Message Identifier List", value="523"),
59
  gr.Textbox(label="Split Marker (optional)", value="Record #"),
60
  ],
61
  outputs=[
62
  gr.Textbox(label="Status"),
63
  gr.Dataframe(label="Matching Lines", type="pandas"),
64
  ],
65
- title="PDF Line# Identifier Checker (Latest Only)",
66
- description="Checks Line# entries with specified Message Identifiers. If a split marker is provided, only the latest section is used; otherwise, the full document is scanned."
67
  )
68
 
69
- demo.launch()
 
6
  import pandas as pd
7
 
8
  def check_latest_section(pdf_url, identifiers_input, split_marker):
9
+ # Step 1: Prepare identifiers (alphanumeric-safe)
10
+ identifiers = [id.strip() for id in identifiers_input.split(',') if id.strip()]
11
  if not identifiers:
12
  return "❌ No valid Message Identifiers entered.", None
13
 
 
29
  # Step 4: Split by user-defined marker (optional)
30
  if split_marker.strip() and split_marker in full_text:
31
  parts = full_text.split(split_marker)
32
+ latest_block = parts[1] # First block *after* the split
33
  note = f"✅ Found marker '{split_marker}', using the latest block."
34
  else:
35
  latest_block = full_text
36
  note = f"⚠️ Marker '{split_marker}' not found. Using entire PDF content."
37
 
38
+ # Step 5: Match Line# and Message Identifier (alphanumeric-safe, unique results)
39
  id_pattern = "|".join(re.escape(i) for i in identifiers)
40
+ regex = re.compile(rf"Line#\s+(\d+)[\s\S]*?\b({id_pattern})\b")
41
 
42
+ matches_set = set()
43
  for match in regex.finditer(latest_block):
44
  line_num, msg_id = match.groups()
45
+ matches_set.add((int(line_num), msg_id)) # Add to set to keep unique
46
 
47
+ if not matches_set:
48
  return note + " No matching Message Identifiers found.", None
49
 
50
+ # Convert to sorted DataFrame
51
+ df = pd.DataFrame(sorted(matches_set), columns=["Line#","Message Identifier"])
52
  return note + " Matches found:", df
53
 
54
  # Gradio Interface
 
56
  fn=check_latest_section,
57
  inputs=[
58
  gr.Textbox(label="PDF URL", placeholder="https://.../yourfile.pdf"),
59
+ gr.Textbox(label="Message Identifier List", value="523, P00"),
60
  gr.Textbox(label="Split Marker (optional)", value="Record #"),
61
  ],
62
  outputs=[
63
  gr.Textbox(label="Status"),
64
  gr.Dataframe(label="Matching Lines", type="pandas"),
65
  ],
66
+ title="PDF Line# Identifier Checker (Unique Matches Only)",
67
+ description="Checks Line# entries with specified Message Identifiers. Removes duplicates. Uses latest section if marker is provided."
68
  )
69
 
70
+ demo.launch()