vaibhavbalar commited on
Commit
cbbc2a6
Β·
verified Β·
1 Parent(s): 969dffc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -29
app.py CHANGED
@@ -4,12 +4,12 @@ import pandas as pd
4
  import re
5
  import io
6
 
7
- def extract_with_lines(pdf_file):
8
  """
9
  Extract all PDF text, displaying page+line number prefix.
10
  Returns raw text for training.
11
  """
12
- with open(pdf_file.name, "rb") as f:
13
  reader = PyPDF2.PdfReader(f)
14
  result = []
15
  for i, page in enumerate(reader.pages):
@@ -40,89 +40,66 @@ def guess_extraction_regex(sample_value, all_lines):
40
  Use the sample_value to build a simple extraction pattern.
41
  If the value is after a colon or consistent header, match similar lines.
42
  """
43
- # Try to extract prefix
44
  for line in all_lines:
45
  if sample_value in line:
46
- # If the sample is after "Some Label: ", extract that
47
  if ':' in line:
48
  prefix, suffix = line.split(':', 1)
49
  if sample_value.strip() == suffix.strip():
50
- return re.compile(f"{re.escape(prefix.strip())}\s*:\s*(.+)", re.IGNORECASE)
51
- # If the sample is always after the same start
52
- match = re.match(r"(.*?)(\s+)?"+re.escape(sample_value)+r"(.*)?", line)
53
  if match and match.group(1).strip():
54
- # Return a regex that matches that prefix and captures the rest
55
- return re.compile(f"{re.escape(match.group(1).strip())}\s*(.+)", re.IGNORECASE)
56
- # Fallback: find lines that contain the sample and grab same structure
57
  return None
58
 
59
  def extract_table_from_sample(raw_text, label, sample_value):
60
- # Split lines
61
  lines = raw_text.splitlines()
62
  if not label or not sample_value:
63
  return pd.DataFrame([{"Error": "Please supply both label and sample value!"}])
64
-
65
- # Try to pattern match (e.g. "Customer Name: Ramesh Kumar")
66
  regex = guess_extraction_regex(sample_value, lines)
67
  found = []
68
-
69
  if regex:
70
  for line in lines:
71
  m = regex.match(line)
72
  if m:
73
  found.append({label: m.group(1).strip()})
74
  else:
75
- # Fallback, just grab lines that contain the sample's prefix
76
- # Try to find all lines which have the non-digit prefix of this sample
77
  prefix = sample_value[:5]
78
  for line in lines:
79
  if prefix in line:
80
  found.append({label: line.strip()})
81
-
82
  if not found:
83
  return pd.DataFrame([{"Error": f"No matches found for sample: {sample_value}"}])
84
  return pd.DataFrame(found)
85
 
86
  def export_xlsx(df):
87
- """Export pandas df to xlsx in-memory file"""
88
  buf = io.BytesIO()
89
  with pd.ExcelWriter(buf, engine="xlsxwriter") as writer:
90
  df.to_excel(writer, index=False)
91
  buf.seek(0)
92
  return buf
93
 
94
- ### Gradio Interface
95
-
96
  with gr.Blocks() as demo:
97
  gr.Markdown("# πŸ§‘β€πŸ« PDF Teach-&-Extract System\n**1. Upload PDF β†’ 2. Teach a sample field β†’ 3. Preview all auto-extracted matches β†’ 4. Download as Excel**")
98
-
99
- file_in = gr.File(label="Upload your PDF", file_count="single", type="file")
100
  raw_text = gr.Textbox(label="Raw extracted PDF text (preview/copy here)", lines=18, show_copy_button=True)
101
-
102
  file_in.change(extract_with_lines, inputs=file_in, outputs=raw_text)
103
-
104
  with gr.Row():
105
  teach_label = gr.Textbox(label="Your Desired Field Name (e.g. Customer Name)")
106
  teach_sample = gr.Textbox(label="Example Value (copy-paste from above)")
107
  teach_search = gr.Button("Show Context")
108
  context_out = gr.Textbox(label="System shows the found context(s)", lines=4)
109
-
110
  teach_search.click(get_sample_context, inputs=[raw_text, teach_sample], outputs=context_out)
111
-
112
  with gr.Row():
113
  extract_btn = gr.Button("Extract All Similar Values")
114
  results_table = gr.Dataframe(label="Extracted Results Table")
115
  download_btn = gr.Button("Download as Excel")
116
  xlsx_file = gr.File(label="Excel Download (.xlsx)", visible=True)
117
-
118
  def extract_and_preview(raw_text, teach_label, teach_sample):
119
  df = extract_table_from_sample(raw_text, teach_label, teach_sample)
120
  return df
121
  extract_btn.click(extract_and_preview, inputs=[raw_text, teach_label, teach_sample], outputs=results_table)
122
-
123
  def save_xlsx(df):
124
  buf = export_xlsx(df)
125
  return ("results.xlsx", buf)
126
  download_btn.click(save_xlsx, inputs=results_table, outputs=xlsx_file)
127
-
128
  demo.launch()
 
4
  import re
5
  import io
6
 
7
+ def extract_with_lines(pdf_path):
8
  """
9
  Extract all PDF text, displaying page+line number prefix.
10
  Returns raw text for training.
11
  """
12
+ with open(pdf_path, "rb") as f:
13
  reader = PyPDF2.PdfReader(f)
14
  result = []
15
  for i, page in enumerate(reader.pages):
 
40
  Use the sample_value to build a simple extraction pattern.
41
  If the value is after a colon or consistent header, match similar lines.
42
  """
 
43
  for line in all_lines:
44
  if sample_value in line:
 
45
  if ':' in line:
46
  prefix, suffix = line.split(':', 1)
47
  if sample_value.strip() == suffix.strip():
48
+ return re.compile(f"{re.escape(prefix.strip())}\\s*:\\s*(.+)", re.IGNORECASE)
49
+ match = re.match(r"(.*?)(\\s+)?"+re.escape(sample_value)+r"(.*)?", line)
 
50
  if match and match.group(1).strip():
51
+ return re.compile(f"{re.escape(match.group(1).strip())}\\s*(.+)", re.IGNORECASE)
 
 
52
  return None
53
 
54
  def extract_table_from_sample(raw_text, label, sample_value):
 
55
  lines = raw_text.splitlines()
56
  if not label or not sample_value:
57
  return pd.DataFrame([{"Error": "Please supply both label and sample value!"}])
 
 
58
  regex = guess_extraction_regex(sample_value, lines)
59
  found = []
 
60
  if regex:
61
  for line in lines:
62
  m = regex.match(line)
63
  if m:
64
  found.append({label: m.group(1).strip()})
65
  else:
 
 
66
  prefix = sample_value[:5]
67
  for line in lines:
68
  if prefix in line:
69
  found.append({label: line.strip()})
 
70
  if not found:
71
  return pd.DataFrame([{"Error": f"No matches found for sample: {sample_value}"}])
72
  return pd.DataFrame(found)
73
 
74
  def export_xlsx(df):
 
75
  buf = io.BytesIO()
76
  with pd.ExcelWriter(buf, engine="xlsxwriter") as writer:
77
  df.to_excel(writer, index=False)
78
  buf.seek(0)
79
  return buf
80
 
 
 
81
  with gr.Blocks() as demo:
82
  gr.Markdown("# πŸ§‘β€πŸ« PDF Teach-&-Extract System\n**1. Upload PDF β†’ 2. Teach a sample field β†’ 3. Preview all auto-extracted matches β†’ 4. Download as Excel**")
83
+ file_in = gr.File(label="Upload your PDF", file_count="single", type="filepath")
 
84
  raw_text = gr.Textbox(label="Raw extracted PDF text (preview/copy here)", lines=18, show_copy_button=True)
 
85
  file_in.change(extract_with_lines, inputs=file_in, outputs=raw_text)
 
86
  with gr.Row():
87
  teach_label = gr.Textbox(label="Your Desired Field Name (e.g. Customer Name)")
88
  teach_sample = gr.Textbox(label="Example Value (copy-paste from above)")
89
  teach_search = gr.Button("Show Context")
90
  context_out = gr.Textbox(label="System shows the found context(s)", lines=4)
 
91
  teach_search.click(get_sample_context, inputs=[raw_text, teach_sample], outputs=context_out)
 
92
  with gr.Row():
93
  extract_btn = gr.Button("Extract All Similar Values")
94
  results_table = gr.Dataframe(label="Extracted Results Table")
95
  download_btn = gr.Button("Download as Excel")
96
  xlsx_file = gr.File(label="Excel Download (.xlsx)", visible=True)
 
97
  def extract_and_preview(raw_text, teach_label, teach_sample):
98
  df = extract_table_from_sample(raw_text, teach_label, teach_sample)
99
  return df
100
  extract_btn.click(extract_and_preview, inputs=[raw_text, teach_label, teach_sample], outputs=results_table)
 
101
  def save_xlsx(df):
102
  buf = export_xlsx(df)
103
  return ("results.xlsx", buf)
104
  download_btn.click(save_xlsx, inputs=results_table, outputs=xlsx_file)
 
105
  demo.launch()