Spaces:
Sleeping
Sleeping
import gradio as gr | |
import PyPDF2 | |
import pandas as pd | |
import re | |
import io | |
def extract_with_lines(pdf_path): | |
""" | |
Extract all PDF text, displaying page+line number prefix. | |
Returns raw text for training. | |
""" | |
with open(pdf_path, "rb") as f: | |
reader = PyPDF2.PdfReader(f) | |
result = [] | |
for i, page in enumerate(reader.pages): | |
page_text = page.extract_text() | |
if page_text: | |
lines = page_text.splitlines() | |
for ln, line in enumerate(lines): | |
result.append(f"[Page {i+1} Line {ln+1}] {line}") | |
return "\n".join(result) if result else "[NO TEXT FOUND]" | |
def get_sample_context(raw_text, example): | |
"""Show where the sample occurs, for user feedback (teaching phase)""" | |
context_lines = [] | |
lines = raw_text.splitlines() | |
example = example.strip() | |
for i, line in enumerate(lines): | |
if example and example in line: | |
prev_line = lines[i-1] if i > 0 else "" | |
next_line = lines[i+1] if i+1 < len(lines) else "" | |
snippet = f"...\n{prev_line}\n>>> {line} <<<\n{next_line}\n..." | |
context_lines.append(snippet) | |
if not context_lines: | |
return "No match for example in extracted text." | |
return "\n---\n".join(context_lines) | |
def guess_extraction_regex(sample_value, all_lines): | |
""" | |
Use the sample_value to build a simple extraction pattern. | |
If the value is after a colon or consistent header, match similar lines. | |
""" | |
for line in all_lines: | |
if sample_value in line: | |
if ':' in line: | |
prefix, suffix = line.split(':', 1) | |
if sample_value.strip() == suffix.strip(): | |
return re.compile(f"{re.escape(prefix.strip())}\\s*:\\s*(.+)", re.IGNORECASE) | |
match = re.match(r"(.*?)(\\s+)?"+re.escape(sample_value)+r"(.*)?", line) | |
if match and match.group(1).strip(): | |
return re.compile(f"{re.escape(match.group(1).strip())}\\s*(.+)", re.IGNORECASE) | |
return None | |
def extract_table_from_sample(raw_text, label, sample_value): | |
lines = raw_text.splitlines() | |
if not label or not sample_value: | |
return pd.DataFrame([{"Error": "Please supply both label and sample value!"}]) | |
regex = guess_extraction_regex(sample_value, lines) | |
found = [] | |
if regex: | |
for line in lines: | |
m = regex.match(line) | |
if m: | |
found.append({label: m.group(1).strip()}) | |
else: | |
prefix = sample_value[:5] | |
for line in lines: | |
if prefix in line: | |
found.append({label: line.strip()}) | |
if not found: | |
return pd.DataFrame([{"Error": f"No matches found for sample: {sample_value}"}]) | |
return pd.DataFrame(found) | |
def export_xlsx(df): | |
buf = io.BytesIO() | |
with pd.ExcelWriter(buf, engine="xlsxwriter") as writer: | |
df.to_excel(writer, index=False) | |
buf.seek(0) | |
return buf | |
with gr.Blocks() as demo: | |
gr.Markdown("# π§βπ« PDF Teach-&-Extract System\n**1. Upload PDF β 2. Teach a sample field β 3. Preview all auto-extracted matches β 4. Download as Excel**") | |
file_in = gr.File(label="Upload your PDF", file_count="single", type="filepath") | |
raw_text = gr.Textbox(label="Raw extracted PDF text (preview/copy here)", lines=18, show_copy_button=True) | |
file_in.change(extract_with_lines, inputs=file_in, outputs=raw_text) | |
with gr.Row(): | |
teach_label = gr.Textbox(label="Your Desired Field Name (e.g. Customer Name)") | |
teach_sample = gr.Textbox(label="Example Value (copy-paste from above)") | |
teach_search = gr.Button("Show Context") | |
context_out = gr.Textbox(label="System shows the found context(s)", lines=4) | |
teach_search.click(get_sample_context, inputs=[raw_text, teach_sample], outputs=context_out) | |
with gr.Row(): | |
extract_btn = gr.Button("Extract All Similar Values") | |
results_table = gr.Dataframe(label="Extracted Results Table") | |
download_btn = gr.Button("Download as Excel") | |
xlsx_file = gr.File(label="Excel Download (.xlsx)", visible=True) | |
def extract_and_preview(raw_text, teach_label, teach_sample): | |
df = extract_table_from_sample(raw_text, teach_label, teach_sample) | |
return df | |
extract_btn.click(extract_and_preview, inputs=[raw_text, teach_label, teach_sample], outputs=results_table) | |
def save_xlsx(df): | |
buf = export_xlsx(df) | |
return ("results.xlsx", buf) | |
download_btn.click(save_xlsx, inputs=results_table, outputs=xlsx_file) | |
demo.launch() |