import gradio as gr import PyPDF2 import pandas as pd import re import io def extract_with_lines(pdf_path): """ Extract all PDF text, displaying page+line number prefix. Returns raw text for training. """ with open(pdf_path, "rb") as f: reader = PyPDF2.PdfReader(f) result = [] for i, page in enumerate(reader.pages): page_text = page.extract_text() if page_text: lines = page_text.splitlines() for ln, line in enumerate(lines): result.append(f"[Page {i+1} Line {ln+1}] {line}") return "\n".join(result) if result else "[NO TEXT FOUND]" def get_sample_context(raw_text, example): """Show where the sample occurs, for user feedback (teaching phase)""" context_lines = [] lines = raw_text.splitlines() example = example.strip() for i, line in enumerate(lines): if example and example in line: prev_line = lines[i-1] if i > 0 else "" next_line = lines[i+1] if i+1 < len(lines) else "" snippet = f"...\n{prev_line}\n>>> {line} <<<\n{next_line}\n..." context_lines.append(snippet) if not context_lines: return "No match for example in extracted text." return "\n---\n".join(context_lines) def guess_extraction_regex(sample_value, all_lines): """ Use the sample_value to build a simple extraction pattern. If the value is after a colon or consistent header, match similar lines. """ for line in all_lines: if sample_value in line: if ':' in line: prefix, suffix = line.split(':', 1) if sample_value.strip() == suffix.strip(): return re.compile(f"{re.escape(prefix.strip())}\\s*:\\s*(.+)", re.IGNORECASE) match = re.match(r"(.*?)(\\s+)?"+re.escape(sample_value)+r"(.*)?", line) if match and match.group(1).strip(): return re.compile(f"{re.escape(match.group(1).strip())}\\s*(.+)", re.IGNORECASE) return None def extract_table_from_sample(raw_text, label, sample_value): lines = raw_text.splitlines() if not label or not sample_value: return pd.DataFrame([{"Error": "Please supply both label and sample value!"}]) regex = guess_extraction_regex(sample_value, lines) found = [] if regex: for line in lines: m = regex.match(line) if m: found.append({label: m.group(1).strip()}) else: prefix = sample_value[:5] for line in lines: if prefix in line: found.append({label: line.strip()}) if not found: return pd.DataFrame([{"Error": f"No matches found for sample: {sample_value}"}]) return pd.DataFrame(found) def export_xlsx(df): buf = io.BytesIO() with pd.ExcelWriter(buf, engine="xlsxwriter") as writer: df.to_excel(writer, index=False) buf.seek(0) return buf with gr.Blocks() as demo: gr.Markdown("# 🧑‍🏫 PDF Teach-&-Extract System\n**1. Upload PDF → 2. Teach a sample field → 3. Preview all auto-extracted matches → 4. Download as Excel**") file_in = gr.File(label="Upload your PDF", file_count="single", type="filepath") raw_text = gr.Textbox(label="Raw extracted PDF text (preview/copy here)", lines=18, show_copy_button=True) file_in.change(extract_with_lines, inputs=file_in, outputs=raw_text) with gr.Row(): teach_label = gr.Textbox(label="Your Desired Field Name (e.g. Customer Name)") teach_sample = gr.Textbox(label="Example Value (copy-paste from above)") teach_search = gr.Button("Show Context") context_out = gr.Textbox(label="System shows the found context(s)", lines=4) teach_search.click(get_sample_context, inputs=[raw_text, teach_sample], outputs=context_out) with gr.Row(): extract_btn = gr.Button("Extract All Similar Values") results_table = gr.Dataframe(label="Extracted Results Table") download_btn = gr.Button("Download as Excel") xlsx_file = gr.File(label="Excel Download (.xlsx)", visible=True) def extract_and_preview(raw_text, teach_label, teach_sample): df = extract_table_from_sample(raw_text, teach_label, teach_sample) return df extract_btn.click(extract_and_preview, inputs=[raw_text, teach_label, teach_sample], outputs=results_table) def save_xlsx(df): buf = export_xlsx(df) return ("results.xlsx", buf) download_btn.click(save_xlsx, inputs=results_table, outputs=xlsx_file) demo.launch()