Spaces:

vaibhavbalar
/

PDF_Extractor

Sleeping

File size: 4,779 Bytes

import gradio as gr  
import PyPDF2  
import pandas as pd  
import re  
import io  

def extract_with_lines(pdf_path):  
    """  
    Extract all PDF text, displaying page+line number prefix.  
    Returns raw text for training.  
    """  
    with open(pdf_path, "rb") as f:  
        reader = PyPDF2.PdfReader(f)  
        result = []  
        for i, page in enumerate(reader.pages):  
            page_text = page.extract_text()  
            if page_text:  
                lines = page_text.splitlines()  
                for ln, line in enumerate(lines):  
                    result.append(f"[Page {i+1} Line {ln+1}] {line}")  
    return "\n".join(result) if result else "[NO TEXT FOUND]"  

def get_sample_context(raw_text, example):  
    """Show where the sample occurs, for user feedback (teaching phase)"""  
    context_lines = []  
    lines = raw_text.splitlines()  
    example = example.strip()  
    for i, line in enumerate(lines):  
        if example and example in line:  
            prev_line = lines[i-1] if i > 0 else ""  
            next_line = lines[i+1] if i+1 < len(lines) else ""  
            snippet = f"...\n{prev_line}\n>>> {line} <<<\n{next_line}\n..."  
            context_lines.append(snippet)  
    if not context_lines:  
        return "No match for example in extracted text."  
    return "\n---\n".join(context_lines)  

def guess_extraction_regex(sample_value, all_lines):  
    """  
    Use the sample_value to build a simple extraction pattern.  
    If the value is after a colon or consistent header, match similar lines.  
    """  
    for line in all_lines:  
        if sample_value in line:  
            if ':' in line:  
                prefix, suffix = line.split(':', 1)  
                if sample_value.strip() == suffix.strip():  
                    return re.compile(f"{re.escape(prefix.strip())}\\s*:\\s*(.+)", re.IGNORECASE)  
            match = re.match(r"(.*?)(\\s+)?"+re.escape(sample_value)+r"(.*)?", line)  
            if match and match.group(1).strip():  
                return re.compile(f"{re.escape(match.group(1).strip())}\\s*(.+)", re.IGNORECASE)  
    return None  

def extract_table_from_sample(raw_text, label, sample_value):  
    lines = raw_text.splitlines()  
    if not label or not sample_value:  
        return pd.DataFrame([{"Error": "Please supply both label and sample value!"}])  
    regex = guess_extraction_regex(sample_value, lines)  
    found = []  
    if regex:  
        for line in lines:  
            m = regex.match(line)  
            if m:  
                found.append({label: m.group(1).strip()})  
    else:  
        prefix = sample_value[:5]  
        for line in lines:  
            if prefix in line:  
                found.append({label: line.strip()})  
    if not found:  
        return pd.DataFrame([{"Error": f"No matches found for sample: {sample_value}"}])  
    return pd.DataFrame(found)  

def export_xlsx(df):  
    buf = io.BytesIO()  
    with pd.ExcelWriter(buf, engine="xlsxwriter") as writer:  
        df.to_excel(writer, index=False)  
    buf.seek(0)  
    return buf  

with gr.Blocks() as demo:  
    gr.Markdown("# 🧑‍🏫 PDF Teach-&-Extract System\n**1. Upload PDF → 2. Teach a sample field → 3. Preview all auto-extracted matches → 4. Download as Excel**")  
    file_in = gr.File(label="Upload your PDF", file_count="single", type="filepath")  
    raw_text = gr.Textbox(label="Raw extracted PDF text (preview/copy here)", lines=18, show_copy_button=True)  
    file_in.change(extract_with_lines, inputs=file_in, outputs=raw_text)  
    with gr.Row():  
        teach_label = gr.Textbox(label="Your Desired Field Name (e.g. Customer Name)")  
        teach_sample = gr.Textbox(label="Example Value (copy-paste from above)")  
        teach_search = gr.Button("Show Context")  
    context_out = gr.Textbox(label="System shows the found context(s)", lines=4)  
    teach_search.click(get_sample_context, inputs=[raw_text, teach_sample], outputs=context_out)  
    with gr.Row():  
        extract_btn = gr.Button("Extract All Similar Values")  
        results_table = gr.Dataframe(label="Extracted Results Table")  
        download_btn = gr.Button("Download as Excel")  
        xlsx_file = gr.File(label="Excel Download (.xlsx)", visible=True)  
    def extract_and_preview(raw_text, teach_label, teach_sample):  
        df = extract_table_from_sample(raw_text, teach_label, teach_sample)  
        return df  
    extract_btn.click(extract_and_preview, inputs=[raw_text, teach_label, teach_sample], outputs=results_table)  
    def save_xlsx(df):  
        buf = export_xlsx(df)  
        return ("results.xlsx", buf)  
    download_btn.click(save_xlsx, inputs=results_table, outputs=xlsx_file)  
demo.launch()