File size: 4,779 Bytes
71c0901
 
fa988d9
 
 
71c0901
cbbc2a6
fa988d9
 
 
 
cbbc2a6
71c0901
 
 
 
 
 
 
 
 
 
fa988d9
 
71c0901
 
 
 
 
 
 
 
 
 
 
 
 
fa988d9
 
 
 
 
 
 
 
 
 
cbbc2a6
 
fa988d9
cbbc2a6
fa988d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71c0901
fa988d9
cbbc2a6
fa988d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71c0901
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import gradio as gr  
import PyPDF2  
import pandas as pd  
import re  
import io  

def extract_with_lines(pdf_path):  
    """  
    Extract all PDF text, displaying page+line number prefix.  
    Returns raw text for training.  
    """  
    with open(pdf_path, "rb") as f:  
        reader = PyPDF2.PdfReader(f)  
        result = []  
        for i, page in enumerate(reader.pages):  
            page_text = page.extract_text()  
            if page_text:  
                lines = page_text.splitlines()  
                for ln, line in enumerate(lines):  
                    result.append(f"[Page {i+1} Line {ln+1}] {line}")  
    return "\n".join(result) if result else "[NO TEXT FOUND]"  

def get_sample_context(raw_text, example):  
    """Show where the sample occurs, for user feedback (teaching phase)"""  
    context_lines = []  
    lines = raw_text.splitlines()  
    example = example.strip()  
    for i, line in enumerate(lines):  
        if example and example in line:  
            prev_line = lines[i-1] if i > 0 else ""  
            next_line = lines[i+1] if i+1 < len(lines) else ""  
            snippet = f"...\n{prev_line}\n>>> {line} <<<\n{next_line}\n..."  
            context_lines.append(snippet)  
    if not context_lines:  
        return "No match for example in extracted text."  
    return "\n---\n".join(context_lines)  

def guess_extraction_regex(sample_value, all_lines):  
    """  
    Use the sample_value to build a simple extraction pattern.  
    If the value is after a colon or consistent header, match similar lines.  
    """  
    for line in all_lines:  
        if sample_value in line:  
            if ':' in line:  
                prefix, suffix = line.split(':', 1)  
                if sample_value.strip() == suffix.strip():  
                    return re.compile(f"{re.escape(prefix.strip())}\\s*:\\s*(.+)", re.IGNORECASE)  
            match = re.match(r"(.*?)(\\s+)?"+re.escape(sample_value)+r"(.*)?", line)  
            if match and match.group(1).strip():  
                return re.compile(f"{re.escape(match.group(1).strip())}\\s*(.+)", re.IGNORECASE)  
    return None  

def extract_table_from_sample(raw_text, label, sample_value):  
    lines = raw_text.splitlines()  
    if not label or not sample_value:  
        return pd.DataFrame([{"Error": "Please supply both label and sample value!"}])  
    regex = guess_extraction_regex(sample_value, lines)  
    found = []  
    if regex:  
        for line in lines:  
            m = regex.match(line)  
            if m:  
                found.append({label: m.group(1).strip()})  
    else:  
        prefix = sample_value[:5]  
        for line in lines:  
            if prefix in line:  
                found.append({label: line.strip()})  
    if not found:  
        return pd.DataFrame([{"Error": f"No matches found for sample: {sample_value}"}])  
    return pd.DataFrame(found)  

def export_xlsx(df):  
    buf = io.BytesIO()  
    with pd.ExcelWriter(buf, engine="xlsxwriter") as writer:  
        df.to_excel(writer, index=False)  
    buf.seek(0)  
    return buf  

with gr.Blocks() as demo:  
    gr.Markdown("# πŸ§‘β€πŸ« PDF Teach-&-Extract System\n**1. Upload PDF β†’ 2. Teach a sample field β†’ 3. Preview all auto-extracted matches β†’ 4. Download as Excel**")  
    file_in = gr.File(label="Upload your PDF", file_count="single", type="filepath")  
    raw_text = gr.Textbox(label="Raw extracted PDF text (preview/copy here)", lines=18, show_copy_button=True)  
    file_in.change(extract_with_lines, inputs=file_in, outputs=raw_text)  
    with gr.Row():  
        teach_label = gr.Textbox(label="Your Desired Field Name (e.g. Customer Name)")  
        teach_sample = gr.Textbox(label="Example Value (copy-paste from above)")  
        teach_search = gr.Button("Show Context")  
    context_out = gr.Textbox(label="System shows the found context(s)", lines=4)  
    teach_search.click(get_sample_context, inputs=[raw_text, teach_sample], outputs=context_out)  
    with gr.Row():  
        extract_btn = gr.Button("Extract All Similar Values")  
        results_table = gr.Dataframe(label="Extracted Results Table")  
        download_btn = gr.Button("Download as Excel")  
        xlsx_file = gr.File(label="Excel Download (.xlsx)", visible=True)  
    def extract_and_preview(raw_text, teach_label, teach_sample):  
        df = extract_table_from_sample(raw_text, teach_label, teach_sample)  
        return df  
    extract_btn.click(extract_and_preview, inputs=[raw_text, teach_label, teach_sample], outputs=results_table)  
    def save_xlsx(df):  
        buf = export_xlsx(df)  
        return ("results.xlsx", buf)  
    download_btn.click(save_xlsx, inputs=results_table, outputs=xlsx_file)  
demo.launch()