File size: 4,923 Bytes
ea15866
cf677a1
9d15142
 
cf677a1
9d15142
ea15866
9d15142
 
ea15866
 
9d15142
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ea15866
9d15142
 
 
 
cf677a1
9d15142
ea15866
6bf4435
9d15142
6bf4435
9d15142
6168767
 
 
 
 
 
 
 
 
 
 
9d15142
 
6bf4435
 
 
 
 
 
9d15142
6bf4435
9d15142
ea15866
9d15142
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import pdfplumber
import re
import pandas as pd
import gradio as gr

def extract_po_data(pdf_file):
    """
    Extracts Purchase Order data with enhanced multi-line Material Description handling,
    and cleans unwanted text or symbols.
    """
    data = []
    purchase_order_no = None
    purchase_order_date = None
    
    with pdfplumber.open(pdf_file) as pdf:
        for page in pdf.pages:
            # Extract text from page
            lines = page.extract_text().split("\n")
            temp_row = None  # Temporary row to handle multi-line descriptions
            
            # Extract Purchase Order Number and Date (Assume it's on the first page)
            if purchase_order_no is None:  # Only extract once
                po_no_match = re.search(r"Purchase Order No[:\s]+(\S+)", "\n".join(lines))
                po_date_match = re.search(r"Purchase Order Date[:\s]+(\S+)", "\n".join(lines))
                
                if po_no_match:
                    purchase_order_no = po_no_match.group(1)
                if po_date_match:
                    purchase_order_date = po_date_match.group(1)
            
            # Process each line to extract data
            for line in lines:
                # Regex pattern for rows (excluding multi-line descriptions)
                pattern = r"^\s*(\d+)\s+(\d+)\s+([A-Z0-9_(),\- ]+?)\s+(\d+)\s+(\w+)\s+([\d.]+)\s+([\d\-A-Za-z]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s*$"
                match = re.match(pattern, line)
                
                if match:
                    # If there's a match, capture the full row
                    if temp_row:  # Append the previous temp_row if it exists
                        data.append(temp_row)
                        temp_row = None
                    temp_row = {
                        "S. No": match[1],
                        "Material No": match[2],
                        "Material Description": match[3].strip(),
                        "Qty": int(match[4]),
                        "Unit": match[5],
                        "Price": float(match[6]),
                        "Delivery Date": match[7],
                        "Total Value": float(match[8]),
                        "Vat%": float(match[9]),
                        "Amount Incl. VAT": float(match[10]),
                    }
                elif temp_row:
                    # If no match, treat it as a continuation of Material Description
                    temp_row["Material Description"] += f" {line.strip()}"
            
            # Append the last row
            if temp_row:
                data.append(temp_row)
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Insert Purchase Order No and Purchase Order Date at the beginning
    if purchase_order_no and purchase_order_date:
        df.insert(0, "Purchase Order No", purchase_order_no)
        df.insert(1, "Purchase Order Date", purchase_order_date)
    
    # Filter unwanted text from Material Description
    def clean_description(description):
        # Define unwanted patterns
        unwanted_patterns = [
            r"This document is electronically approved",  # Matches exact phrase
            r"does not require any signature or stamp",   # Matches approval notes
            r"Total Amount Excl\. VAT.*",                # Matches totals
            r"TWO THOUSAND.*ONLY",                       # Matches written totals
            r"&",                                        # Removes stray symbols like `&`
            r"\.+$",                                     # Removes trailing periods
        ]
        for pattern in unwanted_patterns:
            description = re.sub(pattern, "", description, flags=re.IGNORECASE).strip()
        return description

    df["Material Description"] = df["Material Description"].apply(clean_description)
    
    # Strip extra spaces
    df["Material Description"] = df["Material Description"].str.strip()

    return df

def process_pdf(file):
    """
    Processes the uploaded PDF and saves the extracted data.
    """
    try:
        # Process the extracted text into a DataFrame
        df = extract_po_data(file.name)
        
        # Save the DataFrame to an Excel file
        output_path = "federal_electric_extracted_data.xlsx"
        df.to_excel(output_path, index=False, engine="openpyxl")
        return output_path, "Data extraction successful!"
    except Exception as e:
        return None, f"Error during processing: {str(e)}"

# Gradio app interface
iface = gr.Interface(
    fn=process_pdf,
    inputs=[gr.File(label="Upload PDF")],
    outputs=[
        gr.File(label="Download Extracted Data"),
        gr.Textbox(label="Status")
    ],
    title="Enhanced PO Data Extractor",
    description="Extract data from Purchase Orders, including multi-line descriptions, and clean unwanted text or symbols."
)

if __name__ == "__main__":
    iface.launch()