File size: 5,057 Bytes
ea15866
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import pdfplumber
import re
import pandas as pd
import gradio as gr

def extract_po_data(pdf_file):
    """
    Extracts Purchase Order data with enhanced multi-line Material Description handling,
    and cleans unwanted text or symbols.
    """
    data = []
    purchase_order_no = None
    purchase_order_date = None
    
    with pdfplumber.open(pdf_file) as pdf:
        for page in pdf.pages:
            # Extract text from page
            lines = page.extract_text().split("\n")
            temp_row = None  # Temporary row to handle multi-line descriptions
            
            # Extract Purchase Order Number and Date (Assume it's on the first page)
            if purchase_order_no is None:  # Only extract once
                po_no_match = re.search(r"Purchase Order No[:\s]+(\S+)", "\n".join(lines))
                po_date_match = re.search(r"Purchase Order Date[:\s]+(\S+)", "\n".join(lines))
                
                if po_no_match:
                    purchase_order_no = po_no_match.group(1)
                if po_date_match:
                    purchase_order_date = po_date_match.group(1)
            
            # Process each line to extract data
            for line in lines:
                # Regex pattern for rows (excluding multi-line descriptions)
                pattern = r"^\s*(\d+)\s+(\d+)\s+([A-Z0-9_(),\- ]+?)\s+(\d+)\s+(\w+)\s+([\d.]+)\s+([\d\-A-Za-z]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s*$"
                match = re.match(pattern, line)
                
                if match:
                    # If there's a match, capture the full row
                    if temp_row:  # Append the previous temp_row if it exists
                        data.append(temp_row)
                        temp_row = None
                    temp_row = {
                        "S. No": match[1],
                        "Material No": match[2],
                        "Material Description": match[3].strip(),
                        "Qty": int(match[4]),
                        "Unit": match[5],
                        "Price": float(match[6]),
                        "Delivery Date": match[7],
                        "Total Value": float(match[8]),
                        "Vat%": float(match[9]),
                        "Amount Incl. VAT": float(match[10]),
                    }
                elif temp_row:
                    # If no match, treat it as a continuation of Material Description
                    temp_row["Material Description"] += f" {line.strip()}"
            
            # Append the last row
            if temp_row:
                data.append(temp_row)
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Insert Purchase Order No and Purchase Order Date at the beginning
    if purchase_order_no and purchase_order_date:
        df.insert(0, "Purchase Order No", purchase_order_no)
        df.insert(1, "Purchase Order Date", purchase_order_date)
    
    # Filter unwanted text from Material Description
    def clean_description(description):
        # Define unwanted patterns
        unwanted_patterns = [
            r"This document is electronically approved",  # Matches exact phrase
            r"does not require any signature or stamp",   # Matches approval notes
            r"Total Amount Excl\. VAT.*",                # Matches totals
            r"TWO THOUSAND.*ONLY",                       # Matches written totals
            r"&",                                        # Removes stray symbols like `&`
            r"\.+$",                                     # Removes trailing periods
        ]
        for pattern in unwanted_patterns:
            description = re.sub(pattern, "", description, flags=re.IGNORECASE).strip()
        return description

    df["Material Description"] = df["Material Description"].apply(clean_description)
    
    # Strip extra spaces
    df["Material Description"] = df["Material Description"].str.strip()

    return df

def process_and_save(pdf_file, output_format):
    """
    Processes the uploaded PDF and saves the extracted data as an Excel or CSV file.
    """
    df = extract_po_data(pdf_file.name)
    
    # Save the file in the desired format
    output_file = f"output.{output_format}"
    if output_format == "csv":
        df.to_csv(output_file, index=False)
    elif output_format == "xlsx":
        df.to_excel(output_file, index=False, engine="openpyxl")
    
    return output_file

# Gradio interface function
def gradio_interface(pdf_file, output_format):
    output_file = process_and_save(pdf_file, output_format)
    return output_file

# Gradio app interface
iface = gr.Interface(
    fn=gradio_interface,
    inputs=[gr.File(label="Upload PDF"), gr.Radio(["csv", "xlsx"], label="Output Format")],
    outputs=gr.File(label="Download Output"),
    title="Enhanced PO Data Extractor",
    description="Extract data from Purchase Orders, including multi-line descriptions, and clean unwanted text or symbols. Download as CSV or Excel."
)

if __name__ == "__main__":
    iface.launch()