Spaces:

DSatishchandra
/

PMP_PO_Extraction

Sleeping

File size: 4,923 Bytes

import pdfplumber
import re
import pandas as pd
import gradio as gr

def extract_po_data(pdf_file):
    """
    Extracts Purchase Order data with enhanced multi-line Material Description handling,
    and cleans unwanted text or symbols.
    """
    data = []
    purchase_order_no = None
    purchase_order_date = None
    
    with pdfplumber.open(pdf_file) as pdf:
        for page in pdf.pages:
            # Extract text from page
            lines = page.extract_text().split("\n")
            temp_row = None  # Temporary row to handle multi-line descriptions
            
            # Extract Purchase Order Number and Date (Assume it's on the first page)
            if purchase_order_no is None:  # Only extract once
                po_no_match = re.search(r"Purchase Order No[:\s]+(\S+)", "\n".join(lines))
                po_date_match = re.search(r"Purchase Order Date[:\s]+(\S+)", "\n".join(lines))
                
                if po_no_match:
                    purchase_order_no = po_no_match.group(1)
                if po_date_match:
                    purchase_order_date = po_date_match.group(1)
            
            # Process each line to extract data
            for line in lines:
                # Regex pattern for rows (excluding multi-line descriptions)
                pattern = r"^\s*(\d+)\s+(\d+)\s+([A-Z0-9_(),\- ]+?)\s+(\d+)\s+(\w+)\s+([\d.]+)\s+([\d\-A-Za-z]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s*$"
                match = re.match(pattern, line)
                
                if match:
                    # If there's a match, capture the full row
                    if temp_row:  # Append the previous temp_row if it exists
                        data.append(temp_row)
                        temp_row = None
                    temp_row = {
                        "S. No": match[1],
                        "Material No": match[2],
                        "Material Description": match[3].strip(),
                        "Qty": int(match[4]),
                        "Unit": match[5],
                        "Price": float(match[6]),
                        "Delivery Date": match[7],
                        "Total Value": float(match[8]),
                        "Vat%": float(match[9]),
                        "Amount Incl. VAT": float(match[10]),
                    }
                elif temp_row:
                    # If no match, treat it as a continuation of Material Description
                    temp_row["Material Description"] += f" {line.strip()}"
            
            # Append the last row
            if temp_row:
                data.append(temp_row)
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Insert Purchase Order No and Purchase Order Date at the beginning
    if purchase_order_no and purchase_order_date:
        df.insert(0, "Purchase Order No", purchase_order_no)
        df.insert(1, "Purchase Order Date", purchase_order_date)
    
    # Filter unwanted text from Material Description
    def clean_description(description):
        # Define unwanted patterns
        unwanted_patterns = [
            r"This document is electronically approved",  # Matches exact phrase
            r"does not require any signature or stamp",   # Matches approval notes
            r"Total Amount Excl\. VAT.*",                # Matches totals
            r"TWO THOUSAND.*ONLY",                       # Matches written totals
            r"&",                                        # Removes stray symbols like `&`
            r"\.+$",                                     # Removes trailing periods
        ]
        for pattern in unwanted_patterns:
            description = re.sub(pattern, "", description, flags=re.IGNORECASE).strip()
        return description

    df["Material Description"] = df["Material Description"].apply(clean_description)
    
    # Strip extra spaces
    df["Material Description"] = df["Material Description"].str.strip()

    return df

def process_pdf(file):
    """
    Processes the uploaded PDF and saves the extracted data.
    """
    try:
        # Process the extracted text into a DataFrame
        df = extract_po_data(file.name)
        
        # Save the DataFrame to an Excel file
        output_path = "federal_electric_extracted_data.xlsx"
        df.to_excel(output_path, index=False, engine="openpyxl")
        return output_path, "Data extraction successful!"
    except Exception as e:
        return None, f"Error during processing: {str(e)}"

# Gradio app interface
iface = gr.Interface(
    fn=process_pdf,
    inputs=[gr.File(label="Upload PDF")],
    outputs=[
        gr.File(label="Download Extracted Data"),
        gr.Textbox(label="Status")
    ],
    title="Enhanced PO Data Extractor",
    description="Extract data from Purchase Orders, including multi-line descriptions, and clean unwanted text or symbols."
)

if __name__ == "__main__":
    iface.launch()