PMP_PO_Extraction / federal_electric.py
DSatishchandra's picture
Update federal_electric.py
6bf4435 verified
raw
history blame
4.92 kB
import pdfplumber
import re
import pandas as pd
import gradio as gr
def extract_po_data(pdf_file):
"""
Extracts Purchase Order data with enhanced multi-line Material Description handling,
and cleans unwanted text or symbols.
"""
data = []
purchase_order_no = None
purchase_order_date = None
with pdfplumber.open(pdf_file) as pdf:
for page in pdf.pages:
# Extract text from page
lines = page.extract_text().split("\n")
temp_row = None # Temporary row to handle multi-line descriptions
# Extract Purchase Order Number and Date (Assume it's on the first page)
if purchase_order_no is None: # Only extract once
po_no_match = re.search(r"Purchase Order No[:\s]+(\S+)", "\n".join(lines))
po_date_match = re.search(r"Purchase Order Date[:\s]+(\S+)", "\n".join(lines))
if po_no_match:
purchase_order_no = po_no_match.group(1)
if po_date_match:
purchase_order_date = po_date_match.group(1)
# Process each line to extract data
for line in lines:
# Regex pattern for rows (excluding multi-line descriptions)
pattern = r"^\s*(\d+)\s+(\d+)\s+([A-Z0-9_(),\- ]+?)\s+(\d+)\s+(\w+)\s+([\d.]+)\s+([\d\-A-Za-z]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s*$"
match = re.match(pattern, line)
if match:
# If there's a match, capture the full row
if temp_row: # Append the previous temp_row if it exists
data.append(temp_row)
temp_row = None
temp_row = {
"S. No": match[1],
"Material No": match[2],
"Material Description": match[3].strip(),
"Qty": int(match[4]),
"Unit": match[5],
"Price": float(match[6]),
"Delivery Date": match[7],
"Total Value": float(match[8]),
"Vat%": float(match[9]),
"Amount Incl. VAT": float(match[10]),
}
elif temp_row:
# If no match, treat it as a continuation of Material Description
temp_row["Material Description"] += f" {line.strip()}"
# Append the last row
if temp_row:
data.append(temp_row)
# Create DataFrame
df = pd.DataFrame(data)
# Insert Purchase Order No and Purchase Order Date at the beginning
if purchase_order_no and purchase_order_date:
df.insert(0, "Purchase Order No", purchase_order_no)
df.insert(1, "Purchase Order Date", purchase_order_date)
# Filter unwanted text from Material Description
def clean_description(description):
# Define unwanted patterns
unwanted_patterns = [
r"This document is electronically approved", # Matches exact phrase
r"does not require any signature or stamp", # Matches approval notes
r"Total Amount Excl\. VAT.*", # Matches totals
r"TWO THOUSAND.*ONLY", # Matches written totals
r"&", # Removes stray symbols like `&`
r"\.+$", # Removes trailing periods
]
for pattern in unwanted_patterns:
description = re.sub(pattern, "", description, flags=re.IGNORECASE).strip()
return description
df["Material Description"] = df["Material Description"].apply(clean_description)
# Strip extra spaces
df["Material Description"] = df["Material Description"].str.strip()
return df
def process_pdf(file):
"""
Processes the uploaded PDF and saves the extracted data.
"""
try:
# Process the extracted text into a DataFrame
df = extract_po_data(file.name)
# Save the DataFrame to an Excel file
output_path = "federal_electric_extracted_data.xlsx"
df.to_excel(output_path, index=False, engine="openpyxl")
return output_path, "Data extraction successful!"
except Exception as e:
return None, f"Error during processing: {str(e)}"
# Gradio app interface
iface = gr.Interface(
fn=process_pdf,
inputs=[gr.File(label="Upload PDF")],
outputs=[
gr.File(label="Download Extracted Data"),
gr.Textbox(label="Status")
],
title="Enhanced PO Data Extractor",
description="Extract data from Purchase Orders, including multi-line descriptions, and clean unwanted text or symbols."
)
if __name__ == "__main__":
iface.launch()