|
import pdfplumber |
|
import re |
|
import pandas as pd |
|
import gradio as gr |
|
|
|
def extract_po_data(pdf_file): |
|
""" |
|
Extracts Purchase Order data with enhanced multi-line Material Description handling, |
|
and cleans unwanted text or symbols. |
|
""" |
|
data = [] |
|
purchase_order_no = None |
|
purchase_order_date = None |
|
|
|
with pdfplumber.open(pdf_file) as pdf: |
|
for page in pdf.pages: |
|
|
|
lines = page.extract_text().split("\n") |
|
temp_row = None |
|
|
|
|
|
if purchase_order_no is None: |
|
po_no_match = re.search(r"Purchase Order No[:\s]+(\S+)", "\n".join(lines)) |
|
po_date_match = re.search(r"Purchase Order Date[:\s]+(\S+)", "\n".join(lines)) |
|
|
|
if po_no_match: |
|
purchase_order_no = po_no_match.group(1) |
|
if po_date_match: |
|
purchase_order_date = po_date_match.group(1) |
|
|
|
|
|
for line in lines: |
|
|
|
pattern = r"^\s*(\d+)\s+(\d+)\s+([A-Z0-9_(),\- ]+?)\s+(\d+)\s+(\w+)\s+([\d.]+)\s+([\d\-A-Za-z]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s*$" |
|
match = re.match(pattern, line) |
|
|
|
if match: |
|
|
|
if temp_row: |
|
data.append(temp_row) |
|
temp_row = None |
|
temp_row = { |
|
"S. No": match[1], |
|
"Material No": match[2], |
|
"Material Description": match[3].strip(), |
|
"Qty": int(match[4]), |
|
"Unit": match[5], |
|
"Price": float(match[6]), |
|
"Delivery Date": match[7], |
|
"Total Value": float(match[8]), |
|
"Vat%": float(match[9]), |
|
"Amount Incl. VAT": float(match[10]), |
|
} |
|
elif temp_row: |
|
|
|
temp_row["Material Description"] += f" {line.strip()}" |
|
|
|
|
|
if temp_row: |
|
data.append(temp_row) |
|
|
|
|
|
df = pd.DataFrame(data) |
|
|
|
|
|
if purchase_order_no and purchase_order_date: |
|
df.insert(0, "Purchase Order No", purchase_order_no) |
|
df.insert(1, "Purchase Order Date", purchase_order_date) |
|
|
|
|
|
def clean_description(description): |
|
|
|
unwanted_patterns = [ |
|
r"This document is electronically approved", |
|
r"does not require any signature or stamp", |
|
r"Total Amount Excl\. VAT.*", |
|
r"TWO THOUSAND.*ONLY", |
|
r"&", |
|
r"\.+$", |
|
] |
|
for pattern in unwanted_patterns: |
|
description = re.sub(pattern, "", description, flags=re.IGNORECASE).strip() |
|
return description |
|
|
|
df["Material Description"] = df["Material Description"].apply(clean_description) |
|
|
|
|
|
df["Material Description"] = df["Material Description"].str.strip() |
|
|
|
return df |
|
|
|
def process_pdf(file): |
|
""" |
|
Processes the uploaded PDF and saves the extracted data. |
|
""" |
|
try: |
|
|
|
df = extract_po_data(file.name) |
|
|
|
|
|
output_path = "federal_electric_extracted_data.xlsx" |
|
df.to_excel(output_path, index=False, engine="openpyxl") |
|
return output_path, "Data extraction successful!" |
|
except Exception as e: |
|
return None, f"Error during processing: {str(e)}" |
|
|
|
|
|
iface = gr.Interface( |
|
fn=process_pdf, |
|
inputs=[gr.File(label="Upload PDF")], |
|
outputs=[ |
|
gr.File(label="Download Extracted Data"), |
|
gr.Textbox(label="Status") |
|
], |
|
title="Enhanced PO Data Extractor", |
|
description="Extract data from Purchase Orders, including multi-line descriptions, and clean unwanted text or symbols." |
|
) |
|
|
|
if __name__ == "__main__": |
|
iface.launch() |
|
|